Passing Different Scoring Methods

from sklearn.model_selection import train_test_split

housing_df = pd.read_csv("data/housing.csv")
train_df, test_df = train_test_split(housing_df, test_size=0.1, random_state=123)
X_train = train_df.drop(columns=["median_house_value"])
y_train = train_df["median_house_value"]
X_test = test_df.drop(columns=["median_house_value"])
y_test = test_df["median_house_value"]

numeric_features = [ "longitude", "latitude",
                     "housing_median_age",
                     "households", "median_income",
                     "rooms_per_household",
                     "bedrooms_per_household",
                     "population_per_household"]
                     
categorical_features = ["ocean_proximity"]

X_train.head(3)
longitude latitude housing_median_age households ... ocean_proximity rooms_per_household bedrooms_per_household population_per_household
6051 -117.75 34.04 22.0 602.0 ... INLAND 4.897010 1.056478 4.318937
20113 -119.57 37.94 17.0 20.0 ... INLAND 17.300000 6.500000 2.550000
14289 -117.13 32.74 46.0 708.0 ... NEAR OCEAN 4.738701 1.084746 2.057910

3 rows × 9 columns

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsRegressor

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), 
           ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
           ("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = make_column_transformer(
(numeric_transformer, numeric_features),
        (categorical_transformer, categorical_features), 
    remainder='passthrough')

pipe_regression = make_pipeline(preprocessor, KNeighborsRegressor())

Cross-validation

from sklearn.model_selection import cross_validate

pd.DataFrame(cross_validate(pipe_regression, X_train, y_train, return_train_score=True, scoring = 'neg_root_mean_squared_error'))
fit_time score_time test_score train_score
0 0.035630 0.270113 -62462.584290 -51440.540539
1 0.032212 0.254512 -63437.715015 -51263.979666
2 0.031295 0.257896 -62613.202523 -51758.817852
3 0.031067 0.262459 -64204.295214 -51343.743586
4 0.030682 0.216734 -59217.838633 -47325.157312
from sklearn.metrics import make_scorer


def mape(true, pred):
    return 100.*np.mean(np.abs((pred - true)/true))


mape_scorer = make_scorer(mape)


pd.DataFrame(cross_validate(
    pipe_regression, X_train, y_train, return_train_score=True, scoring=mape_scorer))
fit_time score_time test_score train_score
0 0.030838 0.265528 22.709732 18.420969
1 0.033222 0.250206 22.754570 18.469125
2 0.030187 0.257115 22.236869 18.674964
3 0.031173 0.262148 23.016666 18.510766
4 0.031193 0.217812 21.033519 16.951021
scoring={
    "r2": "r2",
    "mape_score": mape_scorer,
    "neg_rmse": "neg_root_mean_squared_error",    
    "neg_mse": "neg_mean_squared_error",    
}

pd.DataFrame(cross_validate(pipe_regression, X_train, y_train, return_train_score=True, scoring=scoring))
fit_time score_time test_r2 train_r2 ... test_neg_rmse train_neg_rmse test_neg_mse train_neg_mse
0 0.040693 0.268868 0.695818 0.801659 ... -62462.584290 -51440.540539 -3.901574e+09 -2.646129e+09
1 0.031136 0.252659 0.707483 0.799575 ... -63437.715015 -51263.979666 -4.024344e+09 -2.627996e+09
2 0.030507 0.258609 0.713788 0.795944 ... -62613.202523 -51758.817852 -3.920413e+09 -2.678975e+09
3 0.030807 0.264246 0.686938 0.801232 ... -64204.295214 -51343.743586 -4.122192e+09 -2.636180e+09
4 0.031139 0.219227 0.724608 0.832498 ... -59217.838633 -47325.157312 -3.506752e+09 -2.239671e+09

5 rows × 10 columns

What about hyperparameter tuning?

pipe_regression = make_pipeline(preprocessor, KNeighborsRegressor())

param_grid = {"kneighborsregressor__n_neighbors": [2, 5, 50, 100]}


from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipe_regression, param_grid, cv=5, return_train_score=True, n_jobs=-1, scoring= mape_scorer);
grid_search.fit(X_train, y_train);


grid_search.best_params_
{'kneighborsregressor__n_neighbors': 100}


grid_search.best_score_
np.float64(24.63336199650092)
neg_mape_scorer = make_scorer(mape, greater_is_better=False)


param_grid = {"kneighborsregressor__n_neighbors": [2, 5, 50, 100]}

grid_search = GridSearchCV(pipe_regression, param_grid, cv=5,
                           return_train_score=True, verbose=1,
                           n_jobs=-1, scoring= neg_mape_scorer)
grid_search.fit(X_train, y_train);
Fitting 5 folds for each of 4 candidates, totalling 20 fits


grid_search.best_params_
{'kneighborsregressor__n_neighbors': 5}


grid_search.best_score_
np.float64(-22.350271196169718)

Classification

cc_df = pd.read_csv('data/creditcard.csv.zip', encoding='latin-1')
train_df, test_df = train_test_split(cc_df, test_size=0.3, random_state=111)

X_train, y_train = train_df.drop(columns=["Class"]), train_df["Class"]
X_test, y_test = test_df.drop(columns=["Class"]), test_df["Class"]
import scipy
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=123, class_weight='balanced')

param_grid = {"max_depth": scipy.stats.randint(low=1, high=100)}


from sklearn.model_selection import RandomizedSearchCV

grid_search = RandomizedSearchCV(dt_model, param_grid, cv=5, return_train_score=True,
                           verbose=1, n_jobs=-1, scoring= 'f1', n_iter = 6)
grid_search.fit(X_train, y_train);
Fitting 5 folds for each of 6 candidates, totalling 30 fits


grid_search.best_params_
{'max_depth': 36}


grid_search.best_score_
np.float64(0.7243664069433575)

Let’s apply what we learned!