from sklearn.model_selection import train_test_split
housing_df = pd.read_csv("data/housing.csv")
train_df, test_df = train_test_split(housing_df, test_size=0.1, random_state=123)X_train = train_df.drop(columns=["median_house_value"])
y_train = train_df["median_house_value"]
X_test = test_df.drop(columns=["median_house_value"])
y_test = test_df["median_house_value"]
numeric_features = [ "longitude", "latitude",
"housing_median_age",
"households", "median_income",
"rooms_per_household",
"bedrooms_per_household",
"population_per_household"]
categorical_features = ["ocean_proximity"]
X_train.head(3)| longitude | latitude | housing_median_age | households | ... | ocean_proximity | rooms_per_household | bedrooms_per_household | population_per_household | |
|---|---|---|---|---|---|---|---|---|---|
| 6051 | -117.75 | 34.04 | 22.0 | 602.0 | ... | INLAND | 4.897010 | 1.056478 | 4.318937 |
| 20113 | -119.57 | 37.94 | 17.0 | 20.0 | ... | INLAND | 17.300000 | 6.500000 | 2.550000 |
| 14289 | -117.13 | 32.74 | 46.0 | 708.0 | ... | NEAR OCEAN | 4.738701 | 1.084746 | 2.057910 |
3 rows × 9 columns
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsRegressor
numeric_transformer = Pipeline(
steps=[("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())]
)
categorical_transformer = Pipeline(
steps=[("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
("onehot", OneHotEncoder(handle_unknown="ignore"))]
)
preprocessor = make_column_transformer(
(numeric_transformer, numeric_features),
(categorical_transformer, categorical_features),
remainder='passthrough')
pipe_regression = make_pipeline(preprocessor, KNeighborsRegressor())from sklearn.model_selection import cross_validate
pd.DataFrame(cross_validate(pipe_regression, X_train, y_train, return_train_score=True, scoring = 'neg_root_mean_squared_error'))| fit_time | score_time | test_score | train_score | |
|---|---|---|---|---|
| 0 | 0.035630 | 0.270113 | -62462.584290 | -51440.540539 |
| 1 | 0.032212 | 0.254512 | -63437.715015 | -51263.979666 |
| 2 | 0.031295 | 0.257896 | -62613.202523 | -51758.817852 |
| 3 | 0.031067 | 0.262459 | -64204.295214 | -51343.743586 |
| 4 | 0.030682 | 0.216734 | -59217.838633 | -47325.157312 |
pd.DataFrame(cross_validate(
pipe_regression, X_train, y_train, return_train_score=True, scoring=mape_scorer))| fit_time | score_time | test_score | train_score | |
|---|---|---|---|---|
| 0 | 0.030838 | 0.265528 | 22.709732 | 18.420969 |
| 1 | 0.033222 | 0.250206 | 22.754570 | 18.469125 |
| 2 | 0.030187 | 0.257115 | 22.236869 | 18.674964 |
| 3 | 0.031173 | 0.262148 | 23.016666 | 18.510766 |
| 4 | 0.031193 | 0.217812 | 21.033519 | 16.951021 |
scoring={
"r2": "r2",
"mape_score": mape_scorer,
"neg_rmse": "neg_root_mean_squared_error",
"neg_mse": "neg_mean_squared_error",
}
pd.DataFrame(cross_validate(pipe_regression, X_train, y_train, return_train_score=True, scoring=scoring))| fit_time | score_time | test_r2 | train_r2 | ... | test_neg_rmse | train_neg_rmse | test_neg_mse | train_neg_mse | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.040693 | 0.268868 | 0.695818 | 0.801659 | ... | -62462.584290 | -51440.540539 | -3.901574e+09 | -2.646129e+09 |
| 1 | 0.031136 | 0.252659 | 0.707483 | 0.799575 | ... | -63437.715015 | -51263.979666 | -4.024344e+09 | -2.627996e+09 |
| 2 | 0.030507 | 0.258609 | 0.713788 | 0.795944 | ... | -62613.202523 | -51758.817852 | -3.920413e+09 | -2.678975e+09 |
| 3 | 0.030807 | 0.264246 | 0.686938 | 0.801232 | ... | -64204.295214 | -51343.743586 | -4.122192e+09 | -2.636180e+09 |
| 4 | 0.031139 | 0.219227 | 0.724608 | 0.832498 | ... | -59217.838633 | -47325.157312 | -3.506752e+09 | -2.239671e+09 |
5 rows × 10 columns
param_grid = {"kneighborsregressor__n_neighbors": [2, 5, 50, 100]}
grid_search = GridSearchCV(pipe_regression, param_grid, cv=5,
return_train_score=True, verbose=1,
n_jobs=-1, scoring= neg_mape_scorer)
grid_search.fit(X_train, y_train);Fitting 5 folds for each of 4 candidates, totalling 20 fits
from sklearn.model_selection import RandomizedSearchCV
grid_search = RandomizedSearchCV(dt_model, param_grid, cv=5, return_train_score=True,
verbose=1, n_jobs=-1, scoring= 'f1', n_iter = 6)
grid_search.fit(X_train, y_train);Fitting 5 folds for each of 6 candidates, totalling 30 fits