from sklearn.model_selection import train_test_split cities_df = pd.read_csv("data/canada_usa_cities.csv") X = cities_df.drop(columns=["country"]) y = cities_df["country"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=123)
from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier(n_neighbors=1) model.fit(X_train, y_train.to_numpy());
model.score(X_train,y_train)
1.0
from sklearn.model_selection import cross_validate k = 1 knn1 = KNeighborsClassifier(n_neighbors=k) scores = cross_validate(knn1, X_train, y_train, return_train_score = True) pd.DataFrame(scores)
k = 100 knn100 = KNeighborsClassifier(n_neighbors=k) scores = cross_validate(knn100, X_train, y_train, return_train_score = True) pd.DataFrame(scores)
n_neighbors
results_dict = {"n_neighbors": list(), "mean_train_score": list(), "mean_cv_score": list()} for k in range(1,50,5): knn = KNeighborsClassifier(n_neighbors=k) scores = cross_validate(knn, X_train, y_train, return_train_score = True) results_dict["n_neighbors"].append(k) results_dict["mean_cv_score"].append(np.mean(scores["test_score"])) results_dict["mean_train_score"].append(np.mean(scores["train_score"])) results_df = pd.DataFrame(results_dict) results_df
10 rows Γ 3 columns
sorted_results_df = results_df.sort_values("mean_cv_score", ascending = False) sorted_results_df
best_k = sorted_results_df.iloc[0,0] best_k
np.int64(11)
knn = KNeighborsClassifier(n_neighbors=best_k) knn.fit(X_train, y_train); print("Test accuracy:", round(knn.score(X_test, y_test), 3))
Test accuracy: 0.905
KNeighborsClassifier
Attribution