Multi-class Classification

from sklearn import datasets
from sklearn.model_selection import train_test_split

data = datasets.load_wine()
X = pd.DataFrame(data['data'], columns=data["feature_names"])
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2021)

X_train.head()
alcohol malic_acid ash alcalinity_of_ash ... color_intensity hue od280/od315_of_diluted_wines proline
36 13.28 1.64 2.84 15.5 ... 4.60 1.09 2.78 880.0
77 11.84 2.89 2.23 18.0 ... 2.65 0.96 2.52 500.0
131 12.88 2.99 2.40 20.0 ... 5.40 0.74 1.42 530.0
159 13.48 1.67 2.64 22.5 ... 11.75 0.57 1.78 620.0
4 13.24 2.59 2.87 21.0 ... 4.32 1.04 2.93 735.0

5 rows × 13 columns


y_train[:5]
array([0, 1, 2, 2, 0])
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, y_train);
lr.predict(X_test[:5])
array([0, 1, 0, 2, 1])


lr.coef_
array([[ 0.52312902,  0.43154281,  0.38043735, -0.05113148, -0.02634286,  0.54553557,  0.87292828, -0.00570999, -0.08782667,  0.21548057,  0.03508494,  0.61503178,  0.01096161],
       [-0.70841151, -0.80382857, -0.45573989, -0.09464126, -0.03558673, -0.06221383,  0.26973587,  0.13538498,  0.78750331, -0.93636273,  0.24858085, -0.02530429, -0.00913317],
       [ 0.18528249,  0.37228576,  0.07530254,  0.14577274,  0.06192959, -0.48332174, -1.14266415, -0.129675  , -0.69967665,  0.72088216, -0.28366579, -0.58972749, -0.00182844]])


lr.coef_.shape
(3, 13)
lr_coefs = pd.DataFrame(data=lr.coef_.T, index=X_train.columns, columns=lr.classes_)
lr_coefs
0 1 2
alcohol 0.523129 -0.708412 0.185282
malic_acid 0.431543 -0.803829 0.372286
ash 0.380437 -0.455740 0.075303
... ... ... ...
hue 0.035085 0.248581 -0.283666
od280/od315_of_diluted_wines 0.615032 -0.025304 -0.589727
proline 0.010962 -0.009133 -0.001828

13 rows × 3 columns

lr.predict_proba(X_test)[:5]
array([[9.95258798e-01, 4.01471952e-03, 7.26482370e-04],
       [1.66633971e-04, 9.98348580e-01, 1.48478572e-03],
       [9.99720581e-01, 6.44816976e-05, 2.14937661e-04],
       [2.63093392e-05, 9.10246419e-06, 9.99964588e-01],
       [6.20027345e-06, 9.99218629e-01, 7.75171017e-04]])


lr.predict_proba(X_test[:5]).sum(axis=1)
array([1., 1., 1., 1., 1.])
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, lr.predict(X_test))
array([[19,  0,  0],
       [ 1, 16,  0],
       [ 0,  1,  8]])
ConfusionMatrixDisplay.from_estimator(lr, X_test, y_test, display_labels=lr.classes_, cmap='Blues', values_format='d');
from sklearn.metrics import classification_report

print(classification_report(y_test, lr.predict(X_test)))
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       0.94      0.94      0.94        17
           2       1.00      0.89      0.94         9

    accuracy                           0.96        45
   macro avg       0.96      0.94      0.95        45
weighted avg       0.96      0.96      0.96        45
x_train_2d = X_train[['alcohol', 'malic_acid']]
x_train_2d.head(3)
alcohol malic_acid
36 13.28 1.64
77 11.84 2.89
131 12.88 2.99

Let’s apply what we learned!