{ "cells": [ { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "# Lectures 5: Class demo" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports, Announcements, LOs" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "### Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "# import the libraries\n", "import os\n", "import sys\n", "sys.path.append(os.path.join(os.path.abspath(\"../\"), \"code\"))\n", "from plotting_functions import *\n", "from utils import *\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from sklearn.compose import make_column_transformer\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", "\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.svm import SVC\n", "\n", "%matplotlib inline\n", "\n", "pd.set_option(\"display.max_colwidth\", 200)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Incorporating text features in the Spotify dataset" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Recall that we had dropped `song_title` feature when we worked with the Spotify dataset in Lab 1. \n", "\n", "Let's try to include it in our pipeline and examine whether we get better results. " ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "spotify_df = pd.read_csv(\"../data/spotify.csv\", index_col=0)\n", "X_spotify = spotify_df.drop(columns=[\"target\"])\n", "y_spotify = spotify_df[\"target\"]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "slideshow": { "slide_type": "slide" }, "tags": [] }, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(\n", " X_spotify, y_spotify, test_size=0.2, random_state=123\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "(1613, 15)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.shape" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
acousticnessdanceabilityduration_msenergyinstrumentalnesskeylivenessloudnessmodespeechinesstempotime_signaturevalencesong_titleartist
15050.0047700.5852147400.6140.000155100.0762-5.59400.0370114.0594.00.2730Cool for the SummerDemi Lovato
8130.1140000.6652167280.5130.30300000.1220-7.31410.3310100.3443.00.0373Damn Son Where'd You Find This? (feat. Kelly Holiday) - Markus Maximus RemixMarkus Maximus
6150.0302000.7982165850.4810.00000070.1280-10.48810.3140127.1364.00.6400Trill HoeWestern Tink
3190.1060000.9121940400.3170.00020860.0723-12.71900.037899.3464.00.9490Who Is He (And What Is He to You?)Bill Withers
3200.0211000.6972364560.9050.89300060.1190-7.78700.0339119.9774.00.3110AcamarFrankey
................................................
20120.0010600.5842744040.9320.00269010.1290-3.50110.333074.9764.00.2110Like A Bitch - Kill The Noise RemixKill The Noise
13460.0000210.5352035000.9740.000149100.2630-3.56600.1720116.9564.00.4310Flag of the BeastEmmure
14060.5030000.4102563330.6480.00000070.2190-4.46910.036260.3914.00.3420Don't You Cry For MeCobi
13890.7050000.8942223070.1610.00330040.3120-14.31110.0880104.9684.00.8180장가갈 수 있을까 Can I Get Married?Coffeeboy
15340.6230000.4703949200.1560.18700020.1040-17.03610.0399118.1764.00.0591Blue BalladPhil Woods
\n", "

1613 rows × 15 columns

\n", "
" ], "text/plain": [ " acousticness danceability duration_ms energy instrumentalness key \\\n", "1505 0.004770 0.585 214740 0.614 0.000155 10 \n", "813 0.114000 0.665 216728 0.513 0.303000 0 \n", "615 0.030200 0.798 216585 0.481 0.000000 7 \n", "319 0.106000 0.912 194040 0.317 0.000208 6 \n", "320 0.021100 0.697 236456 0.905 0.893000 6 \n", "... ... ... ... ... ... ... \n", "2012 0.001060 0.584 274404 0.932 0.002690 1 \n", "1346 0.000021 0.535 203500 0.974 0.000149 10 \n", "1406 0.503000 0.410 256333 0.648 0.000000 7 \n", "1389 0.705000 0.894 222307 0.161 0.003300 4 \n", "1534 0.623000 0.470 394920 0.156 0.187000 2 \n", "\n", " liveness loudness mode speechiness tempo time_signature valence \\\n", "1505 0.0762 -5.594 0 0.0370 114.059 4.0 0.2730 \n", "813 0.1220 -7.314 1 0.3310 100.344 3.0 0.0373 \n", "615 0.1280 -10.488 1 0.3140 127.136 4.0 0.6400 \n", "319 0.0723 -12.719 0 0.0378 99.346 4.0 0.9490 \n", "320 0.1190 -7.787 0 0.0339 119.977 4.0 0.3110 \n", "... ... ... ... ... ... ... ... \n", "2012 0.1290 -3.501 1 0.3330 74.976 4.0 0.2110 \n", "1346 0.2630 -3.566 0 0.1720 116.956 4.0 0.4310 \n", "1406 0.2190 -4.469 1 0.0362 60.391 4.0 0.3420 \n", "1389 0.3120 -14.311 1 0.0880 104.968 4.0 0.8180 \n", "1534 0.1040 -17.036 1 0.0399 118.176 4.0 0.0591 \n", "\n", " song_title \\\n", "1505 Cool for the Summer \n", "813 Damn Son Where'd You Find This? (feat. Kelly Holiday) - Markus Maximus Remix \n", "615 Trill Hoe \n", "319 Who Is He (And What Is He to You?) \n", "320 Acamar \n", "... ... \n", "2012 Like A Bitch - Kill The Noise Remix \n", "1346 Flag of the Beast \n", "1406 Don't You Cry For Me \n", "1389 장가갈 수 있을까 Can I Get Married? \n", "1534 Blue Ballad \n", "\n", " artist \n", "1505 Demi Lovato \n", "813 Markus Maximus \n", "615 Western Tink \n", "319 Bill Withers \n", "320 Frankey \n", "... ... \n", "2012 Kill The Noise \n", "1346 Emmure \n", "1406 Cobi \n", "1389 Coffeeboy \n", "1534 Phil Woods \n", "\n", "[1613 rows x 15 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "slideshow": { "slide_type": "slide" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Index(['acousticness', 'danceability', 'duration_ms', 'energy',\n", " 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',\n", " 'speechiness', 'tempo', 'time_signature', 'valence', 'song_title',\n", " 'artist'],\n", " dtype='object')" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dummy model " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dummy
fit_time0.000 (+/- 0.000)
score_time0.000 (+/- 0.000)
test_score0.508 (+/- 0.001)
train_score0.508 (+/- 0.000)
\n", "
" ], "text/plain": [ " dummy\n", "fit_time 0.000 (+/- 0.000)\n", "score_time 0.000 (+/- 0.000)\n", "test_score 0.508 (+/- 0.001)\n", "train_score 0.508 (+/- 0.000)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.dummy import DummyClassifier\n", "\n", "results = {}\n", "dummy_model = DummyClassifier()\n", "# mean_std_cross_val_scores is defined in ../code/utils.py\n", "results['dummy'] = mean_std_cross_val_scores(dummy_model, X_train, y_train, return_train_score = True) \n", "pd.DataFrame(results)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Feature categorization" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['acousticness', 'danceability', 'duration_ms', 'energy',\n", " 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',\n", " 'speechiness', 'tempo', 'time_signature', 'valence', 'song_title',\n", " 'artist'],\n", " dtype='object')" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.columns" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "key\n", "1 200\n", "7 169\n", "0 166\n", "9 152\n", "2 145\n", "11 143\n", "5 141\n", "6 127\n", "10 122\n", "8 110\n", "4 88\n", "3 50\n", "Name: count, dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train[\"key\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "time_signature\n", "4.0 1514\n", "3.0 76\n", "5.0 22\n", "1.0 1\n", "Name: count, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train[\"time_signature\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "mode\n", "1 1002\n", "0 611\n", "Name: count, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train[\"mode\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's look at the distribution of values in the `song_title` column. " ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "song_title\n", "Pyramids 2\n", "Look At Wrist 2\n", "Baby 2\n", "The One 2\n", "Best Friend 2\n", " ..\n", "City Of Dreams - Radio Edit 1\n", "Face It 1\n", "The Winner Is - from Little Miss Sunshine 1\n", "History 1\n", "Blue Ballad 1\n", "Name: count, Length: 1579, dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train[\"song_title\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "- Most of the song titles are unique, which makes sense. \n", "- What would happen if we apply one-hot encoding to this feature? \n", "- How about encoding this as a text feature? " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "artist\n", "Drake 14\n", "Disclosure 12\n", "Rick Ross 11\n", "WALK THE MOON 10\n", "Crystal Castles 8\n", " ..\n", "Classixx 1\n", "Jordan Feliz 1\n", "Travis Hayes 1\n", "The Silvertones 1\n", "Phil Woods 1\n", "Name: count, Length: 1131, dtype: int64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train[\"artist\"].value_counts()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "numeric_feats = ['acousticness', 'danceability', 'energy',\n", " 'instrumentalness', 'liveness', 'loudness',\n", " 'speechiness', 'tempo', 'valence']\n", "categorical_feats = ['time_signature', 'key']\n", "passthrough_feats = ['mode']\n", "artist_cat_feat = ['artist']\n", "text_feat = 'song_title' # Define the text feature" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "skip" } }, "source": [ "```{important}\n", "Note that unlike other feature types we are defining `text_feature` as a string and not as a list. \n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Column transformer without `song_title` and `artist` features" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "preprocessor_no_text = make_column_transformer(\n", " (StandardScaler(), numeric_feats), \n", " (\"passthrough\", passthrough_feats), \n", " (OneHotEncoder(handle_unknown = \"ignore\"), categorical_feats), \n", ")" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Visualizing the transformed data " ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1613, 26)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transformed_no_text = preprocessor_no_text.fit_transform(X_train)\n", "transformed_no_text.shape" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
ColumnTransformer(transformers=[('standardscaler', StandardScaler(),\n",
       "                                 ['acousticness', 'danceability', 'energy',\n",
       "                                  'instrumentalness', 'liveness', 'loudness',\n",
       "                                  'speechiness', 'tempo', 'valence']),\n",
       "                                ('passthrough', 'passthrough', ['mode']),\n",
       "                                ('onehotencoder',\n",
       "                                 OneHotEncoder(handle_unknown='ignore'),\n",
       "                                 ['time_signature', 'key'])])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "ColumnTransformer(transformers=[('standardscaler', StandardScaler(),\n", " ['acousticness', 'danceability', 'energy',\n", " 'instrumentalness', 'liveness', 'loudness',\n", " 'speechiness', 'tempo', 'valence']),\n", " ('passthrough', 'passthrough', ['mode']),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'),\n", " ['time_signature', 'key'])])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocessor_no_text" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['time_signature_1.0',\n", " 'time_signature_3.0',\n", " 'time_signature_4.0',\n", " 'time_signature_5.0',\n", " 'key_0',\n", " 'key_1',\n", " 'key_2',\n", " 'key_3',\n", " 'key_4',\n", " 'key_5',\n", " 'key_6',\n", " 'key_7',\n", " 'key_8',\n", " 'key_9',\n", " 'key_10',\n", " 'key_11']" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ohe_feat_names = preprocessor_no_text.named_transformers_[\"onehotencoder\"].get_feature_names_out().tolist()\n", "ohe_feat_names" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "feat_names = numeric_feats + passthrough_feats + ohe_feat_names" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
acousticnessdanceabilityenergyinstrumentalnesslivenessloudnessspeechinesstempovalencemode...key_2key_3key_4key_5key_6key_7key_8key_9key_10key_11
0-0.697633-0.194548-0.318116-0.492359-0.7378980.395794-0.617752-0.293827-0.9081490.0...0.00.00.00.00.00.00.00.01.00.0
1-0.2762910.295726-0.7955520.598355-0.438792-0.0523942.728394-0.802595-1.8612381.0...0.00.00.00.00.00.00.00.00.00.0
2-0.5995401.110806-0.946819-0.492917-0.399607-0.8794572.5349090.1912740.5758701.0...0.00.00.00.00.01.00.00.00.00.0
3-0.3071501.809445-1.722063-0.492168-0.763368-1.460798-0.608647-0.8396161.8253580.0...0.00.00.00.01.00.00.00.00.00.0
4-0.6346420.4918351.0574682.723273-0.458384-0.175645-0.653035-0.074294-0.7544910.0...0.00.00.00.01.00.00.00.00.00.0
..................................................................
1608-0.711944-0.2006761.185100-0.483229-0.3930770.9411762.751157-1.743639-1.1588561.0...0.00.00.00.00.00.00.00.00.00.0
1609-0.715953-0.5009691.383637-0.4923800.4820380.9242390.918743-0.186361-0.2692530.0...0.00.00.00.00.00.00.00.01.00.0
16101.224228-1.267021-0.157395-0.4929170.1946870.688940-0.626857-2.284681-0.6291381.0...0.00.00.00.00.01.00.00.00.00.0
16112.0034191.699134-2.459489-0.4810320.802042-1.875632-0.037298-0.6310641.2956401.0...0.00.01.00.00.00.00.00.00.00.0
16121.687114-0.899316-2.4831250.180574-0.556344-2.585697-0.584746-0.141104-1.7730861.0...1.00.00.00.00.00.00.00.00.00.0
\n", "

1613 rows × 26 columns

\n", "
" ], "text/plain": [ " acousticness danceability energy instrumentalness liveness \\\n", "0 -0.697633 -0.194548 -0.318116 -0.492359 -0.737898 \n", "1 -0.276291 0.295726 -0.795552 0.598355 -0.438792 \n", "2 -0.599540 1.110806 -0.946819 -0.492917 -0.399607 \n", "3 -0.307150 1.809445 -1.722063 -0.492168 -0.763368 \n", "4 -0.634642 0.491835 1.057468 2.723273 -0.458384 \n", "... ... ... ... ... ... \n", "1608 -0.711944 -0.200676 1.185100 -0.483229 -0.393077 \n", "1609 -0.715953 -0.500969 1.383637 -0.492380 0.482038 \n", "1610 1.224228 -1.267021 -0.157395 -0.492917 0.194687 \n", "1611 2.003419 1.699134 -2.459489 -0.481032 0.802042 \n", "1612 1.687114 -0.899316 -2.483125 0.180574 -0.556344 \n", "\n", " loudness speechiness tempo valence mode ... key_2 key_3 \\\n", "0 0.395794 -0.617752 -0.293827 -0.908149 0.0 ... 0.0 0.0 \n", "1 -0.052394 2.728394 -0.802595 -1.861238 1.0 ... 0.0 0.0 \n", "2 -0.879457 2.534909 0.191274 0.575870 1.0 ... 0.0 0.0 \n", "3 -1.460798 -0.608647 -0.839616 1.825358 0.0 ... 0.0 0.0 \n", "4 -0.175645 -0.653035 -0.074294 -0.754491 0.0 ... 0.0 0.0 \n", "... ... ... ... ... ... ... ... ... \n", "1608 0.941176 2.751157 -1.743639 -1.158856 1.0 ... 0.0 0.0 \n", "1609 0.924239 0.918743 -0.186361 -0.269253 0.0 ... 0.0 0.0 \n", "1610 0.688940 -0.626857 -2.284681 -0.629138 1.0 ... 0.0 0.0 \n", "1611 -1.875632 -0.037298 -0.631064 1.295640 1.0 ... 0.0 0.0 \n", "1612 -2.585697 -0.584746 -0.141104 -1.773086 1.0 ... 1.0 0.0 \n", "\n", " key_4 key_5 key_6 key_7 key_8 key_9 key_10 key_11 \n", "0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 \n", "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... ... ... \n", "1608 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "1609 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 \n", "1610 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 \n", "1611 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "1612 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", "[1613 rows x 26 columns]" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(transformed_no_text, columns=feat_names)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Building models" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fit_timescore_timetest_scoretrain_score
dummy0.000 (+/- 0.000)0.000 (+/- 0.000)0.508 (+/- 0.001)0.508 (+/- 0.000)
Decision Tree (no_text)0.016 (+/- 0.000)0.003 (+/- 0.000)0.688 (+/- 0.023)1.000 (+/- 0.000)
KNN (no_text)0.005 (+/- 0.001)0.015 (+/- 0.020)0.676 (+/- 0.028)0.788 (+/- 0.009)
SVM (no_text)0.054 (+/- 0.004)0.021 (+/- 0.001)0.737 (+/- 0.017)0.806 (+/- 0.011)
\n", "
" ], "text/plain": [ " fit_time score_time \\\n", "dummy 0.000 (+/- 0.000) 0.000 (+/- 0.000) \n", "Decision Tree (no_text) 0.016 (+/- 0.000) 0.003 (+/- 0.000) \n", "KNN (no_text) 0.005 (+/- 0.001) 0.015 (+/- 0.020) \n", "SVM (no_text) 0.054 (+/- 0.004) 0.021 (+/- 0.001) \n", "\n", " test_score train_score \n", "dummy 0.508 (+/- 0.001) 0.508 (+/- 0.000) \n", "Decision Tree (no_text) 0.688 (+/- 0.023) 1.000 (+/- 0.000) \n", "KNN (no_text) 0.676 (+/- 0.028) 0.788 (+/- 0.009) \n", "SVM (no_text) 0.737 (+/- 0.017) 0.806 (+/- 0.011) " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "models = {\n", " \"Decision Tree\": DecisionTreeClassifier(),\n", " \"KNN\": KNeighborsClassifier(),\n", " \"SVM\": SVC() \n", "}\n", "\n", "for (name, model) in models.items():\n", " pipe_model = make_pipeline(preprocessor_no_text, model)\n", " results[name + \" (no_text)\"] = mean_std_cross_val_scores(pipe_model, X_train, y_train, return_train_score=True)\n", "pd.DataFrame(results).T" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Incorporating \"song_title\" feature" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's incorporate bag-of-words representation of \"song_title\" feature in our column transformer. " ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['acousticness',\n", " 'danceability',\n", " 'energy',\n", " 'instrumentalness',\n", " 'liveness',\n", " 'loudness',\n", " 'speechiness',\n", " 'tempo',\n", " 'valence']" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "numeric_feats" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'song_title'" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text_feat" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "preprocessor = make_column_transformer(\n", " (StandardScaler(), numeric_feats), \n", " (\"passthrough\", passthrough_feats), \n", " (OneHotEncoder(handle_unknown = \"ignore\"), categorical_feats), \n", " (CountVectorizer(stop_words=\"english\"), text_feat)\n", ")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# Transform the data\n", "transformed = preprocessor.fit_transform(X_train)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
ColumnTransformer(transformers=[('standardscaler', StandardScaler(),\n",
       "                                 ['acousticness', 'danceability', 'energy',\n",
       "                                  'instrumentalness', 'liveness', 'loudness',\n",
       "                                  'speechiness', 'tempo', 'valence']),\n",
       "                                ('passthrough', 'passthrough', ['mode']),\n",
       "                                ('onehotencoder',\n",
       "                                 OneHotEncoder(handle_unknown='ignore'),\n",
       "                                 ['time_signature', 'key']),\n",
       "                                ('countvectorizer',\n",
       "                                 CountVectorizer(stop_words='english'),\n",
       "                                 'song_title')])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "ColumnTransformer(transformers=[('standardscaler', StandardScaler(),\n", " ['acousticness', 'danceability', 'energy',\n", " 'instrumentalness', 'liveness', 'loudness',\n", " 'speechiness', 'tempo', 'valence']),\n", " ('passthrough', 'passthrough', ['mode']),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore'),\n", " ['time_signature', 'key']),\n", " ('countvectorizer',\n", " CountVectorizer(stop_words='english'),\n", " 'song_title')])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocessor" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# Get the vocabulary\n", "vocab = preprocessor.named_transformers_['countvectorizer'].get_feature_names_out()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/plain": [ "1910" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "column_names = numeric_feats + passthrough_feats + ohe_feat_names + vocab.tolist()\n", "len(column_names)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
acousticnessdanceabilityenergyinstrumentalnesslivenessloudnessspeechinesstempovalencemode...너와의루시아변명여기이곳에서이대로있어줘요있을까장가갈지금
0-0.697633-0.194548-0.318116-0.492359-0.7378980.395794-0.617752-0.293827-0.9081490.0...0.00.00.00.00.00.00.00.00.00.0
1-0.2762910.295726-0.7955520.598355-0.438792-0.0523942.728394-0.802595-1.8612381.0...0.00.00.00.00.00.00.00.00.00.0
2-0.5995401.110806-0.946819-0.492917-0.399607-0.8794572.5349090.1912740.5758701.0...0.00.00.00.00.00.00.00.00.00.0
3-0.3071501.809445-1.722063-0.492168-0.763368-1.460798-0.608647-0.8396161.8253580.0...0.00.00.00.00.00.00.00.00.00.0
4-0.6346420.4918351.0574682.723273-0.458384-0.175645-0.653035-0.074294-0.7544910.0...0.00.00.00.00.00.00.00.00.00.0
..................................................................
1608-0.711944-0.2006761.185100-0.483229-0.3930770.9411762.751157-1.743639-1.1588561.0...0.00.00.00.00.00.00.00.00.00.0
1609-0.715953-0.5009691.383637-0.4923800.4820380.9242390.918743-0.186361-0.2692530.0...0.00.00.00.00.00.00.00.00.00.0
16101.224228-1.267021-0.157395-0.4929170.1946870.688940-0.626857-2.284681-0.6291381.0...0.00.00.00.00.00.00.00.00.00.0
16112.0034191.699134-2.459489-0.4810320.802042-1.875632-0.037298-0.6310641.2956401.0...0.00.00.00.00.00.00.01.01.00.0
16121.687114-0.899316-2.4831250.180574-0.556344-2.585697-0.584746-0.141104-1.7730861.0...0.00.00.00.00.00.00.00.00.00.0
\n", "

1613 rows × 1910 columns

\n", "
" ], "text/plain": [ " acousticness danceability energy instrumentalness liveness \\\n", "0 -0.697633 -0.194548 -0.318116 -0.492359 -0.737898 \n", "1 -0.276291 0.295726 -0.795552 0.598355 -0.438792 \n", "2 -0.599540 1.110806 -0.946819 -0.492917 -0.399607 \n", "3 -0.307150 1.809445 -1.722063 -0.492168 -0.763368 \n", "4 -0.634642 0.491835 1.057468 2.723273 -0.458384 \n", "... ... ... ... ... ... \n", "1608 -0.711944 -0.200676 1.185100 -0.483229 -0.393077 \n", "1609 -0.715953 -0.500969 1.383637 -0.492380 0.482038 \n", "1610 1.224228 -1.267021 -0.157395 -0.492917 0.194687 \n", "1611 2.003419 1.699134 -2.459489 -0.481032 0.802042 \n", "1612 1.687114 -0.899316 -2.483125 0.180574 -0.556344 \n", "\n", " loudness speechiness tempo valence mode ... 너와의 루시아 변명 \\\n", "0 0.395794 -0.617752 -0.293827 -0.908149 0.0 ... 0.0 0.0 0.0 \n", "1 -0.052394 2.728394 -0.802595 -1.861238 1.0 ... 0.0 0.0 0.0 \n", "2 -0.879457 2.534909 0.191274 0.575870 1.0 ... 0.0 0.0 0.0 \n", "3 -1.460798 -0.608647 -0.839616 1.825358 0.0 ... 0.0 0.0 0.0 \n", "4 -0.175645 -0.653035 -0.074294 -0.754491 0.0 ... 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... ... ... ... \n", "1608 0.941176 2.751157 -1.743639 -1.158856 1.0 ... 0.0 0.0 0.0 \n", "1609 0.924239 0.918743 -0.186361 -0.269253 0.0 ... 0.0 0.0 0.0 \n", "1610 0.688940 -0.626857 -2.284681 -0.629138 1.0 ... 0.0 0.0 0.0 \n", "1611 -1.875632 -0.037298 -0.631064 1.295640 1.0 ... 0.0 0.0 0.0 \n", "1612 -2.585697 -0.584746 -0.141104 -1.773086 1.0 ... 0.0 0.0 0.0 \n", "\n", " 여기 이곳에서 이대로 있어줘요 있을까 장가갈 지금 \n", "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... ... \n", "1608 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "1609 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "1610 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "1611 0.0 0.0 0.0 0.0 1.0 1.0 0.0 \n", "1612 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", "[1613 rows x 1910 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(transformed.toarray(), columns=column_names)\n", "df" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Visualizing the vocabulary " ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['000', '10', '100', '10cm', '11', '112', '12', '1208', '144', '18'],\n", " dtype=object)" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab[0:10]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['duele', 'duet', 'duke', 'dustland', 'dutchie', 'dynamite',\n", " 'earth', 'easy', 'eazy', 'echelon'], dtype=object)" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab[500:510]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['wide', 'wifey', 'wild', 'wildcard', 'wildfire', 'wiley',\n", " 'willing', 'win', 'wind', 'window'], dtype=object)" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab[1800:1810]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['000', 'ap', 'blind', 'cha', 'dallask', 'duele', 'flashlight',\n", " 'grace', 'icarus', 'lafa', 'making', 'neck', 'pharaohs', 'redeem',\n", " 'seeb', 'soundtrack', 'talons', 'unanswered', 'wide'], dtype=object)" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab[0::100]" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Let's find songs containing the word _earth_ in them. " ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [ { "data": { "text/plain": [ "506" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "earth_index_vocab = np.where(vocab == \"earth\")[0][0]\n", "earth_index_vocab" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "532" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "earth_index_in_df = len(numeric_feats) + len(passthrough_feats) + len(ohe_feat_names) + earth_index_vocab\n", "earth_index_in_df" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dutchiedynamiteeartheasy
3800.00.01.00.0
6390.00.01.00.0
\n", "
" ], "text/plain": [ " dutchie dynamite earth easy\n", "380 0.0 0.0 1.0 0.0\n", "639 0.0 0.0 1.0 0.0" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "earth_songs = df[df.iloc[:, earth_index_in_df] == 1]\n", "earth_songs.iloc[:, earth_index_in_df - 2 : earth_index_in_df + 2]" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index([380, 639], dtype='int64')" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "earth_songs.index" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1851 Softest Place On Earth\n", "1948 Earth Song - Remastered Version\n", "Name: song_title, dtype: object" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.iloc[earth_songs.index][\"song_title\"]" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Model building " ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fit_timescore_timetest_scoretrain_score
dummy0.000 (+/- 0.000)0.000 (+/- 0.000)0.508 (+/- 0.001)0.508 (+/- 0.000)
Decision Tree (no_text)0.016 (+/- 0.000)0.003 (+/- 0.000)0.688 (+/- 0.023)1.000 (+/- 0.000)
KNN (no_text)0.005 (+/- 0.001)0.015 (+/- 0.020)0.676 (+/- 0.028)0.788 (+/- 0.009)
SVM (no_text)0.054 (+/- 0.004)0.021 (+/- 0.001)0.737 (+/- 0.017)0.806 (+/- 0.011)
Decision Tree (text)0.035 (+/- 0.002)0.005 (+/- 0.001)0.700 (+/- 0.027)1.000 (+/- 0.000)
KNN (text)0.012 (+/- 0.002)0.031 (+/- 0.004)0.682 (+/- 0.028)0.786 (+/- 0.010)
SVM (text)0.059 (+/- 0.003)0.014 (+/- 0.001)0.733 (+/- 0.027)0.866 (+/- 0.004)
\n", "
" ], "text/plain": [ " fit_time score_time \\\n", "dummy 0.000 (+/- 0.000) 0.000 (+/- 0.000) \n", "Decision Tree (no_text) 0.016 (+/- 0.000) 0.003 (+/- 0.000) \n", "KNN (no_text) 0.005 (+/- 0.001) 0.015 (+/- 0.020) \n", "SVM (no_text) 0.054 (+/- 0.004) 0.021 (+/- 0.001) \n", "Decision Tree (text) 0.035 (+/- 0.002) 0.005 (+/- 0.001) \n", "KNN (text) 0.012 (+/- 0.002) 0.031 (+/- 0.004) \n", "SVM (text) 0.059 (+/- 0.003) 0.014 (+/- 0.001) \n", "\n", " test_score train_score \n", "dummy 0.508 (+/- 0.001) 0.508 (+/- 0.000) \n", "Decision Tree (no_text) 0.688 (+/- 0.023) 1.000 (+/- 0.000) \n", "KNN (no_text) 0.676 (+/- 0.028) 0.788 (+/- 0.009) \n", "SVM (no_text) 0.737 (+/- 0.017) 0.806 (+/- 0.011) \n", "Decision Tree (text) 0.700 (+/- 0.027) 1.000 (+/- 0.000) \n", "KNN (text) 0.682 (+/- 0.028) 0.786 (+/- 0.010) \n", "SVM (text) 0.733 (+/- 0.027) 0.866 (+/- 0.004) " ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "models = {\n", " \"Decision Tree\": DecisionTreeClassifier(),\n", " \"KNN\": KNeighborsClassifier(),\n", " \"SVM\": SVC() \n", "}\n", "\n", "for (name, model) in models.items():\n", " pipe_model = make_pipeline(preprocessor, model)\n", " results[name + \" (text)\"] = mean_std_cross_val_scores(pipe_model, X_train, y_train, return_train_score=True)\n", "pd.DataFrame(results).T" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "- Not a big difference in the results. \n", "- Seems like there is more overfitting when we included the `song_title` feature.\n", "- The training score of SVC is much higher when we include all features. Hyperparameter optimization of `C` and `gamma` may help. " ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "- What about the `artist` column?\n", "- Does it make sense to apply BOW encoding to it? \n", "- Let's look at the distribution of values in the `artist` column. " ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "artist\n", "Drake 14\n", "Disclosure 12\n", "Rick Ross 11\n", "WALK THE MOON 10\n", "Crystal Castles 8\n", " ..\n", "Classixx 1\n", "Jordan Feliz 1\n", "Travis Hayes 1\n", "The Silvertones 1\n", "Phil Woods 1\n", "Name: count, Length: 1131, dtype: int64" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train['artist'].value_counts()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "artist\n", "Drake 14\n", "Disclosure 12\n", "Rick Ross 11\n", "WALK THE MOON 10\n", "Crystal Castles 8\n", "Big Time Rush 8\n", "FIDLAR 8\n", "Fall Out Boy 8\n", "Demi Lovato 7\n", "Kanye West 7\n", "Kina Grannis 7\n", "Backstreet Boys 7\n", "Beach House 6\n", "Young Thug 6\n", "*NSYNC 6\n", "Name: count, dtype: int64" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "most_frequent = X_train[\"artist\"].value_counts().iloc[:15]\n", "most_frequent" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "preprocessor_artist = make_column_transformer(\n", " (StandardScaler(), numeric_feats), \n", " (\"passthrough\", passthrough_feats), \n", " (OneHotEncoder(handle_unknown = \"ignore\"), categorical_feats),\n", " (OneHotEncoder(dtype=int, handle_unknown=\"ignore\", categories=[most_frequent.index.values]), artist_cat_feat),\n", " (CountVectorizer(max_features = 100, stop_words=\"english\"), text_feat)\n", ")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fit_timescore_timetest_scoretrain_score
dummy0.000 (+/- 0.000)0.000 (+/- 0.000)0.508 (+/- 0.001)0.508 (+/- 0.000)
Decision Tree (no_text)0.016 (+/- 0.000)0.003 (+/- 0.000)0.688 (+/- 0.023)1.000 (+/- 0.000)
KNN (no_text)0.005 (+/- 0.001)0.015 (+/- 0.020)0.676 (+/- 0.028)0.788 (+/- 0.009)
SVM (no_text)0.054 (+/- 0.004)0.021 (+/- 0.001)0.737 (+/- 0.017)0.806 (+/- 0.011)
Decision Tree (text)0.035 (+/- 0.002)0.005 (+/- 0.001)0.700 (+/- 0.027)1.000 (+/- 0.000)
KNN (text)0.012 (+/- 0.002)0.031 (+/- 0.004)0.682 (+/- 0.028)0.786 (+/- 0.010)
SVM (text)0.059 (+/- 0.003)0.014 (+/- 0.001)0.733 (+/- 0.027)0.866 (+/- 0.004)
Decision Tree (all)0.028 (+/- 0.001)0.005 (+/- 0.001)0.684 (+/- 0.035)1.000 (+/- 0.000)
KNN (all)0.012 (+/- 0.000)0.026 (+/- 0.001)0.681 (+/- 0.032)0.792 (+/- 0.008)
SVM (all)0.052 (+/- 0.004)0.013 (+/- 0.000)0.741 (+/- 0.027)0.833 (+/- 0.006)
\n", "
" ], "text/plain": [ " fit_time score_time \\\n", "dummy 0.000 (+/- 0.000) 0.000 (+/- 0.000) \n", "Decision Tree (no_text) 0.016 (+/- 0.000) 0.003 (+/- 0.000) \n", "KNN (no_text) 0.005 (+/- 0.001) 0.015 (+/- 0.020) \n", "SVM (no_text) 0.054 (+/- 0.004) 0.021 (+/- 0.001) \n", "Decision Tree (text) 0.035 (+/- 0.002) 0.005 (+/- 0.001) \n", "KNN (text) 0.012 (+/- 0.002) 0.031 (+/- 0.004) \n", "SVM (text) 0.059 (+/- 0.003) 0.014 (+/- 0.001) \n", "Decision Tree (all) 0.028 (+/- 0.001) 0.005 (+/- 0.001) \n", "KNN (all) 0.012 (+/- 0.000) 0.026 (+/- 0.001) \n", "SVM (all) 0.052 (+/- 0.004) 0.013 (+/- 0.000) \n", "\n", " test_score train_score \n", "dummy 0.508 (+/- 0.001) 0.508 (+/- 0.000) \n", "Decision Tree (no_text) 0.688 (+/- 0.023) 1.000 (+/- 0.000) \n", "KNN (no_text) 0.676 (+/- 0.028) 0.788 (+/- 0.009) \n", "SVM (no_text) 0.737 (+/- 0.017) 0.806 (+/- 0.011) \n", "Decision Tree (text) 0.700 (+/- 0.027) 1.000 (+/- 0.000) \n", "KNN (text) 0.682 (+/- 0.028) 0.786 (+/- 0.010) \n", "SVM (text) 0.733 (+/- 0.027) 0.866 (+/- 0.004) \n", "Decision Tree (all) 0.684 (+/- 0.035) 1.000 (+/- 0.000) \n", "KNN (all) 0.681 (+/- 0.032) 0.792 (+/- 0.008) \n", "SVM (all) 0.741 (+/- 0.027) 0.833 (+/- 0.006) " ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "models = {\n", " \"Decision Tree\": DecisionTreeClassifier(),\n", " \"KNN\": KNeighborsClassifier(),\n", " \"SVM\": SVC() \n", "}\n", "\n", "for (name, model) in models.items():\n", " pipe_model = make_pipeline(preprocessor_artist, model)\n", " results[name + \" (all)\"] = mean_std_cross_val_scores(pipe_model, X_train, y_train, return_train_score=True)\n", "pd.DataFrame(results).T" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Tiny bit improvement in the mean CV scores but we are still overfitting. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "



" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## (Optional) Incorporating text features in the restaurant survey dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Do you recall [the restaurants survey](https://ubc.ca1.qualtrics.com/jfe/form/SV_73VuZiuwM1eDVrw) you completed at the start of the course?\n", "\n", "Let's use that data for this demo. You'll find a [wrangled version](https://github.ubc.ca/MDS-2023-24/DSCI_571_sup-learn-1_students/blob/master/lectures/data/cleaned_restaurant_data.csv) in the course repository." ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('../data/cleaned_restaurant_data.csv')" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
north_americaeat_out_freqagen_peoplepricefood_typenoise_levelgood_servercommentsrestaurant_nametarget
0Yes3.02910.0120.0ItalianmediumYesAmbienceNaNdislike
1Yes2.0233.020.0Canadian/Americanno musicNofood tastes badNaNdislike
2Yes2.02120.015.0ChinesemediumYesbad foodNaNdislike
3No2.02414.018.0OthermediumNoOverall vibe on the restaurantNaNdislike
4Yes5.02330.020.0ChinesemediumYesA bad dayNaNdislike
....................................
959No10.022NaNNaNNaNNaNNaNNaNNaNlike
960Yes1.020NaNNaNNaNNaNNaNNaNNaNlike
961No1.02240.050.0ChinesemediumYesThe self service sauce table is very clean and the sauces were always filled up.Haidilaolike
962Yes3.021NaNNaNNaNNaNNaNNaNNaNlike
963Yes3.02720.022.0OthermediumYesLots of meat that was very soft and tasty. Hearty and amazing broth. Good noodle thickness and consistencyUno Beef Noodlelike
\n", "

964 rows × 11 columns

\n", "
" ], "text/plain": [ " north_america eat_out_freq age n_people price food_type \\\n", "0 Yes 3.0 29 10.0 120.0 Italian \n", "1 Yes 2.0 23 3.0 20.0 Canadian/American \n", "2 Yes 2.0 21 20.0 15.0 Chinese \n", "3 No 2.0 24 14.0 18.0 Other \n", "4 Yes 5.0 23 30.0 20.0 Chinese \n", ".. ... ... ... ... ... ... \n", "959 No 10.0 22 NaN NaN NaN \n", "960 Yes 1.0 20 NaN NaN NaN \n", "961 No 1.0 22 40.0 50.0 Chinese \n", "962 Yes 3.0 21 NaN NaN NaN \n", "963 Yes 3.0 27 20.0 22.0 Other \n", "\n", " noise_level good_server \\\n", "0 medium Yes \n", "1 no music No \n", "2 medium Yes \n", "3 medium No \n", "4 medium Yes \n", ".. ... ... \n", "959 NaN NaN \n", "960 NaN NaN \n", "961 medium Yes \n", "962 NaN NaN \n", "963 medium Yes \n", "\n", " comments \\\n", "0 Ambience \n", "1 food tastes bad \n", "2 bad food \n", "3 Overall vibe on the restaurant \n", "4 A bad day \n", ".. ... \n", "959 NaN \n", "960 NaN \n", "961 The self service sauce table is very clean and the sauces were always filled up. \n", "962 NaN \n", "963 Lots of meat that was very soft and tasty. Hearty and amazing broth. Good noodle thickness and consistency \n", "\n", " restaurant_name target \n", "0 NaN dislike \n", "1 NaN dislike \n", "2 NaN dislike \n", "3 NaN dislike \n", "4 NaN dislike \n", ".. ... ... \n", "959 NaN like \n", "960 NaN like \n", "961 Haidilao like \n", "962 NaN like \n", "963 Uno Beef Noodle like \n", "\n", "[964 rows x 11 columns]" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eat_out_freqagen_peopleprice
count964.000000964.0000006.960000e+02696.000000
mean2.58518723.9751041.439254e+041472.179152
std2.2464864.5567163.790481e+0537903.575636
min0.00000010.000000-2.000000e+000.000000
25%1.00000021.0000001.000000e+0118.000000
50%2.00000022.0000002.000000e+0125.000000
75%3.00000026.0000003.000000e+0140.000000
max15.00000046.0000001.000000e+071000000.000000
\n", "
" ], "text/plain": [ " eat_out_freq age n_people price\n", "count 964.000000 964.000000 6.960000e+02 696.000000\n", "mean 2.585187 23.975104 1.439254e+04 1472.179152\n", "std 2.246486 4.556716 3.790481e+05 37903.575636\n", "min 0.000000 10.000000 -2.000000e+00 0.000000\n", "25% 1.000000 21.000000 1.000000e+01 18.000000\n", "50% 2.000000 22.000000 2.000000e+01 25.000000\n", "75% 3.000000 26.000000 3.000000e+01 40.000000\n", "max 15.000000 46.000000 1.000000e+07 1000000.000000" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Are there any unusual values in this data that you notice?\n", "Let's get rid of these outliers. " ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(942, 11)" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "upperbound_price = 200\n", "lowerbound_people = 1\n", "df = df[~(df['price'] > 200)]\n", "restaurant_df = df[~(df['n_people'] < lowerbound_people)]\n", "restaurant_df.shape" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eat_out_freqagen_peopleprice
count942.000000942.000000674.000000674.000000
mean2.59805723.99256924.97329434.023279
std2.2577874.58257022.01666029.018622
min0.00000010.0000001.0000000.000000
25%1.00000021.00000010.00000018.000000
50%2.00000022.00000020.00000025.000000
75%3.00000026.00000030.00000040.000000
max15.00000046.000000200.000000200.000000
\n", "
" ], "text/plain": [ " eat_out_freq age n_people price\n", "count 942.000000 942.000000 674.000000 674.000000\n", "mean 2.598057 23.992569 24.973294 34.023279\n", "std 2.257787 4.582570 22.016660 29.018622\n", "min 0.000000 10.000000 1.000000 0.000000\n", "25% 1.000000 21.000000 10.000000 18.000000\n", "50% 2.000000 22.000000 20.000000 25.000000\n", "75% 3.000000 26.000000 30.000000 40.000000\n", "max 15.000000 46.000000 200.000000 200.000000" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "restaurant_df.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Data splitting \n", "\n", "We aim to predict whether a restaurant is liked or disliked." ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "# Separate `X` and `y`. \n", "\n", "X = restaurant_df.drop(columns=['target'])\n", "y = restaurant_df['target']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Below I'm perturbing this data just to demonstrate a few concepts. Don't do it in real life. " ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "X.at[459, 'food_type'] = 'Quebecois'\n", "X['price'] = X['price'] * 100" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "# Split the data\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### EDA " ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "X_train.hist(bins=20, figsize=(12, 8));" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Do you see anything interesting in these plots? " ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "food_type\n", "Other 189\n", "Canadian/American 131\n", "Chinese 102\n", "Indian 36\n", "Italian 32\n", "Thai 20\n", "Fusion 18\n", "Mexican 17\n", "fusion 3\n", "Quebecois 1\n", "Name: count, dtype: int64" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train['food_type'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Error in data collection? Probably \"Fusion\" and \"fusion\" categories should be combined?" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "X_train['food_type'] = X_train['food_type'].replace(\"fusion\", \"Fusion\")\n", "X_test['food_type'] = X_test['food_type'].replace(\"fusion\", \"Fusion\")" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "food_type\n", "Other 189\n", "Canadian/American 131\n", "Chinese 102\n", "Indian 36\n", "Italian 32\n", "Fusion 21\n", "Thai 20\n", "Mexican 17\n", "Quebecois 1\n", "Name: count, dtype: int64" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train['food_type'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Again, usually we should spend lots of time in EDA, but let's stop here so that we have time to learn about transformers and pipelines. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dummy Classifier" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
dummy
fit_time0.001 (+/- 0.000)
score_time0.001 (+/- 0.000)
test_score0.515 (+/- 0.002)
train_score0.515 (+/- 0.000)
\n", "
" ], "text/plain": [ " dummy\n", "fit_time 0.001 (+/- 0.000)\n", "score_time 0.001 (+/- 0.000)\n", "test_score 0.515 (+/- 0.002)\n", "train_score 0.515 (+/- 0.000)" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.dummy import DummyClassifier\n", "\n", "results_df = {}\n", "dummy = DummyClassifier()\n", "results_df['dummy'] = mean_std_cross_val_scores(dummy, X_train, y_train, return_train_score=True)\n", "pd.DataFrame(results_df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have a relatively balanced distribution of both 'like' and 'dislike' classes." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preprocessing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How can we horizontally stack \n", "- preprocessed numeric features, \n", "- preprocessed binary features, \n", "- preprocessed ordinal features, and \n", "- preprocessed categorical features?\n", "\n", "Let's define a column transformer. " ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "numeric_feats = ['age', 'n_people', 'price'] # Continuous and quantitative features\n", "categorical_feats = ['north_america', 'food_type'] # Discrete and qualitative features\n", "binary_feats = ['good_server'] # Categorical features with only two possible values \n", "ordinal_feats = ['noise_level'] # Some natural ordering in the categories \n", "noise_cats = ['no music', 'low', 'medium', 'high', 'crazy loud']\n", "drop_feats = ['comments', 'restaurant_name', 'eat_out_freq'] # Dropping text feats and `eat_out_freq` because it's not that useful" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "noise_level\n", "medium 232\n", "low 186\n", "high 75\n", "no music 37\n", "crazy loud 18\n", "Name: count, dtype: int64" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train['noise_level'].value_counts()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "noise_levels = [\"no music\", \"low\", \"medium\", \"high\", \"crazy loud\"]" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "from sklearn.impute import SimpleImputer\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.preprocessing import OrdinalEncoder\n", "\n", "from sklearn.compose import make_column_transformer\n", "\n", "numeric_transformer = make_pipeline(SimpleImputer(strategy=\"median\"),\n", " StandardScaler()) \n", "binary_transformer = make_pipeline(SimpleImputer(strategy=\"most_frequent\"), \n", " OneHotEncoder(drop=\"if_binary\"))\n", "ordinal_transformer = make_pipeline(SimpleImputer(strategy=\"most_frequent\"), \n", " OrdinalEncoder(categories=[noise_levels]))\n", "categorical_transformer = make_pipeline(SimpleImputer(strategy=\"most_frequent\"), \n", " OneHotEncoder(sparse_output=False, handle_unknown=\"ignore\"))\n", "\n", "preprocessor = make_column_transformer(\n", " (numeric_transformer, numeric_feats), \n", " (binary_transformer, binary_feats), \n", " (ordinal_transformer, ordinal_feats),\n", " (categorical_transformer, categorical_feats),\n", " (\"drop\", drop_feats)\n", ")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How does the transformed data look like? " ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(753, 17)" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transformed = preprocessor.fit_transform(X_train)\n", "transformed.shape" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
ColumnTransformer(transformers=[('pipeline-1',\n",
       "                                 Pipeline(steps=[('simpleimputer',\n",
       "                                                  SimpleImputer(strategy='median')),\n",
       "                                                 ('standardscaler',\n",
       "                                                  StandardScaler())]),\n",
       "                                 ['age', 'n_people', 'price']),\n",
       "                                ('pipeline-2',\n",
       "                                 Pipeline(steps=[('simpleimputer',\n",
       "                                                  SimpleImputer(strategy='most_frequent')),\n",
       "                                                 ('onehotencoder',\n",
       "                                                  OneHotEncoder(drop='if_binary'))]),\n",
       "                                 ['good_server']),\n",
       "                                ('pipeline-3',...\n",
       "                                                  OrdinalEncoder(categories=[['no '\n",
       "                                                                              'music',\n",
       "                                                                              'low',\n",
       "                                                                              'medium',\n",
       "                                                                              'high',\n",
       "                                                                              'crazy '\n",
       "                                                                              'loud']]))]),\n",
       "                                 ['noise_level']),\n",
       "                                ('pipeline-4',\n",
       "                                 Pipeline(steps=[('simpleimputer',\n",
       "                                                  SimpleImputer(strategy='most_frequent')),\n",
       "                                                 ('onehotencoder',\n",
       "                                                  OneHotEncoder(handle_unknown='ignore',\n",
       "                                                                sparse_output=False))]),\n",
       "                                 ['north_america', 'food_type']),\n",
       "                                ('drop', 'drop',\n",
       "                                 ['comments', 'restaurant_name',\n",
       "                                  'eat_out_freq'])])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "ColumnTransformer(transformers=[('pipeline-1',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='median')),\n", " ('standardscaler',\n", " StandardScaler())]),\n", " ['age', 'n_people', 'price']),\n", " ('pipeline-2',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(drop='if_binary'))]),\n", " ['good_server']),\n", " ('pipeline-3',...\n", " OrdinalEncoder(categories=[['no '\n", " 'music',\n", " 'low',\n", " 'medium',\n", " 'high',\n", " 'crazy '\n", " 'loud']]))]),\n", " ['noise_level']),\n", " ('pipeline-4',\n", " Pipeline(steps=[('simpleimputer',\n", " SimpleImputer(strategy='most_frequent')),\n", " ('onehotencoder',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse_output=False))]),\n", " ['north_america', 'food_type']),\n", " ('drop', 'drop',\n", " ['comments', 'restaurant_name',\n", " 'eat_out_freq'])])" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocessor" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[\"north_america_Don't want to share\",\n", " 'north_america_No',\n", " 'north_america_Yes',\n", " 'food_type_Canadian/American',\n", " 'food_type_Chinese',\n", " 'food_type_Fusion',\n", " 'food_type_Indian',\n", " 'food_type_Italian',\n", " 'food_type_Mexican',\n", " 'food_type_Other',\n", " 'food_type_Quebecois',\n", " 'food_type_Thai']" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Getting feature names from a column transformer\n", "ohe_feat_names = preprocessor.named_transformers_['pipeline-4']['onehotencoder'].get_feature_names_out(categorical_feats).tolist()\n", "ohe_feat_names" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['age', 'n_people', 'price']" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "numeric_feats" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "feat_names = numeric_feats + binary_feats + ordinal_feats + ohe_feat_names" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-0.66941678, 0.31029469, -0.36840629, ..., 0. ,\n", " 0. , 0. ],\n", " [-0.66941678, 0.31029469, -0.05422496, ..., 0. ,\n", " 0. , 0. ],\n", " [-0.89515383, 0.82336432, -0.25058829, ..., 0. ,\n", " 0. , 0. ],\n", " ...,\n", " [-0.89515383, -0.97237936, -0.64331495, ..., 0. ,\n", " 0. , 0. ],\n", " [-0.89515383, -0.20277493, -0.25058829, ..., 1. ,\n", " 0. , 0. ],\n", " [-0.89515383, 1.33643394, -0.05422496, ..., 0. ,\n", " 0. , 0. ]])" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transformed" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agen_peoplepricegood_servernoise_levelnorth_america_Don't want to sharenorth_america_Nonorth_america_Yesfood_type_Canadian/Americanfood_type_Chinesefood_type_Fusionfood_type_Indianfood_type_Italianfood_type_Mexicanfood_type_Otherfood_type_Quebecoisfood_type_Thai
0-0.6694170.310295-0.3684060.03.00.01.00.00.01.00.00.00.00.00.00.00.0
1-0.6694170.310295-0.0542251.01.00.00.01.01.00.00.00.00.00.00.00.00.0
2-0.8951540.823364-0.2505881.02.00.01.00.01.00.00.00.00.00.00.00.00.0
3-0.669417-0.202775-0.2505881.02.00.00.01.00.00.00.00.00.00.00.01.00.0
40.007794-0.202775-0.0542251.03.00.00.01.00.00.00.01.00.00.00.00.00.0
......................................................
7480.685006-0.715845-0.6433151.02.00.01.00.00.01.00.00.00.00.00.00.00.0
7490.007794-0.613231-0.9182241.02.00.01.00.00.00.00.00.00.00.01.00.00.0
750-0.895154-0.972379-0.6433150.01.00.00.01.01.00.00.00.00.00.00.00.00.0
751-0.895154-0.202775-0.2505881.02.00.00.01.00.00.00.00.00.00.01.00.00.0
752-0.8951541.336434-0.0542251.03.01.00.00.00.01.00.00.00.00.00.00.00.0
\n", "

753 rows × 17 columns

\n", "
" ], "text/plain": [ " age n_people price good_server noise_level \\\n", "0 -0.669417 0.310295 -0.368406 0.0 3.0 \n", "1 -0.669417 0.310295 -0.054225 1.0 1.0 \n", "2 -0.895154 0.823364 -0.250588 1.0 2.0 \n", "3 -0.669417 -0.202775 -0.250588 1.0 2.0 \n", "4 0.007794 -0.202775 -0.054225 1.0 3.0 \n", ".. ... ... ... ... ... \n", "748 0.685006 -0.715845 -0.643315 1.0 2.0 \n", "749 0.007794 -0.613231 -0.918224 1.0 2.0 \n", "750 -0.895154 -0.972379 -0.643315 0.0 1.0 \n", "751 -0.895154 -0.202775 -0.250588 1.0 2.0 \n", "752 -0.895154 1.336434 -0.054225 1.0 3.0 \n", "\n", " north_america_Don't want to share north_america_No north_america_Yes \\\n", "0 0.0 1.0 0.0 \n", "1 0.0 0.0 1.0 \n", "2 0.0 1.0 0.0 \n", "3 0.0 0.0 1.0 \n", "4 0.0 0.0 1.0 \n", ".. ... ... ... \n", "748 0.0 1.0 0.0 \n", "749 0.0 1.0 0.0 \n", "750 0.0 0.0 1.0 \n", "751 0.0 0.0 1.0 \n", "752 1.0 0.0 0.0 \n", "\n", " food_type_Canadian/American food_type_Chinese food_type_Fusion \\\n", "0 0.0 1.0 0.0 \n", "1 1.0 0.0 0.0 \n", "2 1.0 0.0 0.0 \n", "3 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 \n", ".. ... ... ... \n", "748 0.0 1.0 0.0 \n", "749 0.0 0.0 0.0 \n", "750 1.0 0.0 0.0 \n", "751 0.0 0.0 0.0 \n", "752 0.0 1.0 0.0 \n", "\n", " food_type_Indian food_type_Italian food_type_Mexican food_type_Other \\\n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 1.0 0.0 0.0 0.0 \n", ".. ... ... ... ... \n", "748 0.0 0.0 0.0 0.0 \n", "749 0.0 0.0 0.0 1.0 \n", "750 0.0 0.0 0.0 0.0 \n", "751 0.0 0.0 0.0 1.0 \n", "752 0.0 0.0 0.0 0.0 \n", "\n", " food_type_Quebecois food_type_Thai \n", "0 0.0 0.0 \n", "1 0.0 0.0 \n", "2 0.0 0.0 \n", "3 1.0 0.0 \n", "4 0.0 0.0 \n", ".. ... ... \n", "748 0.0 0.0 \n", "749 0.0 0.0 \n", "750 0.0 0.0 \n", "751 0.0 0.0 \n", "752 0.0 0.0 \n", "\n", "[753 rows x 17 columns]" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(transformed, columns = feat_names)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have new columns for the categorical features. Let's create a pipeline with the preprocessor and SVC. " ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fit_timescore_timetest_scoretrain_score
dummy0.001 (+/- 0.000)0.001 (+/- 0.000)0.515 (+/- 0.002)0.515 (+/- 0.000)
Decision Tree (numeric-only)0.003 (+/- 0.000)0.001 (+/- 0.000)0.497 (+/- 0.038)0.833 (+/- 0.010)
KNN (numeric-only)0.003 (+/- 0.001)0.004 (+/- 0.000)0.525 (+/- 0.034)0.674 (+/- 0.015)
SVM (numeric-only)0.012 (+/- 0.000)0.005 (+/- 0.000)0.587 (+/- 0.033)0.623 (+/- 0.006)
\n", "
" ], "text/plain": [ " fit_time score_time \\\n", "dummy 0.001 (+/- 0.000) 0.001 (+/- 0.000) \n", "Decision Tree (numeric-only) 0.003 (+/- 0.000) 0.001 (+/- 0.000) \n", "KNN (numeric-only) 0.003 (+/- 0.001) 0.004 (+/- 0.000) \n", "SVM (numeric-only) 0.012 (+/- 0.000) 0.005 (+/- 0.000) \n", "\n", " test_score train_score \n", "dummy 0.515 (+/- 0.002) 0.515 (+/- 0.000) \n", "Decision Tree (numeric-only) 0.497 (+/- 0.038) 0.833 (+/- 0.010) \n", "KNN (numeric-only) 0.525 (+/- 0.034) 0.674 (+/- 0.015) \n", "SVM (numeric-only) 0.587 (+/- 0.033) 0.623 (+/- 0.006) " ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.svm import SVC\n", "models = {\n", " \"Decision Tree\": DecisionTreeClassifier(),\n", " \"KNN\": KNeighborsClassifier(),\n", " \"SVM\": SVC() \n", "}\n", "\n", "for (name, model) in models.items():\n", " pipe_num_model = make_pipeline(SimpleImputer(strategy=\"median\"), StandardScaler(), model)\n", " results_df[name +' (numeric-only)'] = mean_std_cross_val_scores(pipe_num_model, X_train[numeric_feats], y_train, return_train_score=True)\n", "pd.DataFrame(results_df).T" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fit_timescore_timetest_scoretrain_score
dummy0.001 (+/- 0.000)0.001 (+/- 0.000)0.515 (+/- 0.002)0.515 (+/- 0.000)
Decision Tree (numeric-only)0.003 (+/- 0.000)0.001 (+/- 0.000)0.497 (+/- 0.038)0.833 (+/- 0.010)
KNN (numeric-only)0.003 (+/- 0.001)0.004 (+/- 0.000)0.525 (+/- 0.034)0.674 (+/- 0.015)
SVM (numeric-only)0.012 (+/- 0.000)0.005 (+/- 0.000)0.587 (+/- 0.033)0.623 (+/- 0.006)
Decision Tree(non-text feats)0.009 (+/- 0.000)0.003 (+/- 0.000)0.590 (+/- 0.039)0.889 (+/- 0.008)
KNN(non-text feats)0.008 (+/- 0.000)0.004 (+/- 0.000)0.598 (+/- 0.023)0.737 (+/- 0.008)
SVM(non-text feats)0.019 (+/- 0.000)0.008 (+/- 0.000)0.687 (+/- 0.011)0.733 (+/- 0.008)
\n", "
" ], "text/plain": [ " fit_time score_time \\\n", "dummy 0.001 (+/- 0.000) 0.001 (+/- 0.000) \n", "Decision Tree (numeric-only) 0.003 (+/- 0.000) 0.001 (+/- 0.000) \n", "KNN (numeric-only) 0.003 (+/- 0.001) 0.004 (+/- 0.000) \n", "SVM (numeric-only) 0.012 (+/- 0.000) 0.005 (+/- 0.000) \n", "Decision Tree(non-text feats) 0.009 (+/- 0.000) 0.003 (+/- 0.000) \n", "KNN(non-text feats) 0.008 (+/- 0.000) 0.004 (+/- 0.000) \n", "SVM(non-text feats) 0.019 (+/- 0.000) 0.008 (+/- 0.000) \n", "\n", " test_score train_score \n", "dummy 0.515 (+/- 0.002) 0.515 (+/- 0.000) \n", "Decision Tree (numeric-only) 0.497 (+/- 0.038) 0.833 (+/- 0.010) \n", "KNN (numeric-only) 0.525 (+/- 0.034) 0.674 (+/- 0.015) \n", "SVM (numeric-only) 0.587 (+/- 0.033) 0.623 (+/- 0.006) \n", "Decision Tree(non-text feats) 0.590 (+/- 0.039) 0.889 (+/- 0.008) \n", "KNN(non-text feats) 0.598 (+/- 0.023) 0.737 (+/- 0.008) \n", "SVM(non-text feats) 0.687 (+/- 0.011) 0.733 (+/- 0.008) " ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for (name, model) in models.items():\n", " pipe_model = make_pipeline(preprocessor, model)\n", " results_df[name + '(non-text feats)'] = mean_std_cross_val_scores(pipe_model, X_train, y_train, return_train_score=True)\n", "pd.DataFrame(results_df).T" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We are getting better results when we include numeric, categorical, binary, ordinal features. \n", "


" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Incorporating text features \n", "\n", "We haven't incorporated the comments feature into our pipeline yet, even though it holds significant value in indicating whether the restaurant was liked or not." ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
north_americaeat_out_freqagen_peoplepricefood_typenoise_levelgood_servercommentsrestaurant_name
80No2.02130.02200.0ChinesehighNoThe environment was very not clean. The food tasted awful.NaN
934Yes4.02130.03000.0Canadian/AmericanlowYesThe building and the room gave a very comfy feeling. Immediately after sitting down it felt like we were right at home.NaN
911No4.02040.02500.0Canadian/AmericanmediumYesI was hungryChambar
459Yes5.021NaNNaNQuebecoisNaNNaNNaNNaN
62Yes2.02420.03000.0IndianhighYesbad tasteeast is east
.................................
106No3.02710.01500.0ChinesemediumYesFood wasn't great.NaN
333No1.02412.0800.0OthermediumYesNaNNaN
393Yes4.0205.01500.0Canadian/AmericanlowNoNaNNaN
376Yes5.020NaNNaNNaNNaNNaNNaNNaN
525Don't want to share4.02050.03000.0ChinesehighYesNaNHaidilao
\n", "

753 rows × 10 columns

\n", "
" ], "text/plain": [ " north_america eat_out_freq age n_people price \\\n", "80 No 2.0 21 30.0 2200.0 \n", "934 Yes 4.0 21 30.0 3000.0 \n", "911 No 4.0 20 40.0 2500.0 \n", "459 Yes 5.0 21 NaN NaN \n", "62 Yes 2.0 24 20.0 3000.0 \n", ".. ... ... ... ... ... \n", "106 No 3.0 27 10.0 1500.0 \n", "333 No 1.0 24 12.0 800.0 \n", "393 Yes 4.0 20 5.0 1500.0 \n", "376 Yes 5.0 20 NaN NaN \n", "525 Don't want to share 4.0 20 50.0 3000.0 \n", "\n", " food_type noise_level good_server \\\n", "80 Chinese high No \n", "934 Canadian/American low Yes \n", "911 Canadian/American medium Yes \n", "459 Quebecois NaN NaN \n", "62 Indian high Yes \n", ".. ... ... ... \n", "106 Chinese medium Yes \n", "333 Other medium Yes \n", "393 Canadian/American low No \n", "376 NaN NaN NaN \n", "525 Chinese high Yes \n", "\n", " comments \\\n", "80 The environment was very not clean. The food tasted awful. \n", "934 The building and the room gave a very comfy feeling. Immediately after sitting down it felt like we were right at home. \n", "911 I was hungry \n", "459 NaN \n", "62 bad taste \n", ".. ... \n", "106 Food wasn't great. \n", "333 NaN \n", "393 NaN \n", "376 NaN \n", "525 NaN \n", "\n", " restaurant_name \n", "80 NaN \n", "934 NaN \n", "911 Chambar \n", "459 NaN \n", "62 east is east \n", ".. ... \n", "106 NaN \n", "333 NaN \n", "393 NaN \n", "376 NaN \n", "525 Haidilao \n", "\n", "[753 rows x 10 columns]" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's create bag-of-words representation of the `comments` feature. But first we need to impute the rows where there are no comments. There is a small complication if we want to put `SimpleImputer` and `CountVectorizer` in a pipeline. \n", "- `SimpleImputer` takes a 2D array as input and produced 2D array as output. \n", "- `CountVectorizer` takes a 1D array as input. \n", "\n", "To deal with this, we will use sklearn's `FunctionTransformer` to convert the 2D output of `SimpleImputer` into a 1D array which can be passed to `CountVectorizer` as input. " ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6493951434878588" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import FunctionTransformer\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "reshape_for_countvectorizer = FunctionTransformer(lambda X: X.squeeze(), validate=False)\n", "text_transformer = make_pipeline(SimpleImputer(strategy=\"constant\", fill_value=\"missing\"), \n", " reshape_for_countvectorizer, \n", " CountVectorizer(stop_words=\"english\"))\n", "text_pipe = make_pipeline(text_transformer, SVC())\n", "cross_val_score(text_pipe, X_train[['comments']], y_train).mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pretty good scores just with text features! Let's examine the transformed data. " ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "transformed = text_transformer.fit_transform(X_train[['comments']], y_train)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<753x548 sparse matrix of type ''\n", "\twith 1841 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transformed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It's a sparse matrix. Let's explore the the vocabulary. " ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['18', '30', '40mins', '65', 'actually', 'addition', 'affordable',\n", " 'alcohol', 'ale', 'allergic'], dtype=object)" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab = text_transformer.named_steps[\"countvectorizer\"].get_feature_names_out()\n", "vocab[:10]" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['18', '30', '40mins', '65', 'actually', 'addition', 'affordable',\n", " 'alcohol', 'ale', 'allergic'], dtype=object)" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab[0:10]" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['fusion', 'games', 'gave', 'general', 'genuinely', 'getting',\n", " 'ginger', 'girlfriends', 'gluten', 'going'], dtype=object)" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab[200:210]" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['undressed', 'unfresh', 'uni', 'unique', 'unreasonable', 'upset',\n", " 'usual', 'uwu', 'value', 'vancouver', 'variety', 'vds', 've',\n", " 'vegan', 'vibe', 'vibes', 'vietnamese', 'view', 'visit', 'wait',\n", " 'waited', 'waiter', 'waiters', 'waiting', 'waitress', 'walking',\n", " 'want', 'warm', 'washrooms', 'wasn', 'water', 'watery', 'way',\n", " 'weekend', 'went', 'wet', 'wife', 'wind', 'window', 'wine',\n", " 'wings', 'winter', 'work', 'worst', 'wrong', 'yelling', 'yield',\n", " 'yummy'], dtype=object)" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab[500:600]" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['18', 'ask', 'better', 'cash', 'closing', 'country', 'dessert',\n", " 'drunk', 'expecting', 'figuring', 'fusion', 'having', 'impeccable',\n", " 'knowledgeable', 'love', 'nice', 'pain', 'played', 'quality',\n", " 'removed', 'sauces', 'sitting', 'spoke', 'tacky', 'time',\n", " 'undressed', 'waited', 'wings'], dtype=object)" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab[0::20]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "

" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Do we get better scores if we combine all features? Let's define a column transformer which carries out \n", "- imputation and scaling on numeric features\n", "- imputation and one-hot encoding with `drop=\"if_binary\"` on binary features\n", "- imputation and one-hot encoding with `handle_unknown=\"ignore\"` on categorical features\n", "- imputation, reshaping, and bag-of-words transformation on the text feature" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "text_feat = ['comments']\n", "\n", "preprocessor_all = make_column_transformer(\n", " (numeric_transformer, numeric_feats), \n", " (binary_transformer, binary_feats), \n", " (ordinal_transformer, ordinal_feats),\n", " (categorical_transformer, categorical_feats),\n", " (text_transformer, text_feat), \n", " (\"drop\", drop_feats)\n", ")" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<753x565 sparse matrix of type ''\n", "\twith 6927 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocessor_all.fit_transform(X_train)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fit_timescore_timetest_scoretrain_score
dummy0.001 (+/- 0.000)0.001 (+/- 0.000)0.515 (+/- 0.002)0.515 (+/- 0.000)
Decision Tree (numeric-only)0.003 (+/- 0.000)0.001 (+/- 0.000)0.497 (+/- 0.038)0.833 (+/- 0.010)
KNN (numeric-only)0.003 (+/- 0.001)0.004 (+/- 0.000)0.525 (+/- 0.034)0.674 (+/- 0.015)
SVM (numeric-only)0.012 (+/- 0.000)0.005 (+/- 0.000)0.587 (+/- 0.033)0.623 (+/- 0.006)
Decision Tree(non-text feats)0.009 (+/- 0.000)0.003 (+/- 0.000)0.590 (+/- 0.039)0.889 (+/- 0.008)
KNN(non-text feats)0.008 (+/- 0.000)0.004 (+/- 0.000)0.598 (+/- 0.023)0.737 (+/- 0.008)
SVM(non-text feats)0.019 (+/- 0.000)0.008 (+/- 0.000)0.687 (+/- 0.011)0.733 (+/- 0.008)
Decision Tree(text)0.008 (+/- 0.001)0.001 (+/- 0.000)0.618 (+/- 0.036)0.735 (+/- 0.004)
KNN(text)0.004 (+/- 0.000)0.006 (+/- 0.002)0.572 (+/- 0.023)0.646 (+/- 0.026)
SVM(text)0.010 (+/- 0.000)0.003 (+/- 0.000)0.649 (+/- 0.022)0.728 (+/- 0.005)
\n", "
" ], "text/plain": [ " fit_time score_time \\\n", "dummy 0.001 (+/- 0.000) 0.001 (+/- 0.000) \n", "Decision Tree (numeric-only) 0.003 (+/- 0.000) 0.001 (+/- 0.000) \n", "KNN (numeric-only) 0.003 (+/- 0.001) 0.004 (+/- 0.000) \n", "SVM (numeric-only) 0.012 (+/- 0.000) 0.005 (+/- 0.000) \n", "Decision Tree(non-text feats) 0.009 (+/- 0.000) 0.003 (+/- 0.000) \n", "KNN(non-text feats) 0.008 (+/- 0.000) 0.004 (+/- 0.000) \n", "SVM(non-text feats) 0.019 (+/- 0.000) 0.008 (+/- 0.000) \n", "Decision Tree(text) 0.008 (+/- 0.001) 0.001 (+/- 0.000) \n", "KNN(text) 0.004 (+/- 0.000) 0.006 (+/- 0.002) \n", "SVM(text) 0.010 (+/- 0.000) 0.003 (+/- 0.000) \n", "\n", " test_score train_score \n", "dummy 0.515 (+/- 0.002) 0.515 (+/- 0.000) \n", "Decision Tree (numeric-only) 0.497 (+/- 0.038) 0.833 (+/- 0.010) \n", "KNN (numeric-only) 0.525 (+/- 0.034) 0.674 (+/- 0.015) \n", "SVM (numeric-only) 0.587 (+/- 0.033) 0.623 (+/- 0.006) \n", "Decision Tree(non-text feats) 0.590 (+/- 0.039) 0.889 (+/- 0.008) \n", "KNN(non-text feats) 0.598 (+/- 0.023) 0.737 (+/- 0.008) \n", "SVM(non-text feats) 0.687 (+/- 0.011) 0.733 (+/- 0.008) \n", "Decision Tree(text) 0.618 (+/- 0.036) 0.735 (+/- 0.004) \n", "KNN(text) 0.572 (+/- 0.023) 0.646 (+/- 0.026) \n", "SVM(text) 0.649 (+/- 0.022) 0.728 (+/- 0.005) " ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for (name, model) in models.items():\n", " pipe_model = make_pipeline(text_transformer, model)\n", " results_df[name + '(text)'] = mean_std_cross_val_scores(pipe_model, X_train[['comments']], y_train, return_train_score=True)\n", "pd.DataFrame(results_df).T" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fit_timescore_timetest_scoretrain_score
dummy0.001 (+/- 0.000)0.001 (+/- 0.000)0.515 (+/- 0.002)0.515 (+/- 0.000)
Decision Tree (numeric-only)0.003 (+/- 0.000)0.001 (+/- 0.000)0.497 (+/- 0.038)0.833 (+/- 0.010)
KNN (numeric-only)0.003 (+/- 0.001)0.004 (+/- 0.000)0.525 (+/- 0.034)0.674 (+/- 0.015)
SVM (numeric-only)0.012 (+/- 0.000)0.005 (+/- 0.000)0.587 (+/- 0.033)0.623 (+/- 0.006)
Decision Tree(non-text feats)0.009 (+/- 0.000)0.003 (+/- 0.000)0.590 (+/- 0.039)0.889 (+/- 0.008)
KNN(non-text feats)0.008 (+/- 0.000)0.004 (+/- 0.000)0.598 (+/- 0.023)0.737 (+/- 0.008)
SVM(non-text feats)0.019 (+/- 0.000)0.008 (+/- 0.000)0.687 (+/- 0.011)0.733 (+/- 0.008)
Decision Tree(text)0.008 (+/- 0.001)0.001 (+/- 0.000)0.618 (+/- 0.036)0.735 (+/- 0.004)
KNN(text)0.004 (+/- 0.000)0.006 (+/- 0.002)0.572 (+/- 0.023)0.646 (+/- 0.026)
SVM(text)0.010 (+/- 0.000)0.003 (+/- 0.000)0.649 (+/- 0.022)0.728 (+/- 0.005)
Decision Tree(all)0.016 (+/- 0.001)0.005 (+/- 0.001)0.624 (+/- 0.022)0.893 (+/- 0.006)
KNN(all)0.013 (+/- 0.000)0.012 (+/- 0.001)0.625 (+/- 0.027)0.748 (+/- 0.015)
SVM(all)0.023 (+/- 0.000)0.008 (+/- 0.001)0.699 (+/- 0.017)0.786 (+/- 0.008)
\n", "
" ], "text/plain": [ " fit_time score_time \\\n", "dummy 0.001 (+/- 0.000) 0.001 (+/- 0.000) \n", "Decision Tree (numeric-only) 0.003 (+/- 0.000) 0.001 (+/- 0.000) \n", "KNN (numeric-only) 0.003 (+/- 0.001) 0.004 (+/- 0.000) \n", "SVM (numeric-only) 0.012 (+/- 0.000) 0.005 (+/- 0.000) \n", "Decision Tree(non-text feats) 0.009 (+/- 0.000) 0.003 (+/- 0.000) \n", "KNN(non-text feats) 0.008 (+/- 0.000) 0.004 (+/- 0.000) \n", "SVM(non-text feats) 0.019 (+/- 0.000) 0.008 (+/- 0.000) \n", "Decision Tree(text) 0.008 (+/- 0.001) 0.001 (+/- 0.000) \n", "KNN(text) 0.004 (+/- 0.000) 0.006 (+/- 0.002) \n", "SVM(text) 0.010 (+/- 0.000) 0.003 (+/- 0.000) \n", "Decision Tree(all) 0.016 (+/- 0.001) 0.005 (+/- 0.001) \n", "KNN(all) 0.013 (+/- 0.000) 0.012 (+/- 0.001) \n", "SVM(all) 0.023 (+/- 0.000) 0.008 (+/- 0.001) \n", "\n", " test_score train_score \n", "dummy 0.515 (+/- 0.002) 0.515 (+/- 0.000) \n", "Decision Tree (numeric-only) 0.497 (+/- 0.038) 0.833 (+/- 0.010) \n", "KNN (numeric-only) 0.525 (+/- 0.034) 0.674 (+/- 0.015) \n", "SVM (numeric-only) 0.587 (+/- 0.033) 0.623 (+/- 0.006) \n", "Decision Tree(non-text feats) 0.590 (+/- 0.039) 0.889 (+/- 0.008) \n", "KNN(non-text feats) 0.598 (+/- 0.023) 0.737 (+/- 0.008) \n", "SVM(non-text feats) 0.687 (+/- 0.011) 0.733 (+/- 0.008) \n", "Decision Tree(text) 0.618 (+/- 0.036) 0.735 (+/- 0.004) \n", "KNN(text) 0.572 (+/- 0.023) 0.646 (+/- 0.026) \n", "SVM(text) 0.649 (+/- 0.022) 0.728 (+/- 0.005) \n", "Decision Tree(all) 0.624 (+/- 0.022) 0.893 (+/- 0.006) \n", "KNN(all) 0.625 (+/- 0.027) 0.748 (+/- 0.015) \n", "SVM(all) 0.699 (+/- 0.017) 0.786 (+/- 0.008) " ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for (name, model) in models.items():\n", " pipe_model = make_pipeline(preprocessor_all, model)\n", " results_df[name + '(all)'] = mean_std_cross_val_scores(pipe_model, X_train, y_train, return_train_score=True)\n", "pd.DataFrame(results_df).T" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Some improvement when we combine all features! " ] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "571", "language": "python", "name": "571" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 4 }