from sklearn.linear_model import Ridge
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import (
    OrdinalEncoder,
    StandardScaler,
)
from scipy.stats import loguniform
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


numeric_features = ['n_words']
text_feature = 'Text'
ordinal_features = ['sentiment']
drop_features = ['Id', 'Author']
target = 'Rating'


train_df = pd.read_csv("../Data/processed/train.csv")
X_train, y_train = train_df.drop(columns=[target] + drop_features), train_df[target]


preprocessor = ColumnTransformer(
        transformers=[
            ('text', CountVectorizer(max_features=20_000, max_df=0.6), text_feature),
            ('num', StandardScaler(), numeric_features),
            ('ord', OrdinalEncoder(categories=[['neg', 'compound', 'neu', 'pos']]), ordinal_features)
        ]
    )


ridge_pipe = Pipeline(
        steps=[
            ("prepro", preprocessor),
            ("Ridge", Ridge())
        ]
    )
param_grid = {
        'Ridge__alpha': np.arange(800, 1200, 50)
    }
hyper_parameters_search = GridSearchCV(ridge_pipe, param_grid=param_grid, n_jobs=-1, scoring='r2', verbose=1)
hyper_parameters_search.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   44.9s finished

GridSearchCV(estimator=Pipeline(steps=[('prepro',
                                        ColumnTransformer(transformers=[('text',
                                                                         CountVectorizer(max_df=0.6,
                                                                                         max_features=20000),
                                                                         'Text'),
                                                                        ('num',
                                                                         StandardScaler(),
                                                                         ['n_words']),
                                                                        ('ord',
                                                                         OrdinalEncoder(categories=[['neg',
                                                                                                     'compound',
                                                                                                     'neu',
                                                                                                     'pos']]),
                                                                         ['sentiment'])])),
                                       ('Ridge', Ridge())]),
             n_jobs=-1,
             param_grid={'Ridge__alpha': array([ 800,  850,  900,  950, 1000, 1050, 1100, 1150])},
             scoring='r2', verbose=1)


results_df = {}
scores = cross_validate(
    hyper_parameters_search.best_estimator_,
    X_train,
    y_train,
    scoring='r2',
    return_train_score=True)

df = pd.DataFrame(scores)
results_df["Ridge"] = df.mean()
pd.DataFrame(results_df).T


svr_pipe = Pipeline(
        steps=[
            ("prepro", preprocessor),
            ("svr", SVR())
        ]
    )
param_grid = {
        'svr__gamma': np.arange(0.0001, 0.0015, 0.0001)
    }
hyper_parameters_search = GridSearchCV(svr_pipe, param_grid=param_grid, n_jobs=-1, scoring='r2', verbose=1)
hyper_parameters_search.fit(X_train, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed: 12.9min finished

GridSearchCV(estimator=Pipeline(steps=[('prepro',
                                        ColumnTransformer(transformers=[('text',
                                                                         CountVectorizer(max_df=0.6,
                                                                                         max_features=20000),
                                                                         'Text'),
                                                                        ('num',
                                                                         StandardScaler(),
                                                                         ['n_words']),
                                                                        ('ord',
                                                                         OrdinalEncoder(categories=[['neg',
                                                                                                     'compound',
                                                                                                     'neu',
                                                                                                     'pos']]),
                                                                         ['sentiment'])])),
                                       ('svr', SVR())]),
             n_jobs=-1,
             param_grid={'svr__gamma': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009, 0.001 , 0.0011, 0.0012, 0.0013, 0.0014])},
             scoring='r2', verbose=1)


scores = cross_validate(
    hyper_parameters_search.best_estimator_,
    X_train,
    y_train,
    scoring='r2',
    return_train_score=True)

df = pd.DataFrame(scores)
results_df["SVR"] = df.mean()
pd.DataFrame(results_df).T


rf_pipe = Pipeline(
        steps=[
            ("prepro", preprocessor),
            ("randomforestregressor", RandomForestRegressor(random_state=26))
        ]
 )
param_grid1 = {
        'randomforestregressor__max_depth': [int(x) for x in np.linspace(20, 30, num = 3)],
        'randomforestregressor__n_estimators': [1, 10, 100]
    }
hyper_parameters_search = RandomizedSearchCV(rf_pipe, param_distributions=param_grid1, n_jobs=-1, scoring='r2', n_iter=1000)
hyper_parameters_search.fit(X_train, y_train)

C:\Users\yuzhe\miniconda3\envs\573\lib\site-packages\sklearn\model_selection\_search.py:278: UserWarning: The total space of parameters 9 is smaller than n_iter=1000. Running 9 iterations. For exhaustive searches, use GridSearchCV.
  warnings.warn(

RandomizedSearchCV(estimator=Pipeline(steps=[('prepro',
                                              ColumnTransformer(transformers=[('text',
                                                                               CountVectorizer(max_df=0.6,
                                                                                               max_features=20000),
                                                                               'Text'),
                                                                              ('num',
                                                                               StandardScaler(),
                                                                               ['n_words']),
                                                                              ('ord',
                                                                               OrdinalEncoder(categories=[['neg',
                                                                                                           'compound',
                                                                                                           'neu',
                                                                                                           'pos']]),
                                                                               ['sentiment'])])),
                                             ('randomforestregressor',
                                              RandomForestRegressor(random_state=26))]),
                   n_iter=1000, n_jobs=-1,
                   param_distributions={'randomforestregressor__max_depth': [20,
                                                                             25,
                                                                             30],
                                        'randomforestregressor__n_estimators': [1,
                                                                                10,
                                                                                100]},
                   scoring='r2')


hyper_parameters_search.best_params_

{'randomforestregressor__n_estimators': 100,
 'randomforestregressor__max_depth': 30}


scores = cross_validate(
    hyper_parameters_search.best_estimator_,
    X_train,
    y_train,
    scoring='r2',
    return_train_score=True)

df = pd.DataFrame(scores)
results_df["RandomForestRegressor"] = df.mean()
pd.DataFrame(results_df).T

	fit_time	score_time	test_score	train_score
Ridge	2.198422	0.511951	0.532790	0.799778
SVR	28.988741	8.086322	0.469965	0.723018

	fit_time	score_time	test_score	train_score
Ridge	2.198422	0.511951	0.532790	0.799778
SVR	28.988741	8.086322	0.469965	0.723018
RandomForestRegressor	287.661862	0.499031	0.359660	0.906179

Model Comparison¶

Table of Contents¶

I. General Overview ¶

II. Data preprocessing ¶

III.Ridge ¶

IV.SVR ¶

V. Random Forest ¶

VI. Conclusion ¶

References¶