Introduction to Machine Learning – Make - Pipelines & Column Transformers

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(adult, test_size=0.2, random_state=42)
train_df.head()

	age	workclass	fnlwgt	education	...	hours.per.week	native.country	income
5514	26	Private	256263	HS-grad	...	25	United-States	<=50K
19777	24	Private	170277	HS-grad	...	35	United-States	<=50K
10781	36	Private	75826	Bachelors	...	40	United-States	<=50K
32240	22	State-gov	24395	Some-college	...	20	United-States	<=50K
9876	31	Local-gov	356689	Bachelors	...	40	United-States	<=50K

5 rows × 15 columns

X_train = train_df.drop(columns=['income'])
y_train = train_df['income']

X_test = test_df.drop(columns=['income'])
y_test = test_df['income']

numeric_features = [
    "age",
    "fnlwgt",
    "education.num",
    "capital.gain",
    "capital.loss",
    "hours.per.week"
]

categorical_features = [
    "workclass",
    "education",
    "marital.status",
    "occupation",
    "relationship",
    "sex",
    "native.country"
]

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), 
           ("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)] )

pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("clf", SVC())])

make_pipeline syntax

model_pipeline = Pipeline(
    steps=[
        ("scaling", StandardScaler()),
        ("clf", SVC())])

model_pipeline = make_pipeline(
            StandardScaler(), SVC())

model_pipeline

Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC())])

from sklearn.pipeline import make_pipeline

numeric_transformer = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder()
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

pipe = make_pipeline(preprocessor, SVC())

make_column_transformer syntax

from sklearn.compose import make_column_transformer

so instead of this:

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features) ]
)

we can do this:

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features))

So our whole thing becomes:

numeric_transformer = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())

categorical_transformer = make_pipeline(
                SimpleImputer(strategy="constant", fill_value="missing"),
                OneHotEncoder())
                
preprocessor = make_column_transformer(
               (numeric_transformer, numeric_features), 
               (categorical_transformer, categorical_features))
               
pipe = make_pipeline(preprocessor, SVC())

scores = cross_validate(pipe, X_train, y_train, cv=5, return_train_score=True)

ValueError: Found unknown categories ['Holand-Netherlands'] in column 6 during transform

Detailed traceback: 
  File "<string>", line 1, in <module>
  File "/usr/local/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 942, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 492, in __call__
    return estimator.score(*args, **kwargs)

ValueError: Found unknown categories ['Holand-Netherlands'] in column 7 during transform

X_train["native.country"].value_counts().tail(5)

native.country
Trinadad&Tobago       12
Hungary               11
Scotland              10
Honduras               7
Holand-Netherlands     1
Name: count, dtype: int64

How do we fix it?

numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features), 
    (categorical_transformer, categorical_features))

pipe = make_pipeline(preprocessor, SVC())

from sklearn.model_selection import cross_validate

scores = cross_validate(pipe, X_train, y_train, cv=5, return_train_score=True)
pd.DataFrame(scores).mean()

fit_time       9.053475
score_time     1.455909
test_score     0.855459
train_score    0.867974
dtype: float64

Cases where it’s OK to break the golden rule

If it’s some fixed number of categories.

all_countries = adult["native.country"].unique()
all_countries

array(['United-States', nan, 'Mexico', 'Greece', 'Vietnam', 'China', 'Taiwan', 'India', 'Philippines', 'Trinadad&Tobago', 'Canada', 'South', 'Holand-Netherlands', 'Puerto-Rico', 'Poland', 'Iran', 'England', 'Germany', 'Italy', 'Japan', 'Hong', 'Honduras', 'Cuba', 'Ireland', 'Cambodia', 'Peru', 'Nicaragua', 'Dominican-Republic', 'Haiti', 'El-Salvador', 'Hungary', 'Columbia', 'Guatemala',
       'Jamaica', 'Ecuador', 'France', 'Yugoslavia', 'Scotland', 'Portugal', 'Laos', 'Thailand', 'Outlying-US(Guam-USVI-etc)'], dtype=object)

ohe_cat = OneHotEncoder(categories=all_countries)

Make - Pipelines & Column Transformers

make_pipeline syntax

make_column_transformer syntax

How do we fix it?

Cases where it’s OK to break the golden rule

Let’s apply what we learned!