Make - Pipelines & Column Transformers

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(adult, test_size=0.2, random_state=42)
train_df.head()
age workclass fnlwgt education ... capital.loss hours.per.week native.country income
5514 26 Private 256263 HS-grad ... 0 25 United-States <=50K
19777 24 Private 170277 HS-grad ... 0 35 United-States <=50K
10781 36 Private 75826 Bachelors ... 0 40 United-States <=50K
32240 22 State-gov 24395 Some-college ... 0 20 United-States <=50K
9876 31 Local-gov 356689 Bachelors ... 0 40 United-States <=50K

5 rows × 15 columns

X_train = train_df.drop(columns=['income'])
y_train = train_df['income']

X_test = test_df.drop(columns=['income'])
y_test = test_df['income']
numeric_features = [
    "age",
    "fnlwgt",
    "education.num",
    "capital.gain",
    "capital.loss",
    "hours.per.week"
]

categorical_features = [
    "workclass",
    "education",
    "marital.status",
    "occupation",
    "relationship",
    "sex",
    "native.country"
]
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), 
           ("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)] )

pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("clf", SVC())])

make_pipeline syntax

model_pipeline = Pipeline(
    steps=[
        ("scaling", StandardScaler()),
        ("clf", SVC())])


model_pipeline = make_pipeline(
            StandardScaler(), SVC())


model_pipeline
Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC())])
from sklearn.pipeline import make_pipeline

numeric_transformer = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder()
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

pipe = make_pipeline(preprocessor, SVC())

make_column_transformer syntax

from sklearn.compose import make_column_transformer


so instead of this:

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features) ]
)


we can do this:

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features))

So our whole thing becomes:

numeric_transformer = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())

categorical_transformer = make_pipeline(
                SimpleImputer(strategy="constant", fill_value="missing"),
                OneHotEncoder())
                
preprocessor = make_column_transformer(
               (numeric_transformer, numeric_features), 
               (categorical_transformer, categorical_features))
               
pipe = make_pipeline(preprocessor, SVC())
scores = cross_validate(pipe, X_train, y_train, cv=5, return_train_score=True)
ValueError: Found unknown categories ['Holand-Netherlands'] in column 6 during transform

Detailed traceback: 
  File "<string>", line 1, in <module>
  File "/usr/local/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 942, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 492, in __call__
    return estimator.score(*args, **kwargs)
ValueError: Found unknown categories ['Holand-Netherlands'] in column 7 during transform


X_train["native.country"].value_counts().tail(5)
native.country
Trinadad&Tobago       12
Hungary               11
Scotland              10
Honduras               7
Holand-Netherlands     1
Name: count, dtype: int64

How do we fix it?

numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features), 
    (categorical_transformer, categorical_features))

pipe = make_pipeline(preprocessor, SVC())


from sklearn.model_selection import cross_validate

scores = cross_validate(pipe, X_train, y_train, cv=5, return_train_score=True)
pd.DataFrame(scores).mean()
fit_time       9.053475
score_time     1.455909
test_score     0.855459
train_score    0.867974
dtype: float64

Cases where it’s OK to break the golden rule

  • If it’s some fixed number of categories.
all_countries = adult["native.country"].unique()
all_countries
array(['United-States', nan, 'Mexico', 'Greece', 'Vietnam', 'China', 'Taiwan', 'India', 'Philippines', 'Trinadad&Tobago', 'Canada', 'South', 'Holand-Netherlands', 'Puerto-Rico', 'Poland', 'Iran', 'England', 'Germany', 'Italy', 'Japan', 'Hong', 'Honduras', 'Cuba', 'Ireland', 'Cambodia', 'Peru', 'Nicaragua', 'Dominican-Republic', 'Haiti', 'El-Salvador', 'Hungary', 'Columbia', 'Guatemala',
       'Jamaica', 'Ecuador', 'France', 'Yugoslavia', 'Scotland', 'Portugal', 'Laos', 'Thailand', 'Outlying-US(Guam-USVI-etc)'], dtype=object)


ohe_cat = OneHotEncoder(categories=all_countries)

Let’s apply what we learned!