Source code for pypunisher.metrics.criterion
#!/usr/bin/env python
"""
Information Criterion
=====================
"""
# NOTE: aic() and `bic()` have docstrings formatted
# for sphinx's auto-documentation engine.
from numpy import log, ndarray, pi
from pypunisher._checks import model_check
def _get_coeffs(model, X_train, y_train):
"""
Args:
model : sklearn model object
A fitted sklearn model.
X_train : ndarray
The data used to train `model`.
y_train : 1d numpy array
The response variable.
Returns:
n : int
Number of samples
k : int
Number of features
llf : float
Maximized value of log likelihood function
"""
model_check(model)
n = X_train.shape[0]
k = X_train.shape[1]
y_pred = model.predict(X_train)
rss = sum((y_train - y_pred) ** 2)
llf = -(n / 2) * log(2 * pi) - (n / 2) * log(rss / n) - n / 2
return n, k, llf
[docs]def aic(model, X_train, y_train):
"""Compute the Akaike Information Criterion (AIC)
AIC's objective is to prevent model overfitting by adding a penalty
term which penalizes more complex models. Its formal definition is:
.. math::
-2ln(L) + 2k
where :math:`L` is the maximized value of the likelihood function and :math:`k`
if the number of parameters. A smaller AIC value suggests that the model
is a better fit for the data, relative to competing models.
Args:
model (fitted sklearn model object): A fitted sklearn model.
X_train (2d ndarray): The data used to train `model`.
y_train (1d numpy array): the response variable.
Returns:
aic (float)
AIC value if sample size is sufficient.
If n/k < 40 where n is the number of observations
and k is the number of features, AICc gets returned
to adjust for small sample size.
References:
* https://en.wikipedia.org/wiki/Akaike_information_criterion
"""
if not isinstance(X_train, ndarray):
raise TypeError("`X_train` must be an ndarray.")
if not isinstance(y_train, ndarray):
raise TypeError("`y_train` must be an ndarray.")
n, k, llf = _get_coeffs(model, X_train=X_train, y_train=y_train)
aic = -2 * llf + 2 * k
if n / k < 40:
return aic + 2 * k * (k + 1) / (n - k - 1)
else:
return aic
[docs]def bic(model, X_train, y_train):
"""Compute the Bayesian Information Criterion (BIC)
BIC's objective is to prevent model over-fitting by adding a penalty
term which penalizes more complex models. Its formal definition is:
.. math::
-2ln(L) + ln(n)k
where :math:`L` is the maximized value of the likelihood function and :math:`k`
if the number of parameters. A smaller BIC value suggests that the model
is a better fit for the data, relative to competing models.
Args:
model (fitted sklearn model object): A fitted sklearn model.
X_train (2d ndarray): The data used to train `model`.
y_train (1d numpy array): the response variable.
Returns:
bic (float)
Bayesian Information Criterion value.
References:
* https://en.wikipedia.org/wiki/Bayesian_information_criterion
"""
if not isinstance(X_train, ndarray):
raise TypeError("`X_train` must be an ndarray.")
if not isinstance(y_train, ndarray):
raise TypeError("`y_train` must be an ndarray.")
n, k, llf = _get_coeffs(model, X_train=X_train, y_train=y_train)
bic = -2 * llf + log(n) * k
return bic