Source code for pypunisher.metrics.criterion

#!/usr/bin/env python

"""
 Information Criterion
 =====================
"""
# NOTE: aic() and `bic()` have docstrings formatted
# for sphinx's auto-documentation engine.

from numpy import log, ndarray, pi
from pypunisher._checks import model_check


def _get_coeffs(model, X_train, y_train):
    """

    Args:
        model : sklearn model object
            A fitted sklearn model.
        X_train : ndarray
            The data used to train `model`.
        y_train : 1d numpy array
            The response variable.

    Returns:
        n : int
            Number of samples
        k : int
            Number of features
        llf : float
            Maximized value of log likelihood function
    """
    model_check(model)
    n = X_train.shape[0]
    k = X_train.shape[1]
    y_pred = model.predict(X_train)
    rss = sum((y_train - y_pred) ** 2)
    llf = -(n / 2) * log(2 * pi) - (n / 2) * log(rss / n) - n / 2
    return n, k, llf


[docs]def aic(model, X_train, y_train):
    """Compute the Akaike Information Criterion (AIC)

    AIC's objective is to prevent model overfitting by adding a penalty
    term which penalizes more complex models. Its formal definition is:

    .. math::
        -2ln(L) + 2k

    where :math:`L` is the maximized value of the likelihood function and :math:`k`
    if the number of parameters. A smaller AIC value suggests that the model
    is a better fit for the data, relative to competing models.

    Args:
        model (fitted sklearn model object): A fitted sklearn model.
        X_train (2d ndarray): The data used to train `model`.
        y_train (1d numpy array): the response variable.

    Returns:
        aic (float)
            AIC value if sample size is sufficient.
            If n/k < 40 where n is the number of observations
            and k is the number of features, AICc gets returned
            to adjust for small sample size.

    References:
        * https://en.wikipedia.org/wiki/Akaike_information_criterion
    """

    if not isinstance(X_train, ndarray):
        raise TypeError("`X_train` must be an ndarray.")
    if not isinstance(y_train, ndarray):
        raise TypeError("`y_train` must be an ndarray.")

    n, k, llf = _get_coeffs(model, X_train=X_train, y_train=y_train)
    aic = -2 * llf + 2 * k
    if n / k < 40:
        return aic + 2 * k * (k + 1) / (n - k - 1)
    else:
        return aic


[docs]def bic(model, X_train, y_train):
    """Compute the Bayesian Information Criterion (BIC)

    BIC's objective is to prevent model over-fitting by adding a penalty
    term which penalizes more complex models. Its formal definition is:

    .. math::
        -2ln(L) + ln(n)k

    where :math:`L` is the maximized value of the likelihood function and :math:`k`
    if the number of parameters. A smaller BIC value suggests that the model
    is a better fit for the data, relative to competing models.

    Args:
        model (fitted sklearn model object): A fitted sklearn model.
        X_train (2d ndarray): The data used to train `model`.
        y_train (1d numpy array): the response variable.

    Returns:
        bic (float)
            Bayesian Information Criterion value.

    References:
        * https://en.wikipedia.org/wiki/Bayesian_information_criterion
    """
    if not isinstance(X_train, ndarray):
        raise TypeError("`X_train` must be an ndarray.")
    if not isinstance(y_train, ndarray):
        raise TypeError("`y_train` must be an ndarray.")

    n, k, llf = _get_coeffs(model, X_train=X_train, y_train=y_train)
    bic = -2 * llf + log(n) * k

    return bic