Function to create class object similar to sci-kit learn's object structure for inferential purposes. Given a data frame, the response, and certain specifications return a generalized regression model interface for count data (either using a poisson or a negative binomial distribution) with a fit, predict, and score functions as well as attributes obtained from the statistical analysis.

arid_countreg(
  X,
  y,
  alpha = 0.05,
  fit_intercept = TRUE,
  verbose = FALSE,
  model = "additive",
  family = "poisson"
)

Arguments

X	(data_frame): the input data frame with the explanatory variables to fit the model.
y	(integer): an integer vector with the response to be fitted (only natural numbers).
alpha	(double): a double vector of length 1 indicating the significance level (default: 0.05)
fit_intercept	(logical): if the model should include the intercept (TRUE or FALSE). (default: FALSE)
verbose	(logical): if results should include a written explanation (TRUE or FALSE). (default: FALSE)
model	(character): type of model to be fitted, either "additive" or "interactive". (default: "additive")
family	(character): distributional family to be used in generalized linear model. (default: "poisson")

Value

a class object with three methods and statistical attributes

Examples

X <- as.data.frame(matrix(rnorm(40 * 3), 40, 3))
y <- sample(c(1:60), 40, replace = TRUE)
arid_countreg(X,y,0.1)
#> $Env
#> <environment: 0x000000003bf984e8>
#> 
#> $fit
#> function (X, y) 
#> {
#>     model_df <- X
#>     model_df$response <- y
#>     if (model == "additive") {
#>         columns <- stringr::str_c(colnames(X), collapse = " + ")
#>         formula <- stringr::str_c(c("response", columns), collapse = " ~ ")
#>     }
#>     else {
#>         columns <- stringr::str_c(colnames(X), collapse = " * ")
#>         formula <- stringr::str_c(c("response", columns), collapse = " ~ ")
#>     }
#>     if (fit_intercept == FALSE) {
#>         formula <- formula <- stringr::str_c(c(formula, "1"), 
#>             collapse = " - ")
#>     }
#>     count_model <- glm(formula, data = model_df, family = family)
#>     if (family == "poisson") {
#>         if (AER::dispersiontest(count_model)[[2]] < alpha) {
#>             assign("family_", "negative binomial", Env)
#>             count_model <- MASS::glm.nb(formula, data = model_df)
#>             if (verbose == TRUE) {
#>                 print("The Poisson model has overdispersion and it is underestimating the\n            variance of the model, hence the negative binomial model will be used")
#>                 print(" ")
#>             }
#>         }
#>     }
#>     if (fit_intercept == TRUE) {
#>         initial_value = 2
#>     }
#>     else {
#>         initial_value = 1
#>     }
#>     if (verbose == TRUE) {
#>         for (i in seq(initial_value, nrow(broom::tidy(count_model)))) {
#>             if (broom::tidy(count_model)$p.value[i] < alpha) {
#>                 print(" ")
#>                 print(paste("The variable", broom::tidy(count_model)$term[i], 
#>                   "has a statistically\n                      significant association over the response"))
#>             }
#>         }
#>     }
#>     set_attributes(count_model)
#>     return(count_model)
#> }
#> <bytecode: 0x000000003bf88670>
#> <environment: 0x000000003bf984e8>
#> 
#> $predict_count
#> function (model, new_X) 
#> {
#>     return(exp(predict(model, new_X)))
#> }
#> <bytecode: 0x000000003bf954f0>
#> <environment: 0x000000003bf984e8>
#> 
#> $score
#> function (model) 
#> {
#>     return(tibble::tibble(In_Sample_Metric = c("AIC", "Deviance"), 
#>         Value = c(model$aic, model$deviance)))
#> }
#> <bytecode: 0x000000003bf95170>
#> <environment: 0x000000003bf984e8>
#> 
#> $intercept_
#> [1] 29.87914
#> 
#> $coef_
#> [1] 0.9227020 0.9976305 0.9799423
#> 
#> $p_values_
#> [1] 0.4827375 0.9841780 0.8648655
#> 
#> $count_model_
#> 
#> Call:  MASS::glm.nb(formula = formula, data = model_df, init.theta = 1.852692242, 
#>     link = log)
#> 
#> Coefficients:
#> (Intercept)           V1           V2           V3  
#>    3.397161    -0.080449    -0.002372    -0.020262  
#> 
#> Degrees of Freedom: 39 Total (i.e. Null);  36 Residual
#> Null Deviance:	    44.55 
#> Residual Deviance: 44.03 	AIC: 357.4
#> 
#> $alpha_
#> [1] 0.1
#> 
#> $fit_intercept_
#> [1] TRUE
#> 
#> $type_
#> [1] "additive"
#> 
#> $family_
#> [1] "negative binomial"
#> 
#> attr(,"class")
#> [1] "arid_countreg"