VariableSelection Vignette

1 Performing variable selection

1.1 Metrics

1.2 Stepwise methods

1.2.1 Forward selection example

# Loading BranchGLM package
library(BranchGLM)

# Fitting gamma regression model
cars <- mtcars

# Fitting gamma regression with inverse link
GammaFit <- BranchGLM(mpg ~ ., data = cars, family = "gamma", link = "inverse")

# Forward selection with mtcars
forwardVS <- VariableSelection(GammaFit, type = "forward")
forwardVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using forward selection with AIC
#> Found the top model with AIC = 142.2
#> Number of models fit: 28
#> Variables that were kept in each model:  (Intercept)
#> Order the variables were added to the model:
#> 
#> 1). wt
#> 2). hp

## Getting final coefficients
coef(forwardVS, which = 1)
#>                   Model1
#> (Intercept) 8.922600e-03
#> cyl         0.000000e+00
#> disp        0.000000e+00
#> hp          8.887336e-05
#> drat        0.000000e+00
#> wt          9.826436e-03
#> qsec        0.000000e+00
#> vs          0.000000e+00
#> am          0.000000e+00
#> gear        0.000000e+00
#> carb        0.000000e+00

1.2.2 Backward elimination example

# Backward elimination with mtcars
backwardVS <- VariableSelection(GammaFit, type = "backward")
backwardVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using backward elimination with AIC
#> Found the top model with AIC = 141.9
#> Number of models fit: 50
#> Variables that were kept in each model:  (Intercept)
#> Order the variables were removed from the model:
#> 
#> 1). vs
#> 2). drat
#> 3). am
#> 4). disp
#> 5). carb
#> 6). cyl

## Getting final coefficients
coef(backwardVS, which = 1)
#>                    Model1
#> (Intercept)  4.691154e-02
#> cyl          0.000000e+00
#> disp         0.000000e+00
#> hp           6.284129e-05
#> drat         0.000000e+00
#> wt           9.484597e-03
#> qsec        -1.298959e-03
#> vs           0.000000e+00
#> am           0.000000e+00
#> gear        -2.662008e-03
#> carb         0.000000e+00

1.3 Branch and bound

1.3.1 Branch and bound example

  • If showprogress is true, then progress of the branch and bound algorithm will be reported occasionally.
  • Parallel computation can be used with these methods and can lead to very large speedups.
# Branch and bound with mtcars
VS <- VariableSelection(GammaFit, type = "branch and bound", showprogress = FALSE)
VS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using branch and bound selection with AIC
#> Found 1 model within 0 AIC of the best AIC(141.9)
#> Number of models fit: 50
#> Variables that were kept in each model:  (Intercept)

## Getting final coefficients
coef(VS, which = 1)
#>                    Model1
#> (Intercept)  4.691154e-02
#> cyl          0.000000e+00
#> disp         0.000000e+00
#> hp           6.284129e-05
#> drat         0.000000e+00
#> wt           9.484597e-03
#> qsec        -1.298959e-03
#> vs           0.000000e+00
#> am           0.000000e+00
#> gear        -2.662008e-03
#> carb         0.000000e+00
  • A formula with the data and the necessary BranchGLM fitting information can also be used instead of supplying a BranchGLM object.
# Can also use a formula and data
formulaVS <- VariableSelection(mpg ~ . ,data = cars, family = "gamma", 
                               link = "inverse", type = "branch and bound",
                               showprogress = FALSE, metric = "AIC")
formulaVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using branch and bound selection with AIC
#> Found 1 model within 0 AIC of the best AIC(141.9)
#> Number of models fit: 50
#> Variables that were kept in each model:  (Intercept)

## Getting final coefficients
coef(formulaVS, which = 1)
#>                    Model1
#> (Intercept)  4.691154e-02
#> cyl          0.000000e+00
#> disp         0.000000e+00
#> hp           6.284129e-05
#> drat         0.000000e+00
#> wt           9.484597e-03
#> qsec        -1.298959e-03
#> vs           0.000000e+00
#> am           0.000000e+00
#> gear        -2.662008e-03
#> carb         0.000000e+00

1.3.2 Using bestmodels

  • The bestmodels argument can be used to find the top k models according to the metric.
# Finding top 10 models
formulaVS <- VariableSelection(mpg ~ . ,data = cars, family = "gamma", 
                               link = "inverse", type = "branch and bound",
                               showprogress = FALSE, metric = "AIC", 
                               bestmodels = 10)
formulaVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using branch and bound selection with AIC
#> The range of AIC values for the top 10 models is (141.9, 143.59)
#> Number of models fit: 98
#> Variables that were kept in each model:  (Intercept)

## Plotting results
plot(formulaVS, type = "b")


## Getting all coefficients
coef(formulaVS, which = "all")
#>                    Model1       Model2        Model3        Model4
#> (Intercept)  4.691154e-02 8.922600e-03  2.896032e-02  1.972585e-02
#> cyl          0.000000e+00 0.000000e+00  0.000000e+00  0.000000e+00
#> disp         0.000000e+00 0.000000e+00  0.000000e+00  0.000000e+00
#> hp           6.284129e-05 8.887336e-05  5.590216e-05  9.987066e-05
#> drat         0.000000e+00 0.000000e+00  0.000000e+00  0.000000e+00
#> wt           9.484597e-03 9.826436e-03  1.105126e-02  8.373894e-03
#> qsec        -1.298959e-03 0.000000e+00 -1.071038e-03  0.000000e+00
#> vs           0.000000e+00 0.000000e+00  0.000000e+00  0.000000e+00
#> am           0.000000e+00 0.000000e+00  0.000000e+00  0.000000e+00
#> gear        -2.662008e-03 0.000000e+00  0.000000e+00 -2.092484e-03
#> carb         0.000000e+00 0.000000e+00  0.000000e+00  0.000000e+00
#>                    Model5       Model6        Model7       Model8        Model9
#> (Intercept)  6.463732e-02 7.472237e-03  4.763986e-02  0.048676830  2.578977e-02
#> cyl         -1.412185e-03 1.087312e-03  0.000000e+00  0.000000000  0.000000e+00
#> disp         0.000000e+00 0.000000e+00  0.000000e+00  0.000000000  0.000000e+00
#> hp           7.523221e-05 7.196928e-05  5.802906e-05  0.000000000  8.564089e-05
#> drat         0.000000e+00 0.000000e+00  0.000000e+00  0.000000000  0.000000e+00
#> wt           1.037319e-02 8.958479e-03  8.858489e-03  0.013362765  7.595784e-03
#> qsec        -1.815606e-03 0.000000e+00 -1.147336e-03 -0.002136415  0.000000e+00
#> vs           0.000000e+00 0.000000e+00  0.000000e+00  0.000000000  0.000000e+00
#> am           0.000000e+00 0.000000e+00  0.000000e+00  0.000000000  0.000000e+00
#> gear        -3.860851e-03 0.000000e+00 -3.408010e-03  0.000000000 -3.358384e-03
#> carb         0.000000e+00 0.000000e+00  7.249656e-04  0.000000000  1.140652e-03
#>                   Model10
#> (Intercept)  4.213154e-02
#> cyl          0.000000e+00
#> disp         0.000000e+00
#> hp           6.607566e-05
#> drat         1.326266e-03
#> wt           9.761600e-03
#> qsec        -1.298089e-03
#> vs           0.000000e+00
#> am           0.000000e+00
#> gear        -3.029601e-03
#> carb         0.000000e+00

1.3.3 Using cutoff

  • The cutoff argument can be used to find all models that have a metric value that is within cutoff of the minimum metric value found.
# Finding all models with an AIC within 2 of the best model
formulaVS <- VariableSelection(mpg ~ . ,data = cars, family = "gamma", 
                               link = "inverse", type = "branch and bound",
                               showprogress = FALSE, metric = "AIC", 
                               cutoff = 2)
formulaVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using branch and bound selection with AIC
#> Found 16 models within 2 AIC of the best AIC(141.9)
#> Number of models fit: 93
#> Variables that were kept in each model:  (Intercept)

## Plotting results
plot(formulaVS, type = "b")

1.4 Using keep

# Example of using keep
keepVS <- VariableSelection(mpg ~ . ,data = cars, family = "gamma", 
                               link = "inverse", type = "branch and bound",
                               keep = c("hp", "cyl"), metric = "AIC",
                               showprogress = FALSE, bestmodels = 10)
keepVS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using branch and bound selection with AIC
#> The range of AIC values for the top 10 models is (143.17, 145.24)
#> Number of models fit: 45
#> Variables that were kept in each model:  (Intercept), hp, cyl

## Getting summary and plotting results
plot(keepVS, type = "b")


## Getting coefficients for top 10 models
coef(keepVS, which = "all")
#>                    Model1       Model2        Model3       Model4        Model5
#> (Intercept)  6.463732e-02 7.472237e-03  0.0256340835  0.068289331  1.716490e-02
#> cyl         -1.412185e-03 1.087312e-03  0.0004708918 -0.001632036  5.216391e-04
#> disp         0.000000e+00 0.000000e+00  0.0000000000  0.000000000  0.000000e+00
#> hp           7.523221e-05 7.196928e-05  0.0000530224  0.000071603  8.984711e-05
#> drat         0.000000e+00 0.000000e+00  0.0000000000  0.000000000  0.000000e+00
#> wt           1.037319e-02 8.958479e-03  0.0105114095  0.009728921  8.209926e-03
#> qsec        -1.815606e-03 0.000000e+00 -0.0009269894 -0.001707464  0.000000e+00
#> vs           0.000000e+00 0.000000e+00  0.0000000000  0.000000000  0.000000e+00
#> am           0.000000e+00 0.000000e+00  0.0000000000  0.000000000  0.000000e+00
#> gear        -3.860851e-03 0.000000e+00  0.0000000000 -0.004973355 -1.732000e-03
#> carb         0.000000e+00 0.000000e+00  0.0000000000  0.000886622  0.000000e+00
#>                    Model6        Model7        Model8        Model9
#> (Intercept)  5.958530e-02  6.575356e-02  6.604896e-02  0.0648442871
#> cyl         -1.320709e-03 -1.277298e-03 -1.444133e-03 -0.0013699334
#> disp         0.000000e+00 -8.059797e-06  0.000000e+00  0.0000000000
#> hp           7.707172e-05  7.872558e-05  7.501653e-05  0.0000745678
#> drat         1.089999e-03  0.000000e+00  0.000000e+00  0.0000000000
#> wt           1.054618e-02  1.074571e-02  1.025472e-02  0.0104217585
#> qsec        -1.782401e-03 -1.870329e-03 -1.867512e-03 -0.0018571564
#> vs           0.000000e+00  0.000000e+00  0.000000e+00  0.0002736861
#> am           0.000000e+00  0.000000e+00 -4.739251e-04  0.0000000000
#> gear        -4.088866e-03 -4.086160e-03 -3.775475e-03 -0.0038351284
#> carb         0.000000e+00  0.000000e+00  0.000000e+00  0.0000000000
#>                   Model10
#> (Intercept)  0.0092037639
#> cyl          0.0008526030
#> disp         0.0000000000
#> hp           0.0000703095
#> drat         0.0000000000
#> wt           0.0090861507
#> qsec         0.0000000000
#> vs          -0.0010182533
#> am           0.0000000000
#> gear         0.0000000000
#> carb         0.0000000000

1.5 Categorical variables

# Variable selection with grouped beta parameters for species
Data <- iris
VS <- VariableSelection(Sepal.Length ~ ., data = Data, family = "gaussian", 
                           link = "identity", metric = "AIC", bestmodels = 10, 
                           showprogress = FALSE)
VS
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using switch branch and bound selection with AIC
#> The range of AIC values for the top 10 models is (79.12, 183.94)
#> Number of models fit: 16
#> Variables that were kept in each model:  (Intercept)

## Plotting results
plot(VS, cex.names = 0.75, type = "b")

# Treating categorical variable beta parameters separately
## This function automatically groups together parameters from a categorical variable
## to avoid this, you need to create the indicator variables yourself
x <- model.matrix(Sepal.Length ~ ., data = iris)
Sepal.Length <- iris$Sepal.Length
Data <- cbind.data.frame(Sepal.Length, x[, -1])
VSCat <- VariableSelection(Sepal.Length ~ ., data = Data, family = "gaussian", 
                           link = "identity", metric = "AIC", bestmodels = 10, 
                           showprogress = FALSE)
VSCat
#> Variable Selection Info:
#> ------------------------
#> Variables were selected using switch branch and bound selection with AIC
#> The range of AIC values for the top 10 models is (79.12, 108.23)
#> Number of models fit: 21
#> Variables that were kept in each model:  (Intercept)

## Plotting results
plot(VSCat, cex.names = 0.75, type = "b")

1.6 Convergence issues