This notebook illustrates how to implement regularization methods using R.
# install packages from CRAN
p_needed <- c('caret', 'elasticnet', 'glmnet', 'devtools',
'MASS', 'grplasso')
packages <- rownames(installed.packages())
p_to_install <- p_needed[!(p_needed %in% packages)]
if (length(p_to_install) > 0) {
install.packages(p_to_install)
}
lapply(p_needed, require, character.only = TRUE)
# install packages from GitHub
p_needed_gh <- c('NetlifyDS')
if (! p_needed_gh %in% packages) {
devtools::install_github("netlify/NetlifyDS")
}
library(NetlifyDS)
dat <- read.csv("http://bit.ly/2P5gTw4")
# data cleaning: delete wrong observations
# expense can't be negative
dat <- subset(dat, store_exp > 0 & online_exp > 0)
# get predictors
trainx <- dat[ , grep("Q", names(dat))]
# get response
trainy <- dat$store_exp + dat$online_exp
In Ridge regression, there is a tuning parameter \(\lambda\) that needs to set for the right level of regularization. The value of lambda can be determined by looking at the performance in the cross-validation data.
# set cross validation
ctrl <- trainControl(method = "cv", number = 10)
# set the parameter range
ridgeGrid <- data.frame(.lambda = seq(0, .1, length = 20))
set.seed(100)
ridgeRegTune <- train(trainx, trainy,
method = "ridge",
tuneGrid = ridgeGrid,
trControl = ctrl,
## center and scale predictors
preProc = c("center", "scale"))
ridgeRegTune
## Ridge Regression
##
## 999 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 899, 899, 899, 899, 899, 900, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.000000000 1620.980 0.8271792 756.8867
## 0.005263158 1620.079 0.8273987 757.8206
## 0.010526316 1619.773 0.8275476 758.9396
## 0.015789474 1619.901 0.8276512 760.1890
## 0.021052632 1620.366 0.8277243 761.5209
## 0.026315789 1621.107 0.8277760 763.1556
## 0.031578947 1622.083 0.8278122 764.8858
## 0.036842105 1623.267 0.8278370 766.7312
## 0.042105263 1624.639 0.8278529 768.6860
## 0.047368421 1626.185 0.8278620 770.7971
## 0.052631579 1627.892 0.8278657 772.9889
## 0.057894737 1629.753 0.8278649 775.2529
## 0.063157895 1631.760 0.8278605 777.6964
## 0.068421053 1633.908 0.8278530 780.2181
## 0.073684211 1636.190 0.8278430 782.8435
## 0.078947368 1638.604 0.8278307 785.5305
## 0.084210526 1641.144 0.8278165 788.2773
## 0.089473684 1643.808 0.8278006 791.2583
## 0.094736842 1646.592 0.8277832 794.3876
## 0.100000000 1649.494 0.8277644 797.5885
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.01052632.
plot(ridgeRegTune)
Once we have determined the value for tuning parameter lambda, we can
use different ways to fit the ridge regression model, for example, using
enet() function.
ridgefit = enet(x = as.matrix(trainx), y = trainy, lambda = 0.01,
# center and scale predictors
normalize = TRUE)
Once the model is fit, we can use the predict() function
to predict new dataset.
ridgePred <- predict(ridgefit, newx = as.matrix(trainx),
s = 1, mode = "fraction", type = "fit")
names(ridgePred)
## [1] "s" "fraction" "mode" "fit"
head(ridgePred$fit)
## 1 2 3 4 5 6
## 1290.4697 224.1595 591.4406 1220.6384 853.3572 908.2040
ridgeCoef<-predict(ridgefit,newx = as.matrix(trainx),
s=1, mode="fraction", type="coefficients")
ridgeCoef$coefficients
## Q1 Q2 Q3 Q4 Q5 Q6
## -316.763403 684.044587 649.176319 246.932116 576.078371 692.149705
## Q7 Q8 Q9 Q10
## 6.181062 -116.500992 382.265663 -314.739623
LASSO method will shrink certain parameters to be zero which can be used as a variable selection tool. In LASSO, there is also a tuning parameter called fraction which can be determined by performance of the cross validation data set.
ctrl <- trainControl(method = "cv", number = 10)
lassoGrid <- data.frame(fraction = seq(.8, 1, length = 20))
set.seed(100)
lassoTune <- train(trainx, trainy,
## set the method to be lasso
method = "lars",
tuneGrid = lassoGrid,
trControl = ctrl,
## standardize the predictors
preProc = c("center", "scale"))
lassoTune
## Least Angle Regression
##
## 999 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 899, 899, 899, 899, 899, 900, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.8000000 1633.994 0.8250185 788.8895
## 0.8105263 1631.956 0.8253421 785.7150
## 0.8210526 1630.196 0.8256225 782.6543
## 0.8315789 1628.441 0.8258906 779.6467
## 0.8421053 1626.539 0.8261791 776.6017
## 0.8526316 1624.745 0.8264526 773.6821
## 0.8631579 1623.107 0.8267066 771.0365
## 0.8736842 1621.643 0.8269375 768.4821
## 0.8842105 1620.390 0.8271378 766.0623
## 0.8947368 1619.349 0.8273079 763.8790
## 0.9052632 1618.520 0.8274481 761.9152
## 0.9157895 1617.903 0.8275585 760.2604
## 0.9263158 1617.516 0.8276338 758.9376
## 0.9368421 1617.382 0.8276544 757.8210
## 0.9473684 1617.452 0.8276476 757.0097
## 0.9578947 1617.802 0.8276001 756.4950
## 0.9684211 1618.367 0.8275239 756.1559
## 0.9789474 1619.138 0.8274210 756.0236
## 0.9894737 1620.048 0.8273066 756.3794
## 1.0000000 1620.980 0.8271792 756.8867
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9368421.
plot(lassoTune)
Once the tuning parameter fraction is set, we can use different
functions to fit LASSO model, such as lars() in
lars, enet() in elasticnet, and
glmnet() in glmnet.
lassoModel <- enet(x = as.matrix(trainx), y = trainy,
lambda = 0, normalize = TRUE)
lassoFit <- predict(lassoModel, newx = as.matrix(trainx),
s = 0.957, mode = "fraction", type = "fit")
head(lassoFit$fit)
## 1 2 3 4 5 6
## 1357.2815 300.5298 690.2390 1228.1558 838.4467 1010.0713
lassoCoef <- predict(lassoModel,
newx = as.matrix(trainx),
s = 0.95, mode = "fraction",
type = "coefficients")
lassoCoef$coefficients
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8
## -326.7759 720.2801 722.7518 180.7107 542.1603 603.0865 0.0000 -75.9610
## Q9 Q10
## 342.6375 -281.7818
Elastic net is a generalization of LASSO and ridge regression. It is a combination of LASSO and ridge regression. For elastic net, there are two tuning parameters called lambda and fraction.
enetGrid <- expand.grid(.lambda = seq(0,0.2,length=20),
.fraction = seq(.8, 1, length = 20))
set.seed(100)
enetTune <- train(trainx, trainy,
method = "enet",
tuneGrid = enetGrid,
trControl = ctrl,
preProc = c("center", "scale"))
enetTune
## Elasticnet
##
## 999 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 899, 899, 899, 899, 899, 900, ...
## Resampling results across tuning parameters:
##
## lambda fraction RMSE Rsquared MAE
## 0.00000000 0.8000000 1633.994 0.8250185 788.8895
## 0.00000000 0.8105263 1631.956 0.8253421 785.7150
## 0.00000000 0.8210526 1630.196 0.8256225 782.6543
## 0.00000000 0.8315789 1628.441 0.8258906 779.6467
## 0.00000000 0.8421053 1626.539 0.8261791 776.6017
## 0.00000000 0.8526316 1624.745 0.8264526 773.6821
## 0.00000000 0.8631579 1623.107 0.8267066 771.0365
## 0.00000000 0.8736842 1621.643 0.8269375 768.4821
## 0.00000000 0.8842105 1620.390 0.8271378 766.0623
## 0.00000000 0.8947368 1619.349 0.8273079 763.8790
## 0.00000000 0.9052632 1618.520 0.8274481 761.9152
## 0.00000000 0.9157895 1617.903 0.8275585 760.2604
## 0.00000000 0.9263158 1617.516 0.8276338 758.9376
## 0.00000000 0.9368421 1617.382 0.8276544 757.8210
## 0.00000000 0.9473684 1617.452 0.8276476 757.0097
## 0.00000000 0.9578947 1617.802 0.8276001 756.4950
## 0.00000000 0.9684211 1618.367 0.8275239 756.1559
## 0.00000000 0.9789474 1619.138 0.8274210 756.0236
## 0.00000000 0.9894737 1620.048 0.8273066 756.3794
## 0.00000000 1.0000000 1620.980 0.8271792 756.8867
## 0.01052632 0.8000000 1635.826 0.8246536 792.3834
## 0.01052632 0.8105263 1633.554 0.8250088 789.1700
## 0.01052632 0.8210526 1631.502 0.8253351 785.9805
## 0.01052632 0.8315789 1629.488 0.8256497 782.7859
## 0.01052632 0.8421053 1627.400 0.8259707 779.6851
## 0.01052632 0.8526316 1625.519 0.8262635 776.6770
## 0.01052632 0.8631579 1623.789 0.8265383 773.9974
## 0.01052632 0.8736842 1622.231 0.8267914 771.4169
## 0.01052632 0.8842105 1620.827 0.8270245 768.9600
## 0.01052632 0.8947368 1619.623 0.8272307 766.6214
## 0.01052632 0.9052632 1618.627 0.8274092 764.5374
## 0.01052632 0.9157895 1617.839 0.8275603 762.6204
## 0.01052632 0.9263158 1617.259 0.8276842 761.0392
## 0.01052632 0.9368421 1616.888 0.8277814 759.8925
## 0.01052632 0.9473684 1616.838 0.8278132 759.0871
## 0.01052632 0.9578947 1617.006 0.8278170 758.5339
## 0.01052632 0.9684211 1617.445 0.8277844 758.1862
## 0.01052632 0.9789474 1618.087 0.8277273 758.1863
## 0.01052632 0.9894737 1618.885 0.8276494 758.4365
## 0.01052632 1.0000000 1619.773 0.8275476 758.9396
## 0.02105263 0.8000000 1636.749 0.8244436 794.3219
## 0.02105263 0.8105263 1634.399 0.8248178 791.0998
## 0.02105263 0.8210526 1632.324 0.8251543 787.9501
## 0.02105263 0.8315789 1630.101 0.8254999 784.7358
## 0.02105263 0.8421053 1627.975 0.8258362 781.5357
## 0.02105263 0.8526316 1626.056 0.8261444 778.6445
## 0.02105263 0.8631579 1624.264 0.8264405 775.8744
## 0.02105263 0.8736842 1622.668 0.8267113 773.2656
## 0.02105263 0.8842105 1621.279 0.8269554 770.8272
## 0.02105263 0.8947368 1620.099 0.8271730 768.6565
## 0.02105263 0.9052632 1619.127 0.8273644 766.5949
## 0.02105263 0.9157895 1618.365 0.8275300 764.9173
## 0.02105263 0.9263158 1617.813 0.8276701 763.5334
## 0.02105263 0.9368421 1617.470 0.8277849 762.2718
## 0.02105263 0.9473684 1617.360 0.8278664 761.4052
## 0.02105263 0.9578947 1617.537 0.8278957 760.8778
## 0.02105263 0.9684211 1617.974 0.8278925 760.6887
## 0.02105263 0.9789474 1618.638 0.8278626 760.7326
## 0.02105263 0.9894737 1619.458 0.8278079 761.0056
## 0.02105263 1.0000000 1620.366 0.8277243 761.5209
## 0.03157895 0.8000000 1637.268 0.8243143 795.3702
## 0.03157895 0.8105263 1634.805 0.8247123 792.1327
## 0.03157895 0.8210526 1632.613 0.8250724 789.0308
## 0.03157895 0.8315789 1630.397 0.8254248 785.9244
## 0.03157895 0.8421053 1628.345 0.8257606 782.8069
## 0.03157895 0.8526316 1626.479 0.8260740 779.9636
## 0.03157895 0.8631579 1624.758 0.8263731 777.2614
## 0.03157895 0.8736842 1623.248 0.8266463 774.7894
## 0.03157895 0.8842105 1621.949 0.8268939 772.5342
## 0.03157895 0.8947368 1620.862 0.8271163 770.4573
## 0.03157895 0.9052632 1619.988 0.8273137 768.7255
## 0.03157895 0.9157895 1619.327 0.8274865 767.1724
## 0.03157895 0.9263158 1618.879 0.8276351 765.8840
## 0.03157895 0.9368421 1618.644 0.8277598 764.7367
## 0.03157895 0.9473684 1618.623 0.8278610 764.0250
## 0.03157895 0.9578947 1618.878 0.8279155 763.6802
## 0.03157895 0.9684211 1619.400 0.8279324 763.5917
## 0.03157895 0.9789474 1620.176 0.8279200 763.7311
## 0.03157895 0.9894737 1621.106 0.8278819 764.1907
## 0.03157895 1.0000000 1622.083 0.8278122 764.8858
## 0.04210526 0.8000000 1637.336 0.8242673 795.7676
## 0.04210526 0.8105263 1634.988 0.8246580 792.6866
## 0.04210526 0.8210526 1632.865 0.8250148 789.7883
## 0.04210526 0.8315789 1630.709 0.8253703 786.7658
## 0.04210526 0.8421053 1628.769 0.8257030 783.8272
## 0.04210526 0.8526316 1626.981 0.8260211 781.0815
## 0.04210526 0.8631579 1625.383 0.8263184 778.5497
## 0.04210526 0.8736842 1624.002 0.8265908 776.3261
## 0.04210526 0.8842105 1622.837 0.8268386 774.2760
## 0.04210526 0.8947368 1621.890 0.8270623 772.4349
## 0.04210526 0.9052632 1621.161 0.8272620 770.8630
## 0.04210526 0.9157895 1620.651 0.8274383 769.4260
## 0.04210526 0.9263158 1620.360 0.8275915 768.1913
## 0.04210526 0.9368421 1620.287 0.8277218 767.3754
## 0.04210526 0.9473684 1620.432 0.8278297 766.9569
## 0.04210526 0.9578947 1620.819 0.8279065 766.9184
## 0.04210526 0.9684211 1621.492 0.8279369 767.0220
## 0.04210526 0.9789474 1622.441 0.8279368 767.3846
## 0.04210526 0.9894737 1623.547 0.8279114 767.9179
## 0.04210526 1.0000000 1624.639 0.8278529 768.6860
## 0.05263158 0.8000000 1637.408 0.8242360 796.1282
## 0.05263158 0.8105263 1635.203 0.8246179 793.2558
## 0.05263158 0.8210526 1633.116 0.8249752 790.4644
## 0.05263158 0.8315789 1631.093 0.8253287 787.5629
## 0.05263158 0.8421053 1629.276 0.8256597 784.7421
## 0.05263158 0.8526316 1627.611 0.8259779 782.1662
## 0.05263158 0.8631579 1626.167 0.8262718 779.9035
## 0.05263158 0.8736842 1624.947 0.8265417 777.8830
## 0.05263158 0.8842105 1623.951 0.8267879 776.1158
## 0.05263158 0.8947368 1623.179 0.8270109 774.4921
## 0.05263158 0.9052632 1622.632 0.8272109 773.0840
## 0.05263158 0.9157895 1622.310 0.8273884 771.8317
## 0.05263158 0.9263158 1622.214 0.8275437 770.8828
## 0.05263158 0.9368421 1622.342 0.8276773 770.3604
## 0.05263158 0.9473684 1622.695 0.8277893 770.2500
## 0.05263158 0.9578947 1623.272 0.8278802 770.4072
## 0.05263158 0.9684211 1624.145 0.8279207 770.7594
## 0.05263158 0.9789474 1625.308 0.8279300 771.3430
## 0.05263158 0.9894737 1626.645 0.8279158 772.0595
## 0.05263158 1.0000000 1627.892 0.8278657 772.9889
## 0.06315789 0.8000000 1637.532 0.8242144 796.5193
## 0.06315789 0.8105263 1635.488 0.8245865 793.7879
## 0.06315789 0.8210526 1633.451 0.8249476 791.0514
## 0.06315789 0.8315789 1631.584 0.8252951 788.2870
## 0.06315789 0.8421053 1629.890 0.8256275 785.6468
## 0.06315789 0.8526316 1628.396 0.8259407 783.4649
## 0.06315789 0.8631579 1627.132 0.8262302 781.4300
## 0.06315789 0.8736842 1626.100 0.8264966 779.6719
## 0.06315789 0.8842105 1625.299 0.8267401 778.1326
## 0.06315789 0.8947368 1624.731 0.8269612 776.7504
## 0.06315789 0.9052632 1624.395 0.8271603 775.4892
## 0.06315789 0.9157895 1624.291 0.8273377 774.5648
## 0.06315789 0.9263158 1624.420 0.8274937 773.9205
## 0.06315789 0.9368421 1624.781 0.8276289 773.5923
## 0.06315789 0.9473684 1625.374 0.8277434 773.6843
## 0.06315789 0.9578947 1626.197 0.8278377 774.1672
## 0.06315789 0.9684211 1627.300 0.8278907 774.6892
## 0.06315789 0.9789474 1628.705 0.8279081 775.4247
## 0.06315789 0.9894737 1630.314 0.8279032 776.5186
## 0.06315789 1.0000000 1631.760 0.8278605 777.6964
## 0.07368421 0.8000000 1637.738 0.8241989 796.9050
## 0.07368421 0.8105263 1635.823 0.8245637 794.3508
## 0.07368421 0.8210526 1633.895 0.8249260 791.6353
## 0.07368421 0.8315789 1632.191 0.8252692 789.1041
## 0.07368421 0.8421053 1630.653 0.8256000 786.8485
## 0.07368421 0.8526316 1629.353 0.8259075 784.8898
## 0.07368421 0.8631579 1628.292 0.8261922 783.1740
## 0.07368421 0.8736842 1627.470 0.8264545 781.6704
## 0.07368421 0.8842105 1626.890 0.8266946 780.3200
## 0.07368421 0.8947368 1626.549 0.8269131 779.1108
## 0.07368421 0.9052632 1626.450 0.8271104 778.2312
## 0.07368421 0.9157895 1626.591 0.8272867 777.6039
## 0.07368421 0.9263158 1626.972 0.8274425 777.2800
## 0.07368421 0.9368421 1627.592 0.8275782 777.1814
## 0.07368421 0.9473684 1628.451 0.8276941 777.4615
## 0.07368421 0.9578947 1629.548 0.8277906 778.0005
## 0.07368421 0.9684211 1630.916 0.8278526 778.8693
## 0.07368421 0.9789474 1632.595 0.8278761 780.0075
## 0.07368421 0.9894737 1634.513 0.8278789 781.4383
## 0.07368421 1.0000000 1636.190 0.8278430 782.8435
## 0.08421053 0.8000000 1638.058 0.8241860 797.3405
## 0.08421053 0.8105263 1636.219 0.8245492 794.8565
## 0.08421053 0.8210526 1634.473 0.8249074 792.3585
## 0.08421053 0.8315789 1632.921 0.8252503 790.1821
## 0.08421053 0.8421053 1631.587 0.8255746 788.1906
## 0.08421053 0.8526316 1630.499 0.8258763 786.5015
## 0.08421053 0.8631579 1629.661 0.8261559 785.0487
## 0.08421053 0.8736842 1629.071 0.8264137 783.7361
## 0.08421053 0.8842105 1628.730 0.8266501 782.6377
## 0.08421053 0.8947368 1628.639 0.8268656 781.8686
## 0.08421053 0.9052632 1628.797 0.8270605 781.3347
## 0.08421053 0.9157895 1629.204 0.8272352 781.0349
## 0.08421053 0.9263158 1629.860 0.8273902 780.9363
## 0.08421053 0.9368421 1630.762 0.8275257 781.1351
## 0.08421053 0.9473684 1631.911 0.8276423 781.5617
## 0.08421053 0.9578947 1633.305 0.8277401 782.4285
## 0.08421053 0.9684211 1634.966 0.8278086 783.5899
## 0.08421053 0.9789474 1636.945 0.8278368 785.0542
## 0.08421053 0.9894737 1639.182 0.8278450 786.6634
## 0.08421053 1.0000000 1641.144 0.8278165 788.2773
## 0.09473684 0.8000000 1638.510 0.8241734 797.9016
## 0.09473684 0.8105263 1636.746 0.8245397 795.4687
## 0.09473684 0.8210526 1635.189 0.8248924 793.3868
## 0.09473684 0.8315789 1633.818 0.8252326 791.4605
## 0.09473684 0.8421053 1632.704 0.8255504 789.6859
## 0.09473684 0.8526316 1631.847 0.8258462 788.3153
## 0.09473684 0.8631579 1631.248 0.8261205 787.0551
## 0.09473684 0.8736842 1630.908 0.8263736 786.0017
## 0.09473684 0.8842105 1630.826 0.8266061 785.2973
## 0.09473684 0.8947368 1631.003 0.8268182 784.8744
## 0.09473684 0.9052632 1631.438 0.8270105 784.7020
## 0.09473684 0.9157895 1632.130 0.8271833 784.6224
## 0.09473684 0.9263158 1633.079 0.8273369 784.8459
## 0.09473684 0.9368421 1634.284 0.8274718 785.3350
## 0.09473684 0.9473684 1635.742 0.8275884 786.2146
## 0.09473684 0.9578947 1637.453 0.8276870 787.4105
## 0.09473684 0.9684211 1639.430 0.8277601 788.7539
## 0.09473684 0.9789474 1641.732 0.8277919 790.4008
## 0.09473684 0.9894737 1644.299 0.8278041 792.4600
## 0.09473684 1.0000000 1646.592 0.8277832 794.3876
## 0.10526316 0.8000000 1639.012 0.8241683 798.4797
## 0.10526316 0.8105263 1637.414 0.8245335 796.3703
## 0.10526316 0.8210526 1636.035 0.8248844 794.5291
## 0.10526316 0.8315789 1634.891 0.8252174 792.7750
## 0.10526316 0.8421053 1634.014 0.8255286 791.3708
## 0.10526316 0.8526316 1633.405 0.8258183 790.2048
## 0.10526316 0.8631579 1633.064 0.8260872 789.2252
## 0.10526316 0.8736842 1632.992 0.8263355 788.5132
## 0.10526316 0.8842105 1633.188 0.8265637 788.1901
## 0.10526316 0.8947368 1633.651 0.8267723 788.1484
## 0.10526316 0.9052632 1634.382 0.8269615 788.1860
## 0.10526316 0.9157895 1635.379 0.8271319 788.4404
## 0.10526316 0.9263158 1636.640 0.8272838 789.0227
## 0.10526316 0.9368421 1638.165 0.8274177 789.9979
## 0.10526316 0.9473684 1639.953 0.8275338 791.1721
## 0.10526316 0.9578947 1642.000 0.8276326 792.5859
## 0.10526316 0.9684211 1644.315 0.8277088 794.3763
## 0.10526316 0.9789474 1646.960 0.8277433 796.4570
## 0.10526316 0.9894737 1649.878 0.8277586 798.7159
## 0.10526316 1.0000000 1652.510 0.8277443 800.8477
## 0.11578947 0.8000000 1639.626 0.8241679 799.2381
## 0.11578947 0.8105263 1638.241 0.8245282 797.4189
## 0.11578947 0.8210526 1637.062 0.8248759 795.7098
## 0.11578947 0.8315789 1636.160 0.8252018 794.3098
## 0.11578947 0.8421053 1635.536 0.8255064 793.1432
## 0.11578947 0.8526316 1635.190 0.8257902 792.2822
## 0.11578947 0.8631579 1635.122 0.8260536 791.6245
## 0.11578947 0.8736842 1635.333 0.8262971 791.3161
## 0.11578947 0.8842105 1635.821 0.8265211 791.3443
## 0.11578947 0.8947368 1636.586 0.8267260 791.5316
## 0.11578947 0.9052632 1637.628 0.8269121 791.9195
## 0.11578947 0.9157895 1638.945 0.8270800 792.5698
## 0.11578947 0.9263158 1640.534 0.8272300 793.6021
## 0.11578947 0.9368421 1642.396 0.8273626 794.8365
## 0.11578947 0.9473684 1644.527 0.8274780 796.2876
## 0.11578947 0.9578947 1646.926 0.8275766 798.2356
## 0.11578947 0.9684211 1649.597 0.8276550 800.4001
## 0.11578947 0.9789474 1652.600 0.8276914 802.6660
## 0.11578947 0.9894737 1655.884 0.8277093 805.2068
## 0.11578947 1.0000000 1658.877 0.8277008 807.5841
## 0.12631579 0.8000000 1640.385 0.8241684 800.1978
## 0.12631579 0.8105263 1639.195 0.8245262 798.4837
## 0.12631579 0.8210526 1638.260 0.8248664 797.0622
## 0.12631579 0.8315789 1637.613 0.8251853 795.9646
## 0.12631579 0.8421053 1637.255 0.8254836 795.1034
## 0.12631579 0.8526316 1637.186 0.8257615 794.5419
## 0.12631579 0.8631579 1637.405 0.8260196 794.2696
## 0.12631579 0.8736842 1637.913 0.8262583 794.3320
## 0.12631579 0.8842105 1638.708 0.8264781 794.6365
## 0.12631579 0.8947368 1639.789 0.8266792 795.1870
## 0.12631579 0.9052632 1641.156 0.8268623 795.9471
## 0.12631579 0.9157895 1642.806 0.8270275 797.0122
## 0.12631579 0.9263158 1644.738 0.8271755 798.3034
## 0.12631579 0.9368421 1646.949 0.8273065 799.8984
## 0.12631579 0.9473684 1649.438 0.8274210 801.8492
## 0.12631579 0.9578947 1652.202 0.8275193 804.0912
## 0.12631579 0.9684211 1655.243 0.8275989 806.5709
## 0.12631579 0.9789474 1658.620 0.8276369 809.0791
## 0.12631579 0.9894737 1662.283 0.8276569 811.8636
## 0.12631579 1.0000000 1665.671 0.8276534 814.5830
## 0.13684211 0.8000000 1641.289 0.8241685 801.2337
## 0.13684211 0.8105263 1640.312 0.8245226 799.6983
## 0.13684211 0.8210526 1639.634 0.8248556 798.6307
## 0.13684211 0.8315789 1639.256 0.8251678 797.8107
## 0.13684211 0.8421053 1639.176 0.8254598 797.2210
## 0.13684211 0.8526316 1639.397 0.8257320 797.0378
## 0.13684211 0.8631579 1639.916 0.8259849 797.1659
## 0.13684211 0.8736842 1640.733 0.8262189 797.5355
## 0.13684211 0.8842105 1641.848 0.8264344 798.1680
## 0.13684211 0.8947368 1643.258 0.8266319 799.0627
## 0.13684211 0.9052632 1644.962 0.8268117 800.2041
## 0.13684211 0.9157895 1646.959 0.8269744 801.5514
## 0.13684211 0.9263158 1649.245 0.8271202 803.3136
## 0.13684211 0.9368421 1651.819 0.8272496 805.3203
## 0.13684211 0.9473684 1654.678 0.8273631 807.5619
## 0.13684211 0.9578947 1657.819 0.8274608 810.1632
## 0.13684211 0.9684211 1661.242 0.8275410 812.9046
## 0.13684211 0.9789474 1665.006 0.8275801 815.7220
## 0.13684211 0.9894737 1669.059 0.8276019 818.8352
## 0.13684211 1.0000000 1672.876 0.8276026 821.9239
## 0.14736842 0.8000000 1642.335 0.8241695 802.3745
## 0.14736842 0.8105263 1641.615 0.8245162 801.2545
## 0.14736842 0.8210526 1641.206 0.8248422 800.4291
## 0.14736842 0.8315789 1641.107 0.8251479 799.8450
## 0.14736842 0.8421053 1641.318 0.8254338 799.6691
## 0.14736842 0.8526316 1641.840 0.8257005 799.8625
## 0.14736842 0.8631579 1642.670 0.8259482 800.2687
## 0.14736842 0.8736842 1643.809 0.8261776 800.9595
## 0.14736842 0.8842105 1645.254 0.8263890 801.9502
## 0.14736842 0.8947368 1647.004 0.8265829 803.1593
## 0.14736842 0.9052632 1649.057 0.8267596 804.6551
## 0.14736842 0.9157895 1651.410 0.8269197 806.4847
## 0.14736842 0.9263158 1654.062 0.8270634 808.5611
## 0.14736842 0.9368421 1657.009 0.8271912 810.8726
## 0.14736842 0.9473684 1660.248 0.8273035 813.4750
## 0.14736842 0.9578947 1663.776 0.8274006 816.3706
## 0.14736842 0.9684211 1667.592 0.8274809 819.4174
## 0.14736842 0.9789474 1671.752 0.8275208 822.6373
## 0.14736842 0.9894737 1676.206 0.8275442 826.0982
## 0.14736842 1.0000000 1680.474 0.8275487 829.6293
## 0.15789474 0.8000000 1643.538 0.8241678 803.8305
## 0.15789474 0.8105263 1643.090 0.8245071 802.9769
## 0.15789474 0.8210526 1642.963 0.8248262 802.4061
## 0.15789474 0.8315789 1643.159 0.8251254 802.2364
## 0.15789474 0.8421053 1643.675 0.8254054 802.4176
## 0.15789474 0.8526316 1644.512 0.8256665 802.9057
## 0.15789474 0.8631579 1645.668 0.8259092 803.6094
## 0.15789474 0.8736842 1647.142 0.8261339 804.6866
## 0.15789474 0.8842105 1648.933 0.8263412 805.9801
## 0.15789474 0.8947368 1651.037 0.8265314 807.5572
## 0.15789474 0.9052632 1653.452 0.8267049 809.5161
## 0.15789474 0.9157895 1656.177 0.8268622 811.6370
## 0.15789474 0.9263158 1659.207 0.8270036 814.0269
## 0.15789474 0.9368421 1662.541 0.8271296 816.7211
## 0.15789474 0.9473684 1666.173 0.8272405 819.6860
## 0.15789474 0.9578947 1670.102 0.8273368 822.8679
## 0.15789474 0.9684211 1674.324 0.8274167 826.2949
## 0.15789474 0.9789474 1678.894 0.8274570 829.9155
## 0.15789474 0.9894737 1683.760 0.8274816 833.8697
## 0.15789474 1.0000000 1688.449 0.8274919 837.7241
## 0.16842105 0.8000000 1644.930 0.8241627 805.5032
## 0.16842105 0.8105263 1644.763 0.8244949 804.9234
## 0.16842105 0.8210526 1644.930 0.8248073 804.7077
## 0.16842105 0.8315789 1645.430 0.8251003 804.8776
## 0.16842105 0.8421053 1646.262 0.8253745 805.3941
## 0.16842105 0.8526316 1647.425 0.8256302 806.1231
## 0.16842105 0.8631579 1648.917 0.8258680 807.2437
## 0.16842105 0.8736842 1650.736 0.8260883 808.6241
## 0.16842105 0.8842105 1652.881 0.8262916 810.2746
## 0.16842105 0.8947368 1655.348 0.8264782 812.3418
## 0.16842105 0.9052632 1658.136 0.8266486 814.5910
## 0.16842105 0.9157895 1661.240 0.8268032 817.0408
## 0.16842105 0.9263158 1664.658 0.8269424 819.8316
## 0.16842105 0.9368421 1668.386 0.8270666 822.8908
## 0.16842105 0.9473684 1672.420 0.8271762 826.1590
## 0.16842105 0.9578947 1676.757 0.8272715 829.6881
## 0.16842105 0.9684211 1681.393 0.8273508 833.6413
## 0.16842105 0.9789474 1686.380 0.8273913 837.7712
## 0.16842105 0.9894737 1691.666 0.8274169 841.9847
## 0.16842105 1.0000000 1696.785 0.8274326 846.1587
## 0.17894737 0.8000000 1646.523 0.8241551 807.3583
## 0.17894737 0.8105263 1646.650 0.8244804 807.0957
## 0.17894737 0.8210526 1647.122 0.8247863 807.2335
## 0.17894737 0.8315789 1647.937 0.8250733 807.7285
## 0.17894737 0.8421053 1649.095 0.8253418 808.5181
## 0.17894737 0.8526316 1650.594 0.8255924 809.6572
## 0.17894737 0.8631579 1652.432 0.8258254 811.0895
## 0.17894737 0.8736842 1654.607 0.8260414 812.8021
## 0.17894737 0.8842105 1657.116 0.8262407 814.9490
## 0.17894737 0.8947368 1659.957 0.8264238 817.3163
## 0.17894737 0.9052632 1663.126 0.8265912 819.8924
## 0.17894737 0.9157895 1666.619 0.8267431 822.7604
## 0.17894737 0.9263158 1670.434 0.8268801 825.9592
## 0.17894737 0.9368421 1674.565 0.8270025 829.3290
## 0.17894737 0.9473684 1679.009 0.8271107 832.9486
## 0.17894737 0.9578947 1683.761 0.8272051 837.0569
## 0.17894737 0.9684211 1688.819 0.8272834 841.3935
## 0.17894737 0.9789474 1694.231 0.8273239 845.7289
## 0.17894737 0.9894737 1699.944 0.8273504 850.2820
## 0.17894737 1.0000000 1705.467 0.8273709 854.8477
## 0.18947368 0.8000000 1648.325 0.8241448 809.3866
## 0.18947368 0.8105263 1648.756 0.8244635 809.5517
## 0.18947368 0.8210526 1649.543 0.8247632 809.9924
## 0.18947368 0.8315789 1650.684 0.8250443 810.7452
## 0.18947368 0.8421053 1652.178 0.8253074 811.8904
## 0.18947368 0.8526316 1654.023 0.8255529 813.4040
## 0.18947368 0.8631579 1656.217 0.8257813 815.1599
## 0.18947368 0.8736842 1658.757 0.8259931 817.3588
## 0.18947368 0.8842105 1661.640 0.8261886 819.8090
## 0.18947368 0.8947368 1664.863 0.8263683 822.5265
## 0.18947368 0.9052632 1668.422 0.8265326 825.5497
## 0.18947368 0.9157895 1672.312 0.8266819 828.8014
## 0.18947368 0.9263158 1676.531 0.8268167 832.2915
## 0.18947368 0.9368421 1681.073 0.8269373 836.0084
## 0.18947368 0.9473684 1685.934 0.8270441 840.2082
## 0.18947368 0.9578947 1691.109 0.8271374 844.6523
## 0.18947368 0.9684211 1696.595 0.8272146 849.1594
## 0.18947368 0.9789474 1702.438 0.8272550 853.8308
## 0.18947368 0.9894737 1708.584 0.8272822 858.7855
## 0.18947368 1.0000000 1714.480 0.8273070 863.7013
## 0.20000000 0.8000000 1650.340 0.8241321 811.7262
## 0.20000000 0.8105263 1651.085 0.8244444 812.1765
## 0.20000000 0.8210526 1652.197 0.8247380 812.9125
## 0.20000000 0.8315789 1653.675 0.8250135 813.9728
## 0.20000000 0.8421053 1655.515 0.8252713 815.5258
## 0.20000000 0.8526316 1657.716 0.8255119 817.3595
## 0.20000000 0.8631579 1660.275 0.8257358 819.6078
## 0.20000000 0.8736842 1663.189 0.8259434 822.1026
## 0.20000000 0.8842105 1666.455 0.8261352 824.9574
## 0.20000000 0.8947368 1670.068 0.8263116 828.0905
## 0.20000000 0.9052632 1674.024 0.8264729 831.4404
## 0.20000000 0.9157895 1678.320 0.8266197 835.0089
## 0.20000000 0.9263158 1682.950 0.8267523 838.8427
## 0.20000000 0.9368421 1687.910 0.8268711 843.0953
## 0.20000000 0.9473684 1693.195 0.8269765 847.6169
## 0.20000000 0.9578947 1698.799 0.8270688 852.2574
## 0.20000000 0.9684211 1704.719 0.8271446 857.0843
## 0.20000000 0.9789474 1710.999 0.8271847 862.1782
## 0.20000000 0.9894737 1717.582 0.8272125 867.6137
## 0.20000000 1.0000000 1723.810 0.8272409 872.8446
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.9473684 and lambda
## = 0.01052632.
Adding penalties is a general technique that can be applied to many
methods other than linear regression. In this section, we will introduce
the penalized generalized linear model using glmnet().
glmnet packageThe default family option in the function glmnet() is
gaussian. It is the linear regression we discussed so far
in this chapter. But the parameterization is a little different in the
generalized linear model framework (we have \(\alpha\) and \(\lambda\)). Let’s start from our previous
example, using the same training data but glmnet() to fit
model:
dat <- read.csv("http://bit.ly/2P5gTw4")
# data cleaning: delete wrong observations with expense < 0
dat <- subset(dat, store_exp > 0 & online_exp > 0)
# get predictors
trainx <- dat[, grep("Q", names(dat))]
# get response
trainy <- dat$store_exp + dat$online_exp
glmfit = glmnet(as.matrix(trainx), trainy)
We can extract useful information using plot() and
print() function from the fitted model.
plot(glmfit, label = T)
print(glmfit)
##
## Call: glmnet(x = as.matrix(trainx), y = trainy)
##
## Df %Dev Lambda
## 1 0 0.00 3042.00
## 2 2 10.38 2771.00
## 3 2 19.19 2525.00
## 4 2 26.50 2301.00
## 5 3 32.64 2096.00
## 6 3 38.94 1910.00
## 7 3 44.17 1741.00
## 8 3 48.52 1586.00
## 9 3 52.12 1445.00
## 10 3 55.12 1317.00
## 11 3 57.60 1200.00
## 12 3 59.67 1093.00
## 13 3 61.38 996.00
## 14 3 62.80 907.50
## 15 3 63.98 826.90
## 16 3 64.96 753.40
## 17 3 65.78 686.50
## 18 3 66.45 625.50
## 19 3 67.01 569.90
## 20 3 67.48 519.30
## 21 3 67.87 473.20
## 22 3 68.19 431.10
## 23 4 68.50 392.80
## 24 4 68.78 357.90
## 25 6 69.02 326.10
## 26 6 69.21 297.20
## 27 6 69.38 270.80
## 28 6 69.51 246.70
## 29 6 69.63 224.80
## 30 6 69.72 204.80
## 31 6 69.80 186.60
## 32 6 69.86 170.10
## 33 6 69.92 154.90
## 34 6 69.96 141.20
## 35 6 70.00 128.60
## 36 7 70.06 117.20
## 37 7 70.19 106.80
## 38 7 70.30 97.31
## 39 7 70.39 88.67
## 40 7 70.47 80.79
## 41 7 70.53 73.61
## 42 8 70.58 67.07
## 43 9 70.64 61.11
## 44 9 70.70 55.68
## 45 9 70.74 50.74
## 46 9 70.78 46.23
## 47 9 70.81 42.12
## 48 9 70.84 38.38
## 49 9 70.86 34.97
## 50 9 70.88 31.86
## 51 9 70.89 29.03
## 52 9 70.90 26.45
## 53 9 70.91 24.10
## 54 9 70.92 21.96
## 55 9 70.93 20.01
## 56 9 70.93 18.23
## 57 9 70.94 16.61
## 58 9 70.94 15.14
## 59 9 70.95 13.79
## 60 9 70.95 12.57
## 61 9 70.95 11.45
## 62 9 70.95 10.43
## 63 9 70.96 9.51
## 64 9 70.96 8.66
## 65 9 70.96 7.89
## 66 9 70.96 7.19
## 67 9 70.96 6.55
## 68 9 70.96 5.97
We can also set up the value of \(\lambda\) in the LASSO penalty by setting
s in coef() function. For example, the
following code set \(\lambda\) to be
1200. and there are three coefficients with non-zero
estimates(Q1, Q2 and Q3).
coef(glmfit, s = 1200)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 2255.2221
## Q1 -390.9214
## Q2 653.6437
## Q3 624.4068
## Q4 .
## Q5 .
## Q6 .
## Q7 .
## Q8 .
## Q9 .
## Q10 .
You can apply models with different values of tuning parameter to new
data using predict():
newdat = matrix(sample(1:9, 30, replace = T), nrow = 3)
predict(glmfit, newdat, s = c(1741, 2000))
## s1 s2
## [1,] 4008.107 3795.615
## [2,] 3051.974 3084.707
## [3,] 7099.473 7025.456
The tuning parameter \(\lambda\) can be determined by the performance of the cross validation data such as the minimum of error or one standard error of the minimum.
cvfit = cv.glmnet(as.matrix(trainx), trainy)
cvfit = cv.glmnet(as.matrix(trainx), trainy)
plot(cvfit)
# lambda with minimum mean cross-validated error
cvfit$lambda.min
## [1] 7.893144
# lambda with one standard error of the minimum
cvfit$lambda.1se
## [1] 1199.688
# coefficient estimates for model with the error
# that is within one standard error of the minimum
coef(cvfit, s = "lambda.1se")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 2255.3136
## Q1 -391.0562
## Q2 653.7079
## Q3 624.5119
## Q4 .
## Q5 .
## Q6 .
## Q7 .
## Q8 .
## Q9 .
## Q10 .
In this section, we will go over penalized logistic regression using
glmnet() with “binomial” for the
“family” parameter. Similarly, for the default parameter,
it will be LASSO penalty and the tuning parameter can be determined by
the performance of the cross validation dataset: minimum of error or one
standard error of the minimum.
dat <- read.csv("http://bit.ly/2KXb1Qi")
trainx = dplyr::select(dat, -y)
trainy = dat$y
fit <- glmnet(as.matrix(trainx), trainy, family = "binomial")
levels(as.factor(trainy))
## [1] "0" "1"
newdat = as.matrix(trainx[1:3, ])
predict(fit, newdat, type = "link", s = c(2.833e-02, 3.110e-02))
## s1 s2
## 1 0.1943472 0.1442796
## 2 -0.9913159 -1.0076600
## 3 -0.5840566 -0.5496074
cvfit = cv.glmnet(as.matrix(trainx), trainy,
family = "binomial", type.measure = "class")
plot(cvfit)
cvfit$lambda.min
## [1] 0.002522398
cvfit$lambda.1se
## [1] 0.005309407
data("sim1_da1")
# the last column of sim1_da1 response variable y
# trainx is the explanatory variable matrix
trainx = dplyr::select(sim1_da1, -y)
# save response variable as as trainy
trainy = sim1_da1$y
# get the group indicator
index <- gsub("\\..*", "", names(trainx))
index[1:50]
## [1] "Q1" "Q1" "Q2" "Q2" "Q3" "Q3" "Q4" "Q4" "Q5" "Q5" "Q6" "Q6"
## [13] "Q7" "Q7" "Q8" "Q8" "Q9" "Q9" "Q10" "Q10" "Q11" "Q11" "Q12" "Q12"
## [25] "Q13" "Q13" "Q14" "Q14" "Q15" "Q15" "Q16" "Q16" "Q17" "Q17" "Q18" "Q18"
## [37] "Q19" "Q19" "Q20" "Q20" "Q21" "Q21" "Q22" "Q22" "Q23" "Q23" "Q24" "Q24"
## [49] "Q25" "Q25"
# Tune over 100 values
nlam <- 100
# set the type of prediction
# - `link`: return the predicted link function
# - `response`: return the predicted probability
# number of cross-validation folds
kfold <- 10
cv_fit <- cv_glasso(trainx, trainy,
nlam = nlam, kfold = kfold, type = "link")
str(cv_fit, width = 60, strict.width = "cut", list.len = 9)
## List of 11
## $ lambda : num [1:100] 52.5 50.4 48.4 46.4 44...
## $ pred :'data.frame': 800 obs. of 100 varia..
## ..$ X52.4935652246023 : num [1:800] -0.0556 -0.0556 -0.0..
## ..$ X50.3938226156183 : num [1:800] -0.0556 -0.0556 -0.0..
## ..$ X48.3780697109935 : num [1:800] -0.0559 -0.0559 -0.0..
## ..$ X46.4429469225538 : num [1:800] -0.0589 -0.0589 -0.0..
## ..$ X44.5852290456516 : num [1:800] -0.0617 -0.0617 -0.0..
## ..$ X42.8018198838256 : num [1:800] -0.06438 -0.06438 -0..
## ..$ X41.0897470884725 : num [1:800] -0.0729 -0.0417 -0.1..
## ..$ X39.4461572049336 : num [1:800] -0.08801 -0.00791 -0..
## ..$ X37.8683109167363 : num [1:800] -0.1092 0.0212 -0.08..
## .. [list output truncated]
## $ auc : num [1:100] 0.573 0.568 0.557 0.518..
## $ log_likelihood : num [1:100] -556 -556 -556 -555 -55..
## $ maxrho : num [1:100] -0.0492 -0.0306 0.0302 ..
## $ lambda.max.auc : Named num [1:2] 0.922 0.949
## ..- attr(*, "names")= chr [1:2] "lambda" "auc"
## $ lambda.1se.auc : Named num [1:2] 16.738 0.828
## ..- attr(*, "names")= chr [1:2] "" "se.auc"
## $ lambda.max.loglike: Named num [1:2] 1.57 -233.49
## ..- attr(*, "names")= chr [1:2] "lambda" "loglike"
## $ lambda.1se.loglike: Named num [1:2] 9.07 -349.13
## ..- attr(*, "names")= chr [1:2] "lambda" "se.loglike"
## [list output truncated]
## - attr(*, "class")= chr "cv_glasso"
plot(cv_fit)
fitgl <- fitglasso(trainx, trainy,
lambda = 0.922, na_action = na.pass)
## Lambda: 0.922 nr.var: 229
head(coef(fitgl))
## 0.922
## Intercept -53.1803922
## Q1.A 1.7566722
## Q1.B 1.7190500
## Q2.A 2.1699192
## Q2.B 0.6939251
## Q3.A 2.1020142
prey <- predict_glasso(fitgl, trainx)