In this notebook, we introduce regression models including linear regression, principal component regression and partial least square.
# install packages from CRAN
p_needed <- c('caret', 'dplyr', 'lattice',
'elasticnet', 'lars', 'corrplot',
'pls')
packages <- rownames(installed.packages())
p_to_install <- p_needed[!(p_needed %in% packages)]
if (length(p_to_install) > 0) {
install.packages(p_to_install)
}
lapply(p_needed, require, character.only = TRUE)
Multivariate linear regression (i.e., the typical least square regression) is one of the simplest supervised learning methods.
# Let us use the expenditure data and apply basic data pre-processing method of removing negative expense.
dat <- read.csv("http://bit.ly/2P5gTw4")
dat <- subset(dat, store_exp > 0 & online_exp > 0)
# response variable: the sum of the expenses
# explanatory variables: 10 survey questions
modeldat <- dat[, grep("Q", names(dat))]
modeldat$total_exp <- dat$store_exp + dat$online_exp
Before fitting a linear regression, let us do a few quick checks, including identifying outliers and highly correlated explanatory variables.
par(mfrow = c(1, 2))
hist(modeldat$total_exp, main = "", xlab = "total_exp")
boxplot(modeldat$total_exp)
# Then let us remove outliers using the z-score method
y <- modeldat$total_exp
# Find data points with Z-score larger than 3.5
zs <- (y - mean(y))/mad(y)
modeldat <- modeldat[-which(zs > 3.5), ]
# Now let us check correlation among input features.
correlation <- cor(modeldat[, grep("Q", names(modeldat))])
corrplot.mixed(correlation, order = "hclust", tl.pos = "lt",
upper = "ellipse")
# remove highly correlated variables using findCorrelation() function in caret package.
highcor <- findCorrelation(correlation, cutoff = 0.75)
modeldat <- modeldat[, -highcor]
Now we are ready to fit a linear regression model. For this case, a
log transformation of total_exp
is applied before fitting
the regression model.
lmfit <- lm(log(total_exp) ~ ., data = modeldat)
summary(lmfit)
##
## Call:
## lm(formula = log(total_exp) ~ ., data = modeldat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.17494 -0.13719 0.01284 0.14163 0.56227
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.098314 0.054286 149.177 < 2e-16 ***
## Q1 -0.145340 0.008823 -16.474 < 2e-16 ***
## Q2 0.102275 0.019492 5.247 1.98e-07 ***
## Q3 0.254450 0.018348 13.868 < 2e-16 ***
## Q6 -0.227684 0.011520 -19.764 < 2e-16 ***
## Q8 -0.090706 0.016497 -5.498 5.15e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2262 on 805 degrees of freedom
## Multiple R-squared: 0.8542, Adjusted R-squared: 0.8533
## F-statistic: 943.4 on 5 and 805 DF, p-value: < 2.2e-16
# Find confidence interval for the model parameters:
confint(lmfit,level=0.9)
## 5 % 95 %
## (Intercept) 8.00891811 8.18771037
## Q1 -0.15986889 -0.13081186
## Q2 0.07017625 0.13437445
## Q3 0.22423576 0.28466333
## Q6 -0.24665434 -0.20871330
## Q8 -0.11787330 -0.06353905
After the model fitting, we need to check typical diagnostic plots and identify potential outliers.
par(mfrow = c(2, 2))
plot(lmfit, which = 1)
plot(lmfit, which = 2)
plot(lmfit, which = 3)
plot(lmfit, which = 4)
# We see three outliers after model fitting
modeldat[which(row.names(modeldat) %in% c(960, 678, 155)), ]
## Q1 Q2 Q3 Q6 Q8 total_exp
## 155 4 2 1 4 4 351.8728
## 678 2 1 1 1 2 1087.3020
## 960 2 1 1 1 3 658.2720
# To see why these are outliers, we can check data points match each outlier, for example using the last outlier described above.
datcheck = modeldat %>% filter(Q1 ==2 & Q2 == 1 & Q3 == 1 & Q6 == 1 & Q8 == 3)
nrow(datcheck)
## [1] 87
# It is now easy to see why it is an outlier, as all the other 86 records with the same survey responses have much higher total expenses.
summary(datcheck$total_exp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 658.3 1883.5 2214.5 2204.0 2554.1 3196.5
In addition to remove highly correlated explanatory variables, we can also try principal component regression to use selected first a few principal components as model input. Principal components analysis is unsupervised, and the corresponding supervised version is partial least square regression. In this section, we will use the expenditure data again to illustrate these two methods.
First, let us load the pre-process the data.
# Load Data
sim.dat <- read.csv("http://bit.ly/2P5gTw4")
ymad <- mad(na.omit(sim.dat$income))
# Calculate Z values
zs <- (sim.dat$income - mean(na.omit(sim.dat$income)))/ymad
# which(na.omit(zs>3.5)) find outlier
# which(is.na(zs)) find missing values
idex <- c(which(na.omit(zs > 3.5)), which(is.na(zs)))
# Remove rows with outlier and missing values
sim.dat <- sim.dat[-idex, ]
xtrain = dplyr::select(sim.dat, Q1:Q10)
ytrain = sim.dat$income
set.seed(100)
ctrl <- trainControl(method = "cv", number = 10)
The following code is for Principal Component Regression.
# Set random seed
set.seed(100)
pcrTune <- train(x = xtrain, y = ytrain,
method = "pcr",
# set hyper-parameter tuning range
tuneGrid = expand.grid(.ncomp = 1:10),
trControl = ctrl)
pcrTune
## Principal Component Analysis
##
## 772 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 695, 695, 693, 696, 696, 696, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 45892.97 0.03252973 36575.13
## 2 32486.72 0.52174011 24072.48
## 3 23255.04 0.75926214 14523.61
## 4 23281.60 0.75862935 14536.21
## 5 23150.89 0.76141262 14232.40
## 6 23128.31 0.76182335 14151.47
## 7 23110.51 0.76245006 14150.81
## 8 23117.28 0.76232121 14149.15
## 9 23011.52 0.76463737 13830.62
## 10 22995.60 0.76502437 13729.15
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 10.
The following code is for Partial Least Square Regression.
plsTune <- train(xtrain, ytrain,
method = "pls",
# set hyper-parameter tuning range
tuneGrid = expand.grid(.ncomp = 1:10),
trControl = ctrl)
plsTune
## Partial Least Squares
##
## 772 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 696, 695, 693, 695, 696, 695, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 27787.94 0.6479066 19818.89
## 2 24496.06 0.7269430 15962.14
## 3 23233.86 0.7541780 14347.75
## 4 23055.88 0.7576908 13764.95
## 5 23017.60 0.7583535 13664.30
## 6 23022.02 0.7582940 13657.96
## 7 23020.50 0.7583379 13655.22
## 8 23020.74 0.7583338 13655.36
## 9 23020.71 0.7583347 13655.27
## 10 23020.72 0.7583344 13655.30
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 5.
plsImp <- varImp(plsTune, scale = FALSE)
plot(plsImp, top = 10, scales = list(y = list(cex = 0.95)))
The plot below shows the performance comparsion for these two methods as a function of number of components.
# Save PLS model tuning information to plsResamples
plsResamples <- plsTune$results
plsResamples$Model <- "PLS"
# Save PCR model tuning information to plsResamples
pcrResamples <- pcrTune$results
pcrResamples$Model <- "PCR"
# Combine both output for plotting
plsPlotData <- rbind(plsResamples, pcrResamples)
# Leverage xyplot() function from lattice library
xyplot(RMSE ~ ncomp, data = plsPlotData, xlab = "# Components",
ylab = "RMSE (Cross-Validation)", auto.key = list(columns = 2),
groups = Model, type = c("o", "g"))