• Load R Packages
  • Linear Regression
  • Principal Component Regression and Partial Least Square Regression

In this notebook, we introduce regression models including linear regression, principal component regression and partial least square.

Load R Packages

# install packages from CRAN
p_needed <- c('caret', 'dplyr', 'lattice',
              'elasticnet', 'lars', 'corrplot', 
              'pls')

packages <- rownames(installed.packages())
p_to_install <- p_needed[!(p_needed %in% packages)]
if (length(p_to_install) > 0) {
    install.packages(p_to_install)
}

lapply(p_needed, require, character.only = TRUE)

Linear Regression

Multivariate linear regression (i.e., the typical least square regression) is one of the simplest supervised learning methods.

# Let us use the expenditure data and apply basic data pre-processing method of removing negative expense.

dat <- read.csv("http://bit.ly/2P5gTw4")
dat <- subset(dat, store_exp > 0 & online_exp > 0)
# response variable: the sum of the expenses 
# explanatory variables: 10 survey questions

modeldat <- dat[, grep("Q", names(dat))]
modeldat$total_exp <- dat$store_exp + dat$online_exp

Before fitting a linear regression, let us do a few quick checks, including identifying outliers and highly correlated explanatory variables.

par(mfrow = c(1, 2))
hist(modeldat$total_exp, main = "", xlab = "total_exp")
boxplot(modeldat$total_exp)

# Then let us remove outliers using the z-score method

y <- modeldat$total_exp
# Find data points with Z-score larger than 3.5
zs <- (y - mean(y))/mad(y)
modeldat <- modeldat[-which(zs > 3.5), ]
# Now let us check correlation among input features.

correlation <- cor(modeldat[, grep("Q", names(modeldat))])
corrplot.mixed(correlation, order = "hclust", tl.pos = "lt", 
    upper = "ellipse")

# remove highly correlated variables using findCorrelation() function in caret package.

highcor <- findCorrelation(correlation, cutoff = 0.75)
modeldat <- modeldat[, -highcor]

Now we are ready to fit a linear regression model. For this case, a log transformation of total_exp is applied before fitting the regression model.

lmfit <- lm(log(total_exp) ~ ., data = modeldat)
summary(lmfit)
## 
## Call:
## lm(formula = log(total_exp) ~ ., data = modeldat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.17494 -0.13719  0.01284  0.14163  0.56227 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.098314   0.054286 149.177  < 2e-16 ***
## Q1          -0.145340   0.008823 -16.474  < 2e-16 ***
## Q2           0.102275   0.019492   5.247 1.98e-07 ***
## Q3           0.254450   0.018348  13.868  < 2e-16 ***
## Q6          -0.227684   0.011520 -19.764  < 2e-16 ***
## Q8          -0.090706   0.016497  -5.498 5.15e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2262 on 805 degrees of freedom
## Multiple R-squared:  0.8542, Adjusted R-squared:  0.8533 
## F-statistic: 943.4 on 5 and 805 DF,  p-value: < 2.2e-16
# Find confidence interval for the model parameters:
confint(lmfit,level=0.9)
##                     5 %        95 %
## (Intercept)  8.00891811  8.18771037
## Q1          -0.15986889 -0.13081186
## Q2           0.07017625  0.13437445
## Q3           0.22423576  0.28466333
## Q6          -0.24665434 -0.20871330
## Q8          -0.11787330 -0.06353905

After the model fitting, we need to check typical diagnostic plots and identify potential outliers.

par(mfrow = c(2, 2))
plot(lmfit, which = 1)
plot(lmfit, which = 2)
plot(lmfit, which = 3)
plot(lmfit, which = 4)

# We see three outliers after model fitting
modeldat[which(row.names(modeldat) %in% c(960, 678, 155)), ]
##     Q1 Q2 Q3 Q6 Q8 total_exp
## 155  4  2  1  4  4  351.8728
## 678  2  1  1  1  2 1087.3020
## 960  2  1  1  1  3  658.2720
# To see why these are outliers, we can check data points match each outlier, for example using the last outlier described above.
datcheck = modeldat %>% filter(Q1 ==2 & Q2 == 1 & Q3 == 1 & Q6 == 1 & Q8 == 3) 
nrow(datcheck)
## [1] 87
# It is now easy to see why it is an outlier, as all the other 86 records with the same survey responses have much higher total expenses.
summary(datcheck$total_exp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   658.3  1883.5  2214.5  2204.0  2554.1  3196.5

Principal Component Regression and Partial Least Square Regression

In addition to remove highly correlated explanatory variables, we can also try principal component regression to use selected first a few principal components as model input. Principal components analysis is unsupervised, and the corresponding supervised version is partial least square regression. In this section, we will use the expenditure data again to illustrate these two methods.

First, let us load the pre-process the data.

# Load Data
sim.dat <- read.csv("http://bit.ly/2P5gTw4")
ymad <- mad(na.omit(sim.dat$income))

# Calculate Z values
zs <- (sim.dat$income - mean(na.omit(sim.dat$income)))/ymad
# which(na.omit(zs>3.5)) find outlier 
# which(is.na(zs)) find missing values
idex <- c(which(na.omit(zs > 3.5)), which(is.na(zs)))
# Remove rows with outlier and missing values
sim.dat <- sim.dat[-idex, ]
xtrain = dplyr::select(sim.dat, Q1:Q10)
ytrain = sim.dat$income
set.seed(100)
ctrl <- trainControl(method = "cv", number = 10)

The following code is for Principal Component Regression.

# Set random seed
 set.seed(100)
 pcrTune <- train(x = xtrain, y = ytrain,
          method = "pcr",
          # set hyper-parameter tuning range
          tuneGrid = expand.grid(.ncomp = 1:10),
          trControl = ctrl)
 pcrTune
## Principal Component Analysis 
## 
## 772 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 695, 695, 693, 696, 696, 696, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared    MAE     
##    1     45892.97  0.03252973  36575.13
##    2     32486.72  0.52174011  24072.48
##    3     23255.04  0.75926214  14523.61
##    4     23281.60  0.75862935  14536.21
##    5     23150.89  0.76141262  14232.40
##    6     23128.31  0.76182335  14151.47
##    7     23110.51  0.76245006  14150.81
##    8     23117.28  0.76232121  14149.15
##    9     23011.52  0.76463737  13830.62
##   10     22995.60  0.76502437  13729.15
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 10.

The following code is for Partial Least Square Regression.

plsTune <- train(xtrain, ytrain, 
                 method = "pls", 
                 # set hyper-parameter tuning range
                 tuneGrid = expand.grid(.ncomp = 1:10),
                 trControl = ctrl)
plsTune
## Partial Least Squares 
## 
## 772 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 696, 695, 693, 695, 696, 695, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     27787.94  0.6479066  19818.89
##    2     24496.06  0.7269430  15962.14
##    3     23233.86  0.7541780  14347.75
##    4     23055.88  0.7576908  13764.95
##    5     23017.60  0.7583535  13664.30
##    6     23022.02  0.7582940  13657.96
##    7     23020.50  0.7583379  13655.22
##    8     23020.74  0.7583338  13655.36
##    9     23020.71  0.7583347  13655.27
##   10     23020.72  0.7583344  13655.30
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 5.
plsImp <- varImp(plsTune, scale = FALSE)
plot(plsImp, top = 10, scales = list(y = list(cex = 0.95)))

The plot below shows the performance comparsion for these two methods as a function of number of components.

# Save PLS model tuning information to plsResamples
plsResamples <- plsTune$results
plsResamples$Model <- "PLS"

# Save PCR model tuning information to plsResamples
pcrResamples <- pcrTune$results
pcrResamples$Model <- "PCR"

# Combine both output for plotting
plsPlotData <- rbind(plsResamples, pcrResamples)

# Leverage xyplot() function from lattice library
xyplot(RMSE ~ ncomp, data = plsPlotData, xlab = "# Components", 
    ylab = "RMSE (Cross-Validation)", auto.key = list(columns = 2), 
    groups = Model, type = c("o", "g"))