• Load R Packages
  • Variance-Bias Trade-Off
  • Data Splitting and Resampling
    • Data Splitting
    • Resampling

In this notebook, we illustrate basic model tuning strategies.

Load R Packages

# install packages from CRAN
p_needed <- c('ggplot2','tidyr', 'caret', 'dplyr',
              'lattice', 'proxy', 'caret')
packages <- rownames(installed.packages())
p_to_install <- p_needed[!(p_needed %in% packages)]
if (length(p_to_install) > 0) {
    install.packages(p_to_install)
}

lapply(p_needed, require, character.only = TRUE)

Variance-Bias Trade-Off

In this section, we will show the variance-bias trade-off through a simulated example. We first simulate a dataset and plot the data.

# randomly simulate some non-linear samples
x = seq(1, 10, 0.01) * pi
e = rnorm(length(x), mean = 0, sd = 0.2)
fx <- sin(x) + e + sqrt(x)
dat = data.frame(x, fx)

If we fit a linear regression model to this dataset, it will be a high bias model.

ggplot(dat, aes(x, fx)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)

If we use a smoothing method to fit the data, it will be a high variance model.

ggplot(dat, aes(x, fx)) + geom_smooth(span = 0.03)

To better illustrate the variance-bias trade-off, we now simulate four subsets of the original dataset and try to fit a smoothing model and a linear model. As the plots show, the smoothing model is more flexible to the training data, while the linear model is more robust to different subsets.

# set random seed
set.seed(2016)
# sample part of the data to fit model sample 1
idx1 = sample(1:length(x), 100)
dat1 = data.frame(x1 = x[idx1], fx1 = fx[idx1])
p1 = ggplot(dat1, aes(x1, fx1)) + 
  geom_smooth(span = 0.03) + 
  geom_point()

# sample 2
idx2 = sample(1:length(x), 100)
dat2 = data.frame(x2 = x[idx2], fx2 = fx[idx2])
p2 = ggplot(dat2, aes(x2, fx2)) + 
  geom_smooth(span = 0.03) + 
  geom_point()

# sample 3
idx3 = sample(1:length(x), 100)
dat3 = data.frame(x3 = x[idx3], fx3 = fx[idx3])
p3 = ggplot(dat3, aes(x3, fx3)) + 
  geom_smooth(span = 0.03) + 
  geom_point()

# sample 4
idx4 = sample(1:length(x), 100)
dat4 = data.frame(x4 = x[idx4], fx4 = fx[idx4])
p4 = ggplot(dat4, aes(x4, fx4)) + 
  geom_smooth(span = 0.03) + 
  geom_point()

# Load multiplot function
source('http://bit.ly/2KeEIg9')

multiplot(p1, p2, p3, p4, cols = 2)

p1 = ggplot(dat1, aes(x1, fx1)) + 
  geom_smooth(method = "lm", se = FALSE) + 
  geom_point()

p2 = ggplot(dat2, aes(x2, fx2)) + 
  geom_smooth(method = "lm", se = FALSE) + 
  geom_point()

p3 = ggplot(dat3, aes(x3, fx3)) + 
  geom_smooth(method = "lm", se = FALSE) + 
  geom_point()

p4 = ggplot(dat4, aes(x4, fx4)) + 
  geom_smooth(method = "lm", se = FALSE) + 
  geom_point()

multiplot(p1, p2, p3, p4, cols = 2)

Data Splitting and Resampling

Data splitting and resampling are fundamental techniques to build sound models for prediction.

Data Splitting

The createDataPartition() function in caret package will return a balanced splitting based on assigned variable.

# load data
sim.dat <- read.csv("http://bit.ly/2P5gTw4")
# set random seed to make sure reproducibility
set.seed(3456)
trainIndex <- createDataPartition(sim.dat$segment,
                                  p = 0.8,
                                  list = FALSE,
                                  times = 1)
head(trainIndex)
##      Resample1
## [1,]         2
## [2,]         3
## [3,]         4
## [4,]         6
## [5,]         7
## [6,]         8
# get training set
datTrain <- sim.dat[trainIndex, ]
# get test set
datTest <- sim.dat[-trainIndex, ]

Let’s check the distribution of the two sets:

datTrain %>%
  dplyr::group_by(segment) %>%
  dplyr::summarise(count = n(),
            percentage = round(length(segment)/nrow(datTrain), 2))
## # A tibble: 4 × 3
##   segment     count percentage
##   <chr>       <int>      <dbl>
## 1 Conspicuous   160       0.2 
## 2 Price         200       0.25
## 3 Quality       160       0.2 
## 4 Style         280       0.35
datTest %>%
  dplyr::group_by(segment) %>%
  dplyr::summarise(count = n(),
            percentage = round(length(segment)/nrow(datTrain), 2))
## # A tibble: 4 × 3
##   segment     count percentage
##   <chr>       <int>      <dbl>
## 1 Conspicuous    40       0.05
## 2 Price          50       0.06
## 3 Quality        40       0.05
## 4 Style          70       0.09

An alternative way is to split data based on the predictors. The goal is to get a diverse subset from a dataset so that the sample is representative.

# select variables
testing <- subset(sim.dat, select = c("age", "income"))

Random select 5 samples as initial subset (start) , the rest will be in samplePool:

set.seed(5)
# select 5 random samples
startSet <- sample(1:dim(testing)[1], 5)
start <- testing[startSet, ]
# save the rest in data frame 'samplePool'
samplePool <- testing[-startSet, ]

Use maxDissim() to select another 5 samples from samplePool that are as different as possible with the initical set start:

selectId <- maxDissim(start, samplePool, obj = minDiss, n = 5)
minDissSet <- samplePool[selectId, ]
selectId <- sample(1:dim(samplePool)[1], 5)
RandomSet <- samplePool[selectId, ]
start$group <- rep("Initial Set", nrow(start))
minDissSet$group <- rep("Maximum Dissimilarity Sampling",
                        nrow(minDissSet))

RandomSet$group <- rep("Random Sampling",
                        nrow(RandomSet))

xyplot(age ~ income,
       data = rbind(start, minDissSet, RandomSet),
       grid = TRUE,
       group = group,
       auto.key = TRUE)

For time series data, random sampling is usually not the best way.

# simulte AR(1) time series samples
timedata = arima.sim(list(order=c(1,0,0), ar=-.9), n=100)
# plot time series
plot(timedata, main=(expression(AR(1)~~~phi==-.9)))

timeSlices <- createTimeSlices(1:length(timedata),
                               initialWindow = 36,
                               horizon = 12,
                               fixedWindow = T)
str(timeSlices,max.level = 1)
## List of 2
##  $ train:List of 53
##  $ test :List of 53
# get result for the 1st training set
trainSlices <- timeSlices[[1]]
# get result for the 1st test set
testSlices <- timeSlices[[2]]
# check the index for the 1st training and test set
trainSlices[[1]]
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36
testSlices[[1]]
##  [1] 37 38 39 40 41 42 43 44 45 46 47 48

Resampling

k-fold cross-validation is to partition the original sample into 𝑘 equal size subsamples (folds).

class <- sim.dat$segment
# creat k-folds
set.seed(1)
cv <- createFolds(class, k = 10, returnTrain = T)
str(cv)
## List of 10
##  $ Fold01: int [1:900] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Fold02: int [1:900] 1 2 3 4 5 7 8 9 10 11 ...
##  $ Fold03: int [1:900] 1 2 3 5 6 7 8 9 11 12 ...
##  $ Fold04: int [1:900] 1 2 3 4 6 7 8 9 10 11 ...
##  $ Fold05: int [1:900] 2 3 4 5 6 7 8 9 10 11 ...
##  $ Fold06: int [1:900] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Fold07: int [1:900] 1 2 4 5 6 7 8 9 10 11 ...
##  $ Fold08: int [1:900] 1 3 4 5 6 7 8 9 10 11 ...
##  $ Fold09: int [1:900] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Fold10: int [1:900] 1 2 3 4 5 6 10 12 13 14 ...

Repeated training/test splits method is repeating the training/test set division on the original data.

trainIndex <- createDataPartition(sim.dat$segment,
                                  p = .8,
                                  list = FALSE,
                                  times = 5)
dplyr::glimpse(trainIndex)
##  int [1:800, 1:5] 1 2 3 4 6 7 8 9 10 12 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:5] "Resample1" "Resample2" "Resample3" "Resample4" ...