In this notebook, we illustrate basic model tuning strategies.
# install packages from CRAN
p_needed <- c('ggplot2','tidyr', 'caret', 'dplyr',
'lattice', 'proxy', 'caret')
packages <- rownames(installed.packages())
p_to_install <- p_needed[!(p_needed %in% packages)]
if (length(p_to_install) > 0) {
install.packages(p_to_install)
}
lapply(p_needed, require, character.only = TRUE)
In this section, we will show the variance-bias trade-off through a simulated example. We first simulate a dataset and plot the data.
# randomly simulate some non-linear samples
x = seq(1, 10, 0.01) * pi
e = rnorm(length(x), mean = 0, sd = 0.2)
fx <- sin(x) + e + sqrt(x)
dat = data.frame(x, fx)
If we fit a linear regression model to this dataset, it will be a high bias model.
ggplot(dat, aes(x, fx)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
If we use a smoothing method to fit the data, it will be a high variance model.
ggplot(dat, aes(x, fx)) + geom_smooth(span = 0.03)
To better illustrate the variance-bias trade-off, we now simulate four subsets of the original dataset and try to fit a smoothing model and a linear model. As the plots show, the smoothing model is more flexible to the training data, while the linear model is more robust to different subsets.
# set random seed
set.seed(2016)
# sample part of the data to fit model sample 1
idx1 = sample(1:length(x), 100)
dat1 = data.frame(x1 = x[idx1], fx1 = fx[idx1])
p1 = ggplot(dat1, aes(x1, fx1)) +
geom_smooth(span = 0.03) +
geom_point()
# sample 2
idx2 = sample(1:length(x), 100)
dat2 = data.frame(x2 = x[idx2], fx2 = fx[idx2])
p2 = ggplot(dat2, aes(x2, fx2)) +
geom_smooth(span = 0.03) +
geom_point()
# sample 3
idx3 = sample(1:length(x), 100)
dat3 = data.frame(x3 = x[idx3], fx3 = fx[idx3])
p3 = ggplot(dat3, aes(x3, fx3)) +
geom_smooth(span = 0.03) +
geom_point()
# sample 4
idx4 = sample(1:length(x), 100)
dat4 = data.frame(x4 = x[idx4], fx4 = fx[idx4])
p4 = ggplot(dat4, aes(x4, fx4)) +
geom_smooth(span = 0.03) +
geom_point()
# Load multiplot function
source('http://bit.ly/2KeEIg9')
multiplot(p1, p2, p3, p4, cols = 2)
p1 = ggplot(dat1, aes(x1, fx1)) +
geom_smooth(method = "lm", se = FALSE) +
geom_point()
p2 = ggplot(dat2, aes(x2, fx2)) +
geom_smooth(method = "lm", se = FALSE) +
geom_point()
p3 = ggplot(dat3, aes(x3, fx3)) +
geom_smooth(method = "lm", se = FALSE) +
geom_point()
p4 = ggplot(dat4, aes(x4, fx4)) +
geom_smooth(method = "lm", se = FALSE) +
geom_point()
multiplot(p1, p2, p3, p4, cols = 2)
Data splitting and resampling are fundamental techniques to build sound models for prediction.
The createDataPartition()
function in caret
package will return a balanced splitting based on assigned variable.
# load data
sim.dat <- read.csv("http://bit.ly/2P5gTw4")
# set random seed to make sure reproducibility
set.seed(3456)
trainIndex <- createDataPartition(sim.dat$segment,
p = 0.8,
list = FALSE,
times = 1)
head(trainIndex)
## Resample1
## [1,] 2
## [2,] 3
## [3,] 4
## [4,] 6
## [5,] 7
## [6,] 8
# get training set
datTrain <- sim.dat[trainIndex, ]
# get test set
datTest <- sim.dat[-trainIndex, ]
Let’s check the distribution of the two sets:
datTrain %>%
dplyr::group_by(segment) %>%
dplyr::summarise(count = n(),
percentage = round(length(segment)/nrow(datTrain), 2))
## # A tibble: 4 × 3
## segment count percentage
## <chr> <int> <dbl>
## 1 Conspicuous 160 0.2
## 2 Price 200 0.25
## 3 Quality 160 0.2
## 4 Style 280 0.35
datTest %>%
dplyr::group_by(segment) %>%
dplyr::summarise(count = n(),
percentage = round(length(segment)/nrow(datTrain), 2))
## # A tibble: 4 × 3
## segment count percentage
## <chr> <int> <dbl>
## 1 Conspicuous 40 0.05
## 2 Price 50 0.06
## 3 Quality 40 0.05
## 4 Style 70 0.09
An alternative way is to split data based on the predictors. The goal is to get a diverse subset from a dataset so that the sample is representative.
# select variables
testing <- subset(sim.dat, select = c("age", "income"))
Random select 5 samples as initial subset (start
) , the
rest will be in samplePool
:
set.seed(5)
# select 5 random samples
startSet <- sample(1:dim(testing)[1], 5)
start <- testing[startSet, ]
# save the rest in data frame 'samplePool'
samplePool <- testing[-startSet, ]
Use maxDissim()
to select another 5 samples from
samplePool
that are as different as possible with the
initical set start
:
selectId <- maxDissim(start, samplePool, obj = minDiss, n = 5)
minDissSet <- samplePool[selectId, ]
selectId <- sample(1:dim(samplePool)[1], 5)
RandomSet <- samplePool[selectId, ]
start$group <- rep("Initial Set", nrow(start))
minDissSet$group <- rep("Maximum Dissimilarity Sampling",
nrow(minDissSet))
RandomSet$group <- rep("Random Sampling",
nrow(RandomSet))
xyplot(age ~ income,
data = rbind(start, minDissSet, RandomSet),
grid = TRUE,
group = group,
auto.key = TRUE)
For time series data, random sampling is usually not the best way.
# simulte AR(1) time series samples
timedata = arima.sim(list(order=c(1,0,0), ar=-.9), n=100)
# plot time series
plot(timedata, main=(expression(AR(1)~~~phi==-.9)))
timeSlices <- createTimeSlices(1:length(timedata),
initialWindow = 36,
horizon = 12,
fixedWindow = T)
str(timeSlices,max.level = 1)
## List of 2
## $ train:List of 53
## $ test :List of 53
# get result for the 1st training set
trainSlices <- timeSlices[[1]]
# get result for the 1st test set
testSlices <- timeSlices[[2]]
# check the index for the 1st training and test set
trainSlices[[1]]
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36
testSlices[[1]]
## [1] 37 38 39 40 41 42 43 44 45 46 47 48
k-fold cross-validation is to partition the original sample into 𝑘 equal size subsamples (folds).
class <- sim.dat$segment
# creat k-folds
set.seed(1)
cv <- createFolds(class, k = 10, returnTrain = T)
str(cv)
## List of 10
## $ Fold01: int [1:900] 1 2 3 4 5 6 7 8 9 10 ...
## $ Fold02: int [1:900] 1 2 3 4 5 7 8 9 10 11 ...
## $ Fold03: int [1:900] 1 2 3 5 6 7 8 9 11 12 ...
## $ Fold04: int [1:900] 1 2 3 4 6 7 8 9 10 11 ...
## $ Fold05: int [1:900] 2 3 4 5 6 7 8 9 10 11 ...
## $ Fold06: int [1:900] 1 2 3 4 5 6 7 8 9 10 ...
## $ Fold07: int [1:900] 1 2 4 5 6 7 8 9 10 11 ...
## $ Fold08: int [1:900] 1 3 4 5 6 7 8 9 10 11 ...
## $ Fold09: int [1:900] 1 2 3 4 5 6 7 8 9 10 ...
## $ Fold10: int [1:900] 1 2 3 4 5 6 10 12 13 14 ...
Repeated training/test splits method is repeating the training/test set division on the original data.
trainIndex <- createDataPartition(sim.dat$segment,
p = .8,
list = FALSE,
times = 5)
dplyr::glimpse(trainIndex)
## int [1:800, 1:5] 1 2 3 4 6 7 8 9 10 12 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:5] "Resample1" "Resample2" "Resample3" "Resample4" ...