# install packages from CRAN
p_needed <- c('rpart', 'caret', 'partykit',
'pROC', 'dplyr', 'ipred',
'e1071', 'randomForest', 'gbm')
packages <- rownames(installed.packages())
p_to_install <- p_needed[!(p_needed %in% packages)]
if (length(p_to_install) > 0) {
install.packages(p_to_install)
}
lapply(p_needed, require, character.only = TRUE)
# first load data
dat <- read.csv("http://bit.ly/2P5gTw4")
head(dat)
## age gender income house store_exp online_exp store_trans online_trans Q1 Q2
## 1 57 Female 120963.4 Yes 529.1344 303.5125 2 2 4 2
## 2 63 Female 122008.1 Yes 478.0058 109.5297 4 2 4 1
## 3 59 Male 114202.3 Yes 490.8107 279.2496 7 2 5 2
## 4 60 Male 113616.3 Yes 347.8090 141.6698 10 2 5 2
## 5 51 Male 124252.6 Yes 379.6259 112.2372 4 4 4 1
## 6 59 Male 107661.5 Yes 338.3154 195.6870 4 5 4 2
## Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1 1 2 1 4 1 4 2 4 Price
## 2 1 2 1 4 1 4 1 4 Price
## 3 1 2 1 4 1 4 1 4 Price
## 4 1 3 1 4 1 4 2 4 Price
## 5 1 3 1 4 1 4 2 4 Price
## 6 1 2 1 4 1 4 1 4 Price
# data cleaning: delete wrong observations
dat <- subset(dat, store_exp > 0 & online_exp > 0)
# use the 10 survey questions as predictors
trainx <- dat[, grep("Q", names(dat))]
# use the sum of store and online expenditure as response variable
# total expenditure = store expenditure + online expenditure
trainy <- dat$store_exp + dat$online_exp
# set fixed random seed such that the results can be duplicated
set.seed(100)
# fit the model using caret library function train to find the best tree model
# based on RMSE on the cross-validation set
rpartTune <- train(trainx, trainy, method = "rpart2", tuneLength = 10,
trControl = trainControl(method = "cv"))
plot(rpartTune)
# from the graph above, RMSE doesn’t change much when the maximum is larger than 2.
# So we set the maximum depth to be 2 and refit the model:
rpartTree <- rpart(trainy ~ ., data = trainx, maxdepth = 2)
print(rpartTree)
## n= 999
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 999 15812720000 3479.113
## 2) Q3< 3.5 799 2373688000 1818.720
## 4) Q5< 1.5 250 3534392 705.193 *
## 5) Q5>=1.5 549 1919009000 2325.791 *
## 3) Q3>=3.5 200 2436211000 10112.380 *
To visualize the tree, you can convert rpart object to party object
using partykit
then use plot()
function:
rpartTree2 <- partykit::as.party(rpartTree)
plot(rpartTree2)
dat <- read.csv("http://bit.ly/2P5gTw4")
head(dat)
## age gender income house store_exp online_exp store_trans online_trans Q1 Q2
## 1 57 Female 120963.4 Yes 529.1344 303.5125 2 2 4 2
## 2 63 Female 122008.1 Yes 478.0058 109.5297 4 2 4 1
## 3 59 Male 114202.3 Yes 490.8107 279.2496 7 2 5 2
## 4 60 Male 113616.3 Yes 347.8090 141.6698 10 2 5 2
## 5 51 Male 124252.6 Yes 379.6259 112.2372 4 4 4 1
## 6 59 Male 107661.5 Yes 338.3154 195.6870 4 5 4 2
## Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1 1 2 1 4 1 4 2 4 Price
## 2 1 2 1 4 1 4 1 4 Price
## 3 1 2 1 4 1 4 1 4 Price
## 4 1 3 1 4 1 4 2 4 Price
## 5 1 3 1 4 1 4 2 4 Price
## 6 1 2 1 4 1 4 1 4 Price
In the code below, we showed two ways to deal with categorical data, use it as it is (approach 1) and covert it to dummy variables (approach 2)
# use the 10 survey questions as predictors
trainx1 <- dat[, grep("Q", names(dat))]
# add a categorical predictor
# use two ways to treat categorical predictor
# trainx1: use approach 1, without encoding
trainx1$segment <- dat$segment
head(trainx1)
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1 4 2 1 2 1 4 1 4 2 4 Price
## 2 4 1 1 2 1 4 1 4 1 4 Price
## 3 5 2 1 2 1 4 1 4 1 4 Price
## 4 5 2 1 3 1 4 1 4 2 4 Price
## 5 4 1 1 3 1 4 1 4 2 4 Price
## 6 4 2 1 2 1 4 1 4 1 4 Price
# trainx2: use approach 2, encode it to a set of dummy variables
dumMod<-dummyVars(~.,
data=trainx1,
# Combine the previous variable name and the level name
# as the new dummy variable name
levelsOnly=F)
trainx2 <- predict(dumMod,trainx1)
head(trainx2)
## Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segmentConspicuous segmentPrice segmentQuality
## 1 4 2 1 2 1 4 1 4 2 4 0 1 0
## 2 4 1 1 2 1 4 1 4 1 4 0 1 0
## 3 5 2 1 2 1 4 1 4 1 4 0 1 0
## 4 5 2 1 3 1 4 1 4 2 4 0 1 0
## 5 4 1 1 3 1 4 1 4 2 4 0 1 0
## 6 4 2 1 2 1 4 1 4 1 4 0 1 0
## segmentStyle
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
# the response variable is gender
trainy <- dat$gender
head(trainy)
## [1] "Female" "Female" "Male" "Male" "Male" "Male"
## model search using approach 1 for categorical variable
rpartTune1 <- caret::train(trainx1, trainy, method = "rpart",
tuneLength = 30,
metric = "ROC",
trControl = trainControl(method = "cv",
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE))
rpartTune1
## CART
##
## 1000 samples
## 11 predictor
## 2 classes: 'Female', 'Male'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 900, 899, 900, 900, 900, 901, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.000000000 0.7039490 0.6696429 0.6788384
## 0.008350085 0.7084551 0.6407792 0.7126263
## 0.016700170 0.6780430 0.5270455 0.8179293
## 0.025050255 0.6764602 0.5145779 0.8270202
## 0.033400340 0.6764602 0.5145779 0.8270202
## 0.041750425 0.6803445 0.5109416 0.8497475
## 0.050100510 0.6803445 0.5109416 0.8497475
## 0.058450595 0.6803445 0.5109416 0.8497475
## 0.066800680 0.6803445 0.5109416 0.8497475
## 0.075150765 0.6803445 0.5109416 0.8497475
## 0.083500850 0.6803445 0.5109416 0.8497475
## 0.091850936 0.6803445 0.5109416 0.8497475
## 0.100201021 0.6803445 0.5109416 0.8497475
## 0.108551106 0.6803445 0.5109416 0.8497475
## 0.116901191 0.6803445 0.5109416 0.8497475
## 0.125251276 0.6803445 0.5109416 0.8497475
## 0.133601361 0.6803445 0.5109416 0.8497475
## 0.141951446 0.6803445 0.5109416 0.8497475
## 0.150301531 0.6803445 0.5109416 0.8497475
## 0.158651616 0.6803445 0.5109416 0.8497475
## 0.167001701 0.6803445 0.5109416 0.8497475
## 0.175351786 0.6803445 0.5109416 0.8497475
## 0.183701871 0.6803445 0.5109416 0.8497475
## 0.192051956 0.6803445 0.5109416 0.8497475
## 0.200402041 0.6803445 0.5109416 0.8497475
## 0.208752126 0.6803445 0.5109416 0.8497475
## 0.217102211 0.6803445 0.5109416 0.8497475
## 0.225452296 0.6551064 0.5537987 0.7564141
## 0.233802381 0.6093236 0.6465260 0.5721212
## 0.242152466 0.5898286 0.6919805 0.4876768
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.008350085.
rpartRoc <- pROC::roc(response = rpartTune1$pred$obs,
predictor = rpartTune1$pred$Female,
levels = rev(levels(rpartTune1$pred$obs)))
rpartRoc
##
## Call:
## roc.default(response = rpartTune1$pred$obs, predictor = rpartTune1$pred$Female, levels = rev(levels(rpartTune1$pred$obs)))
##
## Data: rpartTune1$pred$Female in 13380 controls (rpartTune1$pred$obs Male) < 16620 cases (rpartTune1$pred$obs Female).
## Area under the curve: 0.6567
## model search using approach 2 for categorical variable
rpartTune2 <- caret::train(trainx2, trainy, method = "rpart",
tuneLength = 30,
metric = "ROC",
trControl = trainControl(method = "cv",
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE))
rpartTune2
## CART
##
## 1000 samples
## 14 predictor
## 2 classes: 'Female', 'Male'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 899, 899, 900, 901, 900, 900, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.000000000 0.7017208 0.6603247 0.6793939
## 0.008350085 0.7145228 0.6331818 0.7377778
## 0.016700170 0.6799759 0.5179545 0.8339394
## 0.025050255 0.6784883 0.5179545 0.8293939
## 0.033400340 0.6802652 0.5106818 0.8498485
## 0.041750425 0.6802652 0.5106818 0.8498485
## 0.050100510 0.6802652 0.5106818 0.8498485
## 0.058450595 0.6802652 0.5106818 0.8498485
## 0.066800680 0.6802652 0.5106818 0.8498485
## 0.075150765 0.6802652 0.5106818 0.8498485
## 0.083500850 0.6802652 0.5106818 0.8498485
## 0.091850936 0.6802652 0.5106818 0.8498485
## 0.100201021 0.6802652 0.5106818 0.8498485
## 0.108551106 0.6802652 0.5106818 0.8498485
## 0.116901191 0.6802652 0.5106818 0.8498485
## 0.125251276 0.6802652 0.5106818 0.8498485
## 0.133601361 0.6802652 0.5106818 0.8498485
## 0.141951446 0.6802652 0.5106818 0.8498485
## 0.150301531 0.6802652 0.5106818 0.8498485
## 0.158651616 0.6802652 0.5106818 0.8498485
## 0.167001701 0.6802652 0.5106818 0.8498485
## 0.175351786 0.6802652 0.5106818 0.8498485
## 0.183701871 0.6802652 0.5106818 0.8498485
## 0.192051956 0.6802652 0.5106818 0.8498485
## 0.200402041 0.6802652 0.5106818 0.8498485
## 0.208752126 0.6802652 0.5106818 0.8498485
## 0.217102211 0.6802652 0.5106818 0.8498485
## 0.225452296 0.6560227 0.5488636 0.7631818
## 0.233802381 0.6330069 0.5917208 0.6742929
## 0.242152466 0.5730069 0.7353571 0.4106566
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.008350085.
rpartFactorRoc <- pROC::roc(response = rpartTune2$pred$obs,
predictor = rpartTune2$pred$Female,
levels = rev(levels(rpartTune1$pred$obs))
)
rpartFactorRoc
##
## Call:
## roc.default(response = rpartTune2$pred$obs, predictor = rpartTune2$pred$Female, levels = rev(levels(rpartTune1$pred$obs)))
##
## Data: rpartTune2$pred$Female in 13380 controls (rpartTune2$pred$obs Male) < 16620 cases (rpartTune2$pred$obs Female).
## Area under the curve: 0.6603
# compare the ROC for two approaches
plot.roc(rpartRoc,
type = "s",
print.thres = c(.5),
print.thres.pch = 3,
print.thres.pattern = "",
print.thres.cex = 1.2,
col = "red", legacy.axes = TRUE,
print.thres.col = "red")
plot.roc(rpartFactorRoc,
type = "s",
add = TRUE,
print.thres = c(.5),
print.thres.pch = 16, legacy.axes = TRUE,
print.thres.pattern = "",
print.thres.cex = 1.2)
legend(.75, .2,
c("Grouped Categories", "Independent Categories"),
lwd = c(1, 1),
col = c("black", "red"),
pch = c(16, 3))
# load and prepare data
dat <- read.csv("http://bit.ly/2P5gTw4")
# use the 10 survey questions as predictors
trainx <- dat[, grep("Q", names(dat))]
# add segment as a predictor
# don't need to encode it to dummy variables
trainx$segment <- as.factor(dat$segment)
# use gender as the response variable
trainy <- as.factor(dat$gender)
# using 200 trees to do bagging, no tuning parameters for this method
set.seed(100)
bagTune <- caret::train(trainx, trainy,
method = "treebag",
nbagg = 200,
metric = "ROC",
trControl = trainControl(method = "cv",
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE)
)
bagTune
## Bagged CART
##
## 1000 samples
## 11 predictor
## 2 classes: 'Female', 'Male'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 899, 901, 899, 900, 901, 901, ...
## Resampling results:
##
## ROC Sens Spec
## 0.6974011 0.6566234 0.6641414
# tune across a list of numbers of predictors
mtryValues <- c(1:5)
# train and tune random forest models
set.seed(100)
rfTune <- train(x = trainx,
y = trainy,
# set the model to be random forest
method = "rf",
ntree = 200,
tuneGrid = data.frame(.mtry = mtryValues),
importance = TRUE,
metric = "ROC",
trControl = trainControl(method = "cv",
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE))
rfTune
## Random Forest
##
## 1000 samples
## 11 predictor
## 2 classes: 'Female', 'Male'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 899, 901, 899, 900, 901, 901, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 1 0.7043716 0.5413961 0.8054040
## 2 0.7115298 0.6350325 0.7115152
## 3 0.7078361 0.6548377 0.6867172
## 4 0.7089469 0.6584740 0.6776768
## 5 0.7043586 0.6512987 0.6888384
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# fit model with best option
rfit = randomForest(trainy ~ .,
trainx,
mtry = 5,
ntree = 2000)
# print relative importance of each variable
importance(rfit)
## MeanDecreaseGini
## Q1 16.792910
## Q2 13.078847
## Q3 11.002967
## Q4 25.252843
## Q5 13.279855
## Q6 16.504205
## Q7 5.927529
## Q8 14.672599
## Q9 14.882742
## Q10 13.727443
## segment 32.979268
# plot the relative importance of each variable
varImpPlot(rfit)
# setup tuning grid
gbmGrid <- expand.grid(interaction.depth = c(1, 5, 9),
n.trees = c(6:10)*10,
shrinkage = c(.01, .1),
n.minobsinnode = 10)
set.seed(100)
gbmTune <- caret::train(x = trainx,
y = trainy,
method = "gbm",
tuneGrid = gbmGrid,
metric = "accuracy",
verbose = FALSE,
trControl = trainControl(method = "cv",
classProbs = TRUE,
savePredictions = TRUE))
gbmTune
## Stochastic Gradient Boosting
##
## 1000 samples
## 11 predictor
## 2 classes: 'Female', 'Male'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 899, 901, 899, 900, 901, 901, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees Accuracy Kappa
## 0.01 1 60 0.6628235 0.3275609
## 0.01 1 70 0.6628235 0.3275609
## 0.01 1 80 0.6688841 0.3418728
## 0.01 1 90 0.6668639 0.3375871
## 0.01 1 100 0.6688841 0.3418728
## 0.01 5 60 0.6418522 0.2682534
## 0.01 5 70 0.6518728 0.2914822
## 0.01 5 80 0.6469128 0.2848269
## 0.01 5 90 0.6449027 0.2827059
## 0.01 5 100 0.6489229 0.2927973
## 0.01 9 60 0.6539033 0.2971737
## 0.01 9 70 0.6488928 0.2887013
## 0.01 9 80 0.6468825 0.2855221
## 0.01 9 90 0.6589637 0.3116482
## 0.01 9 100 0.6649443 0.3259928
## 0.10 1 60 0.6619037 0.3280496
## 0.10 1 70 0.6639237 0.3335332
## 0.10 1 80 0.6689540 0.3429249
## 0.10 1 90 0.6639338 0.3288857
## 0.10 1 100 0.6719647 0.3440548
## 0.10 5 60 0.6689247 0.3377632
## 0.10 5 70 0.6669546 0.3332671
## 0.10 5 80 0.6689546 0.3375650
## 0.10 5 90 0.6719647 0.3433815
## 0.10 5 100 0.6639544 0.3267292
## 0.10 9 60 0.6618536 0.3231605
## 0.10 9 70 0.6618940 0.3233166
## 0.10 9 80 0.6579134 0.3157208
## 0.10 9 90 0.6648740 0.3289806
## 0.10 9 100 0.6628938 0.3250712
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 90, interaction.depth =
## 5, shrinkage = 0.1 and n.minobsinnode = 10.
Now, let us plot ROC curve for different methods in the same graph.
treebagRoc <- pROC::roc(response = bagTune$pred$obs,
predictor = bagTune$pred$Female,
levels = rev(levels(bagTune$pred$obs)))
treebagRoc
##
## Call:
## roc.default(response = bagTune$pred$obs, predictor = bagTune$pred$Female, levels = rev(levels(bagTune$pred$obs)))
##
## Data: bagTune$pred$Female in 446 controls (bagTune$pred$obs Male) < 554 cases (bagTune$pred$obs Female).
## Area under the curve: 0.6933
rfRoc <- pROC::roc(response = rfTune$pred$obs,
predictor = rfTune$pred$Female,
levels = rev(levels(rfTune$pred$obs)))
rfRoc
##
## Call:
## roc.default(response = rfTune$pred$obs, predictor = rfTune$pred$Female, levels = rev(levels(rfTune$pred$obs)))
##
## Data: rfTune$pred$Female in 2230 controls (rfTune$pred$obs Male) < 2770 cases (rfTune$pred$obs Female).
## Area under the curve: 0.7072
gbmRoc <- pROC::roc(response = gbmTune$pred$obs,
predictor = gbmTune$pred$Female,
levels = rev(levels(gbmTune$pred$obs)))
gbmRoc
##
## Call:
## roc.default(response = gbmTune$pred$obs, predictor = gbmTune$pred$Female, levels = rev(levels(gbmTune$pred$obs)))
##
## Data: gbmTune$pred$Female in 13380 controls (gbmTune$pred$obs Male) < 16620 cases (gbmTune$pred$obs Female).
## Area under the curve: 0.6963
plot.roc(rpartRoc,
type = "s",
print.thres = c(.5),
print.thres.pch = 16,
print.thres.pattern = "",
print.thres.cex = 1.2,
col = "black", legacy.axes = TRUE,
print.thres.col = "black")
plot.roc(treebagRoc,
type = "s",
add = TRUE, print.thres = c(.5),
print.thres.pch = 3, legacy.axes = TRUE, print.thres.pattern = "",
print.thres.cex = 1.2,
col = "red", print.thres.col = "red")
plot.roc(rfRoc,
type = "s",
add = TRUE, print.thres = c(.5),
print.thres.pch = 1, legacy.axes = TRUE, print.thres.pattern = "",
print.thres.cex = 1.2,
col = "green", print.thres.col = "green")
plot.roc(gbmRoc,
type = "s",
add = TRUE, print.thres = c(.5),
print.thres.pch = 10, legacy.axes = TRUE, print.thres.pattern = "",
print.thres.cex = 1.2,
col = "blue", print.thres.col = "blue")
legend("topleft", cex = 0.8,
c("Single Tree", "Bagged Tree", "Random Forest", "Boosted Tree"),
lwd = c(1, 1, 1, 1),
col = c("black", "red", "green", "blue"),
pch = c(16, 3, 1, 10))
Since the data here doesn’t have many variables, we don’t see a significant difference among the models. But you can still see those ensemble methods are better than a single tree. In most of the real applications, ensemble methods perform much better. Random forest and boosting trees can be a baseline model. Before exploring different models, you can quickly run a random forest to see the performance and then try to improve that performance. If the performance you got from the random forest is not too much better than guessing, you should consider collecting more data or reviewing the problem to frame it a different way instead of trying other models. Because it usually means the current data is not enough to solve the problem.