Load R Packages

# install packages from CRAN
p_needed <- c('rpart', 'caret', 'partykit',
              'pROC', 'dplyr', 'ipred',
              'e1071', 'randomForest', 'gbm')

packages <- rownames(installed.packages())
p_to_install <- p_needed[!(p_needed %in% packages)]
if (length(p_to_install) > 0) {
    install.packages(p_to_install)
}

lapply(p_needed, require, character.only = TRUE)

Regression Tree

# first load data
dat <- read.csv("http://bit.ly/2P5gTw4")
head(dat)
##   age gender   income house store_exp online_exp store_trans online_trans Q1 Q2
## 1  57 Female 120963.4   Yes  529.1344   303.5125           2            2  4  2
## 2  63 Female 122008.1   Yes  478.0058   109.5297           4            2  4  1
## 3  59   Male 114202.3   Yes  490.8107   279.2496           7            2  5  2
## 4  60   Male 113616.3   Yes  347.8090   141.6698          10            2  5  2
## 5  51   Male 124252.6   Yes  379.6259   112.2372           4            4  4  1
## 6  59   Male 107661.5   Yes  338.3154   195.6870           4            5  4  2
##   Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1  1  2  1  4  1  4  2   4   Price
## 2  1  2  1  4  1  4  1   4   Price
## 3  1  2  1  4  1  4  1   4   Price
## 4  1  3  1  4  1  4  2   4   Price
## 5  1  3  1  4  1  4  2   4   Price
## 6  1  2  1  4  1  4  1   4   Price
# data cleaning: delete wrong observations
dat <- subset(dat, store_exp > 0 & online_exp > 0)

# use the 10 survey questions as predictors
trainx <- dat[, grep("Q", names(dat))]

# use the sum of store and online expenditure as response variable
# total expenditure = store expenditure + online expenditure
trainy <- dat$store_exp + dat$online_exp
# set fixed random seed such that the results can be duplicated
set.seed(100)

# fit the model using caret library function train to find the best tree model 
# based on RMSE on the cross-validation set

rpartTune <- train(trainx, trainy, method = "rpart2", tuneLength = 10, 
    trControl = trainControl(method = "cv"))
plot(rpartTune)

# from the graph above, RMSE doesn’t change much when the maximum is larger than 2. 
# So we set the maximum depth to be 2 and refit the model: 
rpartTree <- rpart(trainy ~ ., data = trainx, maxdepth = 2)

print(rpartTree)
## n= 999 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 999 15812720000  3479.113  
##   2) Q3< 3.5 799  2373688000  1818.720  
##     4) Q5< 1.5 250     3534392   705.193 *
##     5) Q5>=1.5 549  1919009000  2325.791 *
##   3) Q3>=3.5 200  2436211000 10112.380 *

To visualize the tree, you can convert rpart object to party object using partykit then use plot() function:

rpartTree2 <- partykit::as.party(rpartTree)
plot(rpartTree2)

Decision Tree

dat <- read.csv("http://bit.ly/2P5gTw4")
head(dat)
##   age gender   income house store_exp online_exp store_trans online_trans Q1 Q2
## 1  57 Female 120963.4   Yes  529.1344   303.5125           2            2  4  2
## 2  63 Female 122008.1   Yes  478.0058   109.5297           4            2  4  1
## 3  59   Male 114202.3   Yes  490.8107   279.2496           7            2  5  2
## 4  60   Male 113616.3   Yes  347.8090   141.6698          10            2  5  2
## 5  51   Male 124252.6   Yes  379.6259   112.2372           4            4  4  1
## 6  59   Male 107661.5   Yes  338.3154   195.6870           4            5  4  2
##   Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1  1  2  1  4  1  4  2   4   Price
## 2  1  2  1  4  1  4  1   4   Price
## 3  1  2  1  4  1  4  1   4   Price
## 4  1  3  1  4  1  4  2   4   Price
## 5  1  3  1  4  1  4  2   4   Price
## 6  1  2  1  4  1  4  1   4   Price

In the code below, we showed two ways to deal with categorical data, use it as it is (approach 1) and covert it to dummy variables (approach 2)

# use the 10 survey questions as predictors
trainx1 <- dat[, grep("Q", names(dat))]

# add a categorical predictor
# use two ways to treat categorical predictor
# trainx1: use approach 1, without encoding
trainx1$segment <- dat$segment
head(trainx1)
##   Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1  4  2  1  2  1  4  1  4  2   4   Price
## 2  4  1  1  2  1  4  1  4  1   4   Price
## 3  5  2  1  2  1  4  1  4  1   4   Price
## 4  5  2  1  3  1  4  1  4  2   4   Price
## 5  4  1  1  3  1  4  1  4  2   4   Price
## 6  4  2  1  2  1  4  1  4  1   4   Price
# trainx2: use approach 2, encode it to a set of dummy variables
dumMod<-dummyVars(~.,
                  data=trainx1,
                  # Combine the previous variable name and the level name
                  # as the new dummy variable name
                  levelsOnly=F)
trainx2 <- predict(dumMod,trainx1)
head(trainx2)
##   Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segmentConspicuous segmentPrice segmentQuality
## 1  4  2  1  2  1  4  1  4  2   4                  0            1              0
## 2  4  1  1  2  1  4  1  4  1   4                  0            1              0
## 3  5  2  1  2  1  4  1  4  1   4                  0            1              0
## 4  5  2  1  3  1  4  1  4  2   4                  0            1              0
## 5  4  1  1  3  1  4  1  4  2   4                  0            1              0
## 6  4  2  1  2  1  4  1  4  1   4                  0            1              0
##   segmentStyle
## 1            0
## 2            0
## 3            0
## 4            0
## 5            0
## 6            0
# the response variable is gender
trainy <- dat$gender
head(trainy)
## [1] "Female" "Female" "Male"   "Male"   "Male"   "Male"
## model search using approach 1 for categorical variable
rpartTune1 <- caret::train(trainx1, trainy, method = "rpart",
                       tuneLength = 30,
                       metric = "ROC", 
                       trControl = trainControl(method = "cv",
                                                summaryFunction = twoClassSummary,
                                                classProbs = TRUE,
                                                savePredictions = TRUE))
rpartTune1
## CART 
## 
## 1000 samples
##   11 predictor
##    2 classes: 'Female', 'Male' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 900, 899, 900, 900, 900, 901, ... 
## Resampling results across tuning parameters:
## 
##   cp           ROC        Sens       Spec     
##   0.000000000  0.7039490  0.6696429  0.6788384
##   0.008350085  0.7084551  0.6407792  0.7126263
##   0.016700170  0.6780430  0.5270455  0.8179293
##   0.025050255  0.6764602  0.5145779  0.8270202
##   0.033400340  0.6764602  0.5145779  0.8270202
##   0.041750425  0.6803445  0.5109416  0.8497475
##   0.050100510  0.6803445  0.5109416  0.8497475
##   0.058450595  0.6803445  0.5109416  0.8497475
##   0.066800680  0.6803445  0.5109416  0.8497475
##   0.075150765  0.6803445  0.5109416  0.8497475
##   0.083500850  0.6803445  0.5109416  0.8497475
##   0.091850936  0.6803445  0.5109416  0.8497475
##   0.100201021  0.6803445  0.5109416  0.8497475
##   0.108551106  0.6803445  0.5109416  0.8497475
##   0.116901191  0.6803445  0.5109416  0.8497475
##   0.125251276  0.6803445  0.5109416  0.8497475
##   0.133601361  0.6803445  0.5109416  0.8497475
##   0.141951446  0.6803445  0.5109416  0.8497475
##   0.150301531  0.6803445  0.5109416  0.8497475
##   0.158651616  0.6803445  0.5109416  0.8497475
##   0.167001701  0.6803445  0.5109416  0.8497475
##   0.175351786  0.6803445  0.5109416  0.8497475
##   0.183701871  0.6803445  0.5109416  0.8497475
##   0.192051956  0.6803445  0.5109416  0.8497475
##   0.200402041  0.6803445  0.5109416  0.8497475
##   0.208752126  0.6803445  0.5109416  0.8497475
##   0.217102211  0.6803445  0.5109416  0.8497475
##   0.225452296  0.6551064  0.5537987  0.7564141
##   0.233802381  0.6093236  0.6465260  0.5721212
##   0.242152466  0.5898286  0.6919805  0.4876768
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.008350085.
rpartRoc <- pROC::roc(response = rpartTune1$pred$obs,
                predictor = rpartTune1$pred$Female,
                levels = rev(levels(rpartTune1$pred$obs)))
rpartRoc
## 
## Call:
## roc.default(response = rpartTune1$pred$obs, predictor = rpartTune1$pred$Female,     levels = rev(levels(rpartTune1$pred$obs)))
## 
## Data: rpartTune1$pred$Female in 13380 controls (rpartTune1$pred$obs Male) < 16620 cases (rpartTune1$pred$obs Female).
## Area under the curve: 0.6567
## model search using approach 2 for categorical variable
rpartTune2 <- caret::train(trainx2, trainy, method = "rpart",
                       tuneLength = 30,
                       metric = "ROC", 
                       trControl = trainControl(method = "cv",
                                                summaryFunction = twoClassSummary,
                                                classProbs = TRUE,
                                                savePredictions = TRUE))
rpartTune2
## CART 
## 
## 1000 samples
##   14 predictor
##    2 classes: 'Female', 'Male' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 899, 899, 900, 901, 900, 900, ... 
## Resampling results across tuning parameters:
## 
##   cp           ROC        Sens       Spec     
##   0.000000000  0.7017208  0.6603247  0.6793939
##   0.008350085  0.7145228  0.6331818  0.7377778
##   0.016700170  0.6799759  0.5179545  0.8339394
##   0.025050255  0.6784883  0.5179545  0.8293939
##   0.033400340  0.6802652  0.5106818  0.8498485
##   0.041750425  0.6802652  0.5106818  0.8498485
##   0.050100510  0.6802652  0.5106818  0.8498485
##   0.058450595  0.6802652  0.5106818  0.8498485
##   0.066800680  0.6802652  0.5106818  0.8498485
##   0.075150765  0.6802652  0.5106818  0.8498485
##   0.083500850  0.6802652  0.5106818  0.8498485
##   0.091850936  0.6802652  0.5106818  0.8498485
##   0.100201021  0.6802652  0.5106818  0.8498485
##   0.108551106  0.6802652  0.5106818  0.8498485
##   0.116901191  0.6802652  0.5106818  0.8498485
##   0.125251276  0.6802652  0.5106818  0.8498485
##   0.133601361  0.6802652  0.5106818  0.8498485
##   0.141951446  0.6802652  0.5106818  0.8498485
##   0.150301531  0.6802652  0.5106818  0.8498485
##   0.158651616  0.6802652  0.5106818  0.8498485
##   0.167001701  0.6802652  0.5106818  0.8498485
##   0.175351786  0.6802652  0.5106818  0.8498485
##   0.183701871  0.6802652  0.5106818  0.8498485
##   0.192051956  0.6802652  0.5106818  0.8498485
##   0.200402041  0.6802652  0.5106818  0.8498485
##   0.208752126  0.6802652  0.5106818  0.8498485
##   0.217102211  0.6802652  0.5106818  0.8498485
##   0.225452296  0.6560227  0.5488636  0.7631818
##   0.233802381  0.6330069  0.5917208  0.6742929
##   0.242152466  0.5730069  0.7353571  0.4106566
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.008350085.
rpartFactorRoc <- pROC::roc(response = rpartTune2$pred$obs,
                      predictor = rpartTune2$pred$Female,
                      levels = rev(levels(rpartTune1$pred$obs))
                      )
rpartFactorRoc
## 
## Call:
## roc.default(response = rpartTune2$pred$obs, predictor = rpartTune2$pred$Female,     levels = rev(levels(rpartTune1$pred$obs)))
## 
## Data: rpartTune2$pred$Female in 13380 controls (rpartTune2$pred$obs Male) < 16620 cases (rpartTune2$pred$obs Female).
## Area under the curve: 0.6603
# compare the ROC for two approaches
plot.roc(rpartRoc, 
     type = "s", 
     print.thres = c(.5),
     print.thres.pch = 3,
     print.thres.pattern = "",
     print.thres.cex = 1.2,
     col = "red", legacy.axes = TRUE,
     print.thres.col = "red")

plot.roc(rpartFactorRoc,
     type = "s",
     add = TRUE,
     print.thres = c(.5),
     print.thres.pch = 16, legacy.axes = TRUE,
     print.thres.pattern = "",
     print.thres.cex = 1.2)

legend(.75, .2,
       c("Grouped Categories", "Independent Categories"),
       lwd = c(1, 1),
       col = c("black", "red"),
       pch = c(16, 3))

Bagging Tree Model

# load and prepare data
dat <- read.csv("http://bit.ly/2P5gTw4")

# use the 10 survey questions as predictors
trainx <- dat[, grep("Q", names(dat))]

# add segment as a predictor 
# don't need to encode it to dummy variables
trainx$segment <- as.factor(dat$segment)

# use gender as the response variable
trainy <- as.factor(dat$gender)
# using 200 trees to do bagging, no tuning parameters for this method
set.seed(100)
bagTune <- caret::train(trainx, trainy, 
                           method = "treebag",
                           nbagg = 200,
                           metric = "ROC",
                           trControl = trainControl(method = "cv",
                           summaryFunction = twoClassSummary,
                           classProbs = TRUE,
                           savePredictions = TRUE)
                        )
bagTune
## Bagged CART 
## 
## 1000 samples
##   11 predictor
##    2 classes: 'Female', 'Male' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 899, 901, 899, 900, 901, 901, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.6974011  0.6566234  0.6641414

Random Forest

# tune across a list of numbers of predictors
mtryValues <- c(1:5)

# train and tune random forest models
set.seed(100)
rfTune <- train(x = trainx, 
               y = trainy,
               # set the model to be random forest
               method = "rf",
               ntree = 200,
               tuneGrid = data.frame(.mtry = mtryValues),
               importance = TRUE,
               metric = "ROC",
               trControl = trainControl(method = "cv",
                           summaryFunction = twoClassSummary,
                           classProbs = TRUE,
                           savePredictions = TRUE))
rfTune
## Random Forest 
## 
## 1000 samples
##   11 predictor
##    2 classes: 'Female', 'Male' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 899, 901, 899, 900, 901, 901, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##   1     0.7043716  0.5413961  0.8054040
##   2     0.7115298  0.6350325  0.7115152
##   3     0.7078361  0.6548377  0.6867172
##   4     0.7089469  0.6584740  0.6776768
##   5     0.7043586  0.6512987  0.6888384
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# fit model with best option
rfit = randomForest(trainy ~ ., 
                    trainx, 
                    mtry = 5, 
                    ntree = 2000)
# print relative importance of each variable
importance(rfit)
##         MeanDecreaseGini
## Q1             16.792910
## Q2             13.078847
## Q3             11.002967
## Q4             25.252843
## Q5             13.279855
## Q6             16.504205
## Q7              5.927529
## Q8             14.672599
## Q9             14.882742
## Q10            13.727443
## segment        32.979268
# plot the relative importance of each variable
varImpPlot(rfit)

Gradient Boosted Tree

# setup tuning grid
gbmGrid <- expand.grid(interaction.depth = c(1, 5, 9),
                       n.trees = c(6:10)*10,
                       shrinkage = c(.01, .1),
                       n.minobsinnode = 10)

set.seed(100)
gbmTune <- caret::train(x = trainx, 
                y = trainy,
                method = "gbm",
                tuneGrid = gbmGrid,
                metric = "accuracy",
                verbose = FALSE,
                trControl = trainControl(method = "cv",
                           classProbs = TRUE,
                           savePredictions = TRUE))
gbmTune
## Stochastic Gradient Boosting 
## 
## 1000 samples
##   11 predictor
##    2 classes: 'Female', 'Male' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 899, 901, 899, 900, 901, 901, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  Accuracy   Kappa    
##   0.01       1                   60      0.6628235  0.3275609
##   0.01       1                   70      0.6628235  0.3275609
##   0.01       1                   80      0.6688841  0.3418728
##   0.01       1                   90      0.6668639  0.3375871
##   0.01       1                  100      0.6688841  0.3418728
##   0.01       5                   60      0.6418522  0.2682534
##   0.01       5                   70      0.6518728  0.2914822
##   0.01       5                   80      0.6469128  0.2848269
##   0.01       5                   90      0.6449027  0.2827059
##   0.01       5                  100      0.6489229  0.2927973
##   0.01       9                   60      0.6539033  0.2971737
##   0.01       9                   70      0.6488928  0.2887013
##   0.01       9                   80      0.6468825  0.2855221
##   0.01       9                   90      0.6589637  0.3116482
##   0.01       9                  100      0.6649443  0.3259928
##   0.10       1                   60      0.6619037  0.3280496
##   0.10       1                   70      0.6639237  0.3335332
##   0.10       1                   80      0.6689540  0.3429249
##   0.10       1                   90      0.6639338  0.3288857
##   0.10       1                  100      0.6719647  0.3440548
##   0.10       5                   60      0.6689247  0.3377632
##   0.10       5                   70      0.6669546  0.3332671
##   0.10       5                   80      0.6689546  0.3375650
##   0.10       5                   90      0.6719647  0.3433815
##   0.10       5                  100      0.6639544  0.3267292
##   0.10       9                   60      0.6618536  0.3231605
##   0.10       9                   70      0.6618940  0.3233166
##   0.10       9                   80      0.6579134  0.3157208
##   0.10       9                   90      0.6648740  0.3289806
##   0.10       9                  100      0.6628938  0.3250712
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 90, interaction.depth =
##  5, shrinkage = 0.1 and n.minobsinnode = 10.

Comparing ROC Curves

Now, let us plot ROC curve for different methods in the same graph.

treebagRoc <- pROC::roc(response = bagTune$pred$obs,
                        predictor = bagTune$pred$Female,
                        levels = rev(levels(bagTune$pred$obs)))
treebagRoc
## 
## Call:
## roc.default(response = bagTune$pred$obs, predictor = bagTune$pred$Female,     levels = rev(levels(bagTune$pred$obs)))
## 
## Data: bagTune$pred$Female in 446 controls (bagTune$pred$obs Male) < 554 cases (bagTune$pred$obs Female).
## Area under the curve: 0.6933
rfRoc <- pROC::roc(response = rfTune$pred$obs,
             predictor = rfTune$pred$Female,
             levels = rev(levels(rfTune$pred$obs)))
rfRoc
## 
## Call:
## roc.default(response = rfTune$pred$obs, predictor = rfTune$pred$Female,     levels = rev(levels(rfTune$pred$obs)))
## 
## Data: rfTune$pred$Female in 2230 controls (rfTune$pred$obs Male) < 2770 cases (rfTune$pred$obs Female).
## Area under the curve: 0.7072
gbmRoc <- pROC::roc(response = gbmTune$pred$obs,
              predictor = gbmTune$pred$Female,
              levels = rev(levels(gbmTune$pred$obs)))
gbmRoc
## 
## Call:
## roc.default(response = gbmTune$pred$obs, predictor = gbmTune$pred$Female,     levels = rev(levels(gbmTune$pred$obs)))
## 
## Data: gbmTune$pred$Female in 13380 controls (gbmTune$pred$obs Male) < 16620 cases (gbmTune$pred$obs Female).
## Area under the curve: 0.6963
plot.roc(rpartRoc, 
     type = "s", 
     print.thres = c(.5),
     print.thres.pch = 16,
     print.thres.pattern = "",
     print.thres.cex = 1.2,
     col = "black", legacy.axes = TRUE,
     print.thres.col = "black")

plot.roc(treebagRoc, 
     type = "s", 
     add = TRUE, print.thres = c(.5), 
     print.thres.pch = 3, legacy.axes = TRUE, print.thres.pattern = "", 
     print.thres.cex = 1.2,
     col = "red", print.thres.col = "red")

plot.roc(rfRoc, 
     type = "s", 
     add = TRUE, print.thres = c(.5), 
     print.thres.pch = 1, legacy.axes = TRUE, print.thres.pattern = "", 
     print.thres.cex = 1.2,
     col = "green", print.thres.col = "green")

plot.roc(gbmRoc, 
     type = "s", 
     add = TRUE, print.thres = c(.5), 
     print.thres.pch = 10, legacy.axes = TRUE, print.thres.pattern = "", 
     print.thres.cex = 1.2,
     col = "blue", print.thres.col = "blue")

legend("topleft", cex = 0.8,
       c("Single Tree", "Bagged Tree", "Random Forest", "Boosted Tree"),
       lwd = c(1, 1, 1, 1),
       col = c("black", "red", "green", "blue"),
       pch = c(16, 3, 1, 10))

Since the data here doesn’t have many variables, we don’t see a significant difference among the models. But you can still see those ensemble methods are better than a single tree. In most of the real applications, ensemble methods perform much better. Random forest and boosting trees can be a baseline model. Before exploring different models, you can quickly run a random forest to see the performance and then try to improve that performance. If the performance you got from the random forest is not too much better than guessing, you should consider collecting more data or reviewing the problem to frame it a different way instead of trying other models. Because it usually means the current data is not enough to solve the problem.