14.1 Customer Data for Clothing Company
The simulation is not very straightforward and we will break it into three parts:
- Define data structure: variable names, variable distribution, customer segment names, segment size
- Variable distribution parameters: mean and variance
- Iterate across segments and variables. Simulate data according to specific parameters assigned
By organizing code this way, it makes easy for us to change specific parts of the simulation. For example, if we want to change the distribution of one variable, we can just change the corresponding part of the code.
Here is code to define data structure:
# set a random number seed to
# make the process repeatable
set.seed(12345)
# define the number of observations
<- 1000
ncust # create a data frmae for simulated data
<- data.frame(id = as.factor(c(1:ncust)))
seg_dat # assign the variable names
<- c("age", "gender", "income", "house", "store_exp",
vars "online_exp", "store_trans", "online_trans")
# assign distribution for each variable
<- c("norm", "binom", "norm", "binom", "norm", "norm",
vartype "pois", "pois")
# names of 4 segments
<- c("Price", "Conspicuous", "Quality", "Style")
group_name # size of each segments
<- c(250, 200, 200, 350) group_size
The next step is to define variable distribution parameters. There are 4 segments of customers and 8 parameters. Different segments correspond to different parameters. Let’s store the parameters in a 4×8 matrix:
# matrix for mean
<- matrix( c(
mus # Price
60, 0.5, 120000,0.9, 500,200,5,2,
# Conspicuous
40, 0.7, 200000,0.9, 5000,5000,10,10,
# Quality
36, 0.5, 70000, 0.4, 300, 2000,2,15,
# Style
25, 0.2, 90000, 0.2, 200, 2000,2,20),
ncol=length(vars), byrow=TRUE)
# matrix for variance
<- matrix( c(
sds# Price
3,NA,8000,NA,100,50,NA,NA,
# Conspicuous
5,NA,50000,NA,1000,1500,NA,NA,
# Quality
7,NA,10000,NA,50,200,NA,NA,
# Style
2,NA,5000,NA,10,500,NA,NA),
ncol=length(vars), byrow=TRUE)
Now we are ready to simulate data using the parameters defined above:
# simulate non-survey data
<- NULL
sim.dat set.seed(2016)
# loop on customer segment (i)
for (i in seq_along(group_name)) {
# add this line in order to moniter the process
cat(i, group_name[i], "\n")
# create an empty matrix to store relevent data
<- data.frame(matrix(NA, nrow = group_size[i],
seg ncol = length(vars)))
# Simulate data within segment i
for (j in seq_along(vars)) {
# loop on every variable (j)
if (vartype[j] == "norm") {
# simulate normal distribution
<- rnorm(group_size[i], mean = mus[i,
seg[, j] sd = sds[i, j])
j], else if (vartype[j] == "pois") {
} # simulate poisson distribution
<- rpois(group_size[i], lambda = mus[i,
seg[, j]
j])else if (vartype[j] == "binom") {
} # simulate binomial distribution
<- rbinom(group_size[i], size = 1,
seg[, j] prob = mus[i, j])
else {
} # if the distribution name is not one of the above, stop
# and return a message
stop("Don't have type:", vartype[j])
}
}<- rbind(sim.dat, seg)
sim.dat }
Now let’s edit the data we just simulated a little by adding tags to 0/1 binomial variables:
# assign variable names
names(sim.dat) <- vars
# assign factor levels to segment variable
$segment <- factor(rep(group_name, times = group_size))
sim.dat# recode gender and house variable
$gender <- factor(sim.dat$gender, labels = c("Female",
sim.dat"Male"))
$house <- factor(sim.dat$house, labels = c("No",
sim.dat"Yes"))
# store_trans and online_trans are at least 1
$store_trans <- sim.dat$store_trans + 1
sim.dat$online_trans <- sim.dat$online_trans + 1
sim.dat# age is integer
$age <- floor(sim.dat$age) sim.dat
In the real world, the data always includes some noise such as missing, wrong imputation. So we will add some noise to the data:
# add missing values
<- as.logical(rbinom(ncust, size = 1, prob = sim.dat$age/200))
idxm $income[idxm] <- NA
sim.dat# add wrong imputations and outliers
set.seed(123)
<- sample(1:ncust, 5)
idx $age[idx[1]] <- 300
sim.dat$store_exp[idx[2]] <- -500
sim.dat$store_exp[idx[3:5]] <- c(50000, 30000, 30000) sim.dat
So far we have created part of the data. You can check it using summary(sim.dat)
. Next, we will move on to simulate survey data.
# number of survey questions
<- 10
nq
# mean matrix for different segments
<- matrix( c( 5,2,1,3,1,4,1,4,2,4, # Price
mus2 1,4,5,4,4,4,4,1,4,2, # Conspicuous
5,2,3,4,3,2,4,2,3,3, # Quality
3,1,1,2,4,1,5,3,4,2), # Style
ncol=nq, byrow=TRUE)
# assume the variance is 0.2 for all
<- 0.2
sd2 <- NULL
sim.dat2 set.seed(1000)
# loop for customer segment (i)
for (i in seq_along(group_name)) {
# the following line is used for checking the
# progress cat (i, group_name[i],'\n') create an
# empty data frame to store data
<- data.frame(matrix(NA, nrow = group_size[i],
seg ncol = nq))
# simulate data within segment
for (j in 1:nq) {
# simulate normal distribution
<- rnorm(group_size[i], mean = mus2[i,
res sd = sd2)
j], # set upper and lower limit
> 5] <- 5
res[res < 1] <- 1
res[res # convert continuous values to discrete integers
<- floor(res)
seg[, j]
}<- rbind(sim.dat2, seg)
sim.dat2
}
names(sim.dat2) <- paste("Q", 1:10, sep = "")
<- cbind(sim.dat, sim.dat2)
sim.dat $segment <- factor(rep(group_name, times = group_size)) sim.dat