## 14.1 Customer Data for Clothing Company

The simulation is not very straightforward and we will break it into three parts:

1. Define data structure: variable names, variable distribution, customer segment names, segment size
2. Variable distribution parameters: mean and variance
3. Iterate across segments and variables. Simulate data according to specific parameters assigned

By organizing code this way, it makes easy for us to change specific parts of the simulation. For example, if we want to change the distribution of one variable, we can just change the corresponding part of the code.

Here is code to define data structure:

# set a random number seed to
# make the process repeatable
set.seed(12345)
# define the number of observations
ncust <- 1000
# create a data frmae for simulated data
seg_dat <- data.frame(id = as.factor(c(1:ncust)))
# assign the variable names
vars <- c("age", "gender", "income", "house", "store_exp",
"online_exp", "store_trans", "online_trans")
# assign distribution for each variable
vartype <- c("norm", "binom", "norm", "binom", "norm", "norm",
"pois", "pois")
# names of 4 segments
group_name <- c("Price", "Conspicuous", "Quality", "Style")
# size of each segments
group_size <- c(250, 200, 200, 350)

The next step is to define variable distribution parameters. There are 4 segments of customers and 8 parameters. Different segments correspond to different parameters. Let’s store the parameters in a 4×8 matrix:

# matrix for mean
mus <- matrix( c(
# Price
60, 0.5, 120000,0.9, 500,200,5,2,
# Conspicuous
40, 0.7, 200000,0.9, 5000,5000,10,10,
# Quality
36, 0.5, 70000, 0.4, 300, 2000,2,15,
# Style
25, 0.2, 90000, 0.2, 200, 2000,2,20),
ncol=length(vars), byrow=TRUE)
# matrix for variance
sds<- matrix( c(
# Price
3,NA,8000,NA,100,50,NA,NA,
# Conspicuous
5,NA,50000,NA,1000,1500,NA,NA,
# Quality
7,NA,10000,NA,50,200,NA,NA,
# Style
2,NA,5000,NA,10,500,NA,NA),
ncol=length(vars), byrow=TRUE)

Now we are ready to simulate data using the parameters defined above:

# simulate non-survey data
sim.dat <- NULL
set.seed(2016)
# loop on customer segment (i)
for (i in seq_along(group_name)) {

# add this line in order to moniter the process
cat(i, group_name[i], "\n")

# create an empty matrix to store relevent data
seg <- data.frame(matrix(NA, nrow = group_size[i],
ncol = length(vars)))

# Simulate data within segment i
for (j in seq_along(vars)) {

# loop on every variable (j)
if (vartype[j] == "norm") {
# simulate normal distribution
seg[, j] <- rnorm(group_size[i], mean = mus[i,
j], sd = sds[i, j])
} else if (vartype[j] == "pois") {
# simulate poisson distribution
seg[, j] <- rpois(group_size[i], lambda = mus[i,
j])
} else if (vartype[j] == "binom") {
# simulate binomial distribution
seg[, j] <- rbinom(group_size[i], size = 1,
prob = mus[i, j])
} else {
# if the distribution name is not one of the above, stop
# and return a message
stop("Don't have type:", vartype[j])
}
}
sim.dat <- rbind(sim.dat, seg)
}

Now let’s edit the data we just simulated a little by adding tags to 0/1 binomial variables:

# assign variable names
names(sim.dat) <- vars
# assign factor levels to segment variable
sim.dat$segment <- factor(rep(group_name, times = group_size)) # recode gender and house variable sim.dat$gender <- factor(sim.dat$gender, labels = c("Female", "Male")) sim.dat$house <- factor(sim.dat$house, labels = c("No", "Yes")) # store_trans and online_trans are at least 1 sim.dat$store_trans <- sim.dat$store_trans + 1 sim.dat$online_trans <- sim.dat$online_trans + 1 # age is integer sim.dat$age <- floor(sim.dat$age) In the real world, the data always includes some noise such as missing, wrong imputation. So we will add some noise to the data: # add missing values idxm <- as.logical(rbinom(ncust, size = 1, prob = sim.dat$age/200))
sim.dat$income[idxm] <- NA # add wrong imputations and outliers set.seed(123) idx <- sample(1:ncust, 5) sim.dat$age[idx] <- 300
sim.dat$store_exp[idx] <- -500 sim.dat$store_exp[idx[3:5]] <- c(50000, 30000, 30000)

So far we have created part of the data. You can check it using summary(sim.dat). Next, we will move on to simulate survey data.

# number of survey questions
nq <- 10

# mean matrix for different segments
mus2 <- matrix( c( 5,2,1,3,1,4,1,4,2,4, # Price
1,4,5,4,4,4,4,1,4,2, # Conspicuous
5,2,3,4,3,2,4,2,3,3, # Quality
3,1,1,2,4,1,5,3,4,2), # Style
ncol=nq, byrow=TRUE)

# assume the variance is 0.2 for all
sd2 <- 0.2
sim.dat2 <- NULL
set.seed(1000)
# loop for customer segment (i)
for (i in seq_along(group_name)) {
# the following line is used for checking the
# progress cat (i, group_name[i],'\n') create an
# empty data frame to store data
seg <- data.frame(matrix(NA, nrow = group_size[i],
ncol = nq))
# simulate data within segment
for (j in 1:nq) {
# simulate normal distribution
res <- rnorm(group_size[i], mean = mus2[i,
j], sd = sd2)
# set upper and lower limit
res[res > 5] <- 5
res[res < 1] <- 1
# convert continuous values to discrete integers
seg[, j] <- floor(res)
}
sim.dat2 <- rbind(sim.dat2, seg)
}

names(sim.dat2) <- paste("Q", 1:10, sep = "")
sim.dat <- cbind(sim.dat, sim.dat2)
sim.dat\$segment <- factor(rep(group_name, times = group_size))