14.1 Customer Data for Clothing Company

The simulation is not very straightforward and we will break it into three parts:

  1. Define data structure: variable names, variable distribution, customer segment names, segment size
  2. Variable distribution parameters: mean and variance
  3. Iterate across segments and variables. Simulate data according to specific parameters assigned

By organizing code this way, it makes easy for us to change specific parts of the simulation. For example, if we want to change the distribution of one variable, we can just change the corresponding part of the code.

Here is code to define data structure:

# set a random number seed to 
# make the process repeatable
set.seed(12345)
# define the number of observations
ncust <- 1000
# create a data frmae for simulated data
seg_dat <- data.frame(id = as.factor(c(1:ncust)))
# assign the variable names
vars <- c("age", "gender", "income", "house", "store_exp", 
    "online_exp", "store_trans", "online_trans")
# assign distribution for each variable
vartype <- c("norm", "binom", "norm", "binom", "norm", "norm", 
    "pois", "pois")
# names of 4 segments
group_name <- c("Price", "Conspicuous", "Quality", "Style")
# size of each segments
group_size <- c(250, 200, 200, 350)

The next step is to define variable distribution parameters. There are 4 segments of customers and 8 parameters. Different segments correspond to different parameters. Let’s store the parameters in a 4×8 matrix:

# matrix for mean
mus <- matrix( c(
  # Price
  60, 0.5, 120000,0.9, 500,200,5,2,
  # Conspicuous
  40, 0.7, 200000,0.9, 5000,5000,10,10,
  # Quality
  36, 0.5, 70000, 0.4, 300, 2000,2,15,
  # Style
  25, 0.2, 90000, 0.2, 200, 2000,2,20), 
  ncol=length(vars), byrow=TRUE)
# matrix for variance
sds<- matrix( c(
  # Price
  3,NA,8000,NA,100,50,NA,NA,
  # Conspicuous
  5,NA,50000,NA,1000,1500,NA,NA,
  # Quality
  7,NA,10000,NA,50,200,NA,NA,
  # Style
  2,NA,5000,NA,10,500,NA,NA), 
  ncol=length(vars), byrow=TRUE)

Now we are ready to simulate data using the parameters defined above:

# simulate non-survey data
sim.dat <- NULL
set.seed(2016)
# loop on customer segment (i)
for (i in seq_along(group_name)) {
    
    # add this line in order to moniter the process
    cat(i, group_name[i], "\n")
    
    # create an empty matrix to store relevent data
    seg <- data.frame(matrix(NA, nrow = group_size[i], 
    ncol = length(vars)))
    
    # Simulate data within segment i
    for (j in seq_along(vars)) {
        
        # loop on every variable (j)
        if (vartype[j] == "norm") {
            # simulate normal distribution
            seg[, j] <- rnorm(group_size[i], mean = mus[i, 
                j], sd = sds[i, j])
        } else if (vartype[j] == "pois") {
            # simulate poisson distribution
            seg[, j] <- rpois(group_size[i], lambda = mus[i, 
                j])
        } else if (vartype[j] == "binom") {
            # simulate binomial distribution
            seg[, j] <- rbinom(group_size[i], size = 1, 
                prob = mus[i, j])
        } else {
            # if the distribution name is not one of the above, stop
            # and return a message
            stop("Don't have type:", vartype[j])
        }
    }
    sim.dat <- rbind(sim.dat, seg)
}

Now let’s edit the data we just simulated a little by adding tags to 0/1 binomial variables:

# assign variable names
names(sim.dat) <- vars
# assign factor levels to segment variable
sim.dat$segment <- factor(rep(group_name, times = group_size))
# recode gender and house variable
sim.dat$gender <- factor(sim.dat$gender, labels = c("Female", 
    "Male"))
sim.dat$house <- factor(sim.dat$house, labels = c("No", 
    "Yes"))
# store_trans and online_trans are at least 1
sim.dat$store_trans <- sim.dat$store_trans + 1
sim.dat$online_trans <- sim.dat$online_trans + 1
# age is integer
sim.dat$age <- floor(sim.dat$age)

In the real world, the data always includes some noise such as missing, wrong imputation. So we will add some noise to the data:

# add missing values
idxm <- as.logical(rbinom(ncust, size = 1, prob = sim.dat$age/200))
sim.dat$income[idxm] <- NA
# add wrong imputations and outliers
set.seed(123)
idx <- sample(1:ncust, 5)
sim.dat$age[idx[1]] <- 300
sim.dat$store_exp[idx[2]] <- -500
sim.dat$store_exp[idx[3:5]] <- c(50000, 30000, 30000)

So far we have created part of the data. You can check it using summary(sim.dat). Next, we will move on to simulate survey data.

# number of survey questions
nq <- 10

# mean matrix for different segments 
mus2 <- matrix( c( 5,2,1,3,1,4,1,4,2,4, # Price
  1,4,5,4,4,4,4,1,4,2, # Conspicuous
  5,2,3,4,3,2,4,2,3,3, # Quality
  3,1,1,2,4,1,5,3,4,2), # Style
ncol=nq, byrow=TRUE) 

# assume the variance is 0.2 for all
sd2 <- 0.2
sim.dat2 <- NULL
set.seed(1000)
# loop for customer segment (i)
for (i in seq_along(group_name)) {
    # the following line is used for checking the
    # progress cat (i, group_name[i],'\n') create an
    # empty data frame to store data
    seg <- data.frame(matrix(NA, nrow = group_size[i], 
        ncol = nq))
    # simulate data within segment
    for (j in 1:nq) {
        # simulate normal distribution
        res <- rnorm(group_size[i], mean = mus2[i, 
            j], sd = sd2)
        # set upper and lower limit
        res[res > 5] <- 5
        res[res < 1] <- 1
        # convert continuous values to discrete integers
        seg[, j] <- floor(res)
    }
    sim.dat2 <- rbind(sim.dat2, seg)
}

names(sim.dat2) <- paste("Q", 1:10, sep = "")
sim.dat <- cbind(sim.dat, sim.dat2)
sim.dat$segment <- factor(rep(group_name, times = group_size))