Load R Packages and Data

Summarize Data

dplyr Package

Display
Subset
Summarize
Create new variable
Merge

apply(), lapply() and sapply() in base R

Tidy and Reshape Data

This notebook illustrates how to perform standard data pre-processing, an essential step for any data science project.

Load R Packages and Data

# install packages from CRAN
p_needed <- c('dplyr','tidyr')
packages <- rownames(installed.packages())
p_to_install <- p_needed[!(p_needed %in% packages)]
if (length(p_to_install) > 0) {
    install.packages(p_to_install)
}

lapply(p_needed, require, character.only = TRUE)

Summarize Data

`dplyr` Package

 # Read data
sim.dat <- read.csv("http://bit.ly/2P5gTw4")

Display

We first use tibble::as_tibble() function to convert data into a tibble object. It offers better checking and printing capabilities than the traditional data frame. For example, the glimpse() function provides a quick view of the data.

tibble::as_tibble(sim.dat)

## # A tibble: 1,000 × 19
##      age gender income house store_exp online_exp store_trans online_trans    Q1
##    <int> <chr>   <dbl> <chr>     <dbl>      <dbl>       <int>        <int> <int>
##  1    57 Female 1.21e5 Yes        529.       304.           2            2     4
##  2    63 Female 1.22e5 Yes        478.       110.           4            2     4
##  3    59 Male   1.14e5 Yes        491.       279.           7            2     5
##  4    60 Male   1.14e5 Yes        348.       142.          10            2     5
##  5    51 Male   1.24e5 Yes        380.       112.           4            4     4
##  6    59 Male   1.08e5 Yes        338.       196.           4            5     4
##  7    57 Male   1.20e5 Yes        483.       285.           5            3     4
##  8    57 Male   1.11e5 Yes        341.       135.          11            5     5
##  9    61 Female 1.32e5 Yes        608.       143.           6            1     4
## 10    60 Male   1.05e5 Yes        470.       163.          12            1     4
## # … with 990 more rows, and 10 more variables: Q2 <int>, Q3 <int>, Q4 <int>,
## #   Q5 <int>, Q6 <int>, Q7 <int>, Q8 <int>, Q9 <int>, Q10 <int>, segment <chr>

glimpse(sim.dat)

## Rows: 1,000
## Columns: 19
## $ age          <int> 57, 63, 59, 60, 51, 59, 57, 57, 61, 60, 58, 59, 64, 57, 6…
## $ gender       <chr> "Female", "Female", "Male", "Male", "Male", "Male", "Male…
## $ income       <dbl> 120963.4, 122008.1, 114202.3, 113616.3, 124252.6, 107661.…
## $ house        <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "…
## $ store_exp    <dbl> 529.1344, 478.0058, 490.8107, 347.8090, 379.6259, 338.315…
## $ online_exp   <dbl> 303.5125, 109.5297, 279.2496, 141.6698, 112.2372, 195.687…
## $ store_trans  <int> 2, 4, 7, 10, 4, 4, 5, 11, 6, 12, 5, 6, 7, 7, 5, 5, 5, 5, …
## $ online_trans <int> 2, 2, 2, 2, 4, 5, 3, 5, 1, 1, 4, 2, 4, 3, 5, 1, 3, 2, 2, …
## $ Q1           <int> 4, 4, 5, 5, 4, 4, 4, 5, 4, 4, 4, 4, 5, 4, 4, 5, 5, 5, 4, …
## $ Q2           <int> 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
## $ Q3           <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Q4           <int> 2, 2, 2, 3, 3, 2, 2, 3, 2, 3, 3, 3, 3, 3, 2, 2, 2, 3, 3, …
## $ Q5           <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Q6           <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ Q7           <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Q8           <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ Q9           <int> 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2, …
## $ Q10          <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ segment      <chr> "Price", "Price", "Price", "Price", "Price", "Price", "Pr…

Subset

filter(sim.dat, income >300000) %>% 
  tibble::as_tibble()

## # A tibble: 4 × 19
##     age gender  income house store_exp online_exp store_trans online_trans    Q1
##   <int> <chr>    <dbl> <chr>     <dbl>      <dbl>       <int>        <int> <int>
## 1    40 Male   301398. Yes       4840.      3618.          10           11     1
## 2    33 Male   319704. Yes       5998.      4396.           9           11     1
## 3    41 Male   317476. Yes       3030.      4180.          11           12     1
## 4    37 Female 315697. Yes       6549.      4284.          13           11     1
## # … with 10 more variables: Q2 <int>, Q3 <int>, Q4 <int>, Q5 <int>, Q6 <int>,
## #   Q7 <int>, Q8 <int>, Q9 <int>, Q10 <int>, segment <chr>

The example below shows the difference between using and not using %>%.

# don't use %>%
ave_exp <- filter(
  summarise(
    group_by(
      filter(
        sim.dat,
        !is.na(income)
      ),
    segment
  ),
  ave_online_exp = mean(online_exp),
  n = n()
  ),
  n > 200
)
 
ave_exp

## # A tibble: 1 × 3
##   segment ave_online_exp     n
##   <chr>            <dbl> <int>
## 1 Style            1961.   314

# use %>%
ave_exp <- sim.dat %>%
  filter(!is.na(income)) %>%
  group_by(segment) %>%
  summarise(
    ave_online_exp = mean(online_exp),
    n = n() ) %>%
  filter(n > 200)
 
ave_exp

## # A tibble: 1 × 3
##   segment ave_online_exp     n
##   <chr>            <dbl> <int>
## 1 Style            1961.   314

Let us check a few more functions in dplyr pacakge.

# remove duplicated rows
dplyr::distinct(sim.dat)

# sample a fraction or certain number of rows of data
print(dplyr::sample_frac(sim.dat, 0.01, replace = TRUE))

##    age gender    income house  store_exp online_exp store_trans online_trans Q1
## 1   22   Male  91553.21    No   200.7210  1777.4974           4           27  2
## 2   40   Male        NA    No   255.1990  2076.8281           4           18  4
## 3   34   Male 156882.12   Yes  4792.6922  6627.5537           9           10  1
## 4   25   Male  88151.69   Yes   201.8566  2675.7310           3           23  2
## 5   58   Male 111364.47   Yes   402.1077   179.1663           7            3  5
## 6   25 Female 100952.45    No   198.7989   746.7248           3           23  3
## 7   39   Male 259350.47   Yes 50000.0000  3172.2617          13           11  1
## 8   21 Female  95847.53   Yes   210.2998  1907.5692           3           31  3
## 9   31 Female  71550.06    No   310.9308  2166.5671           4           14  4
## 10  24 Female  91568.52    No   213.3846  2196.6703           5           23  2
##    Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10     segment
## 1   1  1  2  4  1  5  2  4   1       Style
## 2   1  2  4  2  2  4  2  3   2     Quality
## 3   4  4  4  4  4  4  1  4   2 Conspicuous
## 4   1  1  1  4  1  5  2  4   2       Style
## 5   2  1  3  1  4  1  4  1   4       Price
## 6   1  1  1  4  1  4  2  4   1       Style
## 7   4  4  4  4  4  4  1  4   1 Conspicuous
## 8   1  1  2  4  1  5  2  4   2       Style
## 9   1  2  4  2  1  4  2  3   2     Quality
## 10  1  1  1  4  1  4  3  4   1       Style

print(dplyr::sample_n(sim.dat, 10, replace = TRUE))

##    age gender    income house store_exp online_exp store_trans online_trans Q1
## 1   26 Female        NA    No  183.1205  2391.2305           3           27  3
## 2   31 Female  80539.01    No  225.8908  1975.7586           1           15  4
## 3   40   Male 258034.45   Yes 5459.8279  4437.2839          10            9  1
## 4   30   Male  65412.93   Yes  257.2869  1735.1010           4           13  5
## 5   22 Female  87477.37    No  195.3082  1636.8115           4           20  3
## 6   25 Female  85401.49    No  204.0966  1614.1424           4           20  2
## 7   40   Male 258034.45   Yes 5459.8279  4437.2839          10            9  1
## 8   62 Female 110510.45   Yes  473.6524   253.3283           7            1  5
## 9   64   Male 113301.48   Yes  459.9504   252.7724           2            3  5
## 10  38   Male        NA   Yes 5397.9469  2587.6464          10            9  1
##    Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10     segment
## 1   1  1  2  4  1  5  2  4   1       Style
## 2   2  3  4  2  2  4  1  2   3     Quality
## 3   5  5  4  4  4  4  1  4   2 Conspicuous
## 4   1  2  4  2  2  4  2  3   3     Quality
## 5   1  1  2  4  1  5  3  4   1       Style
## 6   1  1  2  4  1  5  2  4   2       Style
## 7   5  5  4  4  4  4  1  4   2 Conspicuous
## 8   1  1  2  1  4  1  4  2   4       Price
## 9   1  1  3  1  4  1  4  2   4       Price
## 10  4  4  4  4  4  4  1  4   1 Conspicuous

# subset rows by position
dplyr::slice(sim.dat, 10:15)

##   age gender   income house store_exp online_exp store_trans online_trans Q1 Q2
## 1  60   Male 105048.8   Yes  470.3190   163.4663          12            1  4  2
## 2  58   Male 107196.7   Yes  366.6393   170.1333           5            4  4  1
## 3  59   Male       NA   Yes  674.8829   310.2657           6            2  4  1
## 4  64   Male 119020.0   Yes  613.9490   160.8463           7            4  5  1
## 5  57 Female       NA   Yes  737.0497   224.5322           7            3  4  2
## 6  64   Male 114539.4   Yes  402.4901   241.8339           5            5  4  2
##   Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1  1  3  1  4  1  4  1   4   Price
## 2  1  3  1  4  1  4  2   4   Price
## 3  1  3  1  4  1  4  1   4   Price
## 4  1  3  1  4  1  4  2   4   Price
## 5  1  3  1  4  1  4  1   4   Price
## 6  1  2  1  4  1  4  1   4   Price

# select the order top n entries
dplyr::top_n(sim.dat,2,income)

##   age gender   income house store_exp online_exp store_trans online_trans Q1 Q2
## 1  33   Male 319704.3   Yes  5998.305   4395.923           9           11  1  4
## 2  41   Male 317476.2   Yes  3029.844   4179.671          11           12  1  4
##   Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10     segment
## 1  4  4  4  4  4  1  4   2 Conspicuous
## 2  5  4  4  4  4  1  4   2 Conspicuous

# select by column name
print(head(dplyr::select(sim.dat,income,age,store_exp)))

##     income age store_exp
## 1 120963.4  57  529.1344
## 2 122008.1  63  478.0058
## 3 114202.3  59  490.8107
## 4 113616.3  60  347.8090
## 5 124252.6  51  379.6259
## 6 107661.5  59  338.3154

# select columns whose name contains a character string
print(head(dplyr::select(sim.dat, contains("_"))))

##   store_exp online_exp store_trans online_trans
## 1  529.1344   303.5125           2            2
## 2  478.0058   109.5297           4            2
## 3  490.8107   279.2496           7            2
## 4  347.8090   141.6698          10            2
## 5  379.6259   112.2372           4            4
## 6  338.3154   195.6870           4            5

# select columns whose name ends with a character string
# similar there is "starts_with"
print(head(dplyr::select(sim.dat, ends_with("e"))))

##   age   income house
## 1  57 120963.4   Yes
## 2  63 122008.1   Yes
## 3  59 114202.3   Yes
## 4  60 113616.3   Yes
## 5  51 124252.6   Yes
## 6  59 107661.5   Yes

# select columns Q1,Q2,Q3,Q4 and Q5
print(head(select(sim.dat, num_range("Q", 1:5))))

##   Q1 Q2 Q3 Q4 Q5
## 1  4  2  1  2  1
## 2  4  1  1  2  1
## 3  5  2  1  2  1
## 4  5  2  1  3  1
## 5  4  1  1  3  1
## 6  4  2  1  2  1

# select columns whose names are in a group of names
print(head(dplyr::select(sim.dat, one_of(c("age", "income")))))

##   age   income
## 1  57 120963.4
## 2  63 122008.1
## 3  59 114202.3
## 4  60 113616.3
## 5  51 124252.6
## 6  59 107661.5

# select columns between age and online_exp
print(head(dplyr::select(sim.dat, age:online_exp)))

##   age gender   income house store_exp online_exp
## 1  57 Female 120963.4   Yes  529.1344   303.5125
## 2  63 Female 122008.1   Yes  478.0058   109.5297
## 3  59   Male 114202.3   Yes  490.8107   279.2496
## 4  60   Male 113616.3   Yes  347.8090   141.6698
## 5  51   Male 124252.6   Yes  379.6259   112.2372
## 6  59   Male 107661.5   Yes  338.3154   195.6870

# select all columns except for age
print(head(dplyr::select(sim.dat, -age)))

##   gender   income house store_exp online_exp store_trans online_trans Q1 Q2 Q3
## 1 Female 120963.4   Yes  529.1344   303.5125           2            2  4  2  1
## 2 Female 122008.1   Yes  478.0058   109.5297           4            2  4  1  1
## 3   Male 114202.3   Yes  490.8107   279.2496           7            2  5  2  1
## 4   Male 113616.3   Yes  347.8090   141.6698          10            2  5  2  1
## 5   Male 124252.6   Yes  379.6259   112.2372           4            4  4  1  1
## 6   Male 107661.5   Yes  338.3154   195.6870           4            5  4  2  1
##   Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1  2  1  4  1  4  2   4   Price
## 2  2  1  4  1  4  1   4   Price
## 3  2  1  4  1  4  1   4   Price
## 4  3  1  4  1  4  2   4   Price
## 5  3  1  4  1  4  2   4   Price
## 6  2  1  4  1  4  1   4   Price

Summarize

dat_summary <- sim.dat %>% 
  dplyr::group_by(segment) %>% 
  dplyr::summarise(Age = round(mean(na.omit(age)), 0), 
            FemalePct = round(mean(gender == "Female"), 2), 
            HouseYes = round(mean(house == "Yes"), 2), 
            store_exp = round(mean(na.omit(store_exp), 
                                   trim = 0.1), 0),
            online_exp = round(mean(online_exp), 0), 
            store_trans = round(mean(store_trans), 1), 
            online_trans = round(mean(online_trans), 1))

# transpose the data frame for showing purpose
# due to the limit of output width
cnames <- dat_summary$segment
dat_summary <- dplyr::select(dat_summary, - segment)
tdat_summary <- t(dat_summary) %>% data.frame()
names(tdat_summary) <- cnames
tdat_summary

##              Conspicuous  Price Quality   Style
## Age                42.00  60.00   35.00   24.00
## FemalePct           0.32   0.45    0.47    0.81
## HouseYes            0.86   0.94    0.34    0.27
## store_exp        4990.00 501.00  301.00  200.00
## online_exp       4898.00 205.00 2013.00 1962.00
## store_trans        10.90   6.10    2.90    3.00
## online_trans       11.10   3.00   16.00   21.10

# Another example using group_by() and summarise()
sim.dat %>%
  group_by(segment) %>%
  summarise(avg_online = round(sum(online_exp)/sum(online_trans), 2),
            avg_store = round(sum(store_exp)/sum(store_trans), 2))

## # A tibble: 4 × 3
##   segment     avg_online avg_store
##   <chr>            <dbl>     <dbl>
## 1 Conspicuous      442.      479. 
## 2 Price             69.3      81.3
## 3 Quality          126.      105. 
## 4 Style             92.8     121.

# apply function anyNA() to each column
# you can also assign a function vector such as: c("anyNA","is.factor")
dplyr::summarise_all(sim.dat, funs_(c("anyNA")))

## Warning: `funs_()` was deprecated in dplyr 0.7.0.
## Please use `funs()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

##     age gender income house store_exp online_exp store_trans online_trans    Q1
## 1 FALSE  FALSE   TRUE FALSE     FALSE      FALSE       FALSE        FALSE FALSE
##      Q2    Q3    Q4    Q5    Q6    Q7    Q8    Q9   Q10 segment
## 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE   FALSE

Create new variable

# create a new variable
head(dplyr::mutate(sim.dat, total_exp = store_exp + online_exp))

##   age gender   income house store_exp online_exp store_trans online_trans Q1 Q2
## 1  57 Female 120963.4   Yes  529.1344   303.5125           2            2  4  2
## 2  63 Female 122008.1   Yes  478.0058   109.5297           4            2  4  1
## 3  59   Male 114202.3   Yes  490.8107   279.2496           7            2  5  2
## 4  60   Male 113616.3   Yes  347.8090   141.6698          10            2  5  2
## 5  51   Male 124252.6   Yes  379.6259   112.2372           4            4  4  1
## 6  59   Male 107661.5   Yes  338.3154   195.6870           4            5  4  2
##   Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment total_exp
## 1  1  2  1  4  1  4  2   4   Price  832.6468
## 2  1  2  1  4  1  4  1   4   Price  587.5355
## 3  1  2  1  4  1  4  1   4   Price  770.0603
## 4  1  3  1  4  1  4  2   4   Price  489.4788
## 5  1  3  1  4  1  4  2   4   Price  491.8631
## 6  1  2  1  4  1  4  1   4   Price  534.0024

# create a new variable and only keep the new variable
head(dplyr::transmute(sim.dat, total_exp = store_exp + online_exp))

##   total_exp
## 1  832.6468
## 2  587.5355
## 3  770.0603
## 4  489.4788
## 5  491.8631
## 6  534.0024

Merge

(x <- data.frame(cbind(ID = c("A", "B", "C"), x1 = c(1, 2, 3))))

##   ID x1
## 1  A  1
## 2  B  2
## 3  C  3

(y <- data.frame(cbind(ID = c("B", "C", "D"), y1 = c(T, T, F))))

##   ID    y1
## 1  B  TRUE
## 2  C  TRUE
## 3  D FALSE

# join to the left
# keep all rows in x
left_join(x, y, by = "ID")

##   ID x1   y1
## 1  A  1 <NA>
## 2  B  2 TRUE
## 3  C  3 TRUE

# get rows matched in both data sets
inner_join(x, y, by = "ID")

##   ID x1   y1
## 1  B  2 TRUE
## 2  C  3 TRUE

# get rows in either data set
full_join(x, y, by = "ID")

##   ID   x1    y1
## 1  A    1  <NA>
## 2  B    2  TRUE
## 3  C    3  TRUE
## 4  D <NA> FALSE

# filter out rows in x that can be matched in y 
# it doesn't bring in any values from y 
semi_join(x, y, by = "ID")

# the opposite of  semi_join()
# it gets rows in x that cannot be matched in y
# it doesn't bring in any values from y
anti_join(x, y, by = "ID")

`apply()`, `lapply()` and `sapply()` in base R

Compute row and column sums for a matrix:

## simulate a matrix
x <- cbind(x1 =1:8, x2 = c(4:1, 2:5))
dimnames(x)[[1]] <- letters[1:8]
apply(x, 2, mean)

##  x1  x2 
## 4.5 3.0

col.sums <- apply(x, 2, sum)
row.sums <- apply(x, 1, sum)

You can also apply other functions:

ma <- matrix(c(1:4, 1, 6:8), nrow = 2)
ma

##      [,1] [,2] [,3] [,4]
## [1,]    1    3    1    7
## [2,]    2    4    6    8

apply(ma, 1, table)  #--> a list of length 2

## [[1]]
## 
## 1 3 7 
## 2 1 1 
## 
## [[2]]
## 
## 2 4 6 8 
## 1 1 1 1

apply(ma, 1, stats::quantile) # 5 x n matrix with rownames

##      [,1] [,2]
## 0%      1  2.0
## 25%     1  3.5
## 50%     2  5.0
## 75%     4  6.5
## 100%    7  8.0

Let’s use some data with context to help you better understand the functions.

Get the mean and standard deviation of all numerical variables in the dataset.

# Get numerical variables
sdat <- sim.dat[, lapply(sim.dat, class) %in% c("integer", "numeric")]
## Try the following code with apply() function apply(sim.dat,2,class)
## What is the problem?

The data frame sdat only includes numeric columns. Now we can go head and use apply() to get mean and standard deviation for each column:

apply(sdat, MARGIN = 2, function(x) mean(na.omit(x)))

##          age       income    store_exp   online_exp  store_trans online_trans 
##       38.840   113543.065     1356.851     2120.181        5.350       13.546 
##           Q1           Q2           Q3           Q4           Q5           Q6 
##        3.101        1.823        1.992        2.763        2.945        2.448 
##           Q7           Q8           Q9          Q10 
##        3.434        2.396        3.085        2.320

Run the following code and compare the results:

lapply(sdat, function(x) sd(na.omit(x)))
sapply(sdat, function(x) sd(na.omit(x)))
sapply(sdat, function(x) sd(na.omit(x)), simplify = FALSE)

Tidy and Reshape Data

Take a baby subset of our exemplary clothes consumers data to illustrate:

sdat<-sim.dat[1:5,1:6]
sdat

##   age gender   income house store_exp online_exp
## 1  57 Female 120963.4   Yes  529.1344   303.5125
## 2  63 Female 122008.1   Yes  478.0058   109.5297
## 3  59   Male 114202.3   Yes  490.8107   279.2496
## 4  60   Male 113616.3   Yes  347.8090   141.6698
## 5  51   Male 124252.6   Yes  379.6259   112.2372

Change data from “wide” to “long”.

dat_long <- tidyr::gather(sdat, "Channel","Expense", 
                       store_exp, online_exp)
dat_long

##    age gender   income house    Channel  Expense
## 1   57 Female 120963.4   Yes  store_exp 529.1344
## 2   63 Female 122008.1   Yes  store_exp 478.0058
## 3   59   Male 114202.3   Yes  store_exp 490.8107
## 4   60   Male 113616.3   Yes  store_exp 347.8090
## 5   51   Male 124252.6   Yes  store_exp 379.6259
## 6   57 Female 120963.4   Yes online_exp 303.5125
## 7   63 Female 122008.1   Yes online_exp 109.5297
## 8   59   Male 114202.3   Yes online_exp 279.2496
## 9   60   Male 113616.3   Yes online_exp 141.6698
## 10  51   Male 124252.6   Yes online_exp 112.2372

You can run a regression to study the effect of purchasing channel as follows:

# Here we use all observations from sim.dat
# Don't show result here

msdat <- tidyr::gather(sim.dat[, 1:6], "Channel","Expense", 
                       store_exp, online_exp)
fit <- lm(Expense ~ gender + house + income + Channel + age, 
          data = msdat)
summary(fit)

## 
## Call:
## lm(formula = Expense ~ gender + house + income + Channel + age, 
##     data = msdat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4208   -821   -275    533  44353 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -8.363e+01  1.560e+02  -0.536 0.591884    
## genderMale        3.572e+02  1.028e+02   3.475 0.000524 ***
## houseYes         -5.687e+01  1.138e+02  -0.500 0.617275    
## income            2.834e-02  1.079e-03  26.268  < 2e-16 ***
## Channelstore_exp -8.296e+02  9.772e+01  -8.489  < 2e-16 ***
## age              -2.793e+01  3.356e+00  -8.321  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1974 on 1626 degrees of freedom
##   (368 observations deleted due to missingness)
## Multiple R-squared:  0.348,  Adjusted R-squared:  0.346 
## F-statistic: 173.5 on 5 and 1626 DF,  p-value: < 2.2e-16

Change data from “long” to “wide”.

dat_wide = tidyr::spread(dat_long, Channel, Expense)
# you can check what dat_long is like
dat_wide %>% 
  dplyr::group_by(house, gender) %>% 
  dplyr::summarise(total_online_exp = sum(online_exp),
                   total_store_exp = sum(store_exp))

## # A tibble: 2 × 4
## # Groups:   house [1]
##   house gender total_online_exp total_store_exp
##   <chr> <chr>             <dbl>           <dbl>
## 1 Yes   Female             413.           1007.
## 2 Yes   Male               533.           1218.

Another pair of functions that do opposite manipulations are separate() and unite().

sepdat<- dat_long %>% 
  separate(Channel, c("Source", "Type"))
sepdat

##    age gender   income house Source Type  Expense
## 1   57 Female 120963.4   Yes  store  exp 529.1344
## 2   63 Female 122008.1   Yes  store  exp 478.0058
## 3   59   Male 114202.3   Yes  store  exp 490.8107
## 4   60   Male 113616.3   Yes  store  exp 347.8090
## 5   51   Male 124252.6   Yes  store  exp 379.6259
## 6   57 Female 120963.4   Yes online  exp 303.5125
## 7   63 Female 122008.1   Yes online  exp 109.5297
## 8   59   Male 114202.3   Yes online  exp 279.2496
## 9   60   Male 113616.3   Yes online  exp 141.6698
## 10  51   Male 124252.6   Yes online  exp 112.2372

The unite() function will do the opposite: combining two columns. It is the generalization of paste() to a data frame.

sepdat %>% 
  unite("Channel", Source, Type, sep = "_")

##    age gender   income house    Channel  Expense
## 1   57 Female 120963.4   Yes  store_exp 529.1344
## 2   63 Female 122008.1   Yes  store_exp 478.0058
## 3   59   Male 114202.3   Yes  store_exp 490.8107
## 4   60   Male 113616.3   Yes  store_exp 347.8090
## 5   51   Male 124252.6   Yes  store_exp 379.6259
## 6   57 Female 120963.4   Yes online_exp 303.5125
## 7   63 Female 122008.1   Yes online_exp 109.5297
## 8   59   Male 114202.3   Yes online_exp 279.2496
## 9   60   Male 113616.3   Yes online_exp 141.6698
## 10  51   Male 124252.6   Yes online_exp 112.2372

Chapter 6 Data Wrangling

2022-04-23

Load R Packages and Data

Summarize Data

`dplyr` Package

Display

Subset

Summarize

Create new variable

Merge

`apply()`, `lapply()` and `sapply()` in base R

Tidy and Reshape Data

Chapter 6 Data Wrangling

2022-04-23

Load R Packages and Data

Summarize Data

dplyr Package

Display

Subset

Summarize

Create new variable

Merge

apply(), lapply() and sapply() in base R

Tidy and Reshape Data

`dplyr` Package

`apply()`, `lapply()` and `sapply()` in base R