dplyr
Packageapply()
, lapply()
and
sapply()
in base RThis notebook illustrates how to perform standard data pre-processing, an essential step for any data science project.
# install packages from CRAN
p_needed <- c('dplyr','tidyr')
packages <- rownames(installed.packages())
p_to_install <- p_needed[!(p_needed %in% packages)]
if (length(p_to_install) > 0) {
install.packages(p_to_install)
}
lapply(p_needed, require, character.only = TRUE)
dplyr
Package # Read data
sim.dat <- read.csv("http://bit.ly/2P5gTw4")
We first use tibble::as_tibble()
function to convert
data into a tibble
object. It offers better checking and
printing capabilities than the traditional data frame. For example, the
glimpse()
function provides a quick view of the data.
tibble::as_tibble(sim.dat)
## # A tibble: 1,000 × 19
## age gender income house store_exp online_exp store_trans online_trans Q1
## <int> <chr> <dbl> <chr> <dbl> <dbl> <int> <int> <int>
## 1 57 Female 1.21e5 Yes 529. 304. 2 2 4
## 2 63 Female 1.22e5 Yes 478. 110. 4 2 4
## 3 59 Male 1.14e5 Yes 491. 279. 7 2 5
## 4 60 Male 1.14e5 Yes 348. 142. 10 2 5
## 5 51 Male 1.24e5 Yes 380. 112. 4 4 4
## 6 59 Male 1.08e5 Yes 338. 196. 4 5 4
## 7 57 Male 1.20e5 Yes 483. 285. 5 3 4
## 8 57 Male 1.11e5 Yes 341. 135. 11 5 5
## 9 61 Female 1.32e5 Yes 608. 143. 6 1 4
## 10 60 Male 1.05e5 Yes 470. 163. 12 1 4
## # … with 990 more rows, and 10 more variables: Q2 <int>, Q3 <int>, Q4 <int>,
## # Q5 <int>, Q6 <int>, Q7 <int>, Q8 <int>, Q9 <int>, Q10 <int>, segment <chr>
glimpse(sim.dat)
## Rows: 1,000
## Columns: 19
## $ age <int> 57, 63, 59, 60, 51, 59, 57, 57, 61, 60, 58, 59, 64, 57, 6…
## $ gender <chr> "Female", "Female", "Male", "Male", "Male", "Male", "Male…
## $ income <dbl> 120963.4, 122008.1, 114202.3, 113616.3, 124252.6, 107661.…
## $ house <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "…
## $ store_exp <dbl> 529.1344, 478.0058, 490.8107, 347.8090, 379.6259, 338.315…
## $ online_exp <dbl> 303.5125, 109.5297, 279.2496, 141.6698, 112.2372, 195.687…
## $ store_trans <int> 2, 4, 7, 10, 4, 4, 5, 11, 6, 12, 5, 6, 7, 7, 5, 5, 5, 5, …
## $ online_trans <int> 2, 2, 2, 2, 4, 5, 3, 5, 1, 1, 4, 2, 4, 3, 5, 1, 3, 2, 2, …
## $ Q1 <int> 4, 4, 5, 5, 4, 4, 4, 5, 4, 4, 4, 4, 5, 4, 4, 5, 5, 5, 4, …
## $ Q2 <int> 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
## $ Q3 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Q4 <int> 2, 2, 2, 3, 3, 2, 2, 3, 2, 3, 3, 3, 3, 3, 2, 2, 2, 3, 3, …
## $ Q5 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Q6 <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ Q7 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Q8 <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ Q9 <int> 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2, …
## $ Q10 <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, …
## $ segment <chr> "Price", "Price", "Price", "Price", "Price", "Price", "Pr…
filter(sim.dat, income >300000) %>%
tibble::as_tibble()
## # A tibble: 4 × 19
## age gender income house store_exp online_exp store_trans online_trans Q1
## <int> <chr> <dbl> <chr> <dbl> <dbl> <int> <int> <int>
## 1 40 Male 301398. Yes 4840. 3618. 10 11 1
## 2 33 Male 319704. Yes 5998. 4396. 9 11 1
## 3 41 Male 317476. Yes 3030. 4180. 11 12 1
## 4 37 Female 315697. Yes 6549. 4284. 13 11 1
## # … with 10 more variables: Q2 <int>, Q3 <int>, Q4 <int>, Q5 <int>, Q6 <int>,
## # Q7 <int>, Q8 <int>, Q9 <int>, Q10 <int>, segment <chr>
The example below shows the difference between using and not using
%>%
.
# don't use %>%
ave_exp <- filter(
summarise(
group_by(
filter(
sim.dat,
!is.na(income)
),
segment
),
ave_online_exp = mean(online_exp),
n = n()
),
n > 200
)
ave_exp
## # A tibble: 1 × 3
## segment ave_online_exp n
## <chr> <dbl> <int>
## 1 Style 1961. 314
# use %>%
ave_exp <- sim.dat %>%
filter(!is.na(income)) %>%
group_by(segment) %>%
summarise(
ave_online_exp = mean(online_exp),
n = n() ) %>%
filter(n > 200)
ave_exp
## # A tibble: 1 × 3
## segment ave_online_exp n
## <chr> <dbl> <int>
## 1 Style 1961. 314
Let us check a few more functions in dplyr
pacakge.
# remove duplicated rows
dplyr::distinct(sim.dat)
# sample a fraction or certain number of rows of data
print(dplyr::sample_frac(sim.dat, 0.01, replace = TRUE))
## age gender income house store_exp online_exp store_trans online_trans Q1
## 1 22 Male 91553.21 No 200.7210 1777.4974 4 27 2
## 2 40 Male NA No 255.1990 2076.8281 4 18 4
## 3 34 Male 156882.12 Yes 4792.6922 6627.5537 9 10 1
## 4 25 Male 88151.69 Yes 201.8566 2675.7310 3 23 2
## 5 58 Male 111364.47 Yes 402.1077 179.1663 7 3 5
## 6 25 Female 100952.45 No 198.7989 746.7248 3 23 3
## 7 39 Male 259350.47 Yes 50000.0000 3172.2617 13 11 1
## 8 21 Female 95847.53 Yes 210.2998 1907.5692 3 31 3
## 9 31 Female 71550.06 No 310.9308 2166.5671 4 14 4
## 10 24 Female 91568.52 No 213.3846 2196.6703 5 23 2
## Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1 1 1 2 4 1 5 2 4 1 Style
## 2 1 2 4 2 2 4 2 3 2 Quality
## 3 4 4 4 4 4 4 1 4 2 Conspicuous
## 4 1 1 1 4 1 5 2 4 2 Style
## 5 2 1 3 1 4 1 4 1 4 Price
## 6 1 1 1 4 1 4 2 4 1 Style
## 7 4 4 4 4 4 4 1 4 1 Conspicuous
## 8 1 1 2 4 1 5 2 4 2 Style
## 9 1 2 4 2 1 4 2 3 2 Quality
## 10 1 1 1 4 1 4 3 4 1 Style
print(dplyr::sample_n(sim.dat, 10, replace = TRUE))
## age gender income house store_exp online_exp store_trans online_trans Q1
## 1 26 Female NA No 183.1205 2391.2305 3 27 3
## 2 31 Female 80539.01 No 225.8908 1975.7586 1 15 4
## 3 40 Male 258034.45 Yes 5459.8279 4437.2839 10 9 1
## 4 30 Male 65412.93 Yes 257.2869 1735.1010 4 13 5
## 5 22 Female 87477.37 No 195.3082 1636.8115 4 20 3
## 6 25 Female 85401.49 No 204.0966 1614.1424 4 20 2
## 7 40 Male 258034.45 Yes 5459.8279 4437.2839 10 9 1
## 8 62 Female 110510.45 Yes 473.6524 253.3283 7 1 5
## 9 64 Male 113301.48 Yes 459.9504 252.7724 2 3 5
## 10 38 Male NA Yes 5397.9469 2587.6464 10 9 1
## Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1 1 1 2 4 1 5 2 4 1 Style
## 2 2 3 4 2 2 4 1 2 3 Quality
## 3 5 5 4 4 4 4 1 4 2 Conspicuous
## 4 1 2 4 2 2 4 2 3 3 Quality
## 5 1 1 2 4 1 5 3 4 1 Style
## 6 1 1 2 4 1 5 2 4 2 Style
## 7 5 5 4 4 4 4 1 4 2 Conspicuous
## 8 1 1 2 1 4 1 4 2 4 Price
## 9 1 1 3 1 4 1 4 2 4 Price
## 10 4 4 4 4 4 4 1 4 1 Conspicuous
# subset rows by position
dplyr::slice(sim.dat, 10:15)
## age gender income house store_exp online_exp store_trans online_trans Q1 Q2
## 1 60 Male 105048.8 Yes 470.3190 163.4663 12 1 4 2
## 2 58 Male 107196.7 Yes 366.6393 170.1333 5 4 4 1
## 3 59 Male NA Yes 674.8829 310.2657 6 2 4 1
## 4 64 Male 119020.0 Yes 613.9490 160.8463 7 4 5 1
## 5 57 Female NA Yes 737.0497 224.5322 7 3 4 2
## 6 64 Male 114539.4 Yes 402.4901 241.8339 5 5 4 2
## Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1 1 3 1 4 1 4 1 4 Price
## 2 1 3 1 4 1 4 2 4 Price
## 3 1 3 1 4 1 4 1 4 Price
## 4 1 3 1 4 1 4 2 4 Price
## 5 1 3 1 4 1 4 1 4 Price
## 6 1 2 1 4 1 4 1 4 Price
# select the order top n entries
dplyr::top_n(sim.dat,2,income)
## age gender income house store_exp online_exp store_trans online_trans Q1 Q2
## 1 33 Male 319704.3 Yes 5998.305 4395.923 9 11 1 4
## 2 41 Male 317476.2 Yes 3029.844 4179.671 11 12 1 4
## Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1 4 4 4 4 4 1 4 2 Conspicuous
## 2 5 4 4 4 4 1 4 2 Conspicuous
# select by column name
print(head(dplyr::select(sim.dat,income,age,store_exp)))
## income age store_exp
## 1 120963.4 57 529.1344
## 2 122008.1 63 478.0058
## 3 114202.3 59 490.8107
## 4 113616.3 60 347.8090
## 5 124252.6 51 379.6259
## 6 107661.5 59 338.3154
# select columns whose name contains a character string
print(head(dplyr::select(sim.dat, contains("_"))))
## store_exp online_exp store_trans online_trans
## 1 529.1344 303.5125 2 2
## 2 478.0058 109.5297 4 2
## 3 490.8107 279.2496 7 2
## 4 347.8090 141.6698 10 2
## 5 379.6259 112.2372 4 4
## 6 338.3154 195.6870 4 5
# select columns whose name ends with a character string
# similar there is "starts_with"
print(head(dplyr::select(sim.dat, ends_with("e"))))
## age income house
## 1 57 120963.4 Yes
## 2 63 122008.1 Yes
## 3 59 114202.3 Yes
## 4 60 113616.3 Yes
## 5 51 124252.6 Yes
## 6 59 107661.5 Yes
# select columns Q1,Q2,Q3,Q4 and Q5
print(head(select(sim.dat, num_range("Q", 1:5))))
## Q1 Q2 Q3 Q4 Q5
## 1 4 2 1 2 1
## 2 4 1 1 2 1
## 3 5 2 1 2 1
## 4 5 2 1 3 1
## 5 4 1 1 3 1
## 6 4 2 1 2 1
# select columns whose names are in a group of names
print(head(dplyr::select(sim.dat, one_of(c("age", "income")))))
## age income
## 1 57 120963.4
## 2 63 122008.1
## 3 59 114202.3
## 4 60 113616.3
## 5 51 124252.6
## 6 59 107661.5
# select columns between age and online_exp
print(head(dplyr::select(sim.dat, age:online_exp)))
## age gender income house store_exp online_exp
## 1 57 Female 120963.4 Yes 529.1344 303.5125
## 2 63 Female 122008.1 Yes 478.0058 109.5297
## 3 59 Male 114202.3 Yes 490.8107 279.2496
## 4 60 Male 113616.3 Yes 347.8090 141.6698
## 5 51 Male 124252.6 Yes 379.6259 112.2372
## 6 59 Male 107661.5 Yes 338.3154 195.6870
# select all columns except for age
print(head(dplyr::select(sim.dat, -age)))
## gender income house store_exp online_exp store_trans online_trans Q1 Q2 Q3
## 1 Female 120963.4 Yes 529.1344 303.5125 2 2 4 2 1
## 2 Female 122008.1 Yes 478.0058 109.5297 4 2 4 1 1
## 3 Male 114202.3 Yes 490.8107 279.2496 7 2 5 2 1
## 4 Male 113616.3 Yes 347.8090 141.6698 10 2 5 2 1
## 5 Male 124252.6 Yes 379.6259 112.2372 4 4 4 1 1
## 6 Male 107661.5 Yes 338.3154 195.6870 4 5 4 2 1
## Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1 2 1 4 1 4 2 4 Price
## 2 2 1 4 1 4 1 4 Price
## 3 2 1 4 1 4 1 4 Price
## 4 3 1 4 1 4 2 4 Price
## 5 3 1 4 1 4 2 4 Price
## 6 2 1 4 1 4 1 4 Price
dat_summary <- sim.dat %>%
dplyr::group_by(segment) %>%
dplyr::summarise(Age = round(mean(na.omit(age)), 0),
FemalePct = round(mean(gender == "Female"), 2),
HouseYes = round(mean(house == "Yes"), 2),
store_exp = round(mean(na.omit(store_exp),
trim = 0.1), 0),
online_exp = round(mean(online_exp), 0),
store_trans = round(mean(store_trans), 1),
online_trans = round(mean(online_trans), 1))
# transpose the data frame for showing purpose
# due to the limit of output width
cnames <- dat_summary$segment
dat_summary <- dplyr::select(dat_summary, - segment)
tdat_summary <- t(dat_summary) %>% data.frame()
names(tdat_summary) <- cnames
tdat_summary
## Conspicuous Price Quality Style
## Age 42.00 60.00 35.00 24.00
## FemalePct 0.32 0.45 0.47 0.81
## HouseYes 0.86 0.94 0.34 0.27
## store_exp 4990.00 501.00 301.00 200.00
## online_exp 4898.00 205.00 2013.00 1962.00
## store_trans 10.90 6.10 2.90 3.00
## online_trans 11.10 3.00 16.00 21.10
# Another example using group_by() and summarise()
sim.dat %>%
group_by(segment) %>%
summarise(avg_online = round(sum(online_exp)/sum(online_trans), 2),
avg_store = round(sum(store_exp)/sum(store_trans), 2))
## # A tibble: 4 × 3
## segment avg_online avg_store
## <chr> <dbl> <dbl>
## 1 Conspicuous 442. 479.
## 2 Price 69.3 81.3
## 3 Quality 126. 105.
## 4 Style 92.8 121.
# apply function anyNA() to each column
# you can also assign a function vector such as: c("anyNA","is.factor")
dplyr::summarise_all(sim.dat, funs_(c("anyNA")))
## Warning: `funs_()` was deprecated in dplyr 0.7.0.
## Please use `funs()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## age gender income house store_exp online_exp store_trans online_trans Q1
## 1 FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment
## 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# create a new variable
head(dplyr::mutate(sim.dat, total_exp = store_exp + online_exp))
## age gender income house store_exp online_exp store_trans online_trans Q1 Q2
## 1 57 Female 120963.4 Yes 529.1344 303.5125 2 2 4 2
## 2 63 Female 122008.1 Yes 478.0058 109.5297 4 2 4 1
## 3 59 Male 114202.3 Yes 490.8107 279.2496 7 2 5 2
## 4 60 Male 113616.3 Yes 347.8090 141.6698 10 2 5 2
## 5 51 Male 124252.6 Yes 379.6259 112.2372 4 4 4 1
## 6 59 Male 107661.5 Yes 338.3154 195.6870 4 5 4 2
## Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 segment total_exp
## 1 1 2 1 4 1 4 2 4 Price 832.6468
## 2 1 2 1 4 1 4 1 4 Price 587.5355
## 3 1 2 1 4 1 4 1 4 Price 770.0603
## 4 1 3 1 4 1 4 2 4 Price 489.4788
## 5 1 3 1 4 1 4 2 4 Price 491.8631
## 6 1 2 1 4 1 4 1 4 Price 534.0024
# create a new variable and only keep the new variable
head(dplyr::transmute(sim.dat, total_exp = store_exp + online_exp))
## total_exp
## 1 832.6468
## 2 587.5355
## 3 770.0603
## 4 489.4788
## 5 491.8631
## 6 534.0024
(x <- data.frame(cbind(ID = c("A", "B", "C"), x1 = c(1, 2, 3))))
## ID x1
## 1 A 1
## 2 B 2
## 3 C 3
(y <- data.frame(cbind(ID = c("B", "C", "D"), y1 = c(T, T, F))))
## ID y1
## 1 B TRUE
## 2 C TRUE
## 3 D FALSE
# join to the left
# keep all rows in x
left_join(x, y, by = "ID")
## ID x1 y1
## 1 A 1 <NA>
## 2 B 2 TRUE
## 3 C 3 TRUE
# get rows matched in both data sets
inner_join(x, y, by = "ID")
## ID x1 y1
## 1 B 2 TRUE
## 2 C 3 TRUE
# get rows in either data set
full_join(x, y, by = "ID")
## ID x1 y1
## 1 A 1 <NA>
## 2 B 2 TRUE
## 3 C 3 TRUE
## 4 D <NA> FALSE
# filter out rows in x that can be matched in y
# it doesn't bring in any values from y
semi_join(x, y, by = "ID")
# the opposite of semi_join()
# it gets rows in x that cannot be matched in y
# it doesn't bring in any values from y
anti_join(x, y, by = "ID")
apply()
, lapply()
and
sapply()
in base RCompute row and column sums for a matrix:
## simulate a matrix
x <- cbind(x1 =1:8, x2 = c(4:1, 2:5))
dimnames(x)[[1]] <- letters[1:8]
apply(x, 2, mean)
## x1 x2
## 4.5 3.0
col.sums <- apply(x, 2, sum)
row.sums <- apply(x, 1, sum)
You can also apply other functions:
ma <- matrix(c(1:4, 1, 6:8), nrow = 2)
ma
## [,1] [,2] [,3] [,4]
## [1,] 1 3 1 7
## [2,] 2 4 6 8
apply(ma, 1, table) #--> a list of length 2
## [[1]]
##
## 1 3 7
## 2 1 1
##
## [[2]]
##
## 2 4 6 8
## 1 1 1 1
apply(ma, 1, stats::quantile) # 5 x n matrix with rownames
## [,1] [,2]
## 0% 1 2.0
## 25% 1 3.5
## 50% 2 5.0
## 75% 4 6.5
## 100% 7 8.0
Let’s use some data with context to help you better understand the functions.
# Get numerical variables
sdat <- sim.dat[, lapply(sim.dat, class) %in% c("integer", "numeric")]
## Try the following code with apply() function apply(sim.dat,2,class)
## What is the problem?
The data frame sdat
only includes numeric columns. Now
we can go head and use apply()
to get mean and standard
deviation for each column:
apply(sdat, MARGIN = 2, function(x) mean(na.omit(x)))
## age income store_exp online_exp store_trans online_trans
## 38.840 113543.065 1356.851 2120.181 5.350 13.546
## Q1 Q2 Q3 Q4 Q5 Q6
## 3.101 1.823 1.992 2.763 2.945 2.448
## Q7 Q8 Q9 Q10
## 3.434 2.396 3.085 2.320
Run the following code and compare the results:
lapply(sdat, function(x) sd(na.omit(x)))
sapply(sdat, function(x) sd(na.omit(x)))
sapply(sdat, function(x) sd(na.omit(x)), simplify = FALSE)
Take a baby subset of our exemplary clothes consumers data to illustrate:
sdat<-sim.dat[1:5,1:6]
sdat
## age gender income house store_exp online_exp
## 1 57 Female 120963.4 Yes 529.1344 303.5125
## 2 63 Female 122008.1 Yes 478.0058 109.5297
## 3 59 Male 114202.3 Yes 490.8107 279.2496
## 4 60 Male 113616.3 Yes 347.8090 141.6698
## 5 51 Male 124252.6 Yes 379.6259 112.2372
Change data from “wide” to “long”.
dat_long <- tidyr::gather(sdat, "Channel","Expense",
store_exp, online_exp)
dat_long
## age gender income house Channel Expense
## 1 57 Female 120963.4 Yes store_exp 529.1344
## 2 63 Female 122008.1 Yes store_exp 478.0058
## 3 59 Male 114202.3 Yes store_exp 490.8107
## 4 60 Male 113616.3 Yes store_exp 347.8090
## 5 51 Male 124252.6 Yes store_exp 379.6259
## 6 57 Female 120963.4 Yes online_exp 303.5125
## 7 63 Female 122008.1 Yes online_exp 109.5297
## 8 59 Male 114202.3 Yes online_exp 279.2496
## 9 60 Male 113616.3 Yes online_exp 141.6698
## 10 51 Male 124252.6 Yes online_exp 112.2372
You can run a regression to study the effect of purchasing channel as follows:
# Here we use all observations from sim.dat
# Don't show result here
msdat <- tidyr::gather(sim.dat[, 1:6], "Channel","Expense",
store_exp, online_exp)
fit <- lm(Expense ~ gender + house + income + Channel + age,
data = msdat)
summary(fit)
##
## Call:
## lm(formula = Expense ~ gender + house + income + Channel + age,
## data = msdat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4208 -821 -275 533 44353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.363e+01 1.560e+02 -0.536 0.591884
## genderMale 3.572e+02 1.028e+02 3.475 0.000524 ***
## houseYes -5.687e+01 1.138e+02 -0.500 0.617275
## income 2.834e-02 1.079e-03 26.268 < 2e-16 ***
## Channelstore_exp -8.296e+02 9.772e+01 -8.489 < 2e-16 ***
## age -2.793e+01 3.356e+00 -8.321 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1974 on 1626 degrees of freedom
## (368 observations deleted due to missingness)
## Multiple R-squared: 0.348, Adjusted R-squared: 0.346
## F-statistic: 173.5 on 5 and 1626 DF, p-value: < 2.2e-16
Change data from “long” to “wide”.
dat_wide = tidyr::spread(dat_long, Channel, Expense)
# you can check what dat_long is like
dat_wide %>%
dplyr::group_by(house, gender) %>%
dplyr::summarise(total_online_exp = sum(online_exp),
total_store_exp = sum(store_exp))
## # A tibble: 2 × 4
## # Groups: house [1]
## house gender total_online_exp total_store_exp
## <chr> <chr> <dbl> <dbl>
## 1 Yes Female 413. 1007.
## 2 Yes Male 533. 1218.
Another pair of functions that do opposite manipulations are
separate()
and unite()
.
sepdat<- dat_long %>%
separate(Channel, c("Source", "Type"))
sepdat
## age gender income house Source Type Expense
## 1 57 Female 120963.4 Yes store exp 529.1344
## 2 63 Female 122008.1 Yes store exp 478.0058
## 3 59 Male 114202.3 Yes store exp 490.8107
## 4 60 Male 113616.3 Yes store exp 347.8090
## 5 51 Male 124252.6 Yes store exp 379.6259
## 6 57 Female 120963.4 Yes online exp 303.5125
## 7 63 Female 122008.1 Yes online exp 109.5297
## 8 59 Male 114202.3 Yes online exp 279.2496
## 9 60 Male 113616.3 Yes online exp 141.6698
## 10 51 Male 124252.6 Yes online exp 112.2372
The unite()
function will do the opposite: combining two
columns. It is the generalization of paste()
to a data
frame.
sepdat %>%
unite("Channel", Source, Type, sep = "_")
## age gender income house Channel Expense
## 1 57 Female 120963.4 Yes store_exp 529.1344
## 2 63 Female 122008.1 Yes store_exp 478.0058
## 3 59 Male 114202.3 Yes store_exp 490.8107
## 4 60 Male 113616.3 Yes store_exp 347.8090
## 5 51 Male 124252.6 Yes store_exp 379.6259
## 6 57 Female 120963.4 Yes online_exp 303.5125
## 7 63 Female 122008.1 Yes online_exp 109.5297
## 8 59 Male 114202.3 Yes online_exp 279.2496
## 9 60 Male 113616.3 Yes online_exp 141.6698
## 10 51 Male 124252.6 Yes online_exp 112.2372