dplyr
dplyr is a powerful R-package to transform and summarize tabular data with rows and columns.
dplyr is faster than plyr
install and load packages
#install.packages("dplyr")
suppressPackageStartupMessages(library(dplyr))
dim(ChickWeight) #Use the default ChickWeight dataset as example
## [1] 578 4
head(ChickWeight)
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
## 6 93 10 1 1
?ChickWeight #disply the explanation of ChickWeight dataset
count(): tally observations based on a group
It is slightly similar to the table function in the base package.
count(ChickWeight, Diet)
## # A tibble: 4 x 2
## Diet n
## <fctr> <int>
## 1 1 220
## 2 2 120
## 3 3 120
## 4 4 118
summarise(): summarise values
create summary statistics for a given column or multiple columns in the data frame. Like basic function aggregate()
It is very powerful when used in conjunction with the other functions in the dplyr package.
There are many summary statistics you could consider such as mean
, sd()
, min()
, max()
, median()
, sum()
, n()
(returns the length of vector), first()
(returns first value in vector), last()
(returns last value in vector) and n_distinct()
(number of distinct values in vector).
summarise(ChickWeight, mean(weight, na.rm = T)) #conbine with na.rm = TRUE, remove all NA values while calculating
## mean(weight, na.rm = T)
## 1 121.8183
#Calculate multiple statistics
summarise(ChickWeight, mean(Time, na.rm = T), min(weight))
## mean(Time, na.rm = T) min(weight)
## 1 10.71799 35
group_by(): group data by one or more variables
group_by
is related to concept of “split-apply-combine”. We literally want to split the data frame by some variable (e.g. taxonomic order), apply a function to the individual data frames and then combine the output. Usually used with summarise()
function
G1 <- group_by(ChickWeight, Diet) #group by Diet
head(G1)
## # A tibble: 6 x 4
## # Groups: Diet [1]
## weight Time Chick Diet
## <dbl> <dbl> <ord> <fctr>
## 1 42 0 1 1
## 2 51 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
## 6 93 10 1 1
summarise(group_by(ChickWeight, Diet), mean(Time, na.rm = T), mean(weight), sum(weight), total=n())
## # A tibble: 4 x 5
## Diet `mean(Time, na.rm = T)` `mean(weight)` `sum(weight)` total
## <fctr> <dbl> <dbl> <dbl> <int>
## 1 1 10.48182 102.6455 22582 220
## 2 2 10.91667 122.6167 14714 120
## 3 3 10.91667 142.9500 17154 120
## 4 4 10.75424 135.2627 15961 118
sample(): select random rows
sample_n(ChickWeight, size = 6) #randomly selects n rows from the dataset
## weight Time Chick Diet
## 439 61 4 39 3
## 374 146 18 33 3
## 371 137 12 33 3
## 234 55 2 22 2
## 48 157 21 4 1
## 368 77 6 33 3
sample_frac(ChickWeight, size = 0.01) #randomly selects 1% of the rows from the dataset.
## weight Time Chick Diet
## 458 262 18 40 3
## 198 48 2 19 1
## 463 66 4 41 4
## 200 62 6 19 1
## 336 122 14 30 2
## 496 200 21 43 4
arrange(): re-order or arrange rows by variables
To arrange (or re-order) rows by a particular column such as the taxonomic order. Similar to the basic order()
function.
A1 <- arrange(ChickWeight, weight, Diet)
A2 <- arrange(ChickWeight, desc(weight), Diet) #show decreasing order with desc() function
filter(): select rows
The filter()
function will return all the rows that satisfy given conditions. Like basic function subset()
Boolean operators:
==, >, >=, <, <=, !=, %in%
&, |, !, xor()
is.na()
between(), near()
#For numeric conditions
F1 <- filter(ChickWeight, weight > 50, Time < 3)
F1 <- filter(ChickWeight, weight > 50&Time < 3) #The conditions can be seperated by "&" or ","
#For factor or character conditions
F2 <- filter(ChickWeight, Chick == "1") #For factor or characters, "==" should be used
F3 <- filter(ChickWeight, Chick %in% c("1", "3")) #To filter with multiple variables, use "c()" with "%in%"
F4 <- filter(ChickWeight, Chick == "2", Diet == "1")
# Combine numeric, factor and character conditions
F5 <- filter(ChickWeight, weight > 50, Chick %in% c("1", "3"))
select(): select columns
Select and exclude columns based on specific criteria:
starts_with()
= Select columns that start with a character string
ends_with()
= Select columns that end with a character string
contains()
= Select columns that contain a character string
matches()
= Select columns that match a regular expression
one_of()
= Select columns names that are from a group of names
everything()
= select all columns
S1 <- select(ChickWeight, Time, Diet) #select the second and fourth columns by names
S2 <- select(ChickWeight, -Time, -Diet) #exclude columns by using "-" (subtraction/negative indexing) operator
S3 <- select(ChickWeight, weight:Diet) #select a range of columns by name, use the ":" (colon) operator
S4 <- select(ChickWeight, starts_with("Ch")) #select all columns that start with the character string "Ch", use the function starts_with()
S5 <- select(ChickWeight, ends_with("t")) # #select all columns that end with the character string "t", use the function ends_with()
S6 <- select(ChickWeight, contains("ic")) # #select all columns that contain the character string "ic", use the function contains()
S7 <- select(ChickWeight, matches(".e.")) # #select all columns that match with the character string ".e.", use the function matches()
var <- c("Diet", "Time")
S8 <- select(ChickWeight, one_of(var)) # #select all columns that match with the character vector, use the function one_of()
S9 <- select(ChickWeight, everything()) # #select all columns, use the function everything()
mutate(): create new columns
The mutate()
function will add new columns to the data frame (separated by commas). Like basic function transform()
M1 <- mutate(ChickWeight, Diff = weight - mean(weight), Ratio = weight/mean(weight))
join(): join two datasets together
inner_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)
left_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)
right_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)
full_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)
semi_join(x, y, by = NULL, copy = FALSE, ...)
anti_join(x, y, by = NULL, copy = FALSE, ...)
by: a character vector of variables to join by. Natural join using all variables with common names across the two tables by default.
To join by different variables on x and y use a named vector. For example, by = c("a" = "b") will match x.a to y.b.
x <- data.frame(CustomerId = c(1:6), Product = c(rep("Toaster", 3), rep("Radio", 3)))
y <- data.frame(CustomerId = c(2, 4, 6), State = c(rep("Alabama", 2), rep("Ohio", 1)))
x; y
## CustomerId Product
## 1 1 Toaster
## 2 2 Toaster
## 3 3 Toaster
## 4 4 Radio
## 5 5 Radio
## 6 6 Radio
## CustomerId State
## 1 2 Alabama
## 2 4 Alabama
## 3 6 Ohio
inner_join()
: Include only rows in both x and y that have a matching value
inner_join(x, y)
## CustomerId Product State
## 1 2 Toaster Alabama
## 2 4 Radio Alabama
## 3 6 Radio Ohio
left_join()
: Include all of x, and matching rows of y
left_join(x, y) #the same as right_join(y, x), except column position
## CustomerId Product State
## 1 1 Toaster <NA>
## 2 2 Toaster Alabama
## 3 3 Toaster <NA>
## 4 4 Radio Alabama
## 5 5 Radio <NA>
## 6 6 Radio Ohio
right_join()
: Include all of y, and matching rows of x
right_join(x, y) #the same as left_join(y, x), except column position
## CustomerId Product State
## 1 2 Toaster Alabama
## 2 4 Radio Alabama
## 3 6 Radio Ohio
semi_join()
: Include rows of x that match y but only keep the columns from x
anti_join()
: Include rows of x that not match y. Opposite of semi_join
semi_join(x, y)
## CustomerId Product
## 1 2 Toaster
## 2 4 Radio
## 3 6 Radio
anti_join(x, y)
## CustomerId Product
## 1 1 Toaster
## 2 3 Toaster
## 3 5 Radio
full_join()
: return all rows and all columns from both x and y. NA was used for not matching value.
full_join(x, y)
## CustomerId Product State
## 1 1 Toaster <NA>
## 2 2 Toaster Alabama
## 3 3 Toaster <NA>
## 4 4 Radio Alabama
## 5 5 Radio <NA>
## 6 6 Radio Ohio
pipe operator(%>%): chain code together
This operator allows you to pipe the output from one function to the input of another function. Instead of nesting functions (reading from the inside to the outside), the idea of of piping is to read the functions from left to right.
P1 <- ChickWeight %>%
filter(Chick != c(15, 18, 20, 27)) %>%
group_by(Diet) %>%
summarise(mean(Time, na.rm = T))
P2 <- ChickWeight %>%
filter(weight >= 50, Diet != 1) %>%
mutate(DiffTime = Time - mean(Time, na.rm =T)) %>%
select(Diet:DiffTime) %>%
head