dplyr

dplyr is a powerful R-package to transform and summarize tabular data with rows and columns.
dplyr is faster than plyr

install and load packages

#install.packages("dplyr")
suppressPackageStartupMessages(library(dplyr))

dim(ChickWeight) #Use the default ChickWeight dataset as example
## [1] 578   4
head(ChickWeight)
##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
## 6     93   10     1    1
?ChickWeight  #disply the explanation of ChickWeight dataset

count(): tally observations based on a group

It is slightly similar to the table function in the base package.

count(ChickWeight, Diet)
## # A tibble: 4 x 2
##     Diet     n
##   <fctr> <int>
## 1      1   220
## 2      2   120
## 3      3   120
## 4      4   118

summarise(): summarise values

create summary statistics for a given column or multiple columns in the data frame. Like basic function aggregate() It is very powerful when used in conjunction with the other functions in the dplyr package.

There are many summary statistics you could consider such as mean, sd(), min(), max(), median(), sum(), n() (returns the length of vector), first() (returns first value in vector), last() (returns last value in vector) and n_distinct() (number of distinct values in vector).

summarise(ChickWeight, mean(weight, na.rm = T)) #conbine with na.rm = TRUE, remove all NA values while calculating
##   mean(weight, na.rm = T)
## 1                121.8183
#Calculate multiple statistics
summarise(ChickWeight, mean(Time, na.rm = T), min(weight))
##   mean(Time, na.rm = T) min(weight)
## 1              10.71799          35

group_by(): group data by one or more variables

group_by is related to concept of “split-apply-combine”. We literally want to split the data frame by some variable (e.g. taxonomic order), apply a function to the individual data frames and then combine the output. Usually used with summarise() function

G1 <- group_by(ChickWeight, Diet) #group by Diet
head(G1)
## # A tibble: 6 x 4
## # Groups:   Diet [1]
##   weight  Time Chick   Diet
##    <dbl> <dbl> <ord> <fctr>
## 1     42     0     1      1
## 2     51     2     1      1
## 3     59     4     1      1
## 4     64     6     1      1
## 5     76     8     1      1
## 6     93    10     1      1
summarise(group_by(ChickWeight, Diet), mean(Time, na.rm = T), mean(weight), sum(weight), total=n())
## # A tibble: 4 x 5
##     Diet `mean(Time, na.rm = T)` `mean(weight)` `sum(weight)` total
##   <fctr>                   <dbl>          <dbl>         <dbl> <int>
## 1      1                10.48182       102.6455         22582   220
## 2      2                10.91667       122.6167         14714   120
## 3      3                10.91667       142.9500         17154   120
## 4      4                10.75424       135.2627         15961   118

sample(): select random rows

sample_n(ChickWeight, size = 6) #randomly selects n rows from the dataset
##     weight Time Chick Diet
## 439     61    4    39    3
## 374    146   18    33    3
## 371    137   12    33    3
## 234     55    2    22    2
## 48     157   21     4    1
## 368     77    6    33    3
sample_frac(ChickWeight, size = 0.01) #randomly selects 1% of the rows from the dataset.
##     weight Time Chick Diet
## 458    262   18    40    3
## 198     48    2    19    1
## 463     66    4    41    4
## 200     62    6    19    1
## 336    122   14    30    2
## 496    200   21    43    4

arrange(): re-order or arrange rows by variables

To arrange (or re-order) rows by a particular column such as the taxonomic order. Similar to the basic order() function.

A1 <- arrange(ChickWeight, weight, Diet)
A2 <- arrange(ChickWeight, desc(weight), Diet) #show decreasing order with desc() function

filter(): select rows

The filter() function will return all the rows that satisfy given conditions. Like basic function subset()
Boolean operators: ==, >, >=, <, <=, !=, %in%
&, |, !, xor()
is.na()
between(), near()

#For numeric conditions
F1 <- filter(ChickWeight, weight > 50, Time < 3)
F1 <- filter(ChickWeight, weight > 50&Time < 3) #The conditions can be seperated by "&" or ","

#For factor or character conditions
F2 <- filter(ChickWeight, Chick == "1") #For factor or characters, "==" should be used
F3 <- filter(ChickWeight, Chick %in% c("1", "3")) #To filter with multiple variables, use "c()" with "%in%"
F4 <- filter(ChickWeight, Chick == "2", Diet == "1")

# Combine numeric, factor and character conditions
F5 <- filter(ChickWeight, weight > 50, Chick %in% c("1", "3"))

select(): select columns

Select and exclude columns based on specific criteria:
starts_with() = Select columns that start with a character string
ends_with() = Select columns that end with a character string
contains() = Select columns that contain a character string
matches() = Select columns that match a regular expression
one_of() = Select columns names that are from a group of names
everything() = select all columns

S1 <- select(ChickWeight, Time, Diet) #select the second and fourth columns by names
S2 <- select(ChickWeight, -Time, -Diet) #exclude columns by using "-" (subtraction/negative indexing) operator
S3 <- select(ChickWeight, weight:Diet) #select a range of columns by name, use the ":" (colon) operator
S4 <- select(ChickWeight, starts_with("Ch")) #select all columns that start with the character string "Ch", use the function starts_with()
S5 <- select(ChickWeight, ends_with("t")) # #select all columns that end with the character string "t", use the function ends_with()
S6 <- select(ChickWeight, contains("ic")) # #select all columns that contain the character string "ic", use the function contains()
S7 <- select(ChickWeight, matches(".e.")) # #select all columns that match with the character string ".e.", use the function matches()
var <- c("Diet", "Time")
S8 <- select(ChickWeight, one_of(var)) # #select all columns that match with the character vector, use the function one_of()
S9 <- select(ChickWeight, everything()) # #select all columns, use the function everything()

mutate(): create new columns

The mutate() function will add new columns to the data frame (separated by commas). Like basic function transform()

M1 <- mutate(ChickWeight, Diff = weight - mean(weight), Ratio = weight/mean(weight))

join(): join two datasets together

inner_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)   
left_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)     
right_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)   
full_join(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...)     
semi_join(x, y, by = NULL, copy = FALSE, ...)       
anti_join(x, y, by = NULL, copy = FALSE, ...)   

by: a character vector of variables to join by. Natural join using all variables with common names across the two tables by default.  
	To join by different variables on x and y use a named vector. For example, by = c("a" = "b") will match x.a to y.b.
x <- data.frame(CustomerId = c(1:6), Product = c(rep("Toaster", 3), rep("Radio", 3)))
y <- data.frame(CustomerId = c(2, 4, 6), State = c(rep("Alabama", 2), rep("Ohio", 1)))
x; y
##   CustomerId Product
## 1          1 Toaster
## 2          2 Toaster
## 3          3 Toaster
## 4          4   Radio
## 5          5   Radio
## 6          6   Radio
##   CustomerId   State
## 1          2 Alabama
## 2          4 Alabama
## 3          6    Ohio

inner_join(): Include only rows in both x and y that have a matching value

inner_join(x, y)
##   CustomerId Product   State
## 1          2 Toaster Alabama
## 2          4   Radio Alabama
## 3          6   Radio    Ohio

left_join(): Include all of x, and matching rows of y

left_join(x, y) #the same as right_join(y, x), except column position
##   CustomerId Product   State
## 1          1 Toaster    <NA>
## 2          2 Toaster Alabama
## 3          3 Toaster    <NA>
## 4          4   Radio Alabama
## 5          5   Radio    <NA>
## 6          6   Radio    Ohio

right_join(): Include all of y, and matching rows of x

right_join(x, y) #the same as left_join(y, x), except column position
##   CustomerId Product   State
## 1          2 Toaster Alabama
## 2          4   Radio Alabama
## 3          6   Radio    Ohio

semi_join(): Include rows of x that match y but only keep the columns from x

anti_join(): Include rows of x that not match y. Opposite of semi_join

semi_join(x, y)
##   CustomerId Product
## 1          2 Toaster
## 2          4   Radio
## 3          6   Radio
anti_join(x, y)
##   CustomerId Product
## 1          1 Toaster
## 2          3 Toaster
## 3          5   Radio

full_join(): return all rows and all columns from both x and y. NA was used for not matching value.

full_join(x, y)
##   CustomerId Product   State
## 1          1 Toaster    <NA>
## 2          2 Toaster Alabama
## 3          3 Toaster    <NA>
## 4          4   Radio Alabama
## 5          5   Radio    <NA>
## 6          6   Radio    Ohio

pipe operator(%>%): chain code together

This operator allows you to pipe the output from one function to the input of another function. Instead of nesting functions (reading from the inside to the outside), the idea of of piping is to read the functions from left to right.

P1 <- ChickWeight %>%
  filter(Chick != c(15, 18, 20, 27)) %>%
  group_by(Diet) %>%
  summarise(mean(Time, na.rm = T))
P2 <- ChickWeight %>%
  filter(weight >= 50, Diet != 1) %>%
  mutate(DiffTime = Time - mean(Time, na.rm =T)) %>%
  select(Diet:DiffTime) %>%
  head

References

dplyr tutorial
Data Processing with dplyr & tidyr

CHENYUAN

CHENYUAN

CHENYUAN
Pursuing the dream and the best future

CHENYUAN Blog Homepage

因为不想遗忘! 在这个信息大爆炸的年代,最重要的是对知识的消化-吸收-重铸。每天学了很多东西,但是理解的多少,以及能够运用多少是日后成功的关键。作为一个PhD,大脑中充斥了太多的东西,同时随着年龄的增长,难免会忘掉很多事情。所以只是为了在众多教程中写一个自己用到的,与自己...… Continue reading