What is dplyr?

  • a new(er) package that provides a set of tools for manipulating datasets in R
  • part of the tidyverse: an opinionated collection of R packages which share philosophy, grammar, and data structure
  • specifically written to be fast!
  • has individual functions that correspond to the most common operations
  • makes it easier for you to figure out what to do with your data

The core ‘verbs’

  1. filter()
  2. arrange()
  3. select()
  4. summarize() and group_by()
  5. mutate()

Start with a built-in dataset

library(tidyverse)
data(starwars)
class(starwars)
## [1] "tbl_df"     "tbl"        "data.frame"

What is a tibble?

  • “a modern take on data frames”
  • said to keep the great aspects of data frames and drops the frustrating ones (i.e. changing variable names, changing an input type)
glimpse(starwars) #more effective than str() in this case   
## Rows: 87
## Columns: 14
## $ name       <chr> "Luke Skywalker", "C-3PO", "R2-D2", "Darth Vader", "Leia Or…
## $ height     <int> 172, 167, 96, 202, 150, 178, 165, 97, 183, 182, 188, 180, 2…
## $ mass       <dbl> 77.0, 75.0, 32.0, 136.0, 49.0, 120.0, 75.0, 32.0, 84.0, 77.…
## $ hair_color <chr> "blond", NA, NA, "none", "brown", "brown, grey", "brown", N…
## $ skin_color <chr> "fair", "gold", "white, blue", "white", "light", "light", "…
## $ eye_color  <chr> "blue", "yellow", "red", "yellow", "brown", "blue", "blue",…
## $ birth_year <dbl> 19.0, 112.0, 33.0, 41.9, 19.0, 52.0, 47.0, NA, 24.0, 57.0, …
## $ sex        <chr> "male", "none", "none", "male", "female", "male", "female",…
## $ gender     <chr> "masculine", "masculine", "masculine", "masculine", "femini…
## $ homeworld  <chr> "Tatooine", "Tatooine", "Naboo", "Tatooine", "Alderaan", "T…
## $ species    <chr> "Human", "Droid", "Droid", "Human", "Human", "Human", "Huma…
## $ films      <list> <"The Empire Strikes Back", "Revenge of the Sith", "Return…
## $ vehicles   <list> <"Snowspeeder", "Imperial Speeder Bike">, <>, <>, <>, "Imp…
## $ starships  <list> <"X-wing", "Imperial shuttle">, <>, <>, "TIE Advanced x1",…
head(starwars) #looks a little different since it's a tibble (mentions dimensions and also states variable type)
## # A tibble: 6 × 14
##   name      height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
## 2 C-3PO        167    75 <NA>       gold       yellow         112   none  mascu…
## 3 R2-D2         96    32 <NA>       white, bl… red             33   none  mascu…
## 4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
## 5 Leia Org…    150    49 brown      light      brown           19   fema… femin…
## 6 Owen Lars    178   120 brown, gr… light      blue            52   male  mascu…
## # … with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>

Cleaning up our data

# complete.cases is not part of dplyr, but it is very useful!
#dataClean <- starwars[complete.cases(starwars),] # removes all rows with NA values but doesn't work with lists

# to get complete cases for just certain columns 
starwarsClean <-starwars[complete.cases(starwars[,1:10]),]


# Check for NAs
is.na(starwarsClean[1,1]) #useful for only a few observations since it returns a list of True/False
##       name
## [1,] FALSE
anyNA(starwarsClean)
## [1] FALSE
anyNA(starwars[,1:10]) # compared to our original dataset
## [1] TRUE
# What does our data look like now?
glimpse(starwarsClean) # the list columns are still there but no NAs in columns 1-10
## Rows: 29
## Columns: 14
## $ name       <chr> "Luke Skywalker", "Darth Vader", "Leia Organa", "Owen Lars"…
## $ height     <int> 172, 202, 150, 178, 165, 183, 182, 188, 228, 180, 170, 170,…
## $ mass       <dbl> 77.0, 136.0, 49.0, 120.0, 75.0, 84.0, 77.0, 84.0, 112.0, 80…
## $ hair_color <chr> "blond", "none", "brown", "brown, grey", "brown", "black", …
## $ skin_color <chr> "fair", "white", "light", "light", "light", "light", "fair"…
## $ eye_color  <chr> "blue", "yellow", "brown", "blue", "blue", "brown", "blue-g…
## $ birth_year <dbl> 19.0, 41.9, 19.0, 52.0, 47.0, 24.0, 57.0, 41.9, 200.0, 29.0…
## $ sex        <chr> "male", "male", "female", "male", "female", "male", "male",…
## $ gender     <chr> "masculine", "masculine", "feminine", "masculine", "feminin…
## $ homeworld  <chr> "Tatooine", "Tatooine", "Alderaan", "Tatooine", "Tatooine",…
## $ species    <chr> "Human", "Human", "Human", "Human", "Human", "Human", "Huma…
## $ films      <list> <"The Empire Strikes Back", "Revenge of the Sith", "Return…
## $ vehicles   <list> <"Snowspeeder", "Imperial Speeder Bike">, <>, "Imperial Sp…
## $ starships  <list> <"X-wing", "Imperial shuttle">, "TIE Advanced x1", <>, <>,…
head(starwarsClean)
## # A tibble: 6 × 14
##   name      height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
## 2 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
## 3 Leia Org…    150    49 brown      light      brown           19   fema… femin…
## 4 Owen Lars    178   120 brown, gr… light      blue            52   male  mascu…
## 5 Beru Whi…    165    75 brown      light      blue            47   fema… femin…
## 6 Biggs Da…    183    84 black      light      brown           24   male  mascu…
## # … with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>

filter(): Pick/subset observations by their values

### uses >, >=, <, <=,!=, == (not just one!) for comparisons   
### Logical operators: & | !

# filter automatically excludes NAs, have to ask for them specifically 

filter(starwarsClean, gender == "masculine" & height < 180) # you can also use commas in place of &   
## # A tibble: 8 × 14
##   name      height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Luke Sky…    172    77 blond      fair       blue              19 male  mascu…
## 2 Owen Lars    178   120 brown, gr… light      blue              52 male  mascu…
## 3 Wedge An…    170    77 brown      fair       hazel             21 male  mascu…
## 4 Palpatine    170    75 grey       pale       yellow            82 male  mascu…
## 5 Lando Ca…    177    79 black      dark       brown             31 male  mascu…
## 6 Lobot        175    79 none       light      blue              37 male  mascu…
## 7 Wicket S…     88    20 brown      brown      brown              8 male  mascu…
## 8 Darth Ma…    175    80 none       red        yellow            54 male  mascu…
## # … with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
filter(starwars, gender == "masculine", height <180, height > 100) # can add multiple conditions for the same variable   
## # A tibble: 17 × 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue              19 male  mascu…
##  2 C-3PO       167    75 <NA>       gold       yellow           112 none  mascu…
##  3 Owen La…    178   120 brown, gr… light      blue              52 male  mascu…
##  4 Greedo      173    74 <NA>       green      black             44 male  mascu…
##  5 Jabba D…    175  1358 <NA>       green-tan… orange           600 herm… mascu…
##  6 Wedge A…    170    77 brown      fair       hazel             21 male  mascu…
##  7 Palpati…    170    75 grey       pale       yellow            82 male  mascu…
##  8 Lando C…    177    79 black      dark       brown             31 male  mascu…
##  9 Lobot       175    79 none       light      blue              37 male  mascu…
## 10 Nien Nu…    160    68 none       grey       black             NA male  mascu…
## 11 Finis V…    170    NA blond      fair       blue              91 male  mascu…
## 12 Watto       137    NA black      blue, grey yellow            NA male  mascu…
## 13 Sebulba     112    40 none       grey, red  orange            NA male  mascu…
## 14 Darth M…    175    80 none       red        yellow            54 male  mascu…
## 15 Gasgano     122    NA none       white, bl… black             NA male  mascu…
## 16 Ben Qua…    163    65 none       grey, gre… orange            NA male  mascu…
## 17 Eeth Ko…    171    NA black      brown      brown             NA male  mascu…
## # … with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
filter(starwars, eye_color %in% c("blue", "brown")) # we can use %in% for a few conditions, similar to ==
## # A tibble: 40 × 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  mascu…
##  2 Leia Or…    150    49 brown      light      brown           19   fema… femin…
##  3 Owen La…    178   120 brown, gr… light      blue            52   male  mascu…
##  4 Beru Wh…    165    75 brown      light      blue            47   fema… femin…
##  5 Biggs D…    183    84 black      light      brown           24   male  mascu…
##  6 Anakin …    188    84 blond      fair       blue            41.9 male  mascu…
##  7 Wilhuff…    180    NA auburn, g… fair       blue            64   male  mascu…
##  8 Chewbac…    228   112 brown      unknown    blue           200   male  mascu…
##  9 Han Solo    180    80 brown      fair       brown           29   male  mascu…
## 10 Jek Ton…    180   110 brown      fair       blue            NA   male  mascu…
## # … with 30 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## #   films <list>, vehicles <list>, starships <list>

arrange(): Reorder rows

arrange(starwarsClean, by = height) #default is ascending order   
## # A tibble: 29 × 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Wicket …     88  20   brown      brown      brown              8 male  mascu…
##  2 Leia Or…    150  49   brown      light      brown             19 fema… femin…
##  3 Beru Wh…    165  75   brown      light      blue              47 fema… femin…
##  4 Padmé A…    165  45   brown      light      brown             46 fema… femin…
##  5 Barriss…    166  50   black      yellow     blue              40 fema… femin…
##  6 Wedge A…    170  77   brown      fair       hazel             21 male  mascu…
##  7 Palpati…    170  75   grey       pale       yellow            82 male  mascu…
##  8 Luminar…    170  56.2 black      yellow     blue              58 fema… femin…
##  9 Luke Sk…    172  77   blond      fair       blue              19 male  mascu…
## 10 Lobot       175  79   none       light      blue              37 male  mascu…
## # … with 19 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## #   films <list>, vehicles <list>, starships <list>
arrange(starwarsClean, by = desc(height)) #we can use desc() to change that
## # A tibble: 29 × 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Chewbac…    228   112 brown      unknown    blue           200   male  mascu…
##  2 Darth V…    202   136 none       white      yellow          41.9 male  mascu…
##  3 Ki-Adi-…    198    82 white      pale       yellow          92   male  mascu…
##  4 Jar Jar…    196    66 none       orange     orange          52   male  mascu…
##  5 Dooku       193    80 white      fair       brown          102   male  mascu…
##  6 Bossk       190   113 none       green      red             53   male  mascu…
##  7 Anakin …    188    84 blond      fair       blue            41.9 male  mascu…
##  8 Mace Wi…    188    84 none       dark       brown           72   male  mascu…
##  9 Plo Koon    188    80 none       orange     black           22   male  mascu…
## 10 Biggs D…    183    84 black      light      brown           24   male  mascu…
## # … with 19 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## #   films <list>, vehicles <list>, starships <list>
arrange(starwarsClean, height, desc(mass)) # each additional column used to break ties with the preceding column
## # A tibble: 29 × 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Wicket …     88  20   brown      brown      brown              8 male  mascu…
##  2 Leia Or…    150  49   brown      light      brown             19 fema… femin…
##  3 Beru Wh…    165  75   brown      light      blue              47 fema… femin…
##  4 Padmé A…    165  45   brown      light      brown             46 fema… femin…
##  5 Barriss…    166  50   black      yellow     blue              40 fema… femin…
##  6 Wedge A…    170  77   brown      fair       hazel             21 male  mascu…
##  7 Palpati…    170  75   grey       pale       yellow            82 male  mascu…
##  8 Luminar…    170  56.2 black      yellow     blue              58 fema… femin…
##  9 Luke Sk…    172  77   blond      fair       blue              19 male  mascu…
## 10 Darth M…    175  80   none       red        yellow            54 male  mascu…
## # … with 19 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## #   films <list>, vehicles <list>, starships <list>
starwars1<- arrange(starwars, height) # missing values are at the end; note we haven't been assigning anything to a variable, just printing (until now) 
tail(starwars1)
## # A tibble: 6 × 14
##   name      height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Arvel Cr…     NA    NA brown      fair       brown             NA male  mascu…
## 2 Finn          NA    NA black      dark       dark              NA male  mascu…
## 3 Rey           NA    NA brown      light      hazel             NA fema… femin…
## 4 Poe Dame…     NA    NA brown      light      brown             NA male  mascu…
## 5 BB8           NA    NA none       none       black             NA none  mascu…
## 6 Captain …     NA    NA unknown    unknown    unknown           NA <NA>  <NA>  
## # … with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>

select(): Choose variables by their names

## All of these do the same thing (subset)
starwarsClean[1:10,] # using base R
## # A tibble: 10 × 14
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  mascu…
##  2 Darth V…    202   136 none       white      yellow          41.9 male  mascu…
##  3 Leia Or…    150    49 brown      light      brown           19   fema… femin…
##  4 Owen La…    178   120 brown, gr… light      blue            52   male  mascu…
##  5 Beru Wh…    165    75 brown      light      blue            47   fema… femin…
##  6 Biggs D…    183    84 black      light      brown           24   male  mascu…
##  7 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  mascu…
##  8 Anakin …    188    84 blond      fair       blue            41.9 male  mascu…
##  9 Chewbac…    228   112 brown      unknown    blue           200   male  mascu…
## 10 Han Solo    180    80 brown      fair       brown           29   male  mascu…
## # … with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
select(starwarsClean, 1:10) # you can use numbers to subset
## # A tibble: 29 × 10
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  mascu…
##  2 Darth V…    202   136 none       white      yellow          41.9 male  mascu…
##  3 Leia Or…    150    49 brown      light      brown           19   fema… femin…
##  4 Owen La…    178   120 brown, gr… light      blue            52   male  mascu…
##  5 Beru Wh…    165    75 brown      light      blue            47   fema… femin…
##  6 Biggs D…    183    84 black      light      brown           24   male  mascu…
##  7 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  mascu…
##  8 Anakin …    188    84 blond      fair       blue            41.9 male  mascu…
##  9 Chewbac…    228   112 brown      unknown    blue           200   male  mascu…
## 10 Han Solo    180    80 brown      fair       brown           29   male  mascu…
## # … with 19 more rows, and 1 more variable: homeworld <chr>
select(starwarsClean, name:species) # you can use variables names too
## # A tibble: 29 × 11
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  mascu…
##  2 Darth V…    202   136 none       white      yellow          41.9 male  mascu…
##  3 Leia Or…    150    49 brown      light      brown           19   fema… femin…
##  4 Owen La…    178   120 brown, gr… light      blue            52   male  mascu…
##  5 Beru Wh…    165    75 brown      light      blue            47   fema… femin…
##  6 Biggs D…    183    84 black      light      brown           24   male  mascu…
##  7 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  mascu…
##  8 Anakin …    188    84 blond      fair       blue            41.9 male  mascu…
##  9 Chewbac…    228   112 brown      unknown    blue           200   male  mascu…
## 10 Han Solo    180    80 brown      fair       brown           29   male  mascu…
## # … with 19 more rows, and 2 more variables: homeworld <chr>, species <chr>
select(starwarsClean, -(films:starships)) # you can subset everything except particular variables
## # A tibble: 29 × 11
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  mascu…
##  2 Darth V…    202   136 none       white      yellow          41.9 male  mascu…
##  3 Leia Or…    150    49 brown      light      brown           19   fema… femin…
##  4 Owen La…    178   120 brown, gr… light      blue            52   male  mascu…
##  5 Beru Wh…    165    75 brown      light      blue            47   fema… femin…
##  6 Biggs D…    183    84 black      light      brown           24   male  mascu…
##  7 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  mascu…
##  8 Anakin …    188    84 blond      fair       blue            41.9 male  mascu…
##  9 Chewbac…    228   112 brown      unknown    blue           200   male  mascu…
## 10 Han Solo    180    80 brown      fair       brown           29   male  mascu…
## # … with 19 more rows, and 2 more variables: homeworld <chr>, species <chr>
## Rearrange columns
select(starwarsClean, name, gender, species, everything()) # using the everything() helper function is useful if you have a few variables to move to the beginning
## # A tibble: 29 × 14
##    name   gender species height  mass hair_color skin_color eye_color birth_year
##    <chr>  <chr>  <chr>    <int> <dbl> <chr>      <chr>      <chr>          <dbl>
##  1 Luke … mascu… Human      172    77 blond      fair       blue            19  
##  2 Darth… mascu… Human      202   136 none       white      yellow          41.9
##  3 Leia … femin… Human      150    49 brown      light      brown           19  
##  4 Owen … mascu… Human      178   120 brown, gr… light      blue            52  
##  5 Beru … femin… Human      165    75 brown      light      blue            47  
##  6 Biggs… mascu… Human      183    84 black      light      brown           24  
##  7 Obi-W… mascu… Human      182    77 auburn, w… fair       blue-gray       57  
##  8 Anaki… mascu… Human      188    84 blond      fair       blue            41.9
##  9 Chewb… mascu… Wookiee    228   112 brown      unknown    blue           200  
## 10 Han S… mascu… Human      180    80 brown      fair       brown           29  
## # … with 19 more rows, and 5 more variables: sex <chr>, homeworld <chr>,
## #   films <list>, vehicles <list>, starships <list>
select(starwarsClean, contains("color")) ## other helpers include: ends_with, starts_with, matches (reg ex), num_range
## # A tibble: 29 × 3
##    hair_color    skin_color eye_color
##    <chr>         <chr>      <chr>    
##  1 blond         fair       blue     
##  2 none          white      yellow   
##  3 brown         light      brown    
##  4 brown, grey   light      blue     
##  5 brown         light      blue     
##  6 black         light      brown    
##  7 auburn, white fair       blue-gray
##  8 blond         fair       blue     
##  9 brown         unknown    blue     
## 10 brown         fair       brown    
## # … with 19 more rows
## Renaming columns
select(starwars, haircolor = hair_color) # select can rename but only keeps specified variable; actual name of the column comes after stating new name
## # A tibble: 87 × 1
##    haircolor    
##    <chr>        
##  1 blond        
##  2 <NA>         
##  3 <NA>         
##  4 none         
##  5 brown        
##  6 brown, grey  
##  7 brown        
##  8 <NA>         
##  9 black        
## 10 auburn, white
## # … with 77 more rows
rename(starwarsClean,  haircolor = hair_color) # keeps all the variables
## # A tibble: 29 × 14
##    name      height  mass haircolor skin_color eye_color birth_year sex   gender
##    <chr>      <int> <dbl> <chr>     <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Luke Sky…    172    77 blond     fair       blue            19   male  mascu…
##  2 Darth Va…    202   136 none      white      yellow          41.9 male  mascu…
##  3 Leia Org…    150    49 brown     light      brown           19   fema… femin…
##  4 Owen Lars    178   120 brown, g… light      blue            52   male  mascu…
##  5 Beru Whi…    165    75 brown     light      blue            47   fema… femin…
##  6 Biggs Da…    183    84 black     light      brown           24   male  mascu…
##  7 Obi-Wan …    182    77 auburn, … fair       blue-gray       57   male  mascu…
##  8 Anakin S…    188    84 blond     fair       blue            41.9 male  mascu…
##  9 Chewbacca    228   112 brown     unknown    blue           200   male  mascu…
## 10 Han Solo     180    80 brown     fair       brown           29   male  mascu…
## # … with 19 more rows, and 5 more variables: homeworld <chr>, species <chr>,
## #   films <list>, vehicles <list>, starships <list>

mutate(): Create new variables with functions of existing variables

# Let's create a new column that is a height divided by mass
mutate(starwarsClean, ratio = height/mass) # note we use arithmetic operators 
## # A tibble: 29 × 15
##    name     height  mass hair_color skin_color eye_color birth_year sex   gender
##    <chr>     <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
##  1 Luke Sk…    172    77 blond      fair       blue            19   male  mascu…
##  2 Darth V…    202   136 none       white      yellow          41.9 male  mascu…
##  3 Leia Or…    150    49 brown      light      brown           19   fema… femin…
##  4 Owen La…    178   120 brown, gr… light      blue            52   male  mascu…
##  5 Beru Wh…    165    75 brown      light      blue            47   fema… femin…
##  6 Biggs D…    183    84 black      light      brown           24   male  mascu…
##  7 Obi-Wan…    182    77 auburn, w… fair       blue-gray       57   male  mascu…
##  8 Anakin …    188    84 blond      fair       blue            41.9 male  mascu…
##  9 Chewbac…    228   112 brown      unknown    blue           200   male  mascu…
## 10 Han Solo    180    80 brown      fair       brown           29   male  mascu…
## # … with 19 more rows, and 6 more variables: homeworld <chr>, species <chr>,
## #   films <list>, vehicles <list>, starships <list>, ratio <dbl>
starwars_lbs <- mutate(starwarsClean, mass_lbs=mass*2.2) # now let's convert kg to lbs
head(starwars_lbs) # mass_lbs is now the last variable of the tibble
## # A tibble: 6 × 15
##   name      height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
## 2 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
## 3 Leia Org…    150    49 brown      light      brown           19   fema… femin…
## 4 Owen Lars    178   120 brown, gr… light      blue            52   male  mascu…
## 5 Beru Whi…    165    75 brown      light      blue            47   fema… femin…
## 6 Biggs Da…    183    84 black      light      brown           24   male  mascu…
## # … with 6 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>, mass_lbs <dbl>
select(starwars_lbs, 1:3, mass_lbs, everything()) # we can bring it to the beginning using select
## # A tibble: 29 × 15
##    name   height  mass mass_lbs hair_color skin_color eye_color birth_year sex  
##    <chr>   <int> <dbl>    <dbl> <chr>      <chr>      <chr>          <dbl> <chr>
##  1 Luke …    172    77     169. blond      fair       blue            19   male 
##  2 Darth…    202   136     299. none       white      yellow          41.9 male 
##  3 Leia …    150    49     108. brown      light      brown           19   fema…
##  4 Owen …    178   120     264  brown, gr… light      blue            52   male 
##  5 Beru …    165    75     165  brown      light      blue            47   fema…
##  6 Biggs…    183    84     185. black      light      brown           24   male 
##  7 Obi-W…    182    77     169. auburn, w… fair       blue-gray       57   male 
##  8 Anaki…    188    84     185. blond      fair       blue            41.9 male 
##  9 Chewb…    228   112     246. brown      unknown    blue           200   male 
## 10 Han S…    180    80     176  brown      fair       brown           29   male 
## # … with 19 more rows, and 6 more variables: gender <chr>, homeworld <chr>,
## #   species <chr>, films <list>, vehicles <list>, starships <list>
# If we only wanted the new variable
transmute(starwarsClean, mass_lbs=mass*2.2) # just keep the new variable
## # A tibble: 29 × 1
##    mass_lbs
##       <dbl>
##  1     169.
##  2     299.
##  3     108.
##  4     264 
##  5     165 
##  6     185.
##  7     169.
##  8     185.
##  9     246.
## 10     176 
## # … with 19 more rows
transmute(starwarsClean, mass, mass_lbs=mass*2.2) # you can mention variables you want to keep in the new dataset
## # A tibble: 29 × 2
##     mass mass_lbs
##    <dbl>    <dbl>
##  1    77     169.
##  2   136     299.
##  3    49     108.
##  4   120     264 
##  5    75     165 
##  6    84     185.
##  7    77     169.
##  8    84     185.
##  9   112     246.
## 10    80     176 
## # … with 19 more rows

summarize() and group_by(): Collapse many values down to a single summary

summarize(starwarsClean, meanHeight = mean(height)) # gives summary statistics for entire tibble
## # A tibble: 1 × 1
##   meanHeight
##        <dbl>
## 1       178.
# working with NAs
summarize(starwars, meanHeight = mean(height)) # does not calculate mean if NAs are present
## # A tibble: 1 × 1
##   meanHeight
##        <dbl>
## 1         NA
summarize(starwars, meanHeight = mean(height, na.rm=TRUE), TotalNumber = n()) # n() is a useful function in determining sample size
## # A tibble: 1 × 2
##   meanHeight TotalNumber
##        <dbl>       <int>
## 1       174.          87
summarize(starwarsClean, meanHeight = mean(height), number = n() ) #na.rm not necessary since we are working with a clean dataset
## # A tibble: 1 × 2
##   meanHeight number
##        <dbl>  <int>
## 1       178.     29
# use group_by for maximum usefulness
starwarsGenders<- group_by(starwars, gender)
head(starwarsGenders) # now we see the groups mentioned at the top of the tibble
## # A tibble: 6 × 14
## # Groups:   gender [2]
##   name      height  mass hair_color skin_color eye_color birth_year sex   gender
##   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
## 1 Luke Sky…    172    77 blond      fair       blue            19   male  mascu…
## 2 C-3PO        167    75 <NA>       gold       yellow         112   none  mascu…
## 3 R2-D2         96    32 <NA>       white, bl… red             33   none  mascu…
## 4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
## 5 Leia Org…    150    49 brown      light      brown           19   fema… femin…
## 6 Owen Lars    178   120 brown, gr… light      blue            52   male  mascu…
## # … with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## #   vehicles <list>, starships <list>
summarize(starwarsGenders, meanHeight = mean(height, na.rm = TRUE), number = n() ) # now we can compare height and sample size between groups
## # A tibble: 3 × 3
##   gender    meanHeight number
##   <chr>          <dbl>  <int>
## 1 feminine        165.     17
## 2 masculine       177.     66
## 3 <NA>            181.      4

What is piping?

  • used to emphasize a sequence of actions
  • it lets you pass an intermediate result onto the next function (it takes the output of one statement and makes it the input of the next statement)
  • avoid when you need to manipulate more than one object at a time or there are meaningful intermediate objects
  • formatting: should always have a space before it and usually followed by a new line (usually automatic indent)
starwarsClean %>%
    group_by(gender) %>%
    summarize(meanHeight = mean(height, na.rm=T), number=n()) # so much cleaner! notice we lost a bunch of groups with the clean dataset
## # A tibble: 2 × 3
##   gender    meanHeight number
##   <chr>          <dbl>  <int>
## 1 feminine        166.      6
## 2 masculine       181.     23
## case_when() is useful for multiple ifelse statements


starwarsClean %>%
  mutate(sp = case_when(species == "Human" ~ "Human", TRUE ~ "Non-Human")) %>%
  select(name, sp, everything())
## # A tibble: 29 × 15
##    name      sp    height  mass hair_color skin_color eye_color birth_year sex  
##    <chr>     <chr>  <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr>
##  1 Luke Sky… Human    172    77 blond      fair       blue            19   male 
##  2 Darth Va… Human    202   136 none       white      yellow          41.9 male 
##  3 Leia Org… Human    150    49 brown      light      brown           19   fema…
##  4 Owen Lars Human    178   120 brown, gr… light      blue            52   male 
##  5 Beru Whi… Human    165    75 brown      light      blue            47   fema…
##  6 Biggs Da… Human    183    84 black      light      brown           24   male 
##  7 Obi-Wan … Human    182    77 auburn, w… fair       blue-gray       57   male 
##  8 Anakin S… Human    188    84 blond      fair       blue            41.9 male 
##  9 Chewbacca Non-…    228   112 brown      unknown    blue           200   male 
## 10 Han Solo  Human    180    80 brown      fair       brown           29   male 
## # … with 19 more rows, and 6 more variables: gender <chr>, homeworld <chr>,
## #   species <chr>, films <list>, vehicles <list>, starships <list>
starwarsClean %>%
  group_by(films) %>%
  mutate(
    sp = case_when(species == "Human" ~ "Human", TRUE ~ "Non-Human"),
    status = case_when(str_detect(films, "A New Hope") ~ "OG", TRUE ~ "Later")) %>%
  select(name,sp, status, everything()) %>%
  arrange(status) %>%
  {starwarsOGstatus <<- .}
## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing

## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing

## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing

## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing

## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing

## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing

## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing

## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing

## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing

## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing

## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing

## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing

## Warning in stri_detect_regex(string, pattern, negate = negate, opts_regex =
## opts(pattern)): argument is not an atomic vector; coercing
tail(starwarsOGstatus)
## # A tibble: 6 × 16
## # Groups:   films [6]
##   name      sp    status height  mass hair_color skin_color eye_color birth_year
##   <chr>     <chr> <chr>   <int> <dbl> <chr>      <chr>      <chr>          <dbl>
## 1 Beru Whi… Human OG        165    75 brown      light      blue              47
## 2 Biggs Da… Human OG        183    84 black      light      brown             24
## 3 Obi-Wan … Human OG        182    77 auburn, w… fair       blue-gray         57
## 4 Chewbacca Non-… OG        228   112 brown      unknown    blue             200
## 5 Han Solo  Human OG        180    80 brown      fair       brown             29
## 6 Wedge An… Human OG        170    77 brown      fair       hazel             21
## # … with 7 more variables: sex <chr>, gender <chr>, homeworld <chr>,
## #   species <chr>, films <list>, vehicles <list>, starships <list>
## long to wide format and vice-versa
glimpse(starwarsClean)
## Rows: 29
## Columns: 14
## $ name       <chr> "Luke Skywalker", "Darth Vader", "Leia Organa", "Owen Lars"…
## $ height     <int> 172, 202, 150, 178, 165, 183, 182, 188, 228, 180, 170, 170,…
## $ mass       <dbl> 77.0, 136.0, 49.0, 120.0, 75.0, 84.0, 77.0, 84.0, 112.0, 80…
## $ hair_color <chr> "blond", "none", "brown", "brown, grey", "brown", "black", …
## $ skin_color <chr> "fair", "white", "light", "light", "light", "light", "fair"…
## $ eye_color  <chr> "blue", "yellow", "brown", "blue", "blue", "brown", "blue-g…
## $ birth_year <dbl> 19.0, 41.9, 19.0, 52.0, 47.0, 24.0, 57.0, 41.9, 200.0, 29.0…
## $ sex        <chr> "male", "male", "female", "male", "female", "male", "male",…
## $ gender     <chr> "masculine", "masculine", "feminine", "masculine", "feminin…
## $ homeworld  <chr> "Tatooine", "Tatooine", "Alderaan", "Tatooine", "Tatooine",…
## $ species    <chr> "Human", "Human", "Human", "Human", "Human", "Human", "Huma…
## $ films      <list> <"The Empire Strikes Back", "Revenge of the Sith", "Return…
## $ vehicles   <list> <"Snowspeeder", "Imperial Speeder Bike">, <>, "Imperial Sp…
## $ starships  <list> <"X-wing", "Imperial shuttle">, "TIE Advanced x1", <>, <>,…
# make data set wider
wideSW <- starwarsClean %>% 
  select(name,sex, height) %>%
  pivot_wider(names_from = sex, values_from = height, values_fill = NA)
wideSW
## # A tibble: 29 × 3
##    name                male female
##    <chr>              <int>  <int>
##  1 Luke Skywalker       172     NA
##  2 Darth Vader          202     NA
##  3 Leia Organa           NA    150
##  4 Owen Lars            178     NA
##  5 Beru Whitesun lars    NA    165
##  6 Biggs Darklighter    183     NA
##  7 Obi-Wan Kenobi       182     NA
##  8 Anakin Skywalker     188     NA
##  9 Chewbacca            228     NA
## 10 Han Solo             180     NA
## # … with 19 more rows
starwars %>%
  select(name, homeworld) %>%
  group_by(homeworld) %>%
  mutate(rn = row_number()) %>%
  ungroup() %>%
  pivot_wider(names_from = homeworld, values_from = name) %>%
  select(-rn)
## # A tibble: 11 × 49
##    Tatooine    Naboo Alderaan Stewjon Eriadu Kashyyyk Corellia Rodia `Nal Hutta`
##    <chr>       <chr> <chr>    <chr>   <chr>  <chr>    <chr>    <chr> <chr>      
##  1 Luke Skywa… R2-D2 Leia Or… Obi-Wa… Wilhu… Chewbac… Han Solo Gree… Jabba Desi…
##  2 C-3PO       Palp… Bail Pr… <NA>    <NA>   Tarfful  Wedge A… <NA>  <NA>       
##  3 Darth Vader Jar … Raymus … <NA>    <NA>   <NA>     <NA>     <NA>  <NA>       
##  4 Owen Lars   Roos… <NA>     <NA>    <NA>   <NA>     <NA>     <NA>  <NA>       
##  5 Beru White… Rugo… <NA>     <NA>    <NA>   <NA>     <NA>     <NA>  <NA>       
##  6 R5-D4       Ric … <NA>     <NA>    <NA>   <NA>     <NA>     <NA>  <NA>       
##  7 Biggs Dark… Quar… <NA>     <NA>    <NA>   <NA>     <NA>     <NA>  <NA>       
##  8 Anakin Sky… Greg… <NA>     <NA>    <NA>   <NA>     <NA>     <NA>  <NA>       
##  9 Shmi Skywa… Cordé <NA>     <NA>    <NA>   <NA>     <NA>     <NA>  <NA>       
## 10 Cliegg Lars Dormé <NA>     <NA>    <NA>   <NA>     <NA>     <NA>  <NA>       
## 11 <NA>        Padm… <NA>     <NA>    <NA>   <NA>     <NA>     <NA>  <NA>       
## # … with 40 more variables: `Bestine IV` <chr>, `NA` <chr>, Kamino <chr>,
## #   Trandosha <chr>, Socorro <chr>, Bespin <chr>, `Mon Cala` <chr>,
## #   Chandrila <chr>, Endor <chr>, Sullust <chr>, `Cato Neimoidia` <chr>,
## #   Coruscant <chr>, Toydaria <chr>, Malastare <chr>, Dathomir <chr>,
## #   Ryloth <chr>, Vulpter <chr>, Troiken <chr>, Tund <chr>, `Haruun Kal` <chr>,
## #   Cerea <chr>, `Glee Anselm` <chr>, Iridonia <chr>, Iktotch <chr>,
## #   Quermia <chr>, Dorin <chr>, Champala <chr>, Geonosis <chr>, Mirial <chr>, …
## make data set longer
glimpse(wideSW)
## Rows: 29
## Columns: 3
## $ name   <chr> "Luke Skywalker", "Darth Vader", "Leia Organa", "Owen Lars", "B…
## $ male   <int> 172, 202, NA, 178, NA, 183, 182, 188, 228, 180, 170, 170, 183, …
## $ female <int> NA, NA, 150, NA, 165, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
wideSW %>%
  pivot_longer(cols= male:female,
               names_to = "sex",
               values_to= "height",
               values_drop_na = T)
## # A tibble: 29 × 3
##    name               sex    height
##    <chr>              <chr>   <int>
##  1 Luke Skywalker     male      172
##  2 Darth Vader        male      202
##  3 Leia Organa        female    150
##  4 Owen Lars          male      178
##  5 Beru Whitesun lars female    165
##  6 Biggs Darklighter  male      183
##  7 Obi-Wan Kenobi     male      182
##  8 Anakin Skywalker   male      188
##  9 Chewbacca          male      228
## 10 Han Solo           male      180
## # … with 19 more rows