Data Wrangling1 / 21

`dplyr`

Funciones para las tareas más comunes:

filter
select
arrange
mutate
summarise
group_by

2 / 21

Datos

library(nycflights13)
suppressPackageStartupMessages(library(dplyr))
flights

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

3 / 21

Filter4 / 21

Filter

Seleccionar las filas de enero

dplyr::filter(flights, month == 1)

## # A tibble: 27,004 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 26,994 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

4 / 21

Seleccionar las filas de el primer dia de enero

filter(flights, month == 1, day == 1)

## # A tibble: 842 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 832 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

5 / 21

Seleccionar las filas de el primer dia de enero

filter(flights, month == 1, day == 1)

## # A tibble: 842 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 832 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Si paso varias condiciones, se combinan como AND

5 / 21

Condiciones más complejas

Puedo usar operadores lógicos:

filter(flights, month == 1 & day == 1)

## # A tibble: 842 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 832 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

6 / 21

Condiciones más complejas (2)

filter(flights, month == 1 | month == 2)

## # A tibble: 51,955 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 51,945 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

# Es lo mismo que
filter(flights, month %in% c(1, 2))

## # A tibble: 51,955 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 51,945 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Cuidado! && es parecido a &, == no tiene nada que ver con =.

7 / 21

Ejercicios:

Encontrar los vuelos que:
- Tuvieron un atraso en la llegada de dos horas o más
- Volaron a Houston (IAH o HOU)
- Son de United, American, or Delta
- Salieron en invierno (Junio, Julio, Agosto)
- Salieron entre medianoche y las 6am

Investigar cómo funciona between. Usarla para alguna de las preguntas anteriores.

8 / 21

`select`

Selecciona columnas.

msleep <- read.csv("./data/msleep_ggplot2.csv")
head(msleep)

##                         name      genus  vore        order conservation
## 1                    Cheetah   Acinonyx carni    Carnivora           lc
## 2                 Owl monkey      Aotus  omni     Primates         <NA>
## 3            Mountain beaver Aplodontia herbi     Rodentia           nt
## 4 Greater short-tailed shrew    Blarina  omni Soricomorpha           lc
## 5                        Cow        Bos herbi Artiodactyla domesticated
## 6           Three-toed sloth   Bradypus herbi       Pilosa         <NA>
##   sleep_total sleep_rem sleep_cycle awake brainwt  bodywt
## 1        12.1        NA          NA  11.9      NA  50.000
## 2        17.0       1.8          NA   7.0 0.01550   0.480
## 3        14.4       2.4          NA   9.6      NA   1.350
## 4        14.9       2.3   0.1333333   9.1 0.00029   0.019
## 5         4.0       0.7   0.6666667  20.0 0.42300 600.000
## 6        14.4       2.2   0.7666667   9.6      NA   3.850

9 / 21

head(select(msleep, name, sleep_total))

##                         name sleep_total
## 1                    Cheetah        12.1
## 2                 Owl monkey        17.0
## 3            Mountain beaver        14.4
## 4 Greater short-tailed shrew        14.9
## 5                        Cow         4.0
## 6           Three-toed sloth        14.4

10 / 21

`select` (2)

head(select(msleep, -name))

##        genus  vore        order conservation sleep_total sleep_rem
## 1   Acinonyx carni    Carnivora           lc        12.1        NA
## 2      Aotus  omni     Primates         <NA>        17.0       1.8
## 3 Aplodontia herbi     Rodentia           nt        14.4       2.4
## 4    Blarina  omni Soricomorpha           lc        14.9       2.3
## 5        Bos herbi Artiodactyla domesticated         4.0       0.7
## 6   Bradypus herbi       Pilosa         <NA>        14.4       2.2
##   sleep_cycle awake brainwt  bodywt
## 1          NA  11.9      NA  50.000
## 2          NA   7.0 0.01550   0.480
## 3          NA   9.6      NA   1.350
## 4   0.1333333   9.1 0.00029   0.019
## 5   0.6666667  20.0 0.42300 600.000
## 6   0.7666667   9.6      NA   3.850

11 / 21

`select` (3)

head(select(msleep, starts_with("sl")))

##   sleep_total sleep_rem sleep_cycle
## 1        12.1        NA          NA
## 2        17.0       1.8          NA
## 3        14.4       2.4          NA
## 4        14.9       2.3   0.1333333
## 5         4.0       0.7   0.6666667
## 6        14.4       2.2   0.7666667

12 / 21

Otros helpers:ends_with
contains
matches
one_of
13 / 21

Ejercicios

Todas las formas posibles de seleccionar dep_time, dep_delay, arr_time, y arr_delay de flights.
Investigar que hace one_of(), y como lo usarían con un vector:

vars <- c("year", "month", "day", "dep_delay", "arr_delay")

14 / 21

Mutate

flights_sml <- select(flights, 
  year:day, 
  ends_with("delay"), 
  distance, 
  air_time
)

15 / 21

Mutate

mutate(flights_sml,
  gain = dep_delay - arr_delay,
  speed = distance / air_time * 60
)

## # A tibble: 336,776 x 9
##     year month   day dep_delay arr_delay distance air_time  gain speed
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
##  1  2013     1     1         2        11     1400      227    -9  370.
##  2  2013     1     1         4        20     1416      227   -16  374.
##  3  2013     1     1         2        33     1089      160   -31  408.
##  4  2013     1     1        -1       -18     1576      183    17  517.
##  5  2013     1     1        -6       -25      762      116    19  394.
##  6  2013     1     1        -4        12      719      150   -16  288.
##  7  2013     1     1        -5        19     1065      158   -24  404.
##  8  2013     1     1        -3       -14      229       53    11  259.
##  9  2013     1     1        -3        -8      944      140     5  405.
## 10  2013     1     1        -2         8      733      138   -10  319.
## # … with 336,766 more rows

16 / 21

Mutate (2)

Las funciones que usamos dentro de mutate deben estar vectorizadas: toman un vector como argumento y devuelven un vector del mismo tamaño como resultado:

mutate(flights_sml,
  gain = dep_delay - arr_delay,
  speed = distance / air_time * 60
)

## # A tibble: 336,776 x 9
##     year month   day dep_delay arr_delay distance air_time  gain speed
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
##  1  2013     1     1         2        11     1400      227    -9  370.
##  2  2013     1     1         4        20     1416      227   -16  374.
##  3  2013     1     1         2        33     1089      160   -31  408.
##  4  2013     1     1        -1       -18     1576      183    17  517.
##  5  2013     1     1        -6       -25      762      116    19  394.
##  6  2013     1     1        -4        12      719      150   -16  288.
##  7  2013     1     1        -5        19     1065      158   -24  404.
##  8  2013     1     1        -3       -14      229       53    11  259.
##  9  2013     1     1        -3        -8      944      140     5  405.
## 10  2013     1     1        -2         8      733      138   -10  319.
## # … with 336,766 more rows

dep_delay y arr_delay son vectores
¿air_time * 60?

17 / 21

Funciones Útileslead()
lag() 
cumsum()
18 / 21

Funciones Útiles

lead()
lag()
cumsum()
Mutate & family

Mutate calcula columnas nuevas

library(dplyr)
msleep <- read.csv("./data/msleep_ggplot2.csv")
msleep %>%
  mutate(sleep_new=sleep_total + sleep_rem) %>%
  head()

##                         name      genus  vore        order conservation
## 1                    Cheetah   Acinonyx carni    Carnivora           lc
## 2                 Owl monkey      Aotus  omni     Primates         <NA>
## 3            Mountain beaver Aplodontia herbi     Rodentia           nt
## 4 Greater short-tailed shrew    Blarina  omni Soricomorpha           lc
## 5                        Cow        Bos herbi Artiodactyla domesticated
## 6           Three-toed sloth   Bradypus herbi       Pilosa         <NA>
##   sleep_total sleep_rem sleep_cycle awake brainwt  bodywt sleep_new
## 1        12.1        NA          NA  11.9      NA  50.000        NA
## 2        17.0       1.8          NA   7.0 0.01550   0.480      18.8
## 3        14.4       2.4          NA   9.6      NA   1.350      16.8
## 4        14.9       2.3   0.1333333   9.1 0.00029   0.019      17.2
## 5         4.0       0.7   0.6666667  20.0 0.42300 600.000       4.7
## 6        14.4       2.2   0.7666667   9.6      NA   3.850      16.6

Transmute:

Igual que mutate pero elimina las columnas no especificadas

msleep %>% 
  transmute(sleep_new=sleep_rem+sleep_total) %>% 
  head

##   sleep_new
## 1        NA
## 2      18.8
## 3      16.8
## 4      17.2
## 5       4.7
## 6      16.6

Mutate_if

Aplica funcione a todas las columnas que cumplen un criterio.

msleep %>% 
mutate_if(is.numeric, log2)%>% 
head()

##                         name      genus  vore        order conservation
## 1                    Cheetah   Acinonyx carni    Carnivora           lc
## 2                 Owl monkey      Aotus  omni     Primates         <NA>
## 3            Mountain beaver Aplodontia herbi     Rodentia           nt
## 4 Greater short-tailed shrew    Blarina  omni Soricomorpha           lc
## 5                        Cow        Bos herbi Artiodactyla domesticated
## 6           Three-toed sloth   Bradypus herbi       Pilosa         <NA>
##   sleep_total  sleep_rem sleep_cycle    awake    brainwt     bodywt
## 1    3.596935         NA          NA 3.572890         NA  5.6438562
## 2    4.087463  0.8479969          NA 2.807355  -6.011588 -1.0588937
## 3    3.847997  1.2630344          NA 3.263034         NA  0.4329594
## 4    3.897240  1.2016339  -2.9068906 3.185867 -11.751659 -5.7178568
## 5    2.000000 -0.5145732  -0.5849625 4.321928  -1.241270  9.2288187
## 6    3.847997  1.1375035  -0.3833286 3.263034         NA  1.9448584

18 / 21

Mutate_all:

Aplica funciones a todas las columnas

msleep %>% 
  select(sleep_total, sleep_rem) %>%
  mutate_all(funs(log(.),log2(.))) %>%  
  names()

## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.

## [1] "sleep_total"      "sleep_rem"        "sleep_total_log" 
## [4] "sleep_rem_log"    "sleep_total_log2" "sleep_rem_log2"

19 / 21

Mutate_at:

Aplica funciones a columnas.

msleep %>%
mutate_at(vars(sleep_total),funs(log(.),log2(.))) %>%
  names()

##  [1] "name"         "genus"        "vore"         "order"       
##  [5] "conservation" "sleep_total"  "sleep_rem"    "sleep_cycle" 
##  [9] "awake"        "brainwt"      "bodywt"       "log"         
## [13] "log2"

20 / 21

Bibliografía

↑, ←, Pg Up, k	Go to previous slide
↓, →, Pg Dn, Space, j	Go to next slide
Home	Go to first slide
End	Go to last slide
Number + Return	Go to specific slide
b / m / f	Toggle blackout / mirrored / fullscreen mode
c	Clone slideshow
p	Toggle presenter mode
t	Restart the presentation timer
?, h	Toggle this help

Data Wrangling

dplyr

Datos

Filter

Filter

Seleccionar las filas de el primer dia de enero

Seleccionar las filas de el primer dia de enero

Condiciones más complejas

Condiciones más complejas (2)

Ejercicios:

select

select (2)

select (3)

Otros helpers:

Ejercicios

Mutate

Mutate

Mutate (2)

Funciones Útiles

Funciones Útiles

Mutate & family

Transmute:

Mutate_if

Mutate_all:

Mutate_at:

Bibliografía

dplyr

Help

`dplyr`

`select`

`select` (2)

`select` (3)

`dplyr`