dplyr
Funciones para las tareas más comunes:
filter
select
arrange
mutate
summarise
group_by
library(nycflights13)suppressPackageStartupMessages(library(dplyr))flights
## # A tibble: 336,776 x 19## year month day dep_time sched_dep_time dep_delay arr_time## <int> <int> <int> <int> <int> <dbl> <int>## 1 2013 1 1 517 515 2 830## 2 2013 1 1 533 529 4 850## 3 2013 1 1 542 540 2 923## 4 2013 1 1 544 545 -1 1004## 5 2013 1 1 554 600 -6 812## 6 2013 1 1 554 558 -4 740## 7 2013 1 1 555 600 -5 913## 8 2013 1 1 557 600 -3 709## 9 2013 1 1 557 600 -3 838## 10 2013 1 1 558 600 -2 753## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,## # minute <dbl>, time_hour <dttm>
Seleccionar las filas de enero
dplyr::filter(flights, month == 1)
## # A tibble: 27,004 x 19## year month day dep_time sched_dep_time dep_delay arr_time## <int> <int> <int> <int> <int> <dbl> <int>## 1 2013 1 1 517 515 2 830## 2 2013 1 1 533 529 4 850## 3 2013 1 1 542 540 2 923## 4 2013 1 1 544 545 -1 1004## 5 2013 1 1 554 600 -6 812## 6 2013 1 1 554 558 -4 740## 7 2013 1 1 555 600 -5 913## 8 2013 1 1 557 600 -3 709## 9 2013 1 1 557 600 -3 838## 10 2013 1 1 558 600 -2 753## # … with 26,994 more rows, and 12 more variables: sched_arr_time <int>,## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,## # minute <dbl>, time_hour <dttm>
filter(flights, month == 1, day == 1)
## # A tibble: 842 x 19## year month day dep_time sched_dep_time dep_delay arr_time## <int> <int> <int> <int> <int> <dbl> <int>## 1 2013 1 1 517 515 2 830## 2 2013 1 1 533 529 4 850## 3 2013 1 1 542 540 2 923## 4 2013 1 1 544 545 -1 1004## 5 2013 1 1 554 600 -6 812## 6 2013 1 1 554 558 -4 740## 7 2013 1 1 555 600 -5 913## 8 2013 1 1 557 600 -3 709## 9 2013 1 1 557 600 -3 838## 10 2013 1 1 558 600 -2 753## # … with 832 more rows, and 12 more variables: sched_arr_time <int>,## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,## # minute <dbl>, time_hour <dttm>
filter(flights, month == 1, day == 1)
## # A tibble: 842 x 19## year month day dep_time sched_dep_time dep_delay arr_time## <int> <int> <int> <int> <int> <dbl> <int>## 1 2013 1 1 517 515 2 830## 2 2013 1 1 533 529 4 850## 3 2013 1 1 542 540 2 923## 4 2013 1 1 544 545 -1 1004## 5 2013 1 1 554 600 -6 812## 6 2013 1 1 554 558 -4 740## 7 2013 1 1 555 600 -5 913## 8 2013 1 1 557 600 -3 709## 9 2013 1 1 557 600 -3 838## 10 2013 1 1 558 600 -2 753## # … with 832 more rows, and 12 more variables: sched_arr_time <int>,## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,## # minute <dbl>, time_hour <dttm>
AND
Puedo usar operadores lógicos:
filter(flights, month == 1 & day == 1)
## # A tibble: 842 x 19## year month day dep_time sched_dep_time dep_delay arr_time## <int> <int> <int> <int> <int> <dbl> <int>## 1 2013 1 1 517 515 2 830## 2 2013 1 1 533 529 4 850## 3 2013 1 1 542 540 2 923## 4 2013 1 1 544 545 -1 1004## 5 2013 1 1 554 600 -6 812## 6 2013 1 1 554 558 -4 740## 7 2013 1 1 555 600 -5 913## 8 2013 1 1 557 600 -3 709## 9 2013 1 1 557 600 -3 838## 10 2013 1 1 558 600 -2 753## # … with 832 more rows, and 12 more variables: sched_arr_time <int>,## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,## # minute <dbl>, time_hour <dttm>
filter(flights, month == 1 | month == 2)
## # A tibble: 51,955 x 19## year month day dep_time sched_dep_time dep_delay arr_time## <int> <int> <int> <int> <int> <dbl> <int>## 1 2013 1 1 517 515 2 830## 2 2013 1 1 533 529 4 850## 3 2013 1 1 542 540 2 923## 4 2013 1 1 544 545 -1 1004## 5 2013 1 1 554 600 -6 812## 6 2013 1 1 554 558 -4 740## 7 2013 1 1 555 600 -5 913## 8 2013 1 1 557 600 -3 709## 9 2013 1 1 557 600 -3 838## 10 2013 1 1 558 600 -2 753## # … with 51,945 more rows, and 12 more variables: sched_arr_time <int>,## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,## # minute <dbl>, time_hour <dttm>
# Es lo mismo quefilter(flights, month %in% c(1, 2))
## # A tibble: 51,955 x 19## year month day dep_time sched_dep_time dep_delay arr_time## <int> <int> <int> <int> <int> <dbl> <int>## 1 2013 1 1 517 515 2 830## 2 2013 1 1 533 529 4 850## 3 2013 1 1 542 540 2 923## 4 2013 1 1 544 545 -1 1004## 5 2013 1 1 554 600 -6 812## 6 2013 1 1 554 558 -4 740## 7 2013 1 1 555 600 -5 913## 8 2013 1 1 557 600 -3 709## 9 2013 1 1 557 600 -3 838## 10 2013 1 1 558 600 -2 753## # … with 51,945 more rows, and 12 more variables: sched_arr_time <int>,## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,## # minute <dbl>, time_hour <dttm>
&&
es parecido a &
, ==
no tiene nada que ver con =
.Encontrar los vuelos que:
between
. Usarla para alguna de las preguntas anteriores.select
Selecciona columnas.
msleep <- read.csv("./data/msleep_ggplot2.csv")head(msleep)
## name genus vore order conservation## 1 Cheetah Acinonyx carni Carnivora lc## 2 Owl monkey Aotus omni Primates <NA>## 3 Mountain beaver Aplodontia herbi Rodentia nt## 4 Greater short-tailed shrew Blarina omni Soricomorpha lc## 5 Cow Bos herbi Artiodactyla domesticated## 6 Three-toed sloth Bradypus herbi Pilosa <NA>## sleep_total sleep_rem sleep_cycle awake brainwt bodywt## 1 12.1 NA NA 11.9 NA 50.000## 2 17.0 1.8 NA 7.0 0.01550 0.480## 3 14.4 2.4 NA 9.6 NA 1.350## 4 14.9 2.3 0.1333333 9.1 0.00029 0.019## 5 4.0 0.7 0.6666667 20.0 0.42300 600.000## 6 14.4 2.2 0.7666667 9.6 NA 3.850
head(select(msleep, name, sleep_total))
## name sleep_total## 1 Cheetah 12.1## 2 Owl monkey 17.0## 3 Mountain beaver 14.4## 4 Greater short-tailed shrew 14.9## 5 Cow 4.0## 6 Three-toed sloth 14.4
select
(2)head(select(msleep, -name))
## genus vore order conservation sleep_total sleep_rem## 1 Acinonyx carni Carnivora lc 12.1 NA## 2 Aotus omni Primates <NA> 17.0 1.8## 3 Aplodontia herbi Rodentia nt 14.4 2.4## 4 Blarina omni Soricomorpha lc 14.9 2.3## 5 Bos herbi Artiodactyla domesticated 4.0 0.7## 6 Bradypus herbi Pilosa <NA> 14.4 2.2## sleep_cycle awake brainwt bodywt## 1 NA 11.9 NA 50.000## 2 NA 7.0 0.01550 0.480## 3 NA 9.6 NA 1.350## 4 0.1333333 9.1 0.00029 0.019## 5 0.6666667 20.0 0.42300 600.000## 6 0.7666667 9.6 NA 3.850
select
(3)head(select(msleep, starts_with("sl")))
## sleep_total sleep_rem sleep_cycle## 1 12.1 NA NA## 2 17.0 1.8 NA## 3 14.4 2.4 NA## 4 14.9 2.3 0.1333333## 5 4.0 0.7 0.6666667## 6 14.4 2.2 0.7666667
ends_with
contains
matches
one_of
dep_time
, dep_delay
, arr_time
, y arr_delay
de flights
.vars <- c("year", "month", "day", "dep_delay", "arr_delay")
flights_sml <- select(flights, year:day, ends_with("delay"), distance, air_time)
mutate(flights_sml, gain = dep_delay - arr_delay, speed = distance / air_time * 60)
## # A tibble: 336,776 x 9## year month day dep_delay arr_delay distance air_time gain speed## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>## 1 2013 1 1 2 11 1400 227 -9 370.## 2 2013 1 1 4 20 1416 227 -16 374.## 3 2013 1 1 2 33 1089 160 -31 408.## 4 2013 1 1 -1 -18 1576 183 17 517.## 5 2013 1 1 -6 -25 762 116 19 394.## 6 2013 1 1 -4 12 719 150 -16 288.## 7 2013 1 1 -5 19 1065 158 -24 404.## 8 2013 1 1 -3 -14 229 53 11 259.## 9 2013 1 1 -3 -8 944 140 5 405.## 10 2013 1 1 -2 8 733 138 -10 319.## # … with 336,766 more rows
Las funciones que usamos dentro de mutate
deben estar vectorizadas: toman un vector como argumento y devuelven un vector del mismo tamaño como resultado:
mutate(flights_sml, gain = dep_delay - arr_delay, speed = distance / air_time * 60)
## # A tibble: 336,776 x 9## year month day dep_delay arr_delay distance air_time gain speed## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>## 1 2013 1 1 2 11 1400 227 -9 370.## 2 2013 1 1 4 20 1416 227 -16 374.## 3 2013 1 1 2 33 1089 160 -31 408.## 4 2013 1 1 -1 -18 1576 183 17 517.## 5 2013 1 1 -6 -25 762 116 19 394.## 6 2013 1 1 -4 12 719 150 -16 288.## 7 2013 1 1 -5 19 1065 158 -24 404.## 8 2013 1 1 -3 -14 229 53 11 259.## 9 2013 1 1 -3 -8 944 140 5 405.## 10 2013 1 1 -2 8 733 138 -10 319.## # … with 336,766 more rows
dep_delay
y arr_delay
son vectoresair_time * 60
?lead()
lag()
cumsum()
lead()
lag()
cumsum()
Mutate calcula columnas nuevas
library(dplyr)msleep <- read.csv("./data/msleep_ggplot2.csv")msleep %>% mutate(sleep_new=sleep_total + sleep_rem) %>% head()
## name genus vore order conservation## 1 Cheetah Acinonyx carni Carnivora lc## 2 Owl monkey Aotus omni Primates <NA>## 3 Mountain beaver Aplodontia herbi Rodentia nt## 4 Greater short-tailed shrew Blarina omni Soricomorpha lc## 5 Cow Bos herbi Artiodactyla domesticated## 6 Three-toed sloth Bradypus herbi Pilosa <NA>## sleep_total sleep_rem sleep_cycle awake brainwt bodywt sleep_new## 1 12.1 NA NA 11.9 NA 50.000 NA## 2 17.0 1.8 NA 7.0 0.01550 0.480 18.8## 3 14.4 2.4 NA 9.6 NA 1.350 16.8## 4 14.9 2.3 0.1333333 9.1 0.00029 0.019 17.2## 5 4.0 0.7 0.6666667 20.0 0.42300 600.000 4.7## 6 14.4 2.2 0.7666667 9.6 NA 3.850 16.6
Igual que mutate pero elimina las columnas no especificadas
msleep %>% transmute(sleep_new=sleep_rem+sleep_total) %>% head
## sleep_new## 1 NA## 2 18.8## 3 16.8## 4 17.2## 5 4.7## 6 16.6
Aplica funcione a todas las columnas que cumplen un criterio.
msleep %>% mutate_if(is.numeric, log2)%>% head()
## name genus vore order conservation## 1 Cheetah Acinonyx carni Carnivora lc## 2 Owl monkey Aotus omni Primates <NA>## 3 Mountain beaver Aplodontia herbi Rodentia nt## 4 Greater short-tailed shrew Blarina omni Soricomorpha lc## 5 Cow Bos herbi Artiodactyla domesticated## 6 Three-toed sloth Bradypus herbi Pilosa <NA>## sleep_total sleep_rem sleep_cycle awake brainwt bodywt## 1 3.596935 NA NA 3.572890 NA 5.6438562## 2 4.087463 0.8479969 NA 2.807355 -6.011588 -1.0588937## 3 3.847997 1.2630344 NA 3.263034 NA 0.4329594## 4 3.897240 1.2016339 -2.9068906 3.185867 -11.751659 -5.7178568## 5 2.000000 -0.5145732 -0.5849625 4.321928 -1.241270 9.2288187## 6 3.847997 1.1375035 -0.3833286 3.263034 NA 1.9448584
Aplica funciones a todas las columnas
msleep %>% select(sleep_total, sleep_rem) %>% mutate_all(funs(log(.),log2(.))) %>% names()
## Warning: funs() is soft deprecated as of dplyr 0.8.0## please use list() instead## ## # Before:## funs(name = f(.)## ## # After: ## list(name = ~f(.))## This warning is displayed once per session.
## [1] "sleep_total" "sleep_rem" "sleep_total_log" ## [4] "sleep_rem_log" "sleep_total_log2" "sleep_rem_log2"
Aplica funciones a columnas.
msleep %>%mutate_at(vars(sleep_total),funs(log(.),log2(.))) %>% names()
## [1] "name" "genus" "vore" "order" ## [5] "conservation" "sleep_total" "sleep_rem" "sleep_cycle" ## [9] "awake" "brainwt" "bodywt" "log" ## [13] "log2"
dplyr
Funciones para las tareas más comunes:
filter
select
arrange
mutate
summarise
group_by
Keyboard shortcuts
↑, ←, Pg Up, k | Go to previous slide |
↓, →, Pg Dn, Space, j | Go to next slide |
Home | Go to first slide |
End | Go to last slide |
Number + Return | Go to specific slide |
b / m / f | Toggle blackout / mirrored / fullscreen mode |
c | Clone slideshow |
p | Toggle presenter mode |
t | Restart the presentation timer |
?, h | Toggle this help |
Esc | Back to slideshow |