Convert from %>% to |>
This commit is contained in:
parent
1b0c50894a
commit
da0fbd50d5
36
EDA.Rmd
36
EDA.Rmd
|
@ -99,7 +99,7 @@ The height of the bars displays how many observations occurred with each x value
|
|||
You can compute these values manually with `dplyr::count()`:
|
||||
|
||||
```{r}
|
||||
diamonds %>%
|
||||
diamonds |>
|
||||
count(cut)
|
||||
```
|
||||
|
||||
|
@ -115,7 +115,7 @@ ggplot(data = diamonds) +
|
|||
You can compute this by hand by combining `dplyr::count()` and `ggplot2::cut_width()`:
|
||||
|
||||
```{r}
|
||||
diamonds %>%
|
||||
diamonds |>
|
||||
count(cut_width(carat, 0.5))
|
||||
```
|
||||
|
||||
|
@ -127,7 +127,7 @@ You should always explore a variety of binwidths when working with histograms, a
|
|||
For example, here is how the graph above looks when we zoom into just the diamonds with a size of less than three carats and choose a smaller binwidth.
|
||||
|
||||
```{r}
|
||||
smaller <- diamonds %>%
|
||||
smaller <- diamonds |>
|
||||
filter(carat < 3)
|
||||
|
||||
ggplot(data = smaller, mapping = aes(x = carat)) +
|
||||
|
@ -232,9 +232,9 @@ old <- options(tibble.print_max = 10, tibble.print_min = 10)
|
|||
```
|
||||
|
||||
```{r}
|
||||
unusual <- diamonds %>%
|
||||
filter(y < 3 | y > 20) %>%
|
||||
select(price, x, y, z) %>%
|
||||
unusual <- diamonds |>
|
||||
filter(y < 3 | y > 20) |>
|
||||
select(price, x, y, z) |>
|
||||
arrange(y)
|
||||
unusual
|
||||
```
|
||||
|
@ -277,7 +277,7 @@ If you've encountered unusual values in your dataset, and simply want to move on
|
|||
1. Drop the entire row with the strange values:
|
||||
|
||||
```{r, eval = FALSE}
|
||||
diamonds2 <- diamonds %>%
|
||||
diamonds2 <- diamonds |>
|
||||
filter(between(y, 3, 20))
|
||||
```
|
||||
|
||||
|
@ -289,7 +289,7 @@ If you've encountered unusual values in your dataset, and simply want to move on
|
|||
You can use the `ifelse()` function to replace unusual values with `NA`:
|
||||
|
||||
```{r}
|
||||
diamonds2 <- diamonds %>%
|
||||
diamonds2 <- diamonds |>
|
||||
mutate(y = ifelse(y < 3 | y > 20, NA, y))
|
||||
```
|
||||
|
||||
|
@ -320,13 +320,13 @@ So you might want to compare the scheduled departure times for cancelled and non
|
|||
You can do this by making a new variable with `is.na()`.
|
||||
|
||||
```{r}
|
||||
nycflights13::flights %>%
|
||||
nycflights13::flights |>
|
||||
mutate(
|
||||
cancelled = is.na(dep_time),
|
||||
sched_hour = sched_dep_time %/% 100,
|
||||
sched_min = sched_dep_time %% 100,
|
||||
sched_dep_time = sched_hour + sched_min / 60
|
||||
) %>%
|
||||
) |>
|
||||
ggplot(mapping = aes(sched_dep_time)) +
|
||||
geom_freqpoly(mapping = aes(colour = cancelled), binwidth = 1/4)
|
||||
```
|
||||
|
@ -476,15 +476,15 @@ Covariation will appear as a strong correlation between specific x values and sp
|
|||
Another approach is to compute the count with dplyr:
|
||||
|
||||
```{r}
|
||||
diamonds %>%
|
||||
diamonds |>
|
||||
count(color, cut)
|
||||
```
|
||||
|
||||
Then visualise with `geom_tile()` and the fill aesthetic:
|
||||
|
||||
```{r}
|
||||
diamonds %>%
|
||||
count(color, cut) %>%
|
||||
diamonds |>
|
||||
count(color, cut) |>
|
||||
ggplot(mapping = aes(x = color, y = cut)) +
|
||||
geom_tile(mapping = aes(fill = n))
|
||||
```
|
||||
|
@ -628,8 +628,8 @@ library(modelr)
|
|||
|
||||
mod <- lm(log(price) ~ log(carat), data = diamonds)
|
||||
|
||||
diamonds2 <- diamonds %>%
|
||||
add_residuals(mod) %>%
|
||||
diamonds2 <- diamonds |>
|
||||
add_residuals(mod) |>
|
||||
mutate(resid = exp(resid))
|
||||
|
||||
ggplot(data = diamonds2) +
|
||||
|
@ -669,12 +669,12 @@ ggplot(faithful, aes(eruptions)) +
|
|||
```
|
||||
|
||||
Sometimes we'll turn the end of a pipeline of data transformation into a plot.
|
||||
Watch for the transition from `%>%` to `+`.
|
||||
Watch for the transition from `|>` to `+`.
|
||||
I wish this transition wasn't necessary but unfortunately ggplot2 was created before the pipe was discovered.
|
||||
|
||||
```{r, eval = FALSE}
|
||||
diamonds %>%
|
||||
count(cut, clarity) %>%
|
||||
diamonds |>
|
||||
count(cut, clarity) |>
|
||||
ggplot(aes(clarity, cut, fill = n)) +
|
||||
geom_tile()
|
||||
```
|
||||
|
|
|
@ -115,8 +115,8 @@ First, you might have a tibble that provides labels.
|
|||
The plot below isn't terribly useful, but it illustrates a useful approach: pull out the most efficient car in each class with dplyr, and then label it on the plot:
|
||||
|
||||
```{r}
|
||||
best_in_class <- mpg %>%
|
||||
group_by(class) %>%
|
||||
best_in_class <- mpg |>
|
||||
group_by(class) |>
|
||||
filter(row_number(desc(hwy)) == 1)
|
||||
|
||||
ggplot(mpg, aes(displ, hwy)) +
|
||||
|
@ -154,8 +154,8 @@ It's not wonderful for this plot, but it isn't too bad.
|
|||
(`theme(legend.position = "none"`) turns the legend off --- we'll talk about it more shortly.)
|
||||
|
||||
```{r}
|
||||
class_avg <- mpg %>%
|
||||
group_by(class) %>%
|
||||
class_avg <- mpg |>
|
||||
group_by(class) |>
|
||||
summarise(
|
||||
displ = median(displ),
|
||||
hwy = median(hwy)
|
||||
|
@ -176,7 +176,7 @@ Alternatively, you might just want to add a single label to the plot, but you'll
|
|||
Often, you want the label in the corner of the plot, so it's convenient to create a new data frame using `summarise()` to compute the maximum values of x and y.
|
||||
|
||||
```{r}
|
||||
label <- mpg %>%
|
||||
label <- mpg |>
|
||||
summarise(
|
||||
displ = max(displ),
|
||||
hwy = max(hwy),
|
||||
|
@ -207,8 +207,8 @@ In these examples, I manually broke the label up into lines using `"\n"`.
|
|||
Another approach is to use `stringr::str_wrap()` to automatically add line breaks, given the number of characters you want per line:
|
||||
|
||||
```{r}
|
||||
"Increasing engine size is related to decreasing fuel economy." %>%
|
||||
stringr::str_wrap(width = 40) %>%
|
||||
"Increasing engine size is related to decreasing fuel economy." |>
|
||||
stringr::str_wrap(width = 40) |>
|
||||
writeLines()
|
||||
```
|
||||
|
||||
|
@ -219,7 +219,7 @@ Figure \@ref(fig:just) shows all nine possible combinations.
|
|||
vjust <- c(bottom = 0, center = 0.5, top = 1)
|
||||
hjust <- c(left = 0, center = 0.5, right = 1)
|
||||
|
||||
df <- tidyr::crossing(hj = names(hjust), vj = names(vjust)) %>%
|
||||
df <- tidyr::crossing(hj = names(hjust), vj = names(vjust)) |>
|
||||
mutate(
|
||||
y = vjust[vj],
|
||||
x = hjust[hj],
|
||||
|
@ -332,8 +332,8 @@ Another use of `breaks` is when you have relatively few data points and want to
|
|||
For example, take this plot that shows when each US president started and ended their term.
|
||||
|
||||
```{r}
|
||||
presidential %>%
|
||||
mutate(id = 33 + row_number()) %>%
|
||||
presidential |>
|
||||
mutate(id = 33 + row_number()) |>
|
||||
ggplot(aes(start, id)) +
|
||||
geom_point() +
|
||||
geom_segment(aes(xend = end, yend = id)) +
|
||||
|
@ -445,8 +445,8 @@ When you have a predefined mapping between values and colours, use `scale_colour
|
|||
For example, if we map presidential party to colour, we want to use the standard mapping of red for Republicans and blue for Democrats:
|
||||
|
||||
```{r}
|
||||
presidential %>%
|
||||
mutate(id = 33 + row_number()) %>%
|
||||
presidential |>
|
||||
mutate(id = 33 + row_number()) |>
|
||||
ggplot(aes(start, id, colour = party)) +
|
||||
geom_point() +
|
||||
geom_segment(aes(xend = end, yend = id)) +
|
||||
|
@ -526,8 +526,8 @@ ggplot(mpg, mapping = aes(displ, hwy)) +
|
|||
geom_smooth() +
|
||||
coord_cartesian(xlim = c(5, 7), ylim = c(10, 30))
|
||||
|
||||
mpg %>%
|
||||
filter(displ >= 5, displ <= 7, hwy >= 10, hwy <= 30) %>%
|
||||
mpg |>
|
||||
filter(displ >= 5, displ <= 7, hwy >= 10, hwy <= 30) |>
|
||||
ggplot(aes(displ, hwy)) +
|
||||
geom_point(aes(color = class)) +
|
||||
geom_smooth()
|
||||
|
@ -539,8 +539,8 @@ It is generally more useful if you want *expand* the limits, for example, to mat
|
|||
For example, if we extract two classes of cars and plot them separately, it's difficult to compare the plots because all three scales (the x-axis, the y-axis, and the colour aesthetic) have different ranges.
|
||||
|
||||
```{r out.width = "50%", fig.align = "default", fig.width = 4}
|
||||
suv <- mpg %>% filter(class == "suv")
|
||||
compact <- mpg %>% filter(class == "compact")
|
||||
suv <- mpg |> filter(class == "suv")
|
||||
compact <- mpg |> filter(class == "compact")
|
||||
|
||||
ggplot(suv, aes(displ, hwy, colour = drv)) +
|
||||
geom_point()
|
||||
|
|
|
@ -43,14 +43,14 @@ Not only are csv files one of the most common forms of data storage, but once yo
|
|||
Here is what a simple CSV file with a row for column names (also commonly referred to as the header row) and six rows of data looks like.
|
||||
|
||||
```{r echo = FALSE, message = FALSE}
|
||||
read_lines("data/students.csv") %>% cat(sep = "\n")
|
||||
read_lines("data/students.csv") |> cat(sep = "\n")
|
||||
```
|
||||
|
||||
Note that the `,`s separate the columns.
|
||||
Table \@ref(tab:students-table) shows a representation of the same data as a table.
|
||||
|
||||
```{r students-table, echo = FALSE, message = FALSE}
|
||||
read_csv("data/students.csv") %>%
|
||||
read_csv("data/students.csv") |>
|
||||
knitr::kable(caption = "Data from the students.csv file as a table.")
|
||||
```
|
||||
|
||||
|
@ -132,11 +132,11 @@ For example, the column names in the `students` file we read in are formatted in
|
|||
You might consider renaming them one by one with `dplyr::rename()` or you might use the `janitor::clean_names()` function turn them all into snake case at once.[^data-import-1]
|
||||
This function takes in a data frame and returns a data frame with variable names converted to snake case.
|
||||
|
||||
[^data-import-1]: The [janitor](http://sfirke.github.io/janitor/) package is not part of the tidyverse, but it offers handy functions for data cleaning and works well within data pipelines that uses `%>%`.
|
||||
[^data-import-1]: The [janitor](http://sfirke.github.io/janitor/) package is not part of the tidyverse, but it offers handy functions for data cleaning and works well within data pipelines that uses `|>`.
|
||||
|
||||
```{r message = FALSE}
|
||||
library(janitor)
|
||||
students %>%
|
||||
students |>
|
||||
clean_names()
|
||||
```
|
||||
|
||||
|
@ -147,8 +147,8 @@ We can convert this variable to a factor using the `factor()` function.
|
|||
You'll learn more about factors in Chapter \@ref(factors).
|
||||
|
||||
```{r}
|
||||
students <- students %>%
|
||||
clean_names() %>%
|
||||
students <- students |>
|
||||
clean_names() |>
|
||||
mutate(meal_plan = factor(meal_plan))
|
||||
|
||||
students
|
||||
|
|
|
@ -79,11 +79,11 @@ Here are a couple of small examples showing how you might work with `table1`.
|
|||
|
||||
```{r fig.width = 5, fig.alt = "This figure shows the numbers of cases in 1999 and 2000 for Afghanistan, Brazil, and China, with year on the x-axis and number of cases on the y-axis. Each point on the plot represents the number of cases in a given country in a given year. The points for each country are differentiated from others by color and shape and connected with a line, resulting in three, non-parallel, non-intersecting lines. The numbers of cases in China are highest for both 1999 and 2000, with values above 200,000 for both years. The number of cases in Brazil is approximately 40,000 in 1999 and approximately 75,000 in 2000. The numbers of cases in Afghanistan are lowest for both 1999 and 2000, with values that appear to be very close to 0 on this scale."}
|
||||
# Compute rate per 10,000
|
||||
table1 %>%
|
||||
table1 |>
|
||||
mutate(rate = cases / population * 10000)
|
||||
|
||||
# Compute cases per year
|
||||
table1 %>%
|
||||
table1 |>
|
||||
count(year, wt = cases)
|
||||
|
||||
# Visualise changes over time
|
||||
|
@ -147,13 +147,13 @@ table4a
|
|||
And you want to create the following visualisation where each line represents a `country`, `year` is on the x-axis, `cases` are on the y-axis, and you automatically get the legend that indicates which line represents which country.
|
||||
|
||||
```{r tidy-pivot-longer-plot-lines, fig.width = 5, echo = FALSE, fig.cap = "Number of cases over the years for each country.", fig.alt = "This figure shows the numbers of cases in 1999 and 2000 for Afghanistan, Brazil, and China, with year on the x-axis and number of cases on the y-axis. Each point on the plot represents the number of cases in a given country in a given year. The points for each country are differentiated from others by color and shape and connected with a line, resulting in three, non-parallel, non-intersecting lines. The numbers of cases in China are highest for both 1999 and 2000, with values above 200,000 for both years. The number of cases in Brazil is approximately 40,000 in 1999 and approximately 75,000 in 2000. The numbers of cases in Afghanistan are lowest for both 1999 and 2000, with values that appear to be very close to 0 on this scale."}
|
||||
table4a %>%
|
||||
table4a |>
|
||||
pivot_longer(
|
||||
cols = c(`1999`, `2000`),
|
||||
names_to = "year",
|
||||
values_to = "cases",
|
||||
) %>%
|
||||
mutate(year = parse_integer(year)) %>%
|
||||
) |>
|
||||
mutate(year = parse_integer(year)) |>
|
||||
ggplot(aes(x = year, y = cases)) +
|
||||
geom_line(aes(group = country), colour = "grey50") +
|
||||
geom_point(aes(colour = country, shape = country)) +
|
||||
|
@ -163,12 +163,12 @@ table4a %>%
|
|||
It's most straight-forward to do this starting with a data frame where `country`, `year`, and `cases` are the columns and each row represents a record from a country for a particular year.
|
||||
|
||||
```{r echo = FALSE}
|
||||
table4a %>%
|
||||
table4a |>
|
||||
pivot_longer(
|
||||
cols = c(`1999`, `2000`),
|
||||
names_to = "year",
|
||||
values_to = "cases"
|
||||
) %>%
|
||||
) |>
|
||||
mutate(year = parse_integer(year))
|
||||
```
|
||||
|
||||
|
@ -187,7 +187,7 @@ To describe that operation we need three parameters:
|
|||
Together those parameters generate the call to `pivot_longer()`:
|
||||
|
||||
```{r}
|
||||
table4a %>%
|
||||
table4a |>
|
||||
pivot_longer(
|
||||
cols = c(`1999`, `2000`),
|
||||
names_to = "year",
|
||||
|
@ -218,12 +218,12 @@ We can add a new step to our pipeline using `dplyr::mutate()` to parse this vari
|
|||
You can refer back to Section \@ref(parsing-a-vector) for functions for parsing other types of vectors.
|
||||
|
||||
```{r}
|
||||
table4a %>%
|
||||
table4a |>
|
||||
pivot_longer(
|
||||
cols = c(`1999`, `2000`),
|
||||
names_to = "year",
|
||||
values_to = "cases"
|
||||
) %>%
|
||||
) |>
|
||||
mutate(year = parse_integer(year))
|
||||
```
|
||||
|
||||
|
@ -240,31 +240,31 @@ We can use `pivot_longer()` to tidy `table4b` in a similar fashion.
|
|||
The only difference is the variable stored in the cell values:
|
||||
|
||||
```{r}
|
||||
table4b %>%
|
||||
table4b |>
|
||||
pivot_longer(
|
||||
cols = c(`1999`, `2000`),
|
||||
names_to = "year",
|
||||
values_to = "population"
|
||||
) %>%
|
||||
) |>
|
||||
mutate(year = parse_integer(year))
|
||||
```
|
||||
|
||||
To combine the tidied versions of `table4a` and `table4b` into a single tibble, we need to use `dplyr::left_join()`, which you'll learn about in Chapter \@ref(relational-data).
|
||||
|
||||
```{r}
|
||||
tidy4a <- table4b %>%
|
||||
tidy4a <- table4b |>
|
||||
pivot_longer(
|
||||
cols = c(`1999`, `2000`),
|
||||
names_to = "year",
|
||||
values_to = "cases"
|
||||
) %>%
|
||||
) |>
|
||||
mutate(year = parse_integer(year))
|
||||
tidy4b <- table4b %>%
|
||||
tidy4b <- table4b |>
|
||||
pivot_longer(
|
||||
cols = c(`1999`, `2000`),
|
||||
names_to = "year",
|
||||
values_to = "population"
|
||||
) %>%
|
||||
) |>
|
||||
mutate(year = parse_integer(year))
|
||||
left_join(tidy4a, tidy4b)
|
||||
```
|
||||
|
@ -282,8 +282,8 @@ table2
|
|||
Suppose you'd like to calculate the `rate` (number of `cases` divided by `population`) for each country in a given year, and record it as a new column, resulting in the following data frame.
|
||||
|
||||
```{r tidy-pivot-wider-case-ratio, echo = FALSE}
|
||||
table2 %>%
|
||||
pivot_wider(names_from = type, values_from = count) %>%
|
||||
table2 |>
|
||||
pivot_wider(names_from = type, values_from = count) |>
|
||||
mutate(rate = cases / population)
|
||||
```
|
||||
|
||||
|
@ -298,7 +298,7 @@ This time, however, we only need two parameters:
|
|||
We can use `pivot_wider()`, as shown programmatically below, and visually in Figure \@ref(fig:tidy-pivot-wider).
|
||||
|
||||
```{r}
|
||||
table2 %>%
|
||||
table2 |>
|
||||
pivot_wider(names_from = type, values_from = count)
|
||||
```
|
||||
|
||||
|
@ -314,9 +314,9 @@ Once we have our data in this wider format, we can create the data frame that mo
|
|||
Earlier we visualised case counts over the years, and this representation can be useful for visualising case rates, for example.
|
||||
|
||||
```{r, fig.alt = "This figure shows the case rate in 1999 and 2000 for Afghanistan, Brazil, and China, with year on the x-axis and number of cases on the y-axis. Each point on the plot represents the case rate in a given country in a given year. The points for each country are differentiated from others by color and shape and connected with a line, resulting in three, non-parallel, non-intersecting lines. The case rates in Brazil are highest for both 1999 and 2000; approximately 0.0002 in 1999 and approximately 0.00045 in 2000. The case rates in China are slightly below 0.0002 in both 1999 and 2000. The case rates in Afghanistan are lowest for both 1999 and 2000; pretty close to 0 in 1999 and approximately 0.0001 in 2000."}
|
||||
table2 %>%
|
||||
pivot_wider(names_from = type, values_from = count) %>%
|
||||
mutate(rate = cases / population) %>%
|
||||
table2 |>
|
||||
pivot_wider(names_from = type, values_from = count) |>
|
||||
mutate(rate = cases / population) |>
|
||||
ggplot(aes(x = year, y = rate)) +
|
||||
geom_line(aes(group = country), colour = "grey50") +
|
||||
geom_point(aes(colour = country, shape = country)) +
|
||||
|
@ -326,13 +326,13 @@ table2 %>%
|
|||
Now let's go one step further and widen the data to record `cases`, `population`, and `rate` for 1999 and 2000 in separate columns, such as the following.
|
||||
|
||||
```{r tidy-pivot-even-wider-case-ratio, echo = FALSE}
|
||||
table2 %>%
|
||||
pivot_wider(names_from = type, values_from = count) %>%
|
||||
mutate(rate = cases / population) %>%
|
||||
table2 |>
|
||||
pivot_wider(names_from = type, values_from = count) |>
|
||||
mutate(rate = cases / population) |>
|
||||
pivot_wider(
|
||||
names_from = year,
|
||||
values_from = c(cases, population, rate)
|
||||
) %>%
|
||||
) |>
|
||||
relocate(country, contains("1999"))
|
||||
```
|
||||
|
||||
|
@ -349,9 +349,9 @@ To do so, we'll take advantage of the fact that the pivot functions can operate
|
|||
The first three lines of the following code chunk is what we've already done in the previous step and we add on to the pipeline another `pivot_wider()` step where the values for the added columns come from `cases`, `population`, and `rate` and the column names are automatically suffixed with values from the `year` variable.
|
||||
|
||||
```{r}
|
||||
table2 %>%
|
||||
pivot_wider(names_from = type, values_from = count) %>%
|
||||
mutate(rate = cases / population) %>%
|
||||
table2 |>
|
||||
pivot_wider(names_from = type, values_from = count) |>
|
||||
mutate(rate = cases / population) |>
|
||||
pivot_wider(
|
||||
names_from = year,
|
||||
values_from = c(cases, population, rate)
|
||||
|
@ -378,8 +378,8 @@ As you might have guessed from their names, `pivot_wider()` and `pivot_longer()`
|
|||
half = c( 1, 2, 1, 2),
|
||||
return = c(1.88, 0.59, 0.92, 0.17)
|
||||
)
|
||||
stocks %>%
|
||||
pivot_wider(names_from = year, values_from = return) %>%
|
||||
stocks |>
|
||||
pivot_wider(names_from = year, values_from = return) |>
|
||||
pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return")
|
||||
```
|
||||
|
||||
|
@ -391,7 +391,7 @@ As you might have guessed from their names, `pivot_wider()` and `pivot_longer()`
|
|||
2. Why does this code fail?
|
||||
|
||||
```{r, error = TRUE}
|
||||
table4a %>%
|
||||
table4a |>
|
||||
pivot_longer(c(1999, 2000), names_to = "year", values_to = "cases")
|
||||
```
|
||||
|
||||
|
@ -427,7 +427,7 @@ As you might have guessed from their names, `pivot_wider()` and `pivot_longer()`
|
|||
5. One way of summarising the distribution of one categorical variable based on the levels of another is using `dplyr::count()`, e.g. the following gives the distribution of `drv` (type of drive train) for each level of `cyl` (number of cylinders) for cars in the `mpg` dataset.
|
||||
|
||||
```{r}
|
||||
mpg %>%
|
||||
mpg |>
|
||||
count(cyl, drv)
|
||||
```
|
||||
|
||||
|
@ -435,8 +435,8 @@ As you might have guessed from their names, `pivot_wider()` and `pivot_longer()`
|
|||
Use one of the pivoting functions to construct the contingency table shown below based on the output above.
|
||||
|
||||
```{r echo = FALSE}
|
||||
mpg %>%
|
||||
count(cyl, drv) %>%
|
||||
mpg |>
|
||||
count(cyl, drv) |>
|
||||
pivot_wider(names_from = drv, values_from = n)
|
||||
```
|
||||
|
||||
|
@ -504,7 +504,7 @@ In constructing the appropriate regular expression we need to keep in mind a few
|
|||
The regular expression that will capture all of these inconsistencies and extract the three groups of information we need is `new_?(.*)_(.)(.*)`.
|
||||
|
||||
```{r}
|
||||
who %>%
|
||||
who |>
|
||||
pivot_longer(
|
||||
cols = new_sp_m014:newrel_f65,
|
||||
names_to = c("diagnosis", "gender", "age"),
|
||||
|
@ -518,7 +518,7 @@ First, we're seeing lots of `NA`s in the `cases` column.
|
|||
We can drop these observations by setting `values_drop_na` to `TRUE`.
|
||||
|
||||
```{r}
|
||||
who %>%
|
||||
who |>
|
||||
pivot_longer(
|
||||
cols = new_sp_m014:newrel_f65,
|
||||
names_to = c("diagnosis", "gender", "age"),
|
||||
|
@ -532,14 +532,14 @@ Second, `diagnosis` and `gender` are characters by default, however it's a good
|
|||
We'll use the `parse_factor()` function from readr to make the conversion in a `mutate()` step we add to the pipeline.
|
||||
|
||||
```{r}
|
||||
who %>%
|
||||
who |>
|
||||
pivot_longer(
|
||||
cols = new_sp_m014:newrel_f65,
|
||||
names_to = c("diagnosis", "gender", "age"),
|
||||
names_pattern = "new_?(.*)_(.)(.*)",
|
||||
values_to = "cases",
|
||||
values_drop_na = TRUE
|
||||
) %>%
|
||||
) |>
|
||||
mutate(
|
||||
gender = parse_factor(gender, levels = c("f", "m")),
|
||||
age = parse_factor(
|
||||
|
@ -554,14 +554,14 @@ Finally, we might want to recode the `age` variable with level names that are a
|
|||
We'll do this within the `mutate()` step of our pipeline using `forcats::fct_recode()` that you'll learn more about in Chapter \@ref(factors).
|
||||
|
||||
```{r}
|
||||
who_tidy <- who %>%
|
||||
who_tidy <- who |>
|
||||
pivot_longer(
|
||||
cols = new_sp_m014:newrel_f65,
|
||||
names_to = c("diagnosis", "gender", "age"),
|
||||
names_pattern = "new_?(.*)_(.)(.*)",
|
||||
values_to = "cases",
|
||||
values_drop_na = TRUE
|
||||
) %>%
|
||||
) |>
|
||||
mutate(
|
||||
gender = parse_factor(gender, levels = c("f", "m")),
|
||||
age = parse_factor(
|
||||
|
@ -587,10 +587,10 @@ This tidy data frame allows us to explore the data with more ease than the origi
|
|||
For example, we can easily filter for a particular type of TB for a given country and sum over the number of cases to see how case numbers for this type of TB have evolved over the years.
|
||||
|
||||
```{r fig.alt = "A scatterplot of number of smear positive pulmonary TB cases in the US over the years, with year on the x-axis ranging from 1995 to 2013 and yearly total number of cases on the y-axis ranging from 3000 to 8000. The points on the scatterplot are overlaid with a smooth curve, which shows a strong, negative association between the two variables."}
|
||||
who_tidy %>%
|
||||
filter(diagnosis == "sp", country == "United States of America") %>%
|
||||
group_by(year) %>%
|
||||
summarise(cases_total = sum(cases)) %>%
|
||||
who_tidy |>
|
||||
filter(diagnosis == "sp", country == "United States of America") |>
|
||||
group_by(year) |>
|
||||
summarise(cases_total = sum(cases)) |>
|
||||
ggplot(aes(x = year, y = cases_total)) +
|
||||
geom_point() +
|
||||
geom_smooth() +
|
||||
|
|
|
@ -339,7 +339,7 @@ flights |>
|
|||
|
||||
```{r, eval = FALSE, echo = FALSE}
|
||||
# For data checking, not used in results shown in book
|
||||
flights <- flights %>% mutate(
|
||||
flights <- flights |> mutate(
|
||||
dep_time = hour * 60 + minute,
|
||||
arr_time = (arr_time %/% 100) * 60 + (arr_time %% 100),
|
||||
airtime2 = arr_time - dep_time,
|
||||
|
@ -481,7 +481,7 @@ In hindsight, this wasn't great way to make this function work, but it's difficu
|
|||
To make it obvious what's happening, dplyr displays a message that tells you how you can change this behavior:
|
||||
|
||||
```{r}
|
||||
daily_flights <- daily %>%
|
||||
daily_flights <- daily |>
|
||||
summarise(
|
||||
n = n()
|
||||
)
|
||||
|
@ -490,7 +490,7 @@ daily_flights <- daily %>%
|
|||
If you're happy with this behavior, you can explicitly request it in order to suppress the message:
|
||||
|
||||
```{r, results = FALSE}
|
||||
daily_flights <- daily %>%
|
||||
daily_flights <- daily |>
|
||||
summarise(
|
||||
n = n(),
|
||||
.groups = "drop_last"
|
||||
|
@ -505,8 +505,8 @@ You might also want to remove grouping outside of `summarise()`.
|
|||
You can do this with `ungroup()`.
|
||||
|
||||
```{r}
|
||||
daily %>%
|
||||
ungroup() %>%
|
||||
daily |>
|
||||
ungroup() |>
|
||||
summarise(
|
||||
delay = mean(dep_delay, na.rm = TRUE),
|
||||
flights = n()
|
||||
|
@ -598,8 +598,8 @@ There's another common variation on this pattern that we can see in some data ab
|
|||
The following code uses data from the **Lahman** package to compare what proportion of times a player hits the ball vs. the number of attempts they take:
|
||||
|
||||
```{r}
|
||||
batters <- Lahman::Batting %>%
|
||||
group_by(playerID) %>%
|
||||
batters <- Lahman::Batting |>
|
||||
group_by(playerID) |>
|
||||
summarise(
|
||||
perf = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
|
||||
n = sum(AB, na.rm = TRUE)
|
||||
|
@ -620,8 +620,8 @@ When we plot the skill of the batter (measured by the batting average, `ba`) aga
|
|||
#| from 0.2 at when n is 1 to 0.25 when n is ~1000. Average performance
|
||||
#| continues to increase linearly at a much shallower slope reaching
|
||||
#| ~0.3 when n is ~15,000.
|
||||
batters %>%
|
||||
filter(n > 100) %>%
|
||||
batters |>
|
||||
filter(n > 100) |>
|
||||
ggplot(aes(n, perf)) +
|
||||
geom_point(alpha = 1 / 10) +
|
||||
geom_smooth(se = FALSE)
|
||||
|
@ -631,7 +631,7 @@ This also has important implications for ranking.
|
|||
If you naively sort on `desc(ba)`, the people with the best batting averages are clearly lucky, not skilled:
|
||||
|
||||
```{r}
|
||||
batters %>%
|
||||
batters |>
|
||||
arrange(desc(perf))
|
||||
```
|
||||
|
||||
|
|
|
@ -112,15 +112,15 @@ Instead of a single string, sometimes you'll have the individual components of t
|
|||
This is what we have in the flights data:
|
||||
|
||||
```{r}
|
||||
flights %>%
|
||||
flights |>
|
||||
select(year, month, day, hour, minute)
|
||||
```
|
||||
|
||||
To create a date/time from this sort of input, use `make_date()` for dates, or `make_datetime()` for date-times:
|
||||
|
||||
```{r}
|
||||
flights %>%
|
||||
select(year, month, day, hour, minute) %>%
|
||||
flights |>
|
||||
select(year, month, day, hour, minute) |>
|
||||
mutate(departure = make_datetime(year, month, day, hour, minute))
|
||||
```
|
||||
|
||||
|
@ -133,14 +133,14 @@ make_datetime_100 <- function(year, month, day, time) {
|
|||
make_datetime(year, month, day, time %/% 100, time %% 100)
|
||||
}
|
||||
|
||||
flights_dt <- flights %>%
|
||||
filter(!is.na(dep_time), !is.na(arr_time)) %>%
|
||||
flights_dt <- flights |>
|
||||
filter(!is.na(dep_time), !is.na(arr_time)) |>
|
||||
mutate(
|
||||
dep_time = make_datetime_100(year, month, day, dep_time),
|
||||
arr_time = make_datetime_100(year, month, day, arr_time),
|
||||
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
|
||||
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
|
||||
) %>%
|
||||
) |>
|
||||
select(origin, dest, ends_with("delay"), ends_with("time"))
|
||||
|
||||
flights_dt
|
||||
|
@ -149,7 +149,7 @@ flights_dt
|
|||
With this data, I can visualise the distribution of departure times across the year:
|
||||
|
||||
```{r}
|
||||
flights_dt %>%
|
||||
flights_dt |>
|
||||
ggplot(aes(dep_time)) +
|
||||
geom_freqpoly(binwidth = 86400) # 86400 seconds = 1 day
|
||||
```
|
||||
|
@ -157,8 +157,8 @@ flights_dt %>%
|
|||
Or within a single day:
|
||||
|
||||
```{r}
|
||||
flights_dt %>%
|
||||
filter(dep_time < ymd(20130102)) %>%
|
||||
flights_dt |>
|
||||
filter(dep_time < ymd(20130102)) |>
|
||||
ggplot(aes(dep_time)) +
|
||||
geom_freqpoly(binwidth = 600) # 600 s = 10 minutes
|
||||
```
|
||||
|
@ -237,8 +237,8 @@ wday(datetime, label = TRUE, abbr = FALSE)
|
|||
We can use `wday()` to see that more flights depart during the week than on the weekend:
|
||||
|
||||
```{r}
|
||||
flights_dt %>%
|
||||
mutate(wday = wday(dep_time, label = TRUE)) %>%
|
||||
flights_dt |>
|
||||
mutate(wday = wday(dep_time, label = TRUE)) |>
|
||||
ggplot(aes(x = wday)) +
|
||||
geom_bar()
|
||||
```
|
||||
|
@ -247,12 +247,12 @@ There's an interesting pattern if we look at the average departure delay by minu
|
|||
It looks like flights leaving in minutes 20-30 and 50-60 have much lower delays than the rest of the hour!
|
||||
|
||||
```{r}
|
||||
flights_dt %>%
|
||||
mutate(minute = minute(dep_time)) %>%
|
||||
group_by(minute) %>%
|
||||
flights_dt |>
|
||||
mutate(minute = minute(dep_time)) |>
|
||||
group_by(minute) |>
|
||||
summarise(
|
||||
avg_delay = mean(dep_delay, na.rm = TRUE),
|
||||
n = n()) %>%
|
||||
n = n()) |>
|
||||
ggplot(aes(minute, avg_delay)) +
|
||||
geom_line()
|
||||
```
|
||||
|
@ -260,9 +260,9 @@ flights_dt %>%
|
|||
Interestingly, if we look at the *scheduled* departure time we don't see such a strong pattern:
|
||||
|
||||
```{r}
|
||||
sched_dep <- flights_dt %>%
|
||||
mutate(minute = minute(sched_dep_time)) %>%
|
||||
group_by(minute) %>%
|
||||
sched_dep <- flights_dt |>
|
||||
mutate(minute = minute(sched_dep_time)) |>
|
||||
group_by(minute) |>
|
||||
summarise(
|
||||
avg_delay = mean(arr_delay, na.rm = TRUE),
|
||||
n = n())
|
||||
|
@ -287,8 +287,8 @@ Each function takes a vector of dates to adjust and then the name of the unit ro
|
|||
This, for example, allows us to plot the number of flights per week:
|
||||
|
||||
```{r}
|
||||
flights_dt %>%
|
||||
count(week = floor_date(dep_time, "week")) %>%
|
||||
flights_dt |>
|
||||
count(week = floor_date(dep_time, "week")) |>
|
||||
ggplot(aes(week, n)) +
|
||||
geom_line()
|
||||
```
|
||||
|
@ -320,17 +320,17 @@ update(datetime, year = 2020, month = 2, mday = 2, hour = 2)
|
|||
If values are too big, they will roll-over:
|
||||
|
||||
```{r}
|
||||
ymd("2015-02-01") %>%
|
||||
ymd("2015-02-01") |>
|
||||
update(mday = 30)
|
||||
ymd("2015-02-01") %>%
|
||||
ymd("2015-02-01") |>
|
||||
update(hour = 400)
|
||||
```
|
||||
|
||||
You can use `update()` to show the distribution of flights across the course of the day for every day of the year:
|
||||
|
||||
```{r}
|
||||
flights_dt %>%
|
||||
mutate(dep_hour = update(dep_time, yday = 1)) %>%
|
||||
flights_dt |>
|
||||
mutate(dep_hour = update(dep_time, yday = 1)) |>
|
||||
ggplot(aes(dep_hour)) +
|
||||
geom_freqpoly(binwidth = 300)
|
||||
```
|
||||
|
@ -474,7 +474,7 @@ Let's use periods to fix an oddity related to our flight dates.
|
|||
Some planes appear to have arrived at their destination *before* they departed from New York City.
|
||||
|
||||
```{r}
|
||||
flights_dt %>%
|
||||
flights_dt |>
|
||||
filter(arr_time < dep_time)
|
||||
```
|
||||
|
||||
|
@ -483,7 +483,7 @@ We used the same date information for both the departure and the arrival times,
|
|||
We can fix this by adding `days(1)` to the arrival time of each overnight flight.
|
||||
|
||||
```{r}
|
||||
flights_dt <- flights_dt %>%
|
||||
flights_dt <- flights_dt |>
|
||||
mutate(
|
||||
overnight = arr_time < dep_time,
|
||||
arr_time = arr_time + days(ifelse(overnight, 0, 1)),
|
||||
|
@ -494,7 +494,7 @@ flights_dt <- flights_dt %>%
|
|||
Now all of our flights obey the laws of physics.
|
||||
|
||||
```{r}
|
||||
flights_dt %>%
|
||||
flights_dt |>
|
||||
filter(overnight, arr_time < dep_time)
|
||||
```
|
||||
|
||||
|
|
|
@ -51,9 +51,9 @@ To use hierarchical clustering in R, begin by selecting the numeric columns from
|
|||
```{r}
|
||||
small_iris <- sample_n(iris, 50)
|
||||
|
||||
iris_hclust <- small_iris %>%
|
||||
select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>%
|
||||
dist() %>%
|
||||
iris_hclust <- small_iris |>
|
||||
select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) |>
|
||||
dist() |>
|
||||
hclust(method = "complete")
|
||||
```
|
||||
|
||||
|
@ -92,10 +92,10 @@ knitr::include_graphics("images/EDA-linkage.png")
|
|||
|
||||
|
||||
```{r fig.height = 4}
|
||||
small_iris %>%
|
||||
select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>%
|
||||
dist() %>%
|
||||
hclust(method = "single") %>%
|
||||
small_iris |>
|
||||
select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) |>
|
||||
dist() |>
|
||||
hclust(method = "single") |>
|
||||
plot(labels = small_iris$Species)
|
||||
```
|
||||
|
||||
|
@ -116,8 +116,8 @@ knitr::include_graphics("images/EDA-kmeans.png")
|
|||
Use `kmeans()` to perform k means clustering with R. As with hierarchical clustering, you can only apply k means clustering to numerical data. Pass your numerical data to the `kmeans()` function, then set `center` to the number of clusters to search for ($k$) and `nstart` to the number of simulations to run. Since the results of k means clustering depend on the initial assignment of points to groups, which is random, R will run `nstart` simulations and then return the best results (as measured by the minimum sum of squared distances between each point and the centroid of the group it is assigned to). Finally, set the maximum number of iterations to let each simulation run in case the simulation cannot quickly find a stable grouping.
|
||||
|
||||
```{r}
|
||||
iris_kmeans <- small_iris %>%
|
||||
select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>%
|
||||
iris_kmeans <- small_iris |>
|
||||
select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) |>
|
||||
kmeans(centers = 3, nstart = 20, iter.max = 50)
|
||||
|
||||
iris_kmeans$cluster
|
||||
|
@ -129,8 +129,8 @@ Unlike `hclust()`, the k means algorithm does not provide an intuitive visual in
|
|||
ggplot(small_iris, aes(x = Sepal.Width, y = Sepal.Length)) +
|
||||
geom_point(aes(color = factor(iris_kmeans$cluster)))
|
||||
|
||||
small_iris %>%
|
||||
group_by(iris_kmeans$cluster) %>%
|
||||
small_iris |>
|
||||
group_by(iris_kmeans$cluster) |>
|
||||
summarise(n_obs = n(), avg_width = mean(Sepal.Width), avg_length = mean(Sepal.Length))
|
||||
```
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ First, let's address a distraction: the data is censored in an odd way. The y va
|
|||
|
||||
```{r}
|
||||
n <- nrow(heights)
|
||||
heights <- heights %>% filter(income < 150000)
|
||||
heights <- heights |> filter(income < 150000)
|
||||
nrow(heights) / n
|
||||
```
|
||||
|
||||
|
@ -33,7 +33,7 @@ I'm going to record the original number of observations in `n`. We'll come back
|
|||
Also, you can see that heights have been rounded to the nearest inch so using boxplots will make it easier to see the pattern. We'll also remove the very tall and very short people so we can focus on the most typically heights:
|
||||
|
||||
```{r}
|
||||
heights <- heights %>% filter(between(height, 59, 78))
|
||||
heights <- heights |> filter(between(height, 59, 78))
|
||||
nrow(heights) / n
|
||||
|
||||
ggplot(heights, aes(height, income, group = height)) +
|
||||
|
@ -107,8 +107,8 @@ What happens if we also include `sex` in the model?
|
|||
|
||||
```{r}
|
||||
h2 <- lm(income ~ height * sex, data = heights)
|
||||
grid <- heights %>%
|
||||
expand(height, sex) %>%
|
||||
grid <- heights |>
|
||||
expand(height, sex) |>
|
||||
add_predictions(h2, "income")
|
||||
|
||||
ggplot(heights, aes(height, income)) +
|
||||
|
@ -123,8 +123,8 @@ Need to commment about predictions for tall women and short men - there is not a
|
|||
|
||||
```{r}
|
||||
h3 <- lm(income ~ height + sex, data = heights)
|
||||
grid <- heights %>%
|
||||
expand(height, sex) %>%
|
||||
grid <- heights |>
|
||||
expand(height, sex) |>
|
||||
gather_predictions(h2, h3)
|
||||
|
||||
ggplot(grid, aes(height, pred, colour = sex)) +
|
||||
|
@ -140,7 +140,7 @@ But before we add a variable to our model, we need to do a little EDA + cleaning
|
|||
|
||||
```{r}
|
||||
ggplot(heights, aes(education)) + geom_bar()
|
||||
heights_ed <- heights %>% filter(education >= 12)
|
||||
heights_ed <- heights |> filter(education >= 12)
|
||||
nrow(heights) / n
|
||||
```
|
||||
|
||||
|
@ -154,8 +154,8 @@ he2 <- lm(income ~ height * education, data = heights_ed)
|
|||
How can we visualise the results of this model? One way to think about it as a surface: we have a 2d grid of height and education, and point on that grid gets a predicted income.
|
||||
|
||||
```{r}
|
||||
grid <- heights_ed %>%
|
||||
expand(height, education) %>%
|
||||
grid <- heights_ed |>
|
||||
expand(height, education) |>
|
||||
gather_predictions(he1, he2)
|
||||
|
||||
ggplot(grid, aes(height, education, fill = pred)) +
|
||||
|
@ -177,21 +177,21 @@ ggplot(grid, aes(education, pred, group = height)) +
|
|||
One of the big advantages to `+` instead of `*` is that because the terms are independent we display them using two simple plots instead of one complex plot:
|
||||
|
||||
```{r}
|
||||
heights_ed %>%
|
||||
heights_ed |>
|
||||
expand(
|
||||
height = seq_range(height, 10),
|
||||
education = mean(education, na.rm = TRUE)
|
||||
) %>%
|
||||
add_predictions(he1, "income") %>%
|
||||
) |>
|
||||
add_predictions(he1, "income") |>
|
||||
ggplot(aes(height, income)) +
|
||||
geom_line()
|
||||
|
||||
heights_ed %>%
|
||||
heights_ed |>
|
||||
expand(
|
||||
height = mean(height, na.rm = TRUE),
|
||||
education = seq_range(education, 10)
|
||||
) %>%
|
||||
add_predictions(he1, "income") %>%
|
||||
) |>
|
||||
add_predictions(he1, "income") |>
|
||||
ggplot(aes(education, income)) +
|
||||
geom_line()
|
||||
```
|
||||
|
@ -226,8 +226,8 @@ tidy(hes)
|
|||
```
|
||||
|
||||
```{r}
|
||||
heights %>%
|
||||
group_by(sex) %>%
|
||||
heights |>
|
||||
group_by(sex) |>
|
||||
do(glance(lm(income ~ height, data = .)))
|
||||
```
|
||||
|
||||
|
@ -252,9 +252,9 @@ One way to introduce non-linearity into our model is to use transformed variants
|
|||
mod_e1 <- lm(income ~ education, data = heights_ed)
|
||||
mod_e2 <- lm(income ~ education + I(education ^ 2) + I(education ^ 3), data = heights_ed)
|
||||
|
||||
heights_ed %>%
|
||||
expand(education) %>%
|
||||
gather_predictions(mod_e1, mod_e2) %>%
|
||||
heights_ed |>
|
||||
expand(education) |>
|
||||
gather_predictions(mod_e1, mod_e2) |>
|
||||
ggplot(aes(education, pred, colour = model)) +
|
||||
geom_point() +
|
||||
geom_line()
|
||||
|
@ -267,9 +267,9 @@ mod_e1 <- lm(income ~ education, data = heights_ed)
|
|||
mod_e2 <- lm(income ~ poly(education, 2), data = heights_ed)
|
||||
mod_e3 <- lm(income ~ poly(education, 3), data = heights_ed)
|
||||
|
||||
heights_ed %>%
|
||||
expand(education) %>%
|
||||
gather_predictions(mod_e1, mod_e2, mod_e3) %>%
|
||||
heights_ed |>
|
||||
expand(education) |>
|
||||
gather_predictions(mod_e1, mod_e2, mod_e3) |>
|
||||
ggplot(aes(education, pred, colour = model)) +
|
||||
geom_point() +
|
||||
geom_line()
|
||||
|
@ -278,8 +278,8 @@ heights_ed %>%
|
|||
However: there's one major problem with using `poly()`: outside the range of the data, polynomials are going to rapidly shoot off to positive or negative infinity.
|
||||
|
||||
```{r}
|
||||
tibble(education = seq(5, 25)) %>%
|
||||
gather_predictions(mod_e1, mod_e2, mod_e3) %>%
|
||||
tibble(education = seq(5, 25)) |>
|
||||
gather_predictions(mod_e1, mod_e2, mod_e3) |>
|
||||
ggplot(aes(education, pred, colour = model)) +
|
||||
geom_line()
|
||||
```
|
||||
|
@ -292,8 +292,8 @@ mod_e1 <- lm(income ~ education, data = heights_ed)
|
|||
mod_e2 <- lm(income ~ ns(education, 2), data = heights_ed)
|
||||
mod_e3 <- lm(income ~ ns(education, 3), data = heights_ed)
|
||||
|
||||
tibble(education = seq(5, 25)) %>%
|
||||
gather_predictions(mod_e1, mod_e2, mod_e3) %>%
|
||||
tibble(education = seq(5, 25)) |>
|
||||
gather_predictions(mod_e1, mod_e2, mod_e3) |>
|
||||
ggplot(aes(education, pred, colour = model)) +
|
||||
geom_line()
|
||||
```
|
||||
|
|
|
@ -65,10 +65,10 @@ If you're unlucky, you may need to repeat this procedure.)
|
|||
```{r}
|
||||
tibble::tibble(
|
||||
i = seq_along(issues),
|
||||
names = issues %>% map(names)
|
||||
) %>%
|
||||
tidyr::unnest(names) %>%
|
||||
table() %>%
|
||||
names = issues |> map(names)
|
||||
) |>
|
||||
tidyr::unnest(names) |>
|
||||
table() |>
|
||||
t()
|
||||
```
|
||||
|
||||
|
@ -80,9 +80,9 @@ To work with this sort of data, you typically want to turn it into a data frame
|
|||
|
||||
```{r}
|
||||
|
||||
issues %>% map_int("id")
|
||||
issues %>% map_lgl("locked")
|
||||
issues %>% map_chr("state")
|
||||
issues |> map_int("id")
|
||||
issues |> map_lgl("locked")
|
||||
issues |> map_chr("state")
|
||||
```
|
||||
|
||||
You can use the same technique to extract more deeply nested structure.
|
||||
|
@ -90,23 +90,23 @@ For example, imagine you want to extract the name and id of the user.
|
|||
You could do that in two steps:
|
||||
|
||||
```{r}
|
||||
users <- issues %>% map("user")
|
||||
users %>% map_chr("login")
|
||||
users %>% map_int("id")
|
||||
users <- issues |> map("user")
|
||||
users |> map_chr("login")
|
||||
users |> map_int("id")
|
||||
```
|
||||
|
||||
But by supplying a character *vector* to `map_*`, you can do it in one:
|
||||
|
||||
```{r}
|
||||
issues %>% map_chr(c("user", "login"))
|
||||
issues %>% map_int(c("user", "id"))
|
||||
issues |> map_chr(c("user", "login"))
|
||||
issues |> map_int(c("user", "id"))
|
||||
```
|
||||
|
||||
What happens if that path is missing in some of the elements?
|
||||
For example, lets try and extract the HTML url to the pull request:
|
||||
|
||||
```{r, error = TRUE}
|
||||
issues %>% map_chr(c("pull_request", "html_url"))
|
||||
issues |> map_chr(c("pull_request", "html_url"))
|
||||
```
|
||||
|
||||
Unfortunately that doesn't work.
|
||||
|
@ -114,7 +114,7 @@ Whenever you see an error from purrr complaining about the "type" of the result,
|
|||
You can diagnose the problem more easily if you use `map()`:
|
||||
|
||||
```{r}
|
||||
issues %>% map(c("pull_request", "html_url"))
|
||||
issues |> map(c("pull_request", "html_url"))
|
||||
```
|
||||
|
||||
To get the results into a character vector, we need to tell purrr what it should change `NULL` to.
|
||||
|
@ -122,7 +122,7 @@ You can do that with the `.null` argument.
|
|||
The most common value to use is `NA`:
|
||||
|
||||
```{r}
|
||||
issues %>% map_chr(c("pull_request", "html_url"), .null = NA)
|
||||
issues |> map_chr(c("pull_request", "html_url"), .null = NA)
|
||||
```
|
||||
|
||||
(You might wonder why that isn't the default value since it's so useful. Well, if it was the default, you'd never get an error message if you had a typo in the names. You'd just get a vector of missing values. That would be annoying to debug because it's a silent failure.)
|
||||
|
@ -130,7 +130,7 @@ issues %>% map_chr(c("pull_request", "html_url"), .null = NA)
|
|||
It's possible to mix position and named indexing by using a list
|
||||
|
||||
```{r}
|
||||
issues %>% map_chr(list("pull_request", 1), .null = NA)
|
||||
issues |> map_chr(list("pull_request", 1), .null = NA)
|
||||
```
|
||||
|
||||
## Removing a level of hierarchy
|
||||
|
@ -170,8 +170,8 @@ x <- list(
|
|||
x = list(a = 1, b = 3, c = 5),
|
||||
y = list(a = 2, b = 4, c = 6)
|
||||
)
|
||||
x %>% str()
|
||||
x %>% transpose() %>% str()
|
||||
x |> str()
|
||||
x |> transpose() |> str()
|
||||
```
|
||||
|
||||
Graphically, this looks like:
|
||||
|
@ -193,7 +193,7 @@ Many JSON APIs represent data frames in a row-based format, rather than R's colu
|
|||
|
||||
```{r}
|
||||
df <- tibble::tibble(x = 1:3, y = c("a", "b", "c"))
|
||||
df %>% transpose() %>% str()
|
||||
df |> transpose() |> str()
|
||||
```
|
||||
|
||||
## Turning lists into data frames
|
||||
|
@ -211,7 +211,7 @@ df %>% transpose() %>% str()
|
|||
|
||||
```{r, eval = FALSE}
|
||||
files <- dir("data", pattern = "\\.csv$")
|
||||
files %>%
|
||||
set_names(., basename(.)) %>%
|
||||
files |>
|
||||
set_names(., basename(.)) |>
|
||||
map_df(safely(readr::read_csv), .id = "filename")
|
||||
```
|
||||
|
|
|
@ -101,7 +101,7 @@ df <- tibble(
|
|||
y = true_model(x)
|
||||
)
|
||||
|
||||
df %>%
|
||||
df |>
|
||||
ggplot(aes(x, y)) +
|
||||
geom_point()
|
||||
```
|
||||
|
@ -117,12 +117,12 @@ my_model <- function(df) {
|
|||
mod <- my_model(df)
|
||||
rmse(mod, df)
|
||||
|
||||
grid <- df %>%
|
||||
grid <- df |>
|
||||
expand(x = seq_range(x, 50))
|
||||
preds <- grid %>%
|
||||
preds <- grid |>
|
||||
add_predictions(mod, var = "y")
|
||||
|
||||
df %>%
|
||||
df |>
|
||||
ggplot(aes(x, y)) +
|
||||
geom_line(data = preds) +
|
||||
geom_point()
|
||||
|
@ -148,7 +148,7 @@ models <- tibble(
|
|||
rmse = map2_dbl(mod, list(df), rmse)
|
||||
)
|
||||
|
||||
models %>%
|
||||
models |>
|
||||
ggplot(aes(n, rmse)) +
|
||||
geom_line(colour = "grey70") +
|
||||
geom_point(size = 3)
|
||||
|
@ -161,14 +161,14 @@ There are two approaches to help you get around this problem.
|
|||
I'll introduce them briefly here, and then we'll go into more depth in the following sections.
|
||||
|
||||
```{r}
|
||||
boot <- bootstrap(df, 100) %>%
|
||||
boot <- bootstrap(df, 100) |>
|
||||
mutate(
|
||||
mod = map(strap, my_model),
|
||||
pred = map2(list(grid), mod, add_predictions)
|
||||
)
|
||||
|
||||
boot %>%
|
||||
unnest(pred) %>%
|
||||
boot |>
|
||||
unnest(pred) |>
|
||||
ggplot(aes(x, pred, group = .id)) +
|
||||
geom_line(alpha = 1/3)
|
||||
```
|
||||
|
@ -191,7 +191,7 @@ The following code generates 100 test-training splits, holding out 20% of the da
|
|||
We then fit a model to the training set, and evaluate the error on the test set:
|
||||
|
||||
```{r}
|
||||
cv <- crossv_mc(df, 100) %>%
|
||||
cv <- crossv_mc(df, 100) |>
|
||||
mutate(
|
||||
mod = map(train, my_model),
|
||||
rmse = map2_dbl(mod, test, rmse)
|
||||
|
@ -203,7 +203,7 @@ Obviously, a plot is going to help us see distribution more easily.
|
|||
I've added our original estimate of the model error as a white vertical line (where the same dataset is used for both training and testing), and you can see it's very optimistic.
|
||||
|
||||
```{r}
|
||||
cv %>%
|
||||
cv |>
|
||||
ggplot(aes(rmse)) +
|
||||
geom_ref_line(v = rmse(mod, df)) +
|
||||
geom_freqpoly(binwidth = 0.2) +
|
||||
|
@ -215,8 +215,8 @@ These represent samples where we ended up with a few cases on all with low value
|
|||
Let's take a look:
|
||||
|
||||
```{r}
|
||||
filter(cv, rmse > 1.5) %>%
|
||||
unnest(map(train, as.data.frame)) %>%
|
||||
filter(cv, rmse > 1.5) |>
|
||||
unnest(map(train, as.data.frame)) |>
|
||||
ggplot(aes(x, .id)) +
|
||||
geom_point() +
|
||||
xlim(0, 1)
|
||||
|
|
|
@ -88,7 +88,7 @@ One easy place to start is to find the vertical distance between each point and
|
|||
(Note that I've shifted the x values slightly so you can see the individual distances.)
|
||||
|
||||
```{r, echo = FALSE}
|
||||
dist1 <- sim1 %>%
|
||||
dist1 <- sim1 |>
|
||||
mutate(
|
||||
dodge = rep(c(-1, 0, 1) / 20, 10),
|
||||
x1 = x + dodge,
|
||||
|
@ -137,7 +137,7 @@ sim1_dist <- function(a1, a2) {
|
|||
measure_distance(c(a1, a2), sim1)
|
||||
}
|
||||
|
||||
models <- models %>%
|
||||
models <- models |>
|
||||
mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
|
||||
models
|
||||
```
|
||||
|
@ -171,10 +171,10 @@ I picked the parameters of the grid roughly by looking at where the best models
|
|||
grid <- expand.grid(
|
||||
a1 = seq(-5, 20, length = 25),
|
||||
a2 = seq(1, 3, length = 25)
|
||||
) %>%
|
||||
) |>
|
||||
mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
|
||||
|
||||
grid %>%
|
||||
grid |>
|
||||
ggplot(aes(a1, a2)) +
|
||||
geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") +
|
||||
geom_point(aes(colour = -dist))
|
||||
|
@ -282,7 +282,7 @@ The easiest way to do that is to use `modelr::data_grid()`.
|
|||
Its first argument is a data frame, and for each subsequent argument it finds the unique variables and then generates all combinations:
|
||||
|
||||
```{r}
|
||||
grid <- sim1 %>%
|
||||
grid <- sim1 |>
|
||||
data_grid(x)
|
||||
grid
|
||||
```
|
||||
|
@ -294,7 +294,7 @@ We'll use `modelr::add_predictions()` which takes a data frame and a model.
|
|||
It adds the predictions from the model to a new column in the data frame:
|
||||
|
||||
```{r}
|
||||
grid <- grid %>%
|
||||
grid <- grid |>
|
||||
add_predictions(sim1_mod)
|
||||
grid
|
||||
```
|
||||
|
@ -324,7 +324,7 @@ Note, however, that we use the original dataset, not a manufactured grid.
|
|||
This is because to compute residuals we need actual y values.
|
||||
|
||||
```{r}
|
||||
sim1 <- sim1 %>%
|
||||
sim1 <- sim1 |>
|
||||
add_residuals(sim1_mod)
|
||||
sim1
|
||||
```
|
||||
|
@ -444,8 +444,8 @@ We can fit a model to it, and generate predictions:
|
|||
```{r}
|
||||
mod2 <- lm(y ~ x, data = sim2)
|
||||
|
||||
grid <- sim2 %>%
|
||||
data_grid(x) %>%
|
||||
grid <- sim2 |>
|
||||
data_grid(x) |>
|
||||
add_predictions(mod2)
|
||||
grid
|
||||
```
|
||||
|
@ -463,7 +463,7 @@ You can't make predictions about levels that you didn't observe.
|
|||
Sometimes you'll do this by accident so it's good to recognise this error message:
|
||||
|
||||
```{r, error = TRUE}
|
||||
tibble(x = "e") %>%
|
||||
tibble(x = "e") |>
|
||||
add_predictions(mod2)
|
||||
```
|
||||
|
||||
|
@ -501,8 +501,8 @@ To visualise these models we need two new tricks:
|
|||
Together this gives us:
|
||||
|
||||
```{r}
|
||||
grid <- sim3 %>%
|
||||
data_grid(x1, x2) %>%
|
||||
grid <- sim3 |>
|
||||
data_grid(x1, x2) |>
|
||||
gather_predictions(mod1, mod2)
|
||||
grid
|
||||
```
|
||||
|
@ -524,7 +524,7 @@ We can take look at the residuals.
|
|||
Here I've facetted by both model and `x2` because it makes it easier to see the pattern within each group.
|
||||
|
||||
```{r}
|
||||
sim3 <- sim3 %>%
|
||||
sim3 <- sim3 |>
|
||||
gather_residuals(mod1, mod2)
|
||||
|
||||
ggplot(sim3, aes(x1, resid, colour = x2)) +
|
||||
|
@ -547,11 +547,11 @@ Initially things proceed almost identically to the previous example:
|
|||
mod1 <- lm(y ~ x1 + x2, data = sim4)
|
||||
mod2 <- lm(y ~ x1 * x2, data = sim4)
|
||||
|
||||
grid <- sim4 %>%
|
||||
grid <- sim4 |>
|
||||
data_grid(
|
||||
x1 = seq_range(x1, 5),
|
||||
x2 = seq_range(x2, 5)
|
||||
) %>%
|
||||
) |>
|
||||
gather_predictions(mod1, mod2)
|
||||
grid
|
||||
```
|
||||
|
@ -687,8 +687,8 @@ mod3 <- lm(y ~ ns(x, 3), data = sim5)
|
|||
mod4 <- lm(y ~ ns(x, 4), data = sim5)
|
||||
mod5 <- lm(y ~ ns(x, 5), data = sim5)
|
||||
|
||||
grid <- sim5 %>%
|
||||
data_grid(x = seq_range(x, n = 50, expand = 0.1)) %>%
|
||||
grid <- sim5 |>
|
||||
data_grid(x = seq_range(x, n = 50, expand = 0.1)) |>
|
||||
gather_predictions(mod1, mod2, mod3, mod4, mod5, .pred = "y")
|
||||
|
||||
ggplot(sim5, aes(x, y)) +
|
||||
|
|
|
@ -71,8 +71,8 @@ But first, lets make a couple of tweaks to the diamonds dataset to make it easie
|
|||
2. Log-transform the carat and price variables.
|
||||
|
||||
```{r}
|
||||
diamonds2 <- diamonds %>%
|
||||
filter(carat <= 2.5) %>%
|
||||
diamonds2 <- diamonds |>
|
||||
filter(carat <= 2.5) |>
|
||||
mutate(lprice = log2(price), lcarat = log2(carat))
|
||||
```
|
||||
|
||||
|
@ -95,10 +95,10 @@ Then we look at what the model tells us about the data.
|
|||
Note that I back transform the predictions, undoing the log transformation, so I can overlay the predictions on the raw data:
|
||||
|
||||
```{r}
|
||||
grid <- diamonds2 %>%
|
||||
data_grid(carat = seq_range(carat, 20)) %>%
|
||||
mutate(lcarat = log2(carat)) %>%
|
||||
add_predictions(mod_diamond, "lprice") %>%
|
||||
grid <- diamonds2 |>
|
||||
data_grid(carat = seq_range(carat, 20)) |>
|
||||
mutate(lcarat = log2(carat)) |>
|
||||
add_predictions(mod_diamond, "lprice") |>
|
||||
mutate(price = 2 ^ lprice)
|
||||
|
||||
ggplot(diamonds2, aes(carat, price)) +
|
||||
|
@ -113,7 +113,7 @@ This is probably because no diamond in this dataset costs more than \$19,000.
|
|||
Now we can look at the residuals, which verifies that we've successfully removed the strong linear pattern:
|
||||
|
||||
```{r}
|
||||
diamonds2 <- diamonds2 %>%
|
||||
diamonds2 <- diamonds2 |>
|
||||
add_residuals(mod_diamond, "lresid")
|
||||
|
||||
ggplot(diamonds2, aes(lcarat, lresid)) +
|
||||
|
@ -147,8 +147,8 @@ Fortunately, they're currently all independent which means that we can plot them
|
|||
To make the process a little easier, we're going to use the `.model` argument to `data_grid`:
|
||||
|
||||
```{r}
|
||||
grid <- diamonds2 %>%
|
||||
data_grid(cut, .model = mod_diamond2) %>%
|
||||
grid <- diamonds2 |>
|
||||
data_grid(cut, .model = mod_diamond2) |>
|
||||
add_predictions(mod_diamond2)
|
||||
grid
|
||||
|
||||
|
@ -160,7 +160,7 @@ If the model needs variables that you haven't explicitly supplied, `data_grid()`
|
|||
For continuous variables, it uses the median, and categorical variables it uses the most common value (or values, if there's a tie).
|
||||
|
||||
```{r}
|
||||
diamonds2 <- diamonds2 %>%
|
||||
diamonds2 <- diamonds2 |>
|
||||
add_residuals(mod_diamond2, "lresid2")
|
||||
|
||||
ggplot(diamonds2, aes(lcarat, lresid2)) +
|
||||
|
@ -171,11 +171,11 @@ This plot indicates that there are some diamonds with quite large residuals - re
|
|||
It's often useful to look at unusual values individually:
|
||||
|
||||
```{r}
|
||||
diamonds2 %>%
|
||||
filter(abs(lresid2) > 1) %>%
|
||||
add_predictions(mod_diamond2) %>%
|
||||
mutate(pred = round(2 ^ pred)) %>%
|
||||
select(price, pred, carat:table, x:z) %>%
|
||||
diamonds2 |>
|
||||
filter(abs(lresid2) > 1) |>
|
||||
add_predictions(mod_diamond2) |>
|
||||
mutate(pred = round(2 ^ pred)) |>
|
||||
select(price, pred, carat:table, x:z) |>
|
||||
arrange(price)
|
||||
```
|
||||
|
||||
|
@ -203,9 +203,9 @@ This is a really small dataset --- only 365 rows and 2 columns --- and we're not
|
|||
Let's get started by counting the number of flights per day and visualising it with ggplot2.
|
||||
|
||||
```{r}
|
||||
daily <- flights %>%
|
||||
mutate(date = make_date(year, month, day)) %>%
|
||||
group_by(date) %>%
|
||||
daily <- flights |>
|
||||
mutate(date = make_date(year, month, day)) |>
|
||||
group_by(date) |>
|
||||
summarise(n = n())
|
||||
daily
|
||||
|
||||
|
@ -219,7 +219,7 @@ Understanding the long-term trend is challenging because there's a very strong d
|
|||
Let's start by looking at the distribution of flight numbers by day-of-week:
|
||||
|
||||
```{r}
|
||||
daily <- daily %>%
|
||||
daily <- daily |>
|
||||
mutate(wday = wday(date, label = TRUE))
|
||||
ggplot(daily, aes(wday, n)) +
|
||||
geom_boxplot()
|
||||
|
@ -234,8 +234,8 @@ First, we fit the model, and display its predictions overlaid on the original da
|
|||
```{r}
|
||||
mod <- lm(n ~ wday, data = daily)
|
||||
|
||||
grid <- daily %>%
|
||||
data_grid(wday) %>%
|
||||
grid <- daily |>
|
||||
data_grid(wday) |>
|
||||
add_predictions(mod, "n")
|
||||
|
||||
ggplot(daily, aes(wday, n)) +
|
||||
|
@ -246,9 +246,9 @@ ggplot(daily, aes(wday, n)) +
|
|||
Next we compute and visualise the residuals:
|
||||
|
||||
```{r}
|
||||
daily <- daily %>%
|
||||
daily <- daily |>
|
||||
add_residuals(mod)
|
||||
daily %>%
|
||||
daily |>
|
||||
ggplot(aes(date, resid)) +
|
||||
geom_ref_line(h = 0) +
|
||||
geom_line()
|
||||
|
@ -272,7 +272,7 @@ This plot is useful because now that we've removed much of the large day-of-week
|
|||
2. There are some days with far fewer flights than expected:
|
||||
|
||||
```{r}
|
||||
daily %>%
|
||||
daily |>
|
||||
filter(resid < -100)
|
||||
```
|
||||
|
||||
|
@ -284,7 +284,7 @@ This plot is useful because now that we've removed much of the large day-of-week
|
|||
We can highlight that trend with `geom_smooth()`:
|
||||
|
||||
```{r}
|
||||
daily %>%
|
||||
daily |>
|
||||
ggplot(aes(date, resid)) +
|
||||
geom_ref_line(h = 0) +
|
||||
geom_line(colour = "grey50") +
|
||||
|
@ -301,8 +301,8 @@ Let's first tackle our failure to accurately predict the number of flights on Sa
|
|||
A good place to start is to go back to the raw numbers, focussing on Saturdays:
|
||||
|
||||
```{r}
|
||||
daily %>%
|
||||
filter(wday == "Sat") %>%
|
||||
daily |>
|
||||
filter(wday == "Sat") |>
|
||||
ggplot(aes(date, n)) +
|
||||
geom_point() +
|
||||
geom_line() +
|
||||
|
@ -329,11 +329,11 @@ term <- function(date) {
|
|||
)
|
||||
}
|
||||
|
||||
daily <- daily %>%
|
||||
daily <- daily |>
|
||||
mutate(term = term(date))
|
||||
|
||||
daily %>%
|
||||
filter(wday == "Sat") %>%
|
||||
daily |>
|
||||
filter(wday == "Sat") |>
|
||||
ggplot(aes(date, n, colour = term)) +
|
||||
geom_point(alpha = 1/3) +
|
||||
geom_line() +
|
||||
|
@ -345,7 +345,7 @@ daily %>%
|
|||
It's useful to see how this new variable affects the other days of the week:
|
||||
|
||||
```{r}
|
||||
daily %>%
|
||||
daily |>
|
||||
ggplot(aes(wday, n, colour = term)) +
|
||||
geom_boxplot()
|
||||
```
|
||||
|
@ -357,8 +357,8 @@ This improves our model, but not as much as we might hope:
|
|||
mod1 <- lm(n ~ wday, data = daily)
|
||||
mod2 <- lm(n ~ wday * term, data = daily)
|
||||
|
||||
daily %>%
|
||||
gather_residuals(without_term = mod1, with_term = mod2) %>%
|
||||
daily |>
|
||||
gather_residuals(without_term = mod1, with_term = mod2) |>
|
||||
ggplot(aes(date, resid, colour = model)) +
|
||||
geom_line(alpha = 0.75)
|
||||
```
|
||||
|
@ -366,8 +366,8 @@ daily %>%
|
|||
We can see the problem by overlaying the predictions from the model on to the raw data:
|
||||
|
||||
```{r}
|
||||
grid <- daily %>%
|
||||
data_grid(wday, term) %>%
|
||||
grid <- daily |>
|
||||
data_grid(wday, term) |>
|
||||
add_predictions(mod2, "n")
|
||||
|
||||
ggplot(daily, aes(wday, n)) +
|
||||
|
@ -383,8 +383,8 @@ This greatly reduces the impact of the outliers on our estimates, and gives a mo
|
|||
```{r, warn = FALSE}
|
||||
mod3 <- MASS::rlm(n ~ wday * term, data = daily)
|
||||
|
||||
daily %>%
|
||||
add_residuals(mod3, "resid") %>%
|
||||
daily |>
|
||||
add_residuals(mod3, "resid") |>
|
||||
ggplot(aes(date, resid)) +
|
||||
geom_hline(yintercept = 0, size = 2, colour = "white") +
|
||||
geom_line()
|
||||
|
@ -399,7 +399,7 @@ For example, we could write:
|
|||
|
||||
```{r}
|
||||
compute_vars <- function(data) {
|
||||
data %>%
|
||||
data |>
|
||||
mutate(
|
||||
term = term(date),
|
||||
wday = wday(date, label = TRUE)
|
||||
|
@ -430,9 +430,9 @@ A simple linear trend isn't adequate, so we could try using a natural spline to
|
|||
library(splines)
|
||||
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)
|
||||
|
||||
daily %>%
|
||||
data_grid(wday, date = seq_range(date, n = 13)) %>%
|
||||
add_predictions(mod) %>%
|
||||
daily |>
|
||||
data_grid(wday, date = seq_range(date, n = 13)) |>
|
||||
add_predictions(mod) |>
|
||||
ggplot(aes(date, pred, colour = wday)) +
|
||||
geom_line() +
|
||||
geom_point()
|
||||
|
@ -451,7 +451,7 @@ It's a good sign when you get the same signal from different approaches.
|
|||
How would these days generalise to another year?
|
||||
|
||||
```{r}
|
||||
daily %>%
|
||||
daily |>
|
||||
slice_max(n = 3, resid)
|
||||
```
|
||||
|
||||
|
|
|
@ -60,7 +60,7 @@ In this case study, we're going to focus on just three variables to answer the q
|
|||
A good place to start is with a plot:
|
||||
|
||||
```{r}
|
||||
gapminder %>%
|
||||
gapminder |>
|
||||
ggplot(aes(year, lifeExp, group = country)) +
|
||||
geom_line(alpha = 1/3)
|
||||
```
|
||||
|
@ -79,20 +79,20 @@ You already know how to do that if we had a single country:
|
|||
|
||||
```{r, out.width = "33%", fig.asp = 1, fig.width = 3, fig.align='default'}
|
||||
nz <- filter(gapminder, country == "New Zealand")
|
||||
nz %>%
|
||||
nz |>
|
||||
ggplot(aes(year, lifeExp)) +
|
||||
geom_line() +
|
||||
ggtitle("Full data = ")
|
||||
|
||||
nz_mod <- lm(lifeExp ~ year, data = nz)
|
||||
nz %>%
|
||||
add_predictions(nz_mod) %>%
|
||||
nz |>
|
||||
add_predictions(nz_mod) |>
|
||||
ggplot(aes(year, pred)) +
|
||||
geom_line() +
|
||||
ggtitle("Linear trend + ")
|
||||
|
||||
nz %>%
|
||||
add_residuals(nz_mod) %>%
|
||||
nz |>
|
||||
add_residuals(nz_mod) |>
|
||||
ggplot(aes(year, resid)) +
|
||||
geom_hline(yintercept = 0, colour = "white", size = 3) +
|
||||
geom_line() +
|
||||
|
@ -111,8 +111,8 @@ To do that, we need a new data structure: the **nested data frame**.
|
|||
To create a nested data frame we start with a grouped data frame, and "nest" it:
|
||||
|
||||
```{r}
|
||||
by_country <- gapminder %>%
|
||||
group_by(country, continent) %>%
|
||||
by_country <- gapminder |>
|
||||
group_by(country, continent) |>
|
||||
nest()
|
||||
|
||||
by_country
|
||||
|
@ -163,7 +163,7 @@ In other words, instead of creating a new object in the global environment, we'r
|
|||
That's a job for `dplyr::mutate()`:
|
||||
|
||||
```{r}
|
||||
by_country <- by_country %>%
|
||||
by_country <- by_country |>
|
||||
mutate(model = map(data, country_model))
|
||||
by_country
|
||||
```
|
||||
|
@ -172,9 +172,9 @@ This has a big advantage: because all the related objects are stored together, y
|
|||
The semantics of the data frame takes care of that for you:
|
||||
|
||||
```{r}
|
||||
by_country %>%
|
||||
by_country |>
|
||||
filter(continent == "Europe")
|
||||
by_country %>%
|
||||
by_country |>
|
||||
arrange(continent, country)
|
||||
```
|
||||
|
||||
|
@ -188,7 +188,7 @@ Now we have 142 data frames and 142 models.
|
|||
To compute the residuals, we need to call `add_residuals()` with each model-data pair:
|
||||
|
||||
```{r}
|
||||
by_country <- by_country %>%
|
||||
by_country <- by_country |>
|
||||
mutate(
|
||||
resids = map2(data, model, add_residuals)
|
||||
)
|
||||
|
@ -209,7 +209,7 @@ Note that each regular column is repeated once for each row of the nested tibble
|
|||
Now we have regular data frame, we can plot the residuals:
|
||||
|
||||
```{r}
|
||||
resids %>%
|
||||
resids |>
|
||||
ggplot(aes(year, resid)) +
|
||||
geom_line(aes(group = country), alpha = 1 / 3) +
|
||||
geom_smooth(se = FALSE)
|
||||
|
@ -219,7 +219,7 @@ resids %>%
|
|||
Facetting by continent is particularly revealing:
|
||||
|
||||
```{r}
|
||||
resids %>%
|
||||
resids |>
|
||||
ggplot(aes(year, resid, group = country)) +
|
||||
geom_line(alpha = 1 / 3) +
|
||||
facet_wrap(~continent)
|
||||
|
@ -245,9 +245,9 @@ broom::glance(nz_mod)
|
|||
We can use `mutate()` and `unnest()` to create a data frame with a row for each country:
|
||||
|
||||
```{r}
|
||||
glance <- by_country %>%
|
||||
mutate(glance = map(model, broom::glance)) %>%
|
||||
select(country, continent, glance) %>%
|
||||
glance <- by_country |>
|
||||
mutate(glance = map(model, broom::glance)) |>
|
||||
select(country, continent, glance) |>
|
||||
unnest(glance)
|
||||
glance
|
||||
```
|
||||
|
@ -257,7 +257,7 @@ glance
|
|||
With this data frame in hand, we can start to look for models that don't fit well:
|
||||
|
||||
```{r}
|
||||
glance %>%
|
||||
glance |>
|
||||
arrange(r.squared)
|
||||
```
|
||||
|
||||
|
@ -266,7 +266,7 @@ Let's double check that with a plot.
|
|||
Here we have a relatively small number of observations and a discrete variable, so `geom_jitter()` is effective:
|
||||
|
||||
```{r}
|
||||
glance %>%
|
||||
glance |>
|
||||
ggplot(aes(continent, r.squared)) +
|
||||
geom_jitter(width = 0.5)
|
||||
```
|
||||
|
@ -276,8 +276,8 @@ We could pull out the countries with particularly bad $R^2$ and plot the data:
|
|||
```{r}
|
||||
bad_fit <- filter(glance, r.squared < 0.25)
|
||||
|
||||
gapminder %>%
|
||||
semi_join(bad_fit, by = "country") %>%
|
||||
gapminder |>
|
||||
semi_join(bad_fit, by = "country") |>
|
||||
ggplot(aes(year, lifeExp, colour = country)) +
|
||||
geom_line()
|
||||
```
|
||||
|
@ -377,15 +377,15 @@ So far you've seen how to use it with a grouped data frame.
|
|||
When applied to a grouped data frame, `nest()` keeps the grouping columns as is, and bundles everything else into the list-column:
|
||||
|
||||
```{r}
|
||||
gapminder %>%
|
||||
group_by(country, continent) %>%
|
||||
gapminder |>
|
||||
group_by(country, continent) |>
|
||||
nest()
|
||||
```
|
||||
|
||||
You can also use it on an ungrouped data frame, specifying which columns you want to nest:
|
||||
|
||||
```{r}
|
||||
gapminder %>%
|
||||
gapminder |>
|
||||
nest(data = c(year:gdpPercap))
|
||||
```
|
||||
|
||||
|
@ -402,15 +402,15 @@ df <- tribble(
|
|||
"d,e,f,g"
|
||||
)
|
||||
|
||||
df %>%
|
||||
df |>
|
||||
mutate(x2 = stringr::str_split(x1, ","))
|
||||
```
|
||||
|
||||
`unnest()` knows how to handle these lists of vectors:
|
||||
|
||||
```{r}
|
||||
df %>%
|
||||
mutate(x2 = stringr::str_split(x1, ",")) %>%
|
||||
df |>
|
||||
mutate(x2 = stringr::str_split(x1, ",")) |>
|
||||
unnest(x2)
|
||||
```
|
||||
|
||||
|
@ -427,7 +427,7 @@ sim <- tribble(
|
|||
"rpois", list(lambda = 10)
|
||||
)
|
||||
|
||||
sim %>%
|
||||
sim |>
|
||||
mutate(sims = invoke_map(f, params, n = 10))
|
||||
```
|
||||
|
||||
|
@ -440,8 +440,8 @@ One restriction of `summarise()` is that it only works with summary functions th
|
|||
That means that you can't use it with functions like `quantile()` that return a vector of arbitrary length:
|
||||
|
||||
```{r, error = TRUE}
|
||||
mtcars %>%
|
||||
group_by(cyl) %>%
|
||||
mtcars |>
|
||||
group_by(cyl) |>
|
||||
summarise(q = quantile(mpg))
|
||||
```
|
||||
|
||||
|
@ -449,8 +449,8 @@ You can however, wrap the result in a list!
|
|||
This obeys the contract of `summarise()`, because each summary is now a list (a vector) of length 1.
|
||||
|
||||
```{r}
|
||||
mtcars %>%
|
||||
group_by(cyl) %>%
|
||||
mtcars |>
|
||||
group_by(cyl) |>
|
||||
summarise(q = list(quantile(mpg)))
|
||||
```
|
||||
|
||||
|
@ -458,9 +458,9 @@ To make useful results with unnest, you'll also need to capture the probabilitie
|
|||
|
||||
```{r}
|
||||
probs <- c(0.01, 0.25, 0.5, 0.75, 0.99)
|
||||
mtcars %>%
|
||||
group_by(cyl) %>%
|
||||
summarise(p = list(probs), q = list(quantile(mpg, probs))) %>%
|
||||
mtcars |>
|
||||
group_by(cyl) |>
|
||||
summarise(p = list(probs), q = list(quantile(mpg, probs))) |>
|
||||
unnest(c(p, q))
|
||||
```
|
||||
|
||||
|
@ -486,7 +486,7 @@ The advantage of this structure is that it generalises in a straightforward way
|
|||
Now if you want to iterate over names and values in parallel, you can use `map2()`:
|
||||
|
||||
```{r}
|
||||
df %>%
|
||||
df |>
|
||||
mutate(
|
||||
smry = map2_chr(name, value, ~ stringr::str_c(.x, ": ", .y[1]))
|
||||
)
|
||||
|
@ -503,9 +503,9 @@ df %>%
|
|||
Why isn't that helpful here?
|
||||
|
||||
```{r}
|
||||
mtcars %>%
|
||||
group_by(cyl) %>%
|
||||
summarise(q = list(quantile(mpg))) %>%
|
||||
mtcars |>
|
||||
group_by(cyl) |>
|
||||
summarise(q = list(quantile(mpg))) |>
|
||||
unnest(q)
|
||||
```
|
||||
|
||||
|
@ -513,8 +513,8 @@ df %>%
|
|||
Why might it be useful?
|
||||
|
||||
```{r, eval = FALSE}
|
||||
mtcars %>%
|
||||
group_by(cyl) %>%
|
||||
mtcars |>
|
||||
group_by(cyl) |>
|
||||
summarise_all(list(list))
|
||||
```
|
||||
|
||||
|
@ -542,7 +542,7 @@ df <- tribble(
|
|||
runif(5)
|
||||
)
|
||||
|
||||
df %>% mutate(
|
||||
df |> mutate(
|
||||
type = map_chr(x, typeof),
|
||||
length = map_int(x, length)
|
||||
)
|
||||
|
@ -561,7 +561,7 @@ df <- tribble(
|
|||
list(a = 1, b = 2),
|
||||
list(a = 2, c = 4)
|
||||
)
|
||||
df %>% mutate(
|
||||
df |> mutate(
|
||||
a = map_dbl(x, "a"),
|
||||
b = map_dbl(x, "b", .null = NA_real_)
|
||||
)
|
||||
|
@ -573,7 +573,7 @@ df %>% mutate(
|
|||
For example, in the following very simple example we repeat the first row 4 times (because there the first element of `y` has length four), and the second row once:
|
||||
|
||||
```{r}
|
||||
tibble(x = 1:2, y = list(1:4, 1)) %>% unnest(y)
|
||||
tibble(x = 1:2, y = list(1:4, 1)) |> unnest(y)
|
||||
```
|
||||
|
||||
This means that you can't simultaneously unnest two columns that contain different number of elements:
|
||||
|
@ -587,7 +587,7 @@ df1 <- tribble(
|
|||
2, "c", 3
|
||||
)
|
||||
df1
|
||||
df1 %>% unnest(c(y, z))
|
||||
df1 |> unnest(c(y, z))
|
||||
|
||||
# Doesn't work because y and z have different number of elements
|
||||
df2 <- tribble(
|
||||
|
@ -596,7 +596,7 @@ df2 <- tribble(
|
|||
2, c("b", "c"), 3
|
||||
)
|
||||
df2
|
||||
df2 %>% unnest(c(y, z))
|
||||
df2 |> unnest(c(y, z))
|
||||
```
|
||||
|
||||
The same principle applies when unnesting list-columns of data frames.
|
||||
|
|
|
@ -109,9 +109,9 @@ df <- data.frame(
|
|||
)
|
||||
|
||||
|
||||
df[1:4] %>% sapply(class) %>% str()
|
||||
df[1:2] %>% sapply(class) %>% str()
|
||||
df[3:4] %>% sapply(class) %>% str()
|
||||
df[1:4] |> sapply(class) |> str()
|
||||
df[1:2] |> sapply(class) |> str()
|
||||
df[3:4] |> sapply(class) |> str()
|
||||
```
|
||||
|
||||
In the next chapter, you'll learn about the purrr package which provides a variety of alternatives. In this case, you could use `map_chr()` which always returns a character vector: if it can't, it will throw an error. Another option is the base `vapply()` function which takes a third argument indicating what the output should look like.
|
||||
|
|
54
factors.Rmd
54
factors.Rmd
|
@ -91,7 +91,7 @@ You can do that when creating the factor by setting levels to `unique(x)`, or af
|
|||
f1 <- factor(x1, levels = unique(x1))
|
||||
f1
|
||||
|
||||
f2 <- x1 %>% factor() %>% fct_inorder()
|
||||
f2 <- x1 |> factor() |> fct_inorder()
|
||||
f2
|
||||
```
|
||||
|
||||
|
@ -117,7 +117,7 @@ When factors are stored in a tibble, you can't see their levels so easily.
|
|||
One way to see them is with `count()`:
|
||||
|
||||
```{r}
|
||||
gss_cat %>%
|
||||
gss_cat |>
|
||||
count(race)
|
||||
```
|
||||
|
||||
|
@ -141,7 +141,7 @@ These levels represent valid values that simply did not occur in this dataset.
|
|||
In dplyr::count() set the `.drop` option to `FALSE`, to show these.
|
||||
|
||||
```{r}
|
||||
gss_cat %>%
|
||||
gss_cat |>
|
||||
count(race,
|
||||
.drop = FALSE)
|
||||
```
|
||||
|
@ -168,8 +168,8 @@ It's often useful to change the order of the factor levels in a visualisation.
|
|||
For example, imagine you want to explore the average number of hours spent watching TV per day across religions:
|
||||
|
||||
```{r}
|
||||
relig_summary <- gss_cat %>%
|
||||
group_by(relig) %>%
|
||||
relig_summary <- gss_cat |>
|
||||
group_by(relig) |>
|
||||
summarise(
|
||||
age = mean(age, na.rm = TRUE),
|
||||
tvhours = mean(tvhours, na.rm = TRUE),
|
||||
|
@ -198,8 +198,8 @@ As you start making more complicated transformations, I'd recommend moving them
|
|||
For example, you could rewrite the plot above as:
|
||||
|
||||
```{r, eval = FALSE}
|
||||
relig_summary %>%
|
||||
mutate(relig = fct_reorder(relig, tvhours)) %>%
|
||||
relig_summary |>
|
||||
mutate(relig = fct_reorder(relig, tvhours)) |>
|
||||
ggplot(aes(tvhours, relig)) +
|
||||
geom_point()
|
||||
```
|
||||
|
@ -207,8 +207,8 @@ relig_summary %>%
|
|||
What if we create a similar plot looking at how average age varies across reported income level?
|
||||
|
||||
```{r}
|
||||
rincome_summary <- gss_cat %>%
|
||||
group_by(rincome) %>%
|
||||
rincome_summary <- gss_cat |>
|
||||
group_by(rincome) |>
|
||||
summarise(
|
||||
age = mean(age, na.rm = TRUE),
|
||||
tvhours = mean(tvhours, na.rm = TRUE),
|
||||
|
@ -238,10 +238,10 @@ Another type of reordering is useful when you are colouring the lines on a plot.
|
|||
This makes the plot easier to read because the line colours line up with the legend.
|
||||
|
||||
```{r, fig.align = "default", out.width = "50%", fig.width = 4}
|
||||
by_age <- gss_cat %>%
|
||||
filter(!is.na(age)) %>%
|
||||
count(age, marital) %>%
|
||||
group_by(age) %>%
|
||||
by_age <- gss_cat |>
|
||||
filter(!is.na(age)) |>
|
||||
count(age, marital) |>
|
||||
group_by(age) |>
|
||||
mutate(prop = n / sum(n))
|
||||
|
||||
ggplot(by_age, aes(age, prop, colour = marital)) +
|
||||
|
@ -256,8 +256,8 @@ Finally, for bar plots, you can use `fct_infreq()` to order levels in increasing
|
|||
You may want to combine with `fct_rev()`.
|
||||
|
||||
```{r}
|
||||
gss_cat %>%
|
||||
mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
|
||||
gss_cat |>
|
||||
mutate(marital = marital |> fct_infreq() |> fct_rev()) |>
|
||||
ggplot(aes(marital)) +
|
||||
geom_bar()
|
||||
```
|
||||
|
@ -280,14 +280,14 @@ It allows you to recode, or change, the value of each level.
|
|||
For example, take the `gss_cat$partyid`:
|
||||
|
||||
```{r}
|
||||
gss_cat %>% count(partyid)
|
||||
gss_cat |> count(partyid)
|
||||
```
|
||||
|
||||
The levels are terse and inconsistent.
|
||||
Let's tweak them to be longer and use a parallel construction.
|
||||
|
||||
```{r}
|
||||
gss_cat %>%
|
||||
gss_cat |>
|
||||
mutate(partyid = fct_recode(partyid,
|
||||
"Republican, strong" = "Strong republican",
|
||||
"Republican, weak" = "Not str republican",
|
||||
|
@ -295,7 +295,7 @@ gss_cat %>%
|
|||
"Independent, near dem" = "Ind,near dem",
|
||||
"Democrat, weak" = "Not str democrat",
|
||||
"Democrat, strong" = "Strong democrat"
|
||||
)) %>%
|
||||
)) |>
|
||||
count(partyid)
|
||||
```
|
||||
|
||||
|
@ -304,7 +304,7 @@ gss_cat %>%
|
|||
To combine groups, you can assign multiple old levels to the same new level:
|
||||
|
||||
```{r}
|
||||
gss_cat %>%
|
||||
gss_cat |>
|
||||
mutate(partyid = fct_recode(partyid,
|
||||
"Republican, strong" = "Strong republican",
|
||||
"Republican, weak" = "Not str republican",
|
||||
|
@ -315,7 +315,7 @@ gss_cat %>%
|
|||
"Other" = "No answer",
|
||||
"Other" = "Don't know",
|
||||
"Other" = "Other party"
|
||||
)) %>%
|
||||
)) |>
|
||||
count(partyid)
|
||||
```
|
||||
|
||||
|
@ -325,13 +325,13 @@ If you want to collapse a lot of levels, `fct_collapse()` is a useful variant of
|
|||
For each new variable, you can provide a vector of old levels:
|
||||
|
||||
```{r}
|
||||
gss_cat %>%
|
||||
gss_cat |>
|
||||
mutate(partyid = fct_collapse(partyid,
|
||||
other = c("No answer", "Don't know", "Other party"),
|
||||
rep = c("Strong republican", "Not str republican"),
|
||||
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
|
||||
dem = c("Not str democrat", "Strong democrat")
|
||||
)) %>%
|
||||
)) |>
|
||||
count(partyid)
|
||||
```
|
||||
|
||||
|
@ -340,8 +340,8 @@ That's the job of the `fct_lump_*()` family of functions.
|
|||
`fct_lump_lowfreq()` is a simple starting point that progressively lumps the smallest groups categories into "Other", always keeping "Other" as the smallest category.
|
||||
|
||||
```{r}
|
||||
gss_cat %>%
|
||||
mutate(relig = fct_lump_lowfreq(relig)) %>%
|
||||
gss_cat |>
|
||||
mutate(relig = fct_lump_lowfreq(relig)) |>
|
||||
count(relig)
|
||||
```
|
||||
|
||||
|
@ -349,9 +349,9 @@ In this case it's not very helpful: it is true that the majority of Americans in
|
|||
Instead, we can use the `fct_lump_n()` to specify that we want exactly 10 groups:
|
||||
|
||||
```{r}
|
||||
gss_cat %>%
|
||||
mutate(relig = fct_lump_n(relig, n = 10)) %>%
|
||||
count(relig, sort = TRUE) %>%
|
||||
gss_cat |>
|
||||
mutate(relig = fct_lump_n(relig, n = 10)) |>
|
||||
count(relig, sort = TRUE) |>
|
||||
print(n = Inf)
|
||||
```
|
||||
|
||||
|
|
|
@ -844,9 +844,9 @@ library(dplyr)
|
|||
```
|
||||
|
||||
```{r}
|
||||
mtcars %>%
|
||||
show_missings() %>%
|
||||
mutate(mpg = ifelse(mpg < 20, NA, mpg)) %>%
|
||||
mtcars |>
|
||||
show_missings() |>
|
||||
mutate(mpg = ifelse(mpg < 20, NA, mpg)) |>
|
||||
show_missings()
|
||||
```
|
||||
|
||||
|
|
|
@ -123,7 +123,7 @@ However there are a few things we might want to address in this dataset:
|
|||
col_types = c("numeric", "text", "text", "text", "text")
|
||||
)
|
||||
|
||||
students <- students %>%
|
||||
students <- students |>
|
||||
mutate(
|
||||
age = if_else(age == "five", "5", age),
|
||||
age = parse_number(age)
|
||||
|
@ -370,17 +370,17 @@ addWorksheet(penguins_species, sheetName = "Chinstrap")
|
|||
writeDataTable(
|
||||
penguins_species,
|
||||
sheet = "Adelie",
|
||||
x = penguins %>% filter(species == "Adelie")
|
||||
x = penguins |> filter(species == "Adelie")
|
||||
)
|
||||
writeDataTable(
|
||||
penguins_species,
|
||||
sheet = "Gentoo",
|
||||
x = penguins %>% filter(species == "Gentoo")
|
||||
x = penguins |> filter(species == "Gentoo")
|
||||
)
|
||||
writeDataTable(
|
||||
penguins_species,
|
||||
sheet = "Chinstrap",
|
||||
x = penguins %>% filter(species == "Chinstrap")
|
||||
x = penguins |> filter(species == "Chinstrap")
|
||||
)
|
||||
```
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ By contributing to this book, you agree to abide by its terms.
|
|||
```{r, results = "asis", echo = FALSE, message = FALSE}
|
||||
library(dplyr)
|
||||
contributors <- readr::read_csv("contributors.csv", col_types = list())
|
||||
contributors <- contributors %>%
|
||||
contributors <- contributors |>
|
||||
mutate(
|
||||
link = glue::glue("[\\@{login}](https://github.com/{login})"),
|
||||
desc = ifelse(is.na(name), link, glue::glue("{name} ({link})"))
|
||||
|
|
|
@ -242,10 +242,10 @@ library(dplyr)
|
|||
# git --no-pager shortlog -ns > contribs.txt
|
||||
contribs <- readr::read_tsv("contribs.txt", col_names = c("n", "name"))
|
||||
|
||||
contribs <- contribs %>%
|
||||
contribs <- contribs |>
|
||||
filter(!name %in% c("hadley", "Garrett", "Hadley Wickham",
|
||||
"Garrett Grolemund")) %>%
|
||||
arrange(name) %>%
|
||||
"Garrett Grolemund")) |>
|
||||
arrange(name) |>
|
||||
mutate(uname = ifelse(!grepl(" ", name), paste0("@", name), name))
|
||||
|
||||
cat("Thanks go to all contributers in alphabetical order: ")
|
||||
|
|
100
iteration.Rmd
100
iteration.Rmd
|
@ -520,9 +520,9 @@ Compared to using a for loop, focus is on the operation being performed (i.e. `m
|
|||
This is even more apparent if we use the pipe:
|
||||
|
||||
```{r}
|
||||
df %>% map_dbl(mean)
|
||||
df %>% map_dbl(median)
|
||||
df %>% map_dbl(sd)
|
||||
df |> map_dbl(mean)
|
||||
df |> map_dbl(median)
|
||||
df |> map_dbl(sd)
|
||||
```
|
||||
|
||||
There are a few differences between `map_*()` and `col_summary()`:
|
||||
|
@ -533,7 +533,7 @@ There are a few differences between `map_*()` and `col_summary()`:
|
|||
- The second argument, `.f`, the function to apply, can be a formula, a character vector, or an integer vector.
|
||||
You'll learn about those handy shortcuts in the next section.
|
||||
|
||||
- `map_*()` uses ... ([dot dot dot]) to pass along additional arguments to `.f` each time it's called:
|
||||
- `map_*()` uses ... (\[dot dot dot\]) to pass along additional arguments to `.f` each time it's called:
|
||||
|
||||
```{r}
|
||||
map_dbl(df, mean, trim = 0.5)
|
||||
|
@ -553,16 +553,16 @@ Imagine you want to fit a linear model to each group in a dataset.
|
|||
The following toy example splits up the `mtcars` dataset into three pieces (one for each value of cylinder) and fits the same linear model to each piece:
|
||||
|
||||
```{r}
|
||||
models <- mtcars %>%
|
||||
split(.$cyl) %>%
|
||||
models <- mtcars |>
|
||||
split(mtcars$cyl) |>
|
||||
map(function(df) lm(mpg ~ wt, data = df))
|
||||
```
|
||||
|
||||
The syntax for creating an anonymous function in R is quite verbose so purrr provides a convenient shortcut: a one-sided formula.
|
||||
|
||||
```{r}
|
||||
models <- mtcars %>%
|
||||
split(.$cyl) %>%
|
||||
models <- mtcars |>
|
||||
split(mtcars$cyl) |>
|
||||
map(~lm(mpg ~ wt, data = .x))
|
||||
```
|
||||
|
||||
|
@ -574,16 +574,16 @@ To do that we need to first run `summary()` and then extract the component calle
|
|||
We could do that using the shorthand for anonymous functions:
|
||||
|
||||
```{r}
|
||||
models %>%
|
||||
map(summary) %>%
|
||||
map_dbl(~.x$r.squared)
|
||||
models |>
|
||||
map(summary) |>
|
||||
map_dbl(~ .x$r.squared)
|
||||
```
|
||||
|
||||
But extracting named components is a common operation, so purrr provides an even shorter shortcut: you can use a string.
|
||||
|
||||
```{r}
|
||||
models %>%
|
||||
map(summary) %>%
|
||||
models |>
|
||||
map(summary) |>
|
||||
map_dbl("r.squared")
|
||||
```
|
||||
|
||||
|
@ -591,7 +591,7 @@ You can also use an integer to select elements by position:
|
|||
|
||||
```{r}
|
||||
x <- list(list(1, 2, 3), list(4, 5, 6), list(7, 8, 9))
|
||||
x %>% map_dbl(2)
|
||||
x |> map_dbl(2)
|
||||
```
|
||||
|
||||
### Base R
|
||||
|
@ -616,8 +616,8 @@ If you're familiar with the apply family of functions in base R, you might have
|
|||
)
|
||||
|
||||
threshold <- function(x, cutoff = 0.8) x[x > cutoff]
|
||||
x1 %>% sapply(threshold) %>% str()
|
||||
x2 %>% sapply(threshold) %>% str()
|
||||
x1 |> sapply(threshold) |> str()
|
||||
x2 |> sapply(threshold) |> str()
|
||||
```
|
||||
|
||||
- `vapply()` is a safe alternative to `sapply()` because you supply an additional argument that defines the type.
|
||||
|
@ -684,7 +684,7 @@ When the function fails, the `result` element is `NULL` and the `error` element
|
|||
|
||||
```{r}
|
||||
x <- list(1, 10, "a")
|
||||
y <- x %>% map(safely(log))
|
||||
y <- x |> map(safely(log))
|
||||
str(y)
|
||||
```
|
||||
|
||||
|
@ -692,16 +692,16 @@ This would be easier to work with if we had two lists: one of all the errors and
|
|||
That's easy to get with `purrr::transpose()`:
|
||||
|
||||
```{r}
|
||||
y <- y %>% transpose()
|
||||
y <- y |> transpose()
|
||||
str(y)
|
||||
```
|
||||
|
||||
It's up to you how to deal with the errors, but typically you'll either look at the values of `x` where `y` is an error, or work with the values of `y` that are ok:
|
||||
|
||||
```{r}
|
||||
is_ok <- y$error %>% map_lgl(is_null)
|
||||
is_ok <- y$error |> map_lgl(is_null)
|
||||
x[!is_ok]
|
||||
y$result[is_ok] %>% flatten_dbl()
|
||||
y$result[is_ok] |> flatten_dbl()
|
||||
```
|
||||
|
||||
Purrr provides two other useful adverbs:
|
||||
|
@ -711,14 +711,14 @@ Purrr provides two other useful adverbs:
|
|||
|
||||
```{r}
|
||||
x <- list(1, 10, "a")
|
||||
x %>% map_dbl(possibly(log, NA_real_))
|
||||
x |> map_dbl(possibly(log, NA_real_))
|
||||
```
|
||||
|
||||
- `quietly()` performs a similar role to `safely()`, but instead of capturing errors, it captures printed output, messages, and warnings:
|
||||
|
||||
```{r}
|
||||
x <- list(1, -1)
|
||||
x %>% map(quietly(log)) %>% str()
|
||||
x |> map(quietly(log)) |> str()
|
||||
```
|
||||
|
||||
## Mapping over multiple arguments
|
||||
|
@ -731,8 +731,8 @@ You know how to do that with `map()`:
|
|||
|
||||
```{r}
|
||||
mu <- list(5, 10, -3)
|
||||
mu %>%
|
||||
map(rnorm, n = 5) %>%
|
||||
mu |>
|
||||
map(rnorm, n = 5) |>
|
||||
str()
|
||||
```
|
||||
|
||||
|
@ -741,8 +741,8 @@ One way to do that would be to iterate over the indices and index into vectors o
|
|||
|
||||
```{r}
|
||||
sigma <- list(1, 5, 10)
|
||||
seq_along(mu) %>%
|
||||
map(~rnorm(5, mu[[.x]], sigma[[.x]])) %>%
|
||||
seq_along(mu) |>
|
||||
map(~rnorm(5, mu[[.x]], sigma[[.x]])) |>
|
||||
str()
|
||||
```
|
||||
|
||||
|
@ -750,7 +750,7 @@ But that obfuscates the intent of the code.
|
|||
Instead we could use `map2()` which iterates over two vectors in parallel:
|
||||
|
||||
```{r}
|
||||
map2(mu, sigma, rnorm, n = 5) %>% str()
|
||||
map2(mu, sigma, rnorm, n = 5) |> str()
|
||||
```
|
||||
|
||||
`map2()` generates this series of function calls:
|
||||
|
@ -780,8 +780,8 @@ You might use that if you wanted to vary the mean, standard deviation, and numbe
|
|||
```{r}
|
||||
n <- list(1, 3, 5)
|
||||
args1 <- list(n, mu, sigma)
|
||||
args1 %>%
|
||||
pmap(rnorm) %>%
|
||||
args1 |>
|
||||
pmap(rnorm) |>
|
||||
str()
|
||||
```
|
||||
|
||||
|
@ -796,8 +796,8 @@ That's a little fragile, and makes the code harder to read, so it's better to na
|
|||
|
||||
```{r, eval = FALSE}
|
||||
args2 <- list(mean = mu, sd = sigma, n = n)
|
||||
args2 %>%
|
||||
pmap(rnorm) %>%
|
||||
args2 |>
|
||||
pmap(rnorm) |>
|
||||
str()
|
||||
```
|
||||
|
||||
|
@ -816,7 +816,7 @@ params <- tribble(
|
|||
10, 5, 3,
|
||||
-3, 10, 5
|
||||
)
|
||||
params %>%
|
||||
params |>
|
||||
pmap(rnorm)
|
||||
```
|
||||
|
||||
|
@ -838,7 +838,7 @@ param <- list(
|
|||
To handle this case, you can use `invoke_map()`:
|
||||
|
||||
```{r}
|
||||
invoke_map(f, param, n = 5) %>% str()
|
||||
invoke_map(f, param, n = 5) |> str()
|
||||
```
|
||||
|
||||
```{r, echo = FALSE, out.width = NULL}
|
||||
|
@ -858,7 +858,7 @@ sim <- tribble(
|
|||
"rnorm", list(sd = 5),
|
||||
"rpois", list(lambda = 10)
|
||||
)
|
||||
sim %>%
|
||||
sim |>
|
||||
mutate(sim = invoke_map(f, params, n = 10))
|
||||
```
|
||||
|
||||
|
@ -871,7 +871,7 @@ Here's a very simple example:
|
|||
```{r}
|
||||
x <- list(1, "a", 3)
|
||||
|
||||
x %>%
|
||||
x |>
|
||||
walk(print)
|
||||
```
|
||||
|
||||
|
@ -880,8 +880,8 @@ For example, if you had a list of plots and a vector of file names, you could us
|
|||
|
||||
```{r, eval = FALSE}
|
||||
library(ggplot2)
|
||||
plots <- mtcars %>%
|
||||
split(.$cyl) %>%
|
||||
plots <- mtcars |>
|
||||
split(.$cyl) |>
|
||||
map(~ggplot(.x, aes(mpg, wt)) + geom_point())
|
||||
paths <- stringr::str_c(names(plots), ".pdf")
|
||||
|
||||
|
@ -905,12 +905,12 @@ A number of functions work with **predicate** functions that return either a sin
|
|||
`keep()` and `discard()` keep elements of the input where the predicate is `TRUE` or `FALSE` respectively:
|
||||
|
||||
```{r}
|
||||
gss_cat %>%
|
||||
keep(is.factor) %>%
|
||||
gss_cat |>
|
||||
keep(is.factor) |>
|
||||
str()
|
||||
|
||||
gss_cat %>%
|
||||
discard(is.factor) %>%
|
||||
gss_cat |>
|
||||
discard(is.factor) |>
|
||||
str()
|
||||
```
|
||||
|
||||
|
@ -919,10 +919,10 @@ gss_cat %>%
|
|||
```{r}
|
||||
x <- list(1:5, letters, list(10))
|
||||
|
||||
x %>%
|
||||
x |>
|
||||
some(is_character)
|
||||
|
||||
x %>%
|
||||
x |>
|
||||
every(is_vector)
|
||||
```
|
||||
|
||||
|
@ -932,20 +932,20 @@ x %>%
|
|||
x <- sample(10)
|
||||
x
|
||||
|
||||
x %>%
|
||||
x |>
|
||||
detect(~ .x > 5)
|
||||
|
||||
x %>%
|
||||
x |>
|
||||
detect_index(~ .x > 5)
|
||||
```
|
||||
|
||||
`head_while()` and `tail_while()` take elements from the start or end of a vector while a predicate is true:
|
||||
|
||||
```{r}
|
||||
x %>%
|
||||
x |>
|
||||
head_while(~ .x > 5)
|
||||
|
||||
x %>%
|
||||
x |>
|
||||
tail_while(~ .x > 5)
|
||||
```
|
||||
|
||||
|
@ -962,7 +962,7 @@ dfs <- list(
|
|||
trt = tibble(name = "Mary", treatment = "A")
|
||||
)
|
||||
|
||||
dfs %>% reduce(full_join)
|
||||
dfs |> reduce(full_join)
|
||||
```
|
||||
|
||||
Or maybe you have a list of vectors, and want to find the intersection:
|
||||
|
@ -974,7 +974,7 @@ vs <- list(
|
|||
c(1, 2, 3, 4, 8, 9, 10)
|
||||
)
|
||||
|
||||
vs %>% reduce(intersect)
|
||||
vs |> reduce(intersect)
|
||||
```
|
||||
|
||||
`reduce()` takes a "binary" function (i.e. a function with two primary inputs), and applies it repeatedly to a list until there is only a single element left.
|
||||
|
@ -985,7 +985,7 @@ You could use it to implement a cumulative sum:
|
|||
```{r}
|
||||
x <- sample(10)
|
||||
x
|
||||
x %>% accumulate(`+`)
|
||||
x |> accumulate(`+`)
|
||||
```
|
||||
|
||||
### Exercises
|
||||
|
|
|
@ -23,7 +23,7 @@ The elements in a logical vector can have one of three possible values: `TRUE`,
|
|||
### Boolean operations
|
||||
|
||||
If you use multiple conditions In `filter()`, only rows where every condition is `TRUE` are returned.
|
||||
R uses `&` to denote logical "and", so that means `df %>% filter(cond1, cond2)` is equivalent to `df %>% filter(cond1 & cond2)`.
|
||||
R uses `&` to denote logical "and", so that means `df |> filter(cond1, cond2)` is equivalent to `df |> filter(cond1 & cond2)`.
|
||||
For other types of combinations, you'll need to use Boolean operators yourself: `|` is "or" and `!` is "not".
|
||||
Figure \@ref(fig:bool-ops) shows the complete set of Boolean operations.
|
||||
|
||||
|
@ -46,7 +46,7 @@ knitr::include_graphics("diagrams/transform-logical.png")
|
|||
The following code finds all flights that departed in November or December:
|
||||
|
||||
```{r, eval = FALSE}
|
||||
flights %>% filter(month == 11 | month == 12)
|
||||
flights |> filter(month == 11 | month == 12)
|
||||
```
|
||||
|
||||
Note that the order of operations doesn't work like English.
|
||||
|
@ -61,15 +61,15 @@ An easy way to solve this problem is to use `%in%`.
|
|||
So we could use it to rewrite the code above:
|
||||
|
||||
```{r, eval = FALSE}
|
||||
nov_dec <- flights %>% filter(month %in% c(11, 12))
|
||||
nov_dec <- flights |> filter(month %in% c(11, 12))
|
||||
```
|
||||
|
||||
Sometimes you can simplify complicated subsetting by remembering De Morgan's law: `!(x & y)` is the same as `!x | !y`, and `!(x | y)` is the same as `!x & !y`.
|
||||
For example, if you wanted to find flights that weren't delayed (on arrival or departure) by more than two hours, you could use either of the following two filters:
|
||||
|
||||
```{r, eval = FALSE}
|
||||
flights %>% filter(!(arr_delay > 120 | dep_delay > 120))
|
||||
flights %>% filter(arr_delay <= 120, dep_delay <= 120)
|
||||
flights |> filter(!(arr_delay > 120 | dep_delay > 120))
|
||||
flights |> filter(arr_delay <= 120, dep_delay <= 120)
|
||||
```
|
||||
|
||||
As well as `&` and `|`, R also has `&&` and `||`.
|
||||
|
@ -82,8 +82,8 @@ These are called short-circuiting operators and you'll learn when you should use
|
|||
If you want to find rows containing missing values, you'll need to convert missingness into a logical vector using `is.na()`.
|
||||
|
||||
```{r}
|
||||
flights %>% filter(is.na(dep_delay) | is.na(arr_delay))
|
||||
flights %>% filter(is.na(dep_delay) != is.na(arr_delay))
|
||||
flights |> filter(is.na(dep_delay) | is.na(arr_delay))
|
||||
flights |> filter(is.na(dep_delay) != is.na(arr_delay))
|
||||
```
|
||||
|
||||
### In mutate()
|
||||
|
@ -93,8 +93,8 @@ That makes it much easier to check your work.When checking your work, a particul
|
|||
This makes it easy to see the variables involved side-by-side.
|
||||
|
||||
```{r}
|
||||
flights %>%
|
||||
mutate(is_cancelled = is.na(dep_delay) | is.na(arr_delay), .keep = "used") %>%
|
||||
flights |>
|
||||
mutate(is_cancelled = is.na(dep_delay) | is.na(arr_delay), .keep = "used") |>
|
||||
filter(is_cancelled)
|
||||
```
|
||||
|
||||
|
@ -110,7 +110,7 @@ df <- data.frame(
|
|||
date = as.Date("2020-01-01") + 0:6,
|
||||
balance = c(100, 50, 25, -25, -50, 30, 120)
|
||||
)
|
||||
df %>% mutate(status = if_else(balance < 0, "overdraft", "ok"))
|
||||
df |> mutate(status = if_else(balance < 0, "overdraft", "ok"))
|
||||
```
|
||||
|
||||
If you start to nest multiple sets of `if_else`s, I'd suggest switching to `case_when()` instead.
|
||||
|
@ -118,7 +118,7 @@ If you start to nest multiple sets of `if_else`s, I'd suggest switching to `case
|
|||
`condition` must evaluate to a logical vector; when it's `TRUE`, output will be used.
|
||||
|
||||
```{r}
|
||||
df %>%
|
||||
df |>
|
||||
mutate(
|
||||
status = case_when(
|
||||
balance == 0 ~ "no money",
|
||||
|
@ -168,17 +168,17 @@ Like all summary functions, they'll return `NA` if there are any missing values
|
|||
This means that `sum(x)` gives the number of `TRUE`s in `x` and `mean(x)` gives the proportion of `TRUE`s:
|
||||
|
||||
```{r}
|
||||
not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay))
|
||||
not_cancelled <- flights |> filter(!is.na(dep_delay), !is.na(arr_delay))
|
||||
|
||||
# How many flights left before 5am? (these usually indicate delayed
|
||||
# flights from the previous day)
|
||||
not_cancelled %>%
|
||||
group_by(year, month, day) %>%
|
||||
not_cancelled |>
|
||||
group_by(year, month, day) |>
|
||||
summarise(n_early = sum(dep_time < 500))
|
||||
|
||||
# What proportion of flights are delayed by more than an hour?
|
||||
not_cancelled %>%
|
||||
group_by(year, month, day) %>%
|
||||
not_cancelled |>
|
||||
group_by(year, month, day) |>
|
||||
summarise(hour_prop = mean(arr_delay > 60))
|
||||
```
|
||||
|
||||
|
@ -208,7 +208,7 @@ There's no way to list every possible function that you might use, but here's a
|
|||
For example, in the flights dataset, you can compute `hour` and `minute` from `dep_time` with:
|
||||
|
||||
```{r}
|
||||
flights %>% mutate(
|
||||
flights |> mutate(
|
||||
hour = dep_time %/% 100,
|
||||
minute = dep_time %% 100,
|
||||
.keep = "used"
|
||||
|
@ -227,10 +227,10 @@ There's no way to list every possible function that you might use, but here's a
|
|||
```{r}
|
||||
|
||||
|
||||
flights %>%
|
||||
group_by(hour = sched_dep_time %/% 100) %>%
|
||||
summarise(prop_cancelled = mean(is.na(dep_time)), n = n()) %>%
|
||||
filter(hour > 1) %>%
|
||||
flights |>
|
||||
group_by(hour = sched_dep_time %/% 100) |>
|
||||
summarise(prop_cancelled = mean(is.na(dep_time)), n = n()) |>
|
||||
filter(hour > 1) |>
|
||||
ggplot(aes(hour, prop_cancelled)) +
|
||||
geom_point()
|
||||
```
|
||||
|
@ -243,8 +243,8 @@ Just using means, counts, and sum can get you a long way, but R provides many ot
|
|||
The mean is the sum divided by the length; the median is a value where 50% of `x` is above it, and 50% is below it.
|
||||
|
||||
```{r}
|
||||
not_cancelled %>%
|
||||
group_by(month) %>%
|
||||
not_cancelled |>
|
||||
group_by(month) |>
|
||||
summarise(
|
||||
med_arr_delay = median(arr_delay),
|
||||
med_dep_delay = median(dep_delay)
|
||||
|
@ -255,8 +255,8 @@ Just using means, counts, and sum can get you a long way, but R provides many ot
|
|||
We haven't talked about this sort of subsetting yet, but you'll learn more about it in Section \@ref(vector-subsetting).
|
||||
|
||||
```{r}
|
||||
not_cancelled %>%
|
||||
group_by(year, month, day) %>%
|
||||
not_cancelled |>
|
||||
group_by(year, month, day) |>
|
||||
summarise(
|
||||
avg_delay1 = mean(arr_delay),
|
||||
avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay
|
||||
|
@ -269,15 +269,15 @@ Just using means, counts, and sum can get you a long way, but R provides many ot
|
|||
|
||||
```{r}
|
||||
# Why is distance to some destinations more variable than to others?
|
||||
not_cancelled %>%
|
||||
group_by(origin, dest) %>%
|
||||
summarise(distance_sd = sd(distance), n = n()) %>%
|
||||
not_cancelled |>
|
||||
group_by(origin, dest) |>
|
||||
summarise(distance_sd = sd(distance), n = n()) |>
|
||||
filter(distance_sd > 0)
|
||||
|
||||
# Did it move?
|
||||
not_cancelled %>%
|
||||
filter(dest == "EGE") %>%
|
||||
select(time_hour, dest, distance, origin) %>%
|
||||
not_cancelled |>
|
||||
filter(dest == "EGE") |>
|
||||
select(time_hour, dest, distance, origin) |>
|
||||
ggplot(aes(time_hour, distance, colour = origin)) +
|
||||
geom_point()
|
||||
```
|
||||
|
@ -288,8 +288,8 @@ Just using means, counts, and sum can get you a long way, but R provides many ot
|
|||
|
||||
```{r}
|
||||
# When do the first and last flights leave each day?
|
||||
not_cancelled %>%
|
||||
group_by(year, month, day) %>%
|
||||
not_cancelled |>
|
||||
group_by(year, month, day) |>
|
||||
summarise(
|
||||
first = min(dep_time),
|
||||
last = max(dep_time)
|
||||
|
|
|
@ -67,7 +67,7 @@ is.na(x)
|
|||
2. How could you use `arrange()` to sort all missing values to the start?
|
||||
(Hint: use `!is.na()`).
|
||||
|
||||
3. Come up with another approach that will give you the same output as `not_cancelled %>% count(dest)` and `not_cancelled %>% count(tailnum, wt = distance)` (without using `count()`).
|
||||
3. Come up with another approach that will give you the same output as `not_cancelled |> count(dest)` and `not_cancelled |> count(tailnum, wt = distance)` (without using `count()`).
|
||||
|
||||
4. Look at the number of cancelled flights per day.
|
||||
Is there a pattern?
|
||||
|
@ -103,15 +103,15 @@ The way that a dataset is represented can make implicit values explicit.
|
|||
For example, we can make the implicit missing value explicit by putting years in the columns:
|
||||
|
||||
```{r}
|
||||
stocks %>%
|
||||
stocks |>
|
||||
pivot_wider(names_from = year, values_from = return)
|
||||
```
|
||||
|
||||
Because these explicit missing values may not be important in other representations of the data, you can set `values_drop_na = TRUE` in `pivot_longer()` to turn explicit missing values implicit:
|
||||
|
||||
```{r}
|
||||
stocks %>%
|
||||
pivot_wider(names_from = year, values_from = return) %>%
|
||||
stocks |>
|
||||
pivot_wider(names_from = year, values_from = return) |>
|
||||
pivot_longer(
|
||||
cols = c(`2015`, `2016`),
|
||||
names_to = "year",
|
||||
|
@ -123,7 +123,7 @@ stocks %>%
|
|||
Another important tool for making missing values explicit in tidy data is `complete()`:
|
||||
|
||||
```{r}
|
||||
stocks %>%
|
||||
stocks |>
|
||||
complete(year, qtr)
|
||||
```
|
||||
|
||||
|
@ -147,7 +147,7 @@ You can fill in these missing values with `fill()`.
|
|||
It takes a set of columns where you want missing values to be replaced by the most recent non-missing value (sometimes called last observation carried forward).
|
||||
|
||||
```{r}
|
||||
treatment %>%
|
||||
treatment |>
|
||||
fill(person)
|
||||
```
|
||||
|
||||
|
|
|
@ -14,4 +14,3 @@ Welcome to the second edition of "R for Data Science".
|
|||
## Acknowledgements {.unnumbered}
|
||||
|
||||
*TO DO: Add acknowledgements.*
|
||||
|
||||
|
|
|
@ -164,10 +164,10 @@ Defining a "word" in a regular expression is a little tricky, so here I use a si
|
|||
```{r}
|
||||
noun <- "(a|the) ([^ ]+)"
|
||||
|
||||
has_noun <- sentences %>%
|
||||
str_subset(noun) %>%
|
||||
has_noun <- sentences |>
|
||||
str_subset(noun) |>
|
||||
head(10)
|
||||
has_noun %>%
|
||||
has_noun |>
|
||||
str_extract(noun)
|
||||
```
|
||||
|
||||
|
@ -175,7 +175,7 @@ has_noun %>%
|
|||
Instead of a character vector, it returns a matrix, with one column for the complete match followed by one column for each group:
|
||||
|
||||
```{r}
|
||||
has_noun %>%
|
||||
has_noun |>
|
||||
str_match(noun)
|
||||
```
|
||||
|
||||
|
@ -187,8 +187,8 @@ Use `str_split()` to split a string up into pieces.
|
|||
For example, we could split sentences into words:
|
||||
|
||||
```{r}
|
||||
sentences %>%
|
||||
head(5) %>%
|
||||
sentences |>
|
||||
head(5) |>
|
||||
str_split(" ")
|
||||
```
|
||||
|
||||
|
@ -196,16 +196,14 @@ Because each component might contain a different number of pieces, this returns
|
|||
If you're working with a length-1 vector, the easiest thing is to just extract the first element of the list:
|
||||
|
||||
```{r}
|
||||
"a|b|c|d" %>%
|
||||
str_split("\\|") %>%
|
||||
.[[1]]
|
||||
str_split("a|b|c|d", "\\|")[[1]]
|
||||
```
|
||||
|
||||
Otherwise, like the other stringr functions that return a list, you can use `simplify = TRUE` to return a matrix:
|
||||
|
||||
```{r}
|
||||
sentences %>%
|
||||
head(5) %>%
|
||||
sentences |>
|
||||
head(5) |>
|
||||
str_split(" ", simplify = TRUE)
|
||||
```
|
||||
|
||||
|
@ -213,7 +211,7 @@ You can also request a maximum number of pieces:
|
|||
|
||||
```{r}
|
||||
fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
|
||||
fields %>% str_split(": ", n = 2, simplify = TRUE)
|
||||
fields |> str_split(": ", n = 2, simplify = TRUE)
|
||||
```
|
||||
|
||||
Instead of splitting up strings by patterns, you can also split up by character, line, sentence and word `boundary()`s:
|
||||
|
@ -263,10 +261,10 @@ The main difference is the prefix: `str_` vs. `stri_`.
|
|||
Experiment with the various options for the following two toy datasets.
|
||||
|
||||
```{r, eval = FALSE}
|
||||
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
|
||||
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) |>
|
||||
separate(x, c("one", "two", "three"))
|
||||
|
||||
tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
|
||||
tibble(x = c("a,b,c", "d,e", "f,g,i")) |>
|
||||
separate(x, c("one", "two", "three"))
|
||||
```
|
||||
|
||||
|
@ -288,7 +286,7 @@ The main difference is the prefix: `str_` vs. `stri_`.
|
|||
1 , 22
|
||||
)
|
||||
|
||||
events %>%
|
||||
events |>
|
||||
unite("date", month:day, sep = "-", remove = FALSE)
|
||||
```
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@ But this doesn't mean you should rewrite every function: you need to balance wha
|
|||
|
||||
In the following four chapters, you'll learn skills that will allow you to both tackle new programs and to solve existing problems with greater clarity and ease:
|
||||
|
||||
1. In Chapter \@ref(pipes), you will dive deep into the **pipe**, `%>%`, and learn more about how it works, what the alternatives are, and when not to use it.
|
||||
1. In Chapter \@ref(pipes), you will dive deep into the **pipe**, `|>`, and learn more about how it works, what the alternatives are, and when not to use it.
|
||||
|
||||
2. Copy-and-paste is a powerful tool, but you should avoid doing it more than twice.
|
||||
Repeating yourself in code is dangerous because it can easily lead to errors and inconsistencies.
|
||||
|
|
14
regexps.Rmd
14
regexps.Rmd
|
@ -469,8 +469,8 @@ You can also use back references when replacing with `str_replace()` and `str_re
|
|||
The following code will switch the order of the second and third words:
|
||||
|
||||
```{r}
|
||||
sentences %>%
|
||||
str_replace("(\\w+) (\\w+) (\\w+)", "\\1 \\3 \\2") %>%
|
||||
sentences |>
|
||||
str_replace("(\\w+) (\\w+) (\\w+)", "\\1 \\3 \\2") |>
|
||||
head(5)
|
||||
```
|
||||
|
||||
|
@ -478,9 +478,9 @@ You'll sometimes see people using `str_replace()` to extract a single match:
|
|||
|
||||
```{r}
|
||||
pattern <- "^.*the ([^ .,]+).*$"
|
||||
sentences %>%
|
||||
str_subset(pattern) %>%
|
||||
str_replace(pattern, "\\1") %>%
|
||||
sentences |>
|
||||
str_subset(pattern) |>
|
||||
str_replace(pattern, "\\1") |>
|
||||
head(10)
|
||||
```
|
||||
|
||||
|
@ -492,8 +492,8 @@ stringr provides a lower-level function for extract matches called `str_match()`
|
|||
But it returns a matrix, so isn't as easy to work with:
|
||||
|
||||
```{r}
|
||||
sentences %>%
|
||||
str_match("the (\\w+) (\\w+)") %>%
|
||||
sentences |>
|
||||
str_match("the (\\w+) (\\w+)") |>
|
||||
head()
|
||||
```
|
||||
|
||||
|
|
|
@ -121,12 +121,12 @@ Once you've identified the primary keys in your data frames, it's good practice
|
|||
One way to do that is to `count()` the primary keys and look for entries where `n` is greater than one:
|
||||
|
||||
```{r}
|
||||
planes %>%
|
||||
count(tailnum) %>%
|
||||
planes |>
|
||||
count(tailnum) |>
|
||||
filter(n > 1)
|
||||
|
||||
weather %>%
|
||||
count(year, month, day, hour, origin) %>%
|
||||
weather |>
|
||||
count(year, month, day, hour, origin) |>
|
||||
filter(n > 1)
|
||||
```
|
||||
|
||||
|
@ -135,12 +135,12 @@ For example, what's the primary key in the `flights` data frame?
|
|||
You might think it would be the date plus the flight or tail number, but neither of those are unique:
|
||||
|
||||
```{r}
|
||||
flights %>%
|
||||
count(year, month, day, flight) %>%
|
||||
flights |>
|
||||
count(year, month, day, flight) |>
|
||||
filter(n > 1)
|
||||
|
||||
flights %>%
|
||||
count(year, month, day, tailnum) %>%
|
||||
flights |>
|
||||
count(year, month, day, tailnum) |>
|
||||
filter(n > 1)
|
||||
```
|
||||
|
||||
|
@ -192,7 +192,7 @@ Like `mutate()`, the join functions add variables to the right, so if you have a
|
|||
For these examples, we'll make it easier to see what's going on in the examples by creating a narrower dataset:
|
||||
|
||||
```{r}
|
||||
flights2 <- flights %>%
|
||||
flights2 <- flights |>
|
||||
select(year:day, hour, origin, dest, tailnum, carrier)
|
||||
flights2
|
||||
```
|
||||
|
@ -203,8 +203,8 @@ Imagine you want to add the full airline name to the `flights2` data.
|
|||
You can combine the `airlines` and `flights2` data frames with `left_join()`:
|
||||
|
||||
```{r}
|
||||
flights2 %>%
|
||||
select(-origin, -dest) %>%
|
||||
flights2 |>
|
||||
select(-origin, -dest) |>
|
||||
left_join(airlines, by = "carrier")
|
||||
```
|
||||
|
||||
|
@ -213,8 +213,8 @@ This is why I call this type of join a mutating join.
|
|||
In this case, you could have got to the same place using `mutate()` and R's base subsetting:
|
||||
|
||||
```{r}
|
||||
flights2 %>%
|
||||
select(-origin, -dest) %>%
|
||||
flights2 |>
|
||||
select(-origin, -dest) |>
|
||||
mutate(name = airlines$name[match(carrier, airlines$carrier)])
|
||||
```
|
||||
|
||||
|
@ -284,7 +284,7 @@ The output of an inner join is a new data frame that contains the key, the x val
|
|||
We use `by` to tell dplyr which variable is the key:
|
||||
|
||||
```{r}
|
||||
x %>%
|
||||
x |>
|
||||
inner_join(y, by = "key")
|
||||
```
|
||||
|
||||
|
@ -391,7 +391,7 @@ You can use other values for `by` to connect the data frames in other ways:
|
|||
For example, the flights and weather data frames match on their common variables: `year`, `month`, `day`, `hour` and `origin`.
|
||||
|
||||
```{r}
|
||||
flights2 %>%
|
||||
flights2 |>
|
||||
left_join(weather)
|
||||
```
|
||||
|
||||
|
@ -400,7 +400,7 @@ You can use other values for `by` to connect the data frames in other ways:
|
|||
For example, `flights` and `planes` have `year` variables, but they mean different things so we only want to join by `tailnum`.
|
||||
|
||||
```{r}
|
||||
flights2 %>%
|
||||
flights2 |>
|
||||
left_join(planes, by = "tailnum")
|
||||
```
|
||||
|
||||
|
@ -414,10 +414,10 @@ You can use other values for `by` to connect the data frames in other ways:
|
|||
Each flight has an origin and destination `airport`, so we need to specify which one we want to join to:
|
||||
|
||||
```{r}
|
||||
flights2 %>%
|
||||
flights2 |>
|
||||
left_join(airports, c("dest" = "faa"))
|
||||
|
||||
flights2 %>%
|
||||
flights2 |>
|
||||
left_join(airports, c("origin" = "faa"))
|
||||
```
|
||||
|
||||
|
@ -427,8 +427,8 @@ You can use other values for `by` to connect the data frames in other ways:
|
|||
Here's an easy way to draw a map of the United States:
|
||||
|
||||
```{r, eval = FALSE}
|
||||
airports %>%
|
||||
semi_join(flights, c("faa" = "dest")) %>%
|
||||
airports |>
|
||||
semi_join(flights, c("faa" = "dest")) |>
|
||||
ggplot(aes(lon, lat)) +
|
||||
borders("state") +
|
||||
geom_point() +
|
||||
|
@ -450,11 +450,11 @@ You can use other values for `by` to connect the data frames in other ways:
|
|||
|
||||
```{r, eval = FALSE, include = FALSE}
|
||||
worst <- filter(flights, !is.na(dep_time), month == 6, day == 13)
|
||||
worst %>%
|
||||
group_by(dest) %>%
|
||||
summarise(delay = mean(arr_delay), n = n()) %>%
|
||||
filter(n > 5) %>%
|
||||
inner_join(airports, by = c("dest" = "faa")) %>%
|
||||
worst |>
|
||||
group_by(dest) |>
|
||||
summarise(delay = mean(arr_delay), n = n()) |>
|
||||
filter(n > 5) |>
|
||||
inner_join(airports, by = c("dest" = "faa")) |>
|
||||
ggplot(aes(lon, lat)) +
|
||||
borders("state") +
|
||||
geom_point(aes(size = n, colour = delay)) +
|
||||
|
@ -501,8 +501,8 @@ Semi-joins are useful for matching filtered summary data frames back to the orig
|
|||
For example, imagine you've found the top ten most popular destinations:
|
||||
|
||||
```{r}
|
||||
top_dest <- flights %>%
|
||||
count(dest, sort = TRUE) %>%
|
||||
top_dest <- flights |>
|
||||
count(dest, sort = TRUE) |>
|
||||
head(10)
|
||||
top_dest
|
||||
```
|
||||
|
@ -511,7 +511,7 @@ Now you want to find each flight that went to one of those destinations.
|
|||
You could construct a filter yourself:
|
||||
|
||||
```{r}
|
||||
flights %>%
|
||||
flights |>
|
||||
filter(dest %in% top_dest$dest)
|
||||
```
|
||||
|
||||
|
@ -522,7 +522,7 @@ How would you construct the filter statement that used `year`, `month`, and `day
|
|||
Instead you can use a semi-join, which connects the two data frames like a mutating join, but instead of adding new columns, only keeps the rows in `x` that have a match in `y`:
|
||||
|
||||
```{r}
|
||||
flights %>%
|
||||
flights |>
|
||||
semi_join(top_dest)
|
||||
```
|
||||
|
||||
|
@ -550,8 +550,8 @@ Anti-joins are useful for diagnosing join mismatches.
|
|||
For example, when connecting `flights` and `planes`, you might be interested to know that there are many `flights` that don't have a match in `planes`:
|
||||
|
||||
```{r}
|
||||
flights %>%
|
||||
anti_join(planes, by = "tailnum") %>%
|
||||
flights |>
|
||||
anti_join(planes, by = "tailnum") |>
|
||||
count(tailnum, sort = TRUE)
|
||||
```
|
||||
|
||||
|
@ -587,7 +587,7 @@ Your own data is unlikely to be so nice, so there are a few things that you shou
|
|||
For example, the altitude and longitude uniquely identify each airport, but they are not good identifiers!
|
||||
|
||||
```{r}
|
||||
airports %>% count(alt, lon) %>% filter(n > 1)
|
||||
airports |> count(alt, lon) |> filter(n > 1)
|
||||
```
|
||||
|
||||
2. Check that none of the variables in the primary key are missing.
|
||||
|
|
|
@ -184,9 +184,9 @@ You obviously can't do that in a book, so rmarkdown automatically inserts a stat
|
|||
|
||||
```{r}
|
||||
library(leaflet)
|
||||
leaflet() %>%
|
||||
setView(174.764, -36.877, zoom = 16) %>%
|
||||
addTiles() %>%
|
||||
leaflet() |>
|
||||
setView(174.764, -36.877, zoom = 16) |>
|
||||
addTiles() |>
|
||||
addMarkers(174.764, -36.877, popup = "Maungawhau")
|
||||
```
|
||||
|
||||
|
|
|
@ -255,8 +255,8 @@ For example, here the `processed_data` chunk depends on the `raw_data` chunk:
|
|||
`r chunk`
|
||||
|
||||
`r chunk`{r processed_data, cache = TRUE}
|
||||
processed_data <- rawdata %>%
|
||||
filter(!is.na(import_var)) %>%
|
||||
processed_data <- rawdata |>
|
||||
filter(!is.na(import_var)) |>
|
||||
mutate(new_variable = complicated_transformation(x, y, z))
|
||||
`r chunk`
|
||||
|
||||
|
@ -264,8 +264,8 @@ Caching the `processed_data` chunk means that it will get re-run if the dplyr pi
|
|||
You can avoid that problem with the `dependson` chunk option:
|
||||
|
||||
`r chunk`{r processed_data, cache = TRUE, dependson = "raw_data"}
|
||||
processed_data <- rawdata %>%
|
||||
filter(!is.na(import_var)) %>%
|
||||
processed_data <- rawdata |>
|
||||
filter(!is.na(import_var)) |>
|
||||
mutate(new_variable = complicated_transformation(x, y, z))
|
||||
`r chunk`
|
||||
|
||||
|
@ -426,8 +426,8 @@ reports
|
|||
Then we match the column names to the argument names of `render()`, and use purrr's **parallel** walk to call `render()` once for each row:
|
||||
|
||||
```{r, eval = FALSE}
|
||||
reports %>%
|
||||
select(output_file = filename, params) %>%
|
||||
reports |>
|
||||
select(output_file = filename, params) |>
|
||||
purrr::pwalk(rmarkdown::render, input = "fuel-economy.Rmd")
|
||||
```
|
||||
|
||||
|
|
|
@ -34,9 +34,9 @@ ggplot(diamonds, aes(color)) + geom_bar()
|
|||
### The largest diamonds
|
||||
|
||||
```{r}
|
||||
diamonds %>%
|
||||
arrange(desc(carat)) %>%
|
||||
head(100) %>%
|
||||
select(carat, cut, color, price) %>%
|
||||
diamonds |>
|
||||
arrange(desc(carat)) |>
|
||||
head(100) |>
|
||||
select(carat, cut, color, price) |>
|
||||
DT::datatable()
|
||||
```
|
||||
|
|
|
@ -8,7 +8,7 @@ output: html_document
|
|||
library(ggplot2)
|
||||
library(dplyr)
|
||||
|
||||
smaller <- diamonds %>%
|
||||
smaller <- diamonds |>
|
||||
filter(carat <= 2.5)
|
||||
```
|
||||
|
||||
|
@ -18,7 +18,7 @@ We have data about `r nrow(diamonds)` diamonds. Only
|
|||
below:
|
||||
|
||||
```{r, echo = FALSE}
|
||||
smaller %>%
|
||||
smaller |>
|
||||
ggplot(aes(carat)) +
|
||||
geom_freqpoly(binwidth = 0.01)
|
||||
```
|
||||
|
|
|
@ -8,7 +8,7 @@ params:
|
|||
library(ggplot2)
|
||||
library(dplyr)
|
||||
|
||||
class <- mpg %>% filter(class == params$my_class)
|
||||
class <- mpg |> filter(class == params$my_class)
|
||||
```
|
||||
|
||||
# Fuel economy for `r params$my_class`s
|
||||
|
|
32
strings.Rmd
32
strings.Rmd
|
@ -158,13 +158,13 @@ str_c("Hello ", c("John", "Susan"))
|
|||
|
||||
```{r}
|
||||
df <- tibble(name = c("Timothy", "Dewey", "Mable", NA))
|
||||
df %>% mutate(greeting = str_c("Hi ", name, "!"))
|
||||
df |> mutate(greeting = str_c("Hi ", name, "!"))
|
||||
```
|
||||
|
||||
If you want missing values to display in some other way, use `coalesce()` either inside or outside of `str_c()`:
|
||||
|
||||
```{r}
|
||||
df %>% mutate(
|
||||
df |> mutate(
|
||||
greeting1 = str_c("Hi ", coalesce(name, "you"), "!"),
|
||||
greeting2 = coalesce(str_c("Hi ", name, "!"), "Hi!")
|
||||
)
|
||||
|
@ -179,7 +179,7 @@ You give it a single string containing `{}` and anything inside `{}` will be eva
|
|||
[^strings-4]: If you're not using stringr, you can also access it directly with `glue::glue().`
|
||||
|
||||
```{r}
|
||||
df %>% mutate(greeting = str_glue("Hi {name}!"))
|
||||
df |> mutate(greeting = str_glue("Hi {name}!"))
|
||||
```
|
||||
|
||||
You can use any valid R code inside of `{}`, but it's a good idea to pull complex calculations out into their own variables so you can more easily check your work.
|
||||
|
@ -192,7 +192,7 @@ You might expect that you'll need to escape it, and you'd be right.
|
|||
But glue uses a slightly different escaping technique; instead of prefixing with special character like `\`, you just double up the `{` and `}`:
|
||||
|
||||
```{r}
|
||||
df %>% mutate(greeting = str_glue("{{Hi {name}!}}"))
|
||||
df |> mutate(greeting = str_glue("{{Hi {name}!}}"))
|
||||
```
|
||||
|
||||
### `str_flatten()`
|
||||
|
@ -221,8 +221,8 @@ df <- tribble(
|
|||
"Terence", "papaya",
|
||||
"Terence", "madarine"
|
||||
)
|
||||
df %>%
|
||||
group_by(name) %>%
|
||||
df |>
|
||||
group_by(name) |>
|
||||
summarise(fruits = str_flatten(fruit, ", "))
|
||||
```
|
||||
|
||||
|
@ -271,7 +271,7 @@ str_detect(x, "ear") # does the word contain "ear"?
|
|||
For example, this code finds all names that contain a lower-case "x":
|
||||
|
||||
```{r}
|
||||
babynames %>% filter(str_detect(name, "x"))
|
||||
babynames |> filter(str_detect(name, "x"))
|
||||
```
|
||||
|
||||
We can also use `str_detect()` with `summarize()` by remembering that when you use a logical vector in a numeric context, `FALSE` becomes 0 and `TRUE` becomes 1.
|
||||
|
@ -279,9 +279,9 @@ That means `sum(str_detect(x, pattern))` will tell you the number of observation
|
|||
For example, the following snippet computes and visualizes the proportion of baby names that contain "x", broken down by year:
|
||||
|
||||
```{r, fig.alt = "A timeseries showing the proportion of baby names that contain the letter x. The proportion declines gradually from 8 per 1000 in 1880 to 4 per 1000 in 1980, then increases rapidly to 16 per 1000 in 2019."}
|
||||
babynames %>%
|
||||
group_by(year) %>%
|
||||
summarise(prop_x = mean(str_detect(name, "x"))) %>%
|
||||
babynames |>
|
||||
group_by(year) |>
|
||||
summarise(prop_x = mean(str_detect(name, "x"))) |>
|
||||
ggplot(aes(year, prop_x)) +
|
||||
geom_line()
|
||||
```
|
||||
|
@ -377,8 +377,8 @@ It's natural to use `str_count()` with `mutate()`.
|
|||
The following example uses `str_count()` with character classes to count the number of vowels and consonants in each name.
|
||||
|
||||
```{r}
|
||||
babynames %>%
|
||||
count(name) %>%
|
||||
babynames |>
|
||||
count(name) |>
|
||||
mutate(
|
||||
vowels = str_count(name, "[aeiou]"),
|
||||
consonants = str_count(name, "[^aeiou]")
|
||||
|
@ -551,11 +551,11 @@ You could use this with `count()` to find the distribution of lengths of US baby
|
|||
[^strings-10]: Looking at these entries, I'd say the babynames data removes spaces or hyphens from names and truncates after 15 letters.
|
||||
|
||||
```{r}
|
||||
babynames %>%
|
||||
babynames |>
|
||||
count(length = str_length(name), wt = n)
|
||||
|
||||
babynames %>%
|
||||
filter(str_length(name) == 15) %>%
|
||||
babynames |>
|
||||
filter(str_length(name) == 15) |>
|
||||
count(name, wt = n, sort = TRUE)
|
||||
```
|
||||
|
||||
|
@ -584,7 +584,7 @@ str_sub("a", 1, 5)
|
|||
We could use `str_sub()` with `mutate()` to find the first and last letter of each name:
|
||||
|
||||
```{r}
|
||||
babynames %>%
|
||||
babynames |>
|
||||
mutate(
|
||||
first = str_sub(name, 1, 1),
|
||||
last = str_sub(name, -1, -1)
|
||||
|
|
15
tibble.Rmd
15
tibble.Rmd
|
@ -107,7 +107,7 @@ First, you can explicitly `print()` the data frame and control the number of row
|
|||
`width = Inf` will display all columns:
|
||||
|
||||
```{r, eval = FALSE}
|
||||
nycflights13::flights %>%
|
||||
nycflights13::flights |>
|
||||
print(n = 10, width = Inf)
|
||||
```
|
||||
|
||||
|
@ -124,7 +124,7 @@ A final option is to use RStudio's built-in data viewer to get a scrollable view
|
|||
This is also often useful at the end of a long chain of manipulations.
|
||||
|
||||
```{r, eval = FALSE}
|
||||
nycflights13::flights %>%
|
||||
nycflights13::flights |>
|
||||
View()
|
||||
```
|
||||
|
||||
|
@ -141,8 +141,8 @@ tb <- tibble(
|
|||
y1 = 6:10
|
||||
)
|
||||
|
||||
tb %>% pull(x1)
|
||||
tb %>% pull(x1, name = id)
|
||||
tb |> pull(x1)
|
||||
tb |> pull(x1, name = id)
|
||||
```
|
||||
|
||||
You can also use tools like `$` and `[[` to extract a variable.
|
||||
|
@ -157,13 +157,6 @@ tb[["x1"]]
|
|||
tb[[1]]
|
||||
```
|
||||
|
||||
To use these in a pipe, you'll need to use the special placeholder `.`:
|
||||
|
||||
```{r}
|
||||
tb %>% .$x1
|
||||
tb %>% .[["x1"]]
|
||||
```
|
||||
|
||||
Compared to a `data.frame`, tibbles are more strict: they never do partial matching, and they will generate a warning if the column you are trying to access does not exist.
|
||||
In the following chunk `df` is a `data.frame` and `tb` is a `tibble`.
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ status("drafting")
|
|||
library(tidyverse)
|
||||
library(nycflights13)
|
||||
|
||||
not_cancelled <- flights %>%
|
||||
not_cancelled <- flights |>
|
||||
filter(!is.na(dep_delay), !is.na(arr_delay))
|
||||
```
|
||||
|
||||
|
@ -26,23 +26,23 @@ not_cancelled <- flights %>%
|
|||
|
||||
```{r}
|
||||
# Which destinations have the most carriers?
|
||||
not_cancelled %>%
|
||||
group_by(dest) %>%
|
||||
summarise(carriers = n_distinct(carrier)) %>%
|
||||
not_cancelled |>
|
||||
group_by(dest) |>
|
||||
summarise(carriers = n_distinct(carrier)) |>
|
||||
arrange(desc(carriers))
|
||||
```
|
||||
|
||||
Counts are so useful that dplyr provides a simple helper if all you want is a count:
|
||||
|
||||
```{r}
|
||||
not_cancelled %>%
|
||||
not_cancelled |>
|
||||
count(dest)
|
||||
```
|
||||
|
||||
Just like with `group_by()`, you can also provide multiple variables to `count()`.
|
||||
|
||||
```{r}
|
||||
not_cancelled %>%
|
||||
not_cancelled |>
|
||||
count(carrier, dest)
|
||||
```
|
||||
|
||||
|
@ -50,7 +50,7 @@ not_cancelled <- flights %>%
|
|||
For example, you could use this to "count" (sum) the total number of miles a plane flew:
|
||||
|
||||
```{r}
|
||||
not_cancelled %>%
|
||||
not_cancelled |>
|
||||
count(tailnum, wt = distance)
|
||||
```
|
||||
|
||||
|
@ -91,8 +91,8 @@ not_cancelled <- flights %>%
|
|||
For example, we can find the first and last departure for each day:
|
||||
|
||||
```{r}
|
||||
not_cancelled %>%
|
||||
group_by(year, month, day) %>%
|
||||
not_cancelled |>
|
||||
group_by(year, month, day) |>
|
||||
summarise(
|
||||
first_dep = first(dep_time),
|
||||
last_dep = last(dep_time)
|
||||
|
@ -103,9 +103,9 @@ not_cancelled <- flights %>%
|
|||
Filtering gives you all variables, with each observation in a separate row:
|
||||
|
||||
```{r}
|
||||
not_cancelled %>%
|
||||
group_by(year, month, day) %>%
|
||||
mutate(r = min_rank(desc(dep_time))) %>%
|
||||
not_cancelled |>
|
||||
group_by(year, month, day) |>
|
||||
mutate(r = min_rank(desc(dep_time))) |>
|
||||
filter(r %in% range(r))
|
||||
```
|
||||
|
||||
|
@ -136,9 +136,9 @@ df <- data.frame(
|
|||
balance = c(100, 50, 25, -25, -50, 30, 120)
|
||||
)
|
||||
# all rows after first overdraft
|
||||
df %>% filter(cumany(balance < 0))
|
||||
df |> filter(cumany(balance < 0))
|
||||
# all rows until first overdraft
|
||||
df %>% filter(cumall(!(balance < 0)))
|
||||
df |> filter(cumall(!(balance < 0)))
|
||||
```
|
||||
|
||||
###
|
||||
|
@ -157,16 +157,16 @@ flights_sml <- select(flights,
|
|||
- Find the worst members of each group:
|
||||
|
||||
```{r}
|
||||
flights_sml %>%
|
||||
group_by(year, month, day) %>%
|
||||
flights_sml |>
|
||||
group_by(year, month, day) |>
|
||||
filter(rank(desc(arr_delay)) < 10)
|
||||
```
|
||||
|
||||
- Find all groups bigger than a threshold:
|
||||
|
||||
```{r}
|
||||
popular_dests <- flights %>%
|
||||
group_by(dest) %>%
|
||||
popular_dests <- flights |>
|
||||
group_by(dest) |>
|
||||
filter(n() > 365)
|
||||
popular_dests
|
||||
```
|
||||
|
@ -174,9 +174,9 @@ flights_sml <- select(flights,
|
|||
- Standardise to compute per group metrics:
|
||||
|
||||
```{r}
|
||||
popular_dests %>%
|
||||
filter(arr_delay > 0) %>%
|
||||
mutate(prop_delay = arr_delay / sum(arr_delay)) %>%
|
||||
popular_dests |>
|
||||
filter(arr_delay > 0) |>
|
||||
mutate(prop_delay = arr_delay / sum(arr_delay)) |>
|
||||
select(year:day, dest, arr_delay, prop_delay)
|
||||
```
|
||||
|
||||
|
|
|
@ -28,18 +28,18 @@ The key to using the script editor effectively is to memorise one of the most im
|
|||
This executes the current R expression in the console.
|
||||
For example, take the code below.
|
||||
If your cursor is at █, pressing Cmd/Ctrl + Enter will run the complete command that generates `not_cancelled`.
|
||||
It will also move the cursor to the next statement (beginning with `not_cancelled %>%`).
|
||||
It will also move the cursor to the next statement (beginning with `not_cancelled |>`).
|
||||
That makes it easy to run your complete script by repeatedly pressing Cmd/Ctrl + Enter.
|
||||
|
||||
```{r, eval = FALSE}
|
||||
library(dplyr)
|
||||
library(nycflights13)
|
||||
|
||||
not_cancelled <- flights %>%
|
||||
not_cancelled <- flights |>
|
||||
filter(!is.na(dep_delay)█, !is.na(arr_delay))
|
||||
|
||||
not_cancelled %>%
|
||||
group_by(year, month, day) %>%
|
||||
not_cancelled |>
|
||||
group_by(year, month, day) |>
|
||||
summarise(mean = mean(dep_delay))
|
||||
```
|
||||
|
||||
|
|
Loading…
Reference in New Issue