Convert from %>% to |>

This commit is contained in:
Hadley Wickham
2022-02-23 13:15:52 -06:00
parent 1b0c50894a
commit da0fbd50d5
36 changed files with 542 additions and 552 deletions

View File

@@ -88,7 +88,7 @@ One easy place to start is to find the vertical distance between each point and
(Note that I've shifted the x values slightly so you can see the individual distances.)
```{r, echo = FALSE}
dist1 <- sim1 %>%
dist1 <- sim1 |>
mutate(
dodge = rep(c(-1, 0, 1) / 20, 10),
x1 = x + dodge,
@@ -137,7 +137,7 @@ sim1_dist <- function(a1, a2) {
measure_distance(c(a1, a2), sim1)
}
models <- models %>%
models <- models |>
mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
models
```
@@ -171,10 +171,10 @@ I picked the parameters of the grid roughly by looking at where the best models
grid <- expand.grid(
a1 = seq(-5, 20, length = 25),
a2 = seq(1, 3, length = 25)
) %>%
) |>
mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
grid %>%
grid |>
ggplot(aes(a1, a2)) +
geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") +
geom_point(aes(colour = -dist))
@@ -282,7 +282,7 @@ The easiest way to do that is to use `modelr::data_grid()`.
Its first argument is a data frame, and for each subsequent argument it finds the unique variables and then generates all combinations:
```{r}
grid <- sim1 %>%
grid <- sim1 |>
data_grid(x)
grid
```
@@ -294,7 +294,7 @@ We'll use `modelr::add_predictions()` which takes a data frame and a model.
It adds the predictions from the model to a new column in the data frame:
```{r}
grid <- grid %>%
grid <- grid |>
add_predictions(sim1_mod)
grid
```
@@ -324,7 +324,7 @@ Note, however, that we use the original dataset, not a manufactured grid.
This is because to compute residuals we need actual y values.
```{r}
sim1 <- sim1 %>%
sim1 <- sim1 |>
add_residuals(sim1_mod)
sim1
```
@@ -444,8 +444,8 @@ We can fit a model to it, and generate predictions:
```{r}
mod2 <- lm(y ~ x, data = sim2)
grid <- sim2 %>%
data_grid(x) %>%
grid <- sim2 |>
data_grid(x) |>
add_predictions(mod2)
grid
```
@@ -463,7 +463,7 @@ You can't make predictions about levels that you didn't observe.
Sometimes you'll do this by accident so it's good to recognise this error message:
```{r, error = TRUE}
tibble(x = "e") %>%
tibble(x = "e") |>
add_predictions(mod2)
```
@@ -501,8 +501,8 @@ To visualise these models we need two new tricks:
Together this gives us:
```{r}
grid <- sim3 %>%
data_grid(x1, x2) %>%
grid <- sim3 |>
data_grid(x1, x2) |>
gather_predictions(mod1, mod2)
grid
```
@@ -524,7 +524,7 @@ We can take look at the residuals.
Here I've facetted by both model and `x2` because it makes it easier to see the pattern within each group.
```{r}
sim3 <- sim3 %>%
sim3 <- sim3 |>
gather_residuals(mod1, mod2)
ggplot(sim3, aes(x1, resid, colour = x2)) +
@@ -547,11 +547,11 @@ Initially things proceed almost identically to the previous example:
mod1 <- lm(y ~ x1 + x2, data = sim4)
mod2 <- lm(y ~ x1 * x2, data = sim4)
grid <- sim4 %>%
grid <- sim4 |>
data_grid(
x1 = seq_range(x1, 5),
x2 = seq_range(x2, 5)
) %>%
) |>
gather_predictions(mod1, mod2)
grid
```
@@ -687,8 +687,8 @@ mod3 <- lm(y ~ ns(x, 3), data = sim5)
mod4 <- lm(y ~ ns(x, 4), data = sim5)
mod5 <- lm(y ~ ns(x, 5), data = sim5)
grid <- sim5 %>%
data_grid(x = seq_range(x, n = 50, expand = 0.1)) %>%
grid <- sim5 |>
data_grid(x = seq_range(x, n = 50, expand = 0.1)) |>
gather_predictions(mod1, mod2, mod3, mod4, mod5, .pred = "y")
ggplot(sim5, aes(x, y)) +

View File

@@ -71,8 +71,8 @@ But first, lets make a couple of tweaks to the diamonds dataset to make it easie
2. Log-transform the carat and price variables.
```{r}
diamonds2 <- diamonds %>%
filter(carat <= 2.5) %>%
diamonds2 <- diamonds |>
filter(carat <= 2.5) |>
mutate(lprice = log2(price), lcarat = log2(carat))
```
@@ -95,10 +95,10 @@ Then we look at what the model tells us about the data.
Note that I back transform the predictions, undoing the log transformation, so I can overlay the predictions on the raw data:
```{r}
grid <- diamonds2 %>%
data_grid(carat = seq_range(carat, 20)) %>%
mutate(lcarat = log2(carat)) %>%
add_predictions(mod_diamond, "lprice") %>%
grid <- diamonds2 |>
data_grid(carat = seq_range(carat, 20)) |>
mutate(lcarat = log2(carat)) |>
add_predictions(mod_diamond, "lprice") |>
mutate(price = 2 ^ lprice)
ggplot(diamonds2, aes(carat, price)) +
@@ -113,7 +113,7 @@ This is probably because no diamond in this dataset costs more than \$19,000.
Now we can look at the residuals, which verifies that we've successfully removed the strong linear pattern:
```{r}
diamonds2 <- diamonds2 %>%
diamonds2 <- diamonds2 |>
add_residuals(mod_diamond, "lresid")
ggplot(diamonds2, aes(lcarat, lresid)) +
@@ -147,8 +147,8 @@ Fortunately, they're currently all independent which means that we can plot them
To make the process a little easier, we're going to use the `.model` argument to `data_grid`:
```{r}
grid <- diamonds2 %>%
data_grid(cut, .model = mod_diamond2) %>%
grid <- diamonds2 |>
data_grid(cut, .model = mod_diamond2) |>
add_predictions(mod_diamond2)
grid
@@ -160,7 +160,7 @@ If the model needs variables that you haven't explicitly supplied, `data_grid()`
For continuous variables, it uses the median, and categorical variables it uses the most common value (or values, if there's a tie).
```{r}
diamonds2 <- diamonds2 %>%
diamonds2 <- diamonds2 |>
add_residuals(mod_diamond2, "lresid2")
ggplot(diamonds2, aes(lcarat, lresid2)) +
@@ -171,11 +171,11 @@ This plot indicates that there are some diamonds with quite large residuals - re
It's often useful to look at unusual values individually:
```{r}
diamonds2 %>%
filter(abs(lresid2) > 1) %>%
add_predictions(mod_diamond2) %>%
mutate(pred = round(2 ^ pred)) %>%
select(price, pred, carat:table, x:z) %>%
diamonds2 |>
filter(abs(lresid2) > 1) |>
add_predictions(mod_diamond2) |>
mutate(pred = round(2 ^ pred)) |>
select(price, pred, carat:table, x:z) |>
arrange(price)
```
@@ -203,9 +203,9 @@ This is a really small dataset --- only 365 rows and 2 columns --- and we're not
Let's get started by counting the number of flights per day and visualising it with ggplot2.
```{r}
daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
daily <- flights |>
mutate(date = make_date(year, month, day)) |>
group_by(date) |>
summarise(n = n())
daily
@@ -219,7 +219,7 @@ Understanding the long-term trend is challenging because there's a very strong d
Let's start by looking at the distribution of flight numbers by day-of-week:
```{r}
daily <- daily %>%
daily <- daily |>
mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday, n)) +
geom_boxplot()
@@ -234,8 +234,8 @@ First, we fit the model, and display its predictions overlaid on the original da
```{r}
mod <- lm(n ~ wday, data = daily)
grid <- daily %>%
data_grid(wday) %>%
grid <- daily |>
data_grid(wday) |>
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
@@ -246,9 +246,9 @@ ggplot(daily, aes(wday, n)) +
Next we compute and visualise the residuals:
```{r}
daily <- daily %>%
daily <- daily |>
add_residuals(mod)
daily %>%
daily |>
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line()
@@ -272,7 +272,7 @@ This plot is useful because now that we've removed much of the large day-of-week
2. There are some days with far fewer flights than expected:
```{r}
daily %>%
daily |>
filter(resid < -100)
```
@@ -284,7 +284,7 @@ This plot is useful because now that we've removed much of the large day-of-week
We can highlight that trend with `geom_smooth()`:
```{r}
daily %>%
daily |>
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line(colour = "grey50") +
@@ -301,8 +301,8 @@ Let's first tackle our failure to accurately predict the number of flights on Sa
A good place to start is to go back to the raw numbers, focussing on Saturdays:
```{r}
daily %>%
filter(wday == "Sat") %>%
daily |>
filter(wday == "Sat") |>
ggplot(aes(date, n)) +
geom_point() +
geom_line() +
@@ -329,11 +329,11 @@ term <- function(date) {
)
}
daily <- daily %>%
daily <- daily |>
mutate(term = term(date))
daily %>%
filter(wday == "Sat") %>%
daily |>
filter(wday == "Sat") |>
ggplot(aes(date, n, colour = term)) +
geom_point(alpha = 1/3) +
geom_line() +
@@ -345,7 +345,7 @@ daily %>%
It's useful to see how this new variable affects the other days of the week:
```{r}
daily %>%
daily |>
ggplot(aes(wday, n, colour = term)) +
geom_boxplot()
```
@@ -357,8 +357,8 @@ This improves our model, but not as much as we might hope:
mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)
daily %>%
gather_residuals(without_term = mod1, with_term = mod2) %>%
daily |>
gather_residuals(without_term = mod1, with_term = mod2) |>
ggplot(aes(date, resid, colour = model)) +
geom_line(alpha = 0.75)
```
@@ -366,8 +366,8 @@ daily %>%
We can see the problem by overlaying the predictions from the model on to the raw data:
```{r}
grid <- daily %>%
data_grid(wday, term) %>%
grid <- daily |>
data_grid(wday, term) |>
add_predictions(mod2, "n")
ggplot(daily, aes(wday, n)) +
@@ -383,8 +383,8 @@ This greatly reduces the impact of the outliers on our estimates, and gives a mo
```{r, warn = FALSE}
mod3 <- MASS::rlm(n ~ wday * term, data = daily)
daily %>%
add_residuals(mod3, "resid") %>%
daily |>
add_residuals(mod3, "resid") |>
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, colour = "white") +
geom_line()
@@ -399,7 +399,7 @@ For example, we could write:
```{r}
compute_vars <- function(data) {
data %>%
data |>
mutate(
term = term(date),
wday = wday(date, label = TRUE)
@@ -430,9 +430,9 @@ A simple linear trend isn't adequate, so we could try using a natural spline to
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod) %>%
daily |>
data_grid(wday, date = seq_range(date, n = 13)) |>
add_predictions(mod) |>
ggplot(aes(date, pred, colour = wday)) +
geom_line() +
geom_point()
@@ -451,7 +451,7 @@ It's a good sign when you get the same signal from different approaches.
How would these days generalise to another year?
```{r}
daily %>%
daily |>
slice_max(n = 3, resid)
```

View File

@@ -60,7 +60,7 @@ In this case study, we're going to focus on just three variables to answer the q
A good place to start is with a plot:
```{r}
gapminder %>%
gapminder |>
ggplot(aes(year, lifeExp, group = country)) +
geom_line(alpha = 1/3)
```
@@ -79,20 +79,20 @@ You already know how to do that if we had a single country:
```{r, out.width = "33%", fig.asp = 1, fig.width = 3, fig.align='default'}
nz <- filter(gapminder, country == "New Zealand")
nz %>%
nz |>
ggplot(aes(year, lifeExp)) +
geom_line() +
ggtitle("Full data = ")
nz_mod <- lm(lifeExp ~ year, data = nz)
nz %>%
add_predictions(nz_mod) %>%
nz |>
add_predictions(nz_mod) |>
ggplot(aes(year, pred)) +
geom_line() +
ggtitle("Linear trend + ")
nz %>%
add_residuals(nz_mod) %>%
nz |>
add_residuals(nz_mod) |>
ggplot(aes(year, resid)) +
geom_hline(yintercept = 0, colour = "white", size = 3) +
geom_line() +
@@ -111,8 +111,8 @@ To do that, we need a new data structure: the **nested data frame**.
To create a nested data frame we start with a grouped data frame, and "nest" it:
```{r}
by_country <- gapminder %>%
group_by(country, continent) %>%
by_country <- gapminder |>
group_by(country, continent) |>
nest()
by_country
@@ -163,7 +163,7 @@ In other words, instead of creating a new object in the global environment, we'r
That's a job for `dplyr::mutate()`:
```{r}
by_country <- by_country %>%
by_country <- by_country |>
mutate(model = map(data, country_model))
by_country
```
@@ -172,9 +172,9 @@ This has a big advantage: because all the related objects are stored together, y
The semantics of the data frame takes care of that for you:
```{r}
by_country %>%
by_country |>
filter(continent == "Europe")
by_country %>%
by_country |>
arrange(continent, country)
```
@@ -188,7 +188,7 @@ Now we have 142 data frames and 142 models.
To compute the residuals, we need to call `add_residuals()` with each model-data pair:
```{r}
by_country <- by_country %>%
by_country <- by_country |>
mutate(
resids = map2(data, model, add_residuals)
)
@@ -209,7 +209,7 @@ Note that each regular column is repeated once for each row of the nested tibble
Now we have regular data frame, we can plot the residuals:
```{r}
resids %>%
resids |>
ggplot(aes(year, resid)) +
geom_line(aes(group = country), alpha = 1 / 3) +
geom_smooth(se = FALSE)
@@ -219,7 +219,7 @@ resids %>%
Facetting by continent is particularly revealing:
```{r}
resids %>%
resids |>
ggplot(aes(year, resid, group = country)) +
geom_line(alpha = 1 / 3) +
facet_wrap(~continent)
@@ -245,9 +245,9 @@ broom::glance(nz_mod)
We can use `mutate()` and `unnest()` to create a data frame with a row for each country:
```{r}
glance <- by_country %>%
mutate(glance = map(model, broom::glance)) %>%
select(country, continent, glance) %>%
glance <- by_country |>
mutate(glance = map(model, broom::glance)) |>
select(country, continent, glance) |>
unnest(glance)
glance
```
@@ -257,7 +257,7 @@ glance
With this data frame in hand, we can start to look for models that don't fit well:
```{r}
glance %>%
glance |>
arrange(r.squared)
```
@@ -266,7 +266,7 @@ Let's double check that with a plot.
Here we have a relatively small number of observations and a discrete variable, so `geom_jitter()` is effective:
```{r}
glance %>%
glance |>
ggplot(aes(continent, r.squared)) +
geom_jitter(width = 0.5)
```
@@ -276,8 +276,8 @@ We could pull out the countries with particularly bad $R^2$ and plot the data:
```{r}
bad_fit <- filter(glance, r.squared < 0.25)
gapminder %>%
semi_join(bad_fit, by = "country") %>%
gapminder |>
semi_join(bad_fit, by = "country") |>
ggplot(aes(year, lifeExp, colour = country)) +
geom_line()
```
@@ -377,15 +377,15 @@ So far you've seen how to use it with a grouped data frame.
When applied to a grouped data frame, `nest()` keeps the grouping columns as is, and bundles everything else into the list-column:
```{r}
gapminder %>%
group_by(country, continent) %>%
gapminder |>
group_by(country, continent) |>
nest()
```
You can also use it on an ungrouped data frame, specifying which columns you want to nest:
```{r}
gapminder %>%
gapminder |>
nest(data = c(year:gdpPercap))
```
@@ -402,15 +402,15 @@ df <- tribble(
"d,e,f,g"
)
df %>%
df |>
mutate(x2 = stringr::str_split(x1, ","))
```
`unnest()` knows how to handle these lists of vectors:
```{r}
df %>%
mutate(x2 = stringr::str_split(x1, ",")) %>%
df |>
mutate(x2 = stringr::str_split(x1, ",")) |>
unnest(x2)
```
@@ -427,7 +427,7 @@ sim <- tribble(
"rpois", list(lambda = 10)
)
sim %>%
sim |>
mutate(sims = invoke_map(f, params, n = 10))
```
@@ -440,8 +440,8 @@ One restriction of `summarise()` is that it only works with summary functions th
That means that you can't use it with functions like `quantile()` that return a vector of arbitrary length:
```{r, error = TRUE}
mtcars %>%
group_by(cyl) %>%
mtcars |>
group_by(cyl) |>
summarise(q = quantile(mpg))
```
@@ -449,8 +449,8 @@ You can however, wrap the result in a list!
This obeys the contract of `summarise()`, because each summary is now a list (a vector) of length 1.
```{r}
mtcars %>%
group_by(cyl) %>%
mtcars |>
group_by(cyl) |>
summarise(q = list(quantile(mpg)))
```
@@ -458,9 +458,9 @@ To make useful results with unnest, you'll also need to capture the probabilitie
```{r}
probs <- c(0.01, 0.25, 0.5, 0.75, 0.99)
mtcars %>%
group_by(cyl) %>%
summarise(p = list(probs), q = list(quantile(mpg, probs))) %>%
mtcars |>
group_by(cyl) |>
summarise(p = list(probs), q = list(quantile(mpg, probs))) |>
unnest(c(p, q))
```
@@ -486,7 +486,7 @@ The advantage of this structure is that it generalises in a straightforward way
Now if you want to iterate over names and values in parallel, you can use `map2()`:
```{r}
df %>%
df |>
mutate(
smry = map2_chr(name, value, ~ stringr::str_c(.x, ": ", .y[1]))
)
@@ -503,9 +503,9 @@ df %>%
Why isn't that helpful here?
```{r}
mtcars %>%
group_by(cyl) %>%
summarise(q = list(quantile(mpg))) %>%
mtcars |>
group_by(cyl) |>
summarise(q = list(quantile(mpg))) |>
unnest(q)
```
@@ -513,8 +513,8 @@ df %>%
Why might it be useful?
```{r, eval = FALSE}
mtcars %>%
group_by(cyl) %>%
mtcars |>
group_by(cyl) |>
summarise_all(list(list))
```
@@ -542,7 +542,7 @@ df <- tribble(
runif(5)
)
df %>% mutate(
df |> mutate(
type = map_chr(x, typeof),
length = map_int(x, length)
)
@@ -561,7 +561,7 @@ df <- tribble(
list(a = 1, b = 2),
list(a = 2, c = 4)
)
df %>% mutate(
df |> mutate(
a = map_dbl(x, "a"),
b = map_dbl(x, "b", .null = NA_real_)
)
@@ -573,7 +573,7 @@ df %>% mutate(
For example, in the following very simple example we repeat the first row 4 times (because there the first element of `y` has length four), and the second row once:
```{r}
tibble(x = 1:2, y = list(1:4, 1)) %>% unnest(y)
tibble(x = 1:2, y = list(1:4, 1)) |> unnest(y)
```
This means that you can't simultaneously unnest two columns that contain different number of elements:
@@ -587,7 +587,7 @@ df1 <- tribble(
2, "c", 3
)
df1
df1 %>% unnest(c(y, z))
df1 |> unnest(c(y, z))
# Doesn't work because y and z have different number of elements
df2 <- tribble(
@@ -596,7 +596,7 @@ df2 <- tribble(
2, c("b", "c"), 3
)
df2
df2 %>% unnest(c(y, z))
df2 |> unnest(c(y, z))
```
The same principle applies when unnesting list-columns of data frames.