Convert from %>% to |>
This commit is contained in:
		@@ -88,7 +88,7 @@ One easy place to start is to find the vertical distance between each point and
 | 
			
		||||
(Note that I've shifted the x values slightly so you can see the individual distances.)
 | 
			
		||||
 | 
			
		||||
```{r, echo = FALSE}
 | 
			
		||||
dist1 <- sim1 %>% 
 | 
			
		||||
dist1 <- sim1 |> 
 | 
			
		||||
  mutate(
 | 
			
		||||
    dodge = rep(c(-1, 0, 1) / 20, 10),
 | 
			
		||||
    x1 = x + dodge,
 | 
			
		||||
@@ -137,7 +137,7 @@ sim1_dist <- function(a1, a2) {
 | 
			
		||||
  measure_distance(c(a1, a2), sim1)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
models <- models %>% 
 | 
			
		||||
models <- models |> 
 | 
			
		||||
  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
 | 
			
		||||
models
 | 
			
		||||
```
 | 
			
		||||
@@ -171,10 +171,10 @@ I picked the parameters of the grid roughly by looking at where the best models
 | 
			
		||||
grid <- expand.grid(
 | 
			
		||||
  a1 = seq(-5, 20, length = 25),
 | 
			
		||||
  a2 = seq(1, 3, length = 25)
 | 
			
		||||
  ) %>% 
 | 
			
		||||
  ) |> 
 | 
			
		||||
  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
 | 
			
		||||
 | 
			
		||||
grid %>% 
 | 
			
		||||
grid |> 
 | 
			
		||||
  ggplot(aes(a1, a2)) +
 | 
			
		||||
  geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") +
 | 
			
		||||
  geom_point(aes(colour = -dist)) 
 | 
			
		||||
@@ -282,7 +282,7 @@ The easiest way to do that is to use `modelr::data_grid()`.
 | 
			
		||||
Its first argument is a data frame, and for each subsequent argument it finds the unique variables and then generates all combinations:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
grid <- sim1 %>% 
 | 
			
		||||
grid <- sim1 |> 
 | 
			
		||||
  data_grid(x) 
 | 
			
		||||
grid
 | 
			
		||||
```
 | 
			
		||||
@@ -294,7 +294,7 @@ We'll use `modelr::add_predictions()` which takes a data frame and a model.
 | 
			
		||||
It adds the predictions from the model to a new column in the data frame:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
grid <- grid %>% 
 | 
			
		||||
grid <- grid |> 
 | 
			
		||||
  add_predictions(sim1_mod) 
 | 
			
		||||
grid
 | 
			
		||||
```
 | 
			
		||||
@@ -324,7 +324,7 @@ Note, however, that we use the original dataset, not a manufactured grid.
 | 
			
		||||
This is because to compute residuals we need actual y values.
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
sim1 <- sim1 %>% 
 | 
			
		||||
sim1 <- sim1 |> 
 | 
			
		||||
  add_residuals(sim1_mod)
 | 
			
		||||
sim1
 | 
			
		||||
```
 | 
			
		||||
@@ -444,8 +444,8 @@ We can fit a model to it, and generate predictions:
 | 
			
		||||
```{r}
 | 
			
		||||
mod2 <- lm(y ~ x, data = sim2)
 | 
			
		||||
 | 
			
		||||
grid <- sim2 %>% 
 | 
			
		||||
  data_grid(x) %>% 
 | 
			
		||||
grid <- sim2 |> 
 | 
			
		||||
  data_grid(x) |> 
 | 
			
		||||
  add_predictions(mod2)
 | 
			
		||||
grid
 | 
			
		||||
```
 | 
			
		||||
@@ -463,7 +463,7 @@ You can't make predictions about levels that you didn't observe.
 | 
			
		||||
Sometimes you'll do this by accident so it's good to recognise this error message:
 | 
			
		||||
 | 
			
		||||
```{r, error = TRUE}
 | 
			
		||||
tibble(x = "e") %>% 
 | 
			
		||||
tibble(x = "e") |> 
 | 
			
		||||
  add_predictions(mod2)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@@ -501,8 +501,8 @@ To visualise these models we need two new tricks:
 | 
			
		||||
Together this gives us:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
grid <- sim3 %>% 
 | 
			
		||||
  data_grid(x1, x2) %>% 
 | 
			
		||||
grid <- sim3 |> 
 | 
			
		||||
  data_grid(x1, x2) |> 
 | 
			
		||||
  gather_predictions(mod1, mod2)
 | 
			
		||||
grid
 | 
			
		||||
```
 | 
			
		||||
@@ -524,7 +524,7 @@ We can take look at the residuals.
 | 
			
		||||
Here I've facetted by both model and `x2` because it makes it easier to see the pattern within each group.
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
sim3 <- sim3 %>% 
 | 
			
		||||
sim3 <- sim3 |> 
 | 
			
		||||
  gather_residuals(mod1, mod2)
 | 
			
		||||
 | 
			
		||||
ggplot(sim3, aes(x1, resid, colour = x2)) + 
 | 
			
		||||
@@ -547,11 +547,11 @@ Initially things proceed almost identically to the previous example:
 | 
			
		||||
mod1 <- lm(y ~ x1 + x2, data = sim4)
 | 
			
		||||
mod2 <- lm(y ~ x1 * x2, data = sim4)
 | 
			
		||||
 | 
			
		||||
grid <- sim4 %>% 
 | 
			
		||||
grid <- sim4 |> 
 | 
			
		||||
  data_grid(
 | 
			
		||||
    x1 = seq_range(x1, 5), 
 | 
			
		||||
    x2 = seq_range(x2, 5) 
 | 
			
		||||
  ) %>% 
 | 
			
		||||
  ) |> 
 | 
			
		||||
  gather_predictions(mod1, mod2)
 | 
			
		||||
grid
 | 
			
		||||
```
 | 
			
		||||
@@ -687,8 +687,8 @@ mod3 <- lm(y ~ ns(x, 3), data = sim5)
 | 
			
		||||
mod4 <- lm(y ~ ns(x, 4), data = sim5)
 | 
			
		||||
mod5 <- lm(y ~ ns(x, 5), data = sim5)
 | 
			
		||||
 | 
			
		||||
grid <- sim5 %>% 
 | 
			
		||||
  data_grid(x = seq_range(x, n = 50, expand = 0.1)) %>% 
 | 
			
		||||
grid <- sim5 |> 
 | 
			
		||||
  data_grid(x = seq_range(x, n = 50, expand = 0.1)) |> 
 | 
			
		||||
  gather_predictions(mod1, mod2, mod3, mod4, mod5, .pred = "y")
 | 
			
		||||
 | 
			
		||||
ggplot(sim5, aes(x, y)) + 
 | 
			
		||||
 
 | 
			
		||||
@@ -71,8 +71,8 @@ But first, lets make a couple of tweaks to the diamonds dataset to make it easie
 | 
			
		||||
2.  Log-transform the carat and price variables.
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
diamonds2 <- diamonds %>% 
 | 
			
		||||
  filter(carat <= 2.5) %>% 
 | 
			
		||||
diamonds2 <- diamonds |> 
 | 
			
		||||
  filter(carat <= 2.5) |> 
 | 
			
		||||
  mutate(lprice = log2(price), lcarat = log2(carat))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@@ -95,10 +95,10 @@ Then we look at what the model tells us about the data.
 | 
			
		||||
Note that I back transform the predictions, undoing the log transformation, so I can overlay the predictions on the raw data:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
grid <- diamonds2 %>% 
 | 
			
		||||
  data_grid(carat = seq_range(carat, 20)) %>% 
 | 
			
		||||
  mutate(lcarat = log2(carat)) %>% 
 | 
			
		||||
  add_predictions(mod_diamond, "lprice") %>% 
 | 
			
		||||
grid <- diamonds2 |> 
 | 
			
		||||
  data_grid(carat = seq_range(carat, 20)) |> 
 | 
			
		||||
  mutate(lcarat = log2(carat)) |> 
 | 
			
		||||
  add_predictions(mod_diamond, "lprice") |> 
 | 
			
		||||
  mutate(price = 2 ^ lprice)
 | 
			
		||||
 | 
			
		||||
ggplot(diamonds2, aes(carat, price)) + 
 | 
			
		||||
@@ -113,7 +113,7 @@ This is probably because no diamond in this dataset costs more than \$19,000.
 | 
			
		||||
Now we can look at the residuals, which verifies that we've successfully removed the strong linear pattern:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
diamonds2 <- diamonds2 %>% 
 | 
			
		||||
diamonds2 <- diamonds2 |> 
 | 
			
		||||
  add_residuals(mod_diamond, "lresid")
 | 
			
		||||
 | 
			
		||||
ggplot(diamonds2, aes(lcarat, lresid)) + 
 | 
			
		||||
@@ -147,8 +147,8 @@ Fortunately, they're currently all independent which means that we can plot them
 | 
			
		||||
To make the process a little easier, we're going to use the `.model` argument to `data_grid`:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
grid <- diamonds2 %>% 
 | 
			
		||||
  data_grid(cut, .model = mod_diamond2) %>% 
 | 
			
		||||
grid <- diamonds2 |> 
 | 
			
		||||
  data_grid(cut, .model = mod_diamond2) |> 
 | 
			
		||||
  add_predictions(mod_diamond2)
 | 
			
		||||
grid
 | 
			
		||||
 | 
			
		||||
@@ -160,7 +160,7 @@ If the model needs variables that you haven't explicitly supplied, `data_grid()`
 | 
			
		||||
For continuous variables, it uses the median, and categorical variables it uses the most common value (or values, if there's a tie).
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
diamonds2 <- diamonds2 %>% 
 | 
			
		||||
diamonds2 <- diamonds2 |> 
 | 
			
		||||
  add_residuals(mod_diamond2, "lresid2")
 | 
			
		||||
 | 
			
		||||
ggplot(diamonds2, aes(lcarat, lresid2)) + 
 | 
			
		||||
@@ -171,11 +171,11 @@ This plot indicates that there are some diamonds with quite large residuals - re
 | 
			
		||||
It's often useful to look at unusual values individually:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
diamonds2 %>% 
 | 
			
		||||
  filter(abs(lresid2) > 1) %>% 
 | 
			
		||||
  add_predictions(mod_diamond2) %>% 
 | 
			
		||||
  mutate(pred = round(2 ^ pred)) %>% 
 | 
			
		||||
  select(price, pred, carat:table, x:z) %>% 
 | 
			
		||||
diamonds2 |> 
 | 
			
		||||
  filter(abs(lresid2) > 1) |> 
 | 
			
		||||
  add_predictions(mod_diamond2) |> 
 | 
			
		||||
  mutate(pred = round(2 ^ pred)) |> 
 | 
			
		||||
  select(price, pred, carat:table, x:z) |> 
 | 
			
		||||
  arrange(price)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@@ -203,9 +203,9 @@ This is a really small dataset --- only 365 rows and 2 columns --- and we're not
 | 
			
		||||
Let's get started by counting the number of flights per day and visualising it with ggplot2.
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
daily <- flights %>% 
 | 
			
		||||
  mutate(date = make_date(year, month, day)) %>% 
 | 
			
		||||
  group_by(date) %>% 
 | 
			
		||||
daily <- flights |> 
 | 
			
		||||
  mutate(date = make_date(year, month, day)) |> 
 | 
			
		||||
  group_by(date) |> 
 | 
			
		||||
  summarise(n = n())
 | 
			
		||||
daily
 | 
			
		||||
 | 
			
		||||
@@ -219,7 +219,7 @@ Understanding the long-term trend is challenging because there's a very strong d
 | 
			
		||||
Let's start by looking at the distribution of flight numbers by day-of-week:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
daily <- daily %>% 
 | 
			
		||||
daily <- daily |> 
 | 
			
		||||
  mutate(wday = wday(date, label = TRUE))
 | 
			
		||||
ggplot(daily, aes(wday, n)) + 
 | 
			
		||||
  geom_boxplot()
 | 
			
		||||
@@ -234,8 +234,8 @@ First, we fit the model, and display its predictions overlaid on the original da
 | 
			
		||||
```{r}
 | 
			
		||||
mod <- lm(n ~ wday, data = daily)
 | 
			
		||||
 | 
			
		||||
grid <- daily %>% 
 | 
			
		||||
  data_grid(wday) %>% 
 | 
			
		||||
grid <- daily |> 
 | 
			
		||||
  data_grid(wday) |> 
 | 
			
		||||
  add_predictions(mod, "n")
 | 
			
		||||
 | 
			
		||||
ggplot(daily, aes(wday, n)) + 
 | 
			
		||||
@@ -246,9 +246,9 @@ ggplot(daily, aes(wday, n)) +
 | 
			
		||||
Next we compute and visualise the residuals:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
daily <- daily %>% 
 | 
			
		||||
daily <- daily |> 
 | 
			
		||||
  add_residuals(mod)
 | 
			
		||||
daily %>% 
 | 
			
		||||
daily |> 
 | 
			
		||||
  ggplot(aes(date, resid)) + 
 | 
			
		||||
  geom_ref_line(h = 0) + 
 | 
			
		||||
  geom_line()
 | 
			
		||||
@@ -272,7 +272,7 @@ This plot is useful because now that we've removed much of the large day-of-week
 | 
			
		||||
2.  There are some days with far fewer flights than expected:
 | 
			
		||||
 | 
			
		||||
    ```{r}
 | 
			
		||||
    daily %>% 
 | 
			
		||||
    daily |> 
 | 
			
		||||
      filter(resid < -100)
 | 
			
		||||
    ```
 | 
			
		||||
 | 
			
		||||
@@ -284,7 +284,7 @@ This plot is useful because now that we've removed much of the large day-of-week
 | 
			
		||||
    We can highlight that trend with `geom_smooth()`:
 | 
			
		||||
 | 
			
		||||
    ```{r}
 | 
			
		||||
    daily %>% 
 | 
			
		||||
    daily |> 
 | 
			
		||||
      ggplot(aes(date, resid)) + 
 | 
			
		||||
      geom_ref_line(h = 0) + 
 | 
			
		||||
      geom_line(colour = "grey50") + 
 | 
			
		||||
@@ -301,8 +301,8 @@ Let's first tackle our failure to accurately predict the number of flights on Sa
 | 
			
		||||
A good place to start is to go back to the raw numbers, focussing on Saturdays:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
daily %>% 
 | 
			
		||||
  filter(wday == "Sat") %>% 
 | 
			
		||||
daily |> 
 | 
			
		||||
  filter(wday == "Sat") |> 
 | 
			
		||||
  ggplot(aes(date, n)) + 
 | 
			
		||||
    geom_point() + 
 | 
			
		||||
    geom_line() +
 | 
			
		||||
@@ -329,11 +329,11 @@ term <- function(date) {
 | 
			
		||||
  )
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
daily <- daily %>% 
 | 
			
		||||
daily <- daily |> 
 | 
			
		||||
  mutate(term = term(date)) 
 | 
			
		||||
 | 
			
		||||
daily %>% 
 | 
			
		||||
  filter(wday == "Sat") %>% 
 | 
			
		||||
daily |> 
 | 
			
		||||
  filter(wday == "Sat") |> 
 | 
			
		||||
  ggplot(aes(date, n, colour = term)) +
 | 
			
		||||
  geom_point(alpha = 1/3) + 
 | 
			
		||||
  geom_line() +
 | 
			
		||||
@@ -345,7 +345,7 @@ daily %>%
 | 
			
		||||
It's useful to see how this new variable affects the other days of the week:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
daily %>% 
 | 
			
		||||
daily |> 
 | 
			
		||||
  ggplot(aes(wday, n, colour = term)) +
 | 
			
		||||
    geom_boxplot()
 | 
			
		||||
```
 | 
			
		||||
@@ -357,8 +357,8 @@ This improves our model, but not as much as we might hope:
 | 
			
		||||
mod1 <- lm(n ~ wday, data = daily)
 | 
			
		||||
mod2 <- lm(n ~ wday * term, data = daily)
 | 
			
		||||
 | 
			
		||||
daily %>% 
 | 
			
		||||
  gather_residuals(without_term = mod1, with_term = mod2) %>% 
 | 
			
		||||
daily |> 
 | 
			
		||||
  gather_residuals(without_term = mod1, with_term = mod2) |> 
 | 
			
		||||
  ggplot(aes(date, resid, colour = model)) +
 | 
			
		||||
    geom_line(alpha = 0.75)
 | 
			
		||||
```
 | 
			
		||||
@@ -366,8 +366,8 @@ daily %>%
 | 
			
		||||
We can see the problem by overlaying the predictions from the model on to the raw data:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
grid <- daily %>% 
 | 
			
		||||
  data_grid(wday, term) %>% 
 | 
			
		||||
grid <- daily |> 
 | 
			
		||||
  data_grid(wday, term) |> 
 | 
			
		||||
  add_predictions(mod2, "n")
 | 
			
		||||
 | 
			
		||||
ggplot(daily, aes(wday, n)) +
 | 
			
		||||
@@ -383,8 +383,8 @@ This greatly reduces the impact of the outliers on our estimates, and gives a mo
 | 
			
		||||
```{r, warn = FALSE}
 | 
			
		||||
mod3 <- MASS::rlm(n ~ wday * term, data = daily)
 | 
			
		||||
 | 
			
		||||
daily %>% 
 | 
			
		||||
  add_residuals(mod3, "resid") %>% 
 | 
			
		||||
daily |> 
 | 
			
		||||
  add_residuals(mod3, "resid") |> 
 | 
			
		||||
  ggplot(aes(date, resid)) + 
 | 
			
		||||
  geom_hline(yintercept = 0, size = 2, colour = "white") + 
 | 
			
		||||
  geom_line()
 | 
			
		||||
@@ -399,7 +399,7 @@ For example, we could write:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
compute_vars <- function(data) {
 | 
			
		||||
  data %>% 
 | 
			
		||||
  data |> 
 | 
			
		||||
    mutate(
 | 
			
		||||
      term = term(date), 
 | 
			
		||||
      wday = wday(date, label = TRUE)
 | 
			
		||||
@@ -430,9 +430,9 @@ A simple linear trend isn't adequate, so we could try using a natural spline to
 | 
			
		||||
library(splines)
 | 
			
		||||
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)
 | 
			
		||||
 | 
			
		||||
daily %>% 
 | 
			
		||||
  data_grid(wday, date = seq_range(date, n = 13)) %>% 
 | 
			
		||||
  add_predictions(mod) %>% 
 | 
			
		||||
daily |> 
 | 
			
		||||
  data_grid(wday, date = seq_range(date, n = 13)) |> 
 | 
			
		||||
  add_predictions(mod) |> 
 | 
			
		||||
  ggplot(aes(date, pred, colour = wday)) + 
 | 
			
		||||
    geom_line() +
 | 
			
		||||
    geom_point()
 | 
			
		||||
@@ -451,7 +451,7 @@ It's a good sign when you get the same signal from different approaches.
 | 
			
		||||
    How would these days generalise to another year?
 | 
			
		||||
 | 
			
		||||
    ```{r}
 | 
			
		||||
    daily %>% 
 | 
			
		||||
    daily |> 
 | 
			
		||||
      slice_max(n = 3, resid)
 | 
			
		||||
    ```
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -60,7 +60,7 @@ In this case study, we're going to focus on just three variables to answer the q
 | 
			
		||||
A good place to start is with a plot:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
gapminder %>% 
 | 
			
		||||
gapminder |> 
 | 
			
		||||
  ggplot(aes(year, lifeExp, group = country)) +
 | 
			
		||||
    geom_line(alpha = 1/3)
 | 
			
		||||
```
 | 
			
		||||
@@ -79,20 +79,20 @@ You already know how to do that if we had a single country:
 | 
			
		||||
 | 
			
		||||
```{r, out.width = "33%", fig.asp = 1, fig.width = 3, fig.align='default'}
 | 
			
		||||
nz <- filter(gapminder, country == "New Zealand")
 | 
			
		||||
nz %>% 
 | 
			
		||||
nz |> 
 | 
			
		||||
  ggplot(aes(year, lifeExp)) + 
 | 
			
		||||
  geom_line() + 
 | 
			
		||||
  ggtitle("Full data = ")
 | 
			
		||||
 | 
			
		||||
nz_mod <- lm(lifeExp ~ year, data = nz)
 | 
			
		||||
nz %>% 
 | 
			
		||||
  add_predictions(nz_mod) %>%
 | 
			
		||||
nz |> 
 | 
			
		||||
  add_predictions(nz_mod) |>
 | 
			
		||||
  ggplot(aes(year, pred)) + 
 | 
			
		||||
  geom_line() + 
 | 
			
		||||
  ggtitle("Linear trend + ")
 | 
			
		||||
 | 
			
		||||
nz %>% 
 | 
			
		||||
  add_residuals(nz_mod) %>% 
 | 
			
		||||
nz |> 
 | 
			
		||||
  add_residuals(nz_mod) |> 
 | 
			
		||||
  ggplot(aes(year, resid)) + 
 | 
			
		||||
  geom_hline(yintercept = 0, colour = "white", size = 3) + 
 | 
			
		||||
  geom_line() + 
 | 
			
		||||
@@ -111,8 +111,8 @@ To do that, we need a new data structure: the **nested data frame**.
 | 
			
		||||
To create a nested data frame we start with a grouped data frame, and "nest" it:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
by_country <- gapminder %>% 
 | 
			
		||||
  group_by(country, continent) %>% 
 | 
			
		||||
by_country <- gapminder |> 
 | 
			
		||||
  group_by(country, continent) |> 
 | 
			
		||||
  nest()
 | 
			
		||||
 | 
			
		||||
by_country
 | 
			
		||||
@@ -163,7 +163,7 @@ In other words, instead of creating a new object in the global environment, we'r
 | 
			
		||||
That's a job for `dplyr::mutate()`:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
by_country <- by_country %>% 
 | 
			
		||||
by_country <- by_country |> 
 | 
			
		||||
  mutate(model = map(data, country_model))
 | 
			
		||||
by_country
 | 
			
		||||
```
 | 
			
		||||
@@ -172,9 +172,9 @@ This has a big advantage: because all the related objects are stored together, y
 | 
			
		||||
The semantics of the data frame takes care of that for you:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
by_country %>% 
 | 
			
		||||
by_country |> 
 | 
			
		||||
  filter(continent == "Europe")
 | 
			
		||||
by_country %>% 
 | 
			
		||||
by_country |> 
 | 
			
		||||
  arrange(continent, country)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@@ -188,7 +188,7 @@ Now we have 142 data frames and 142 models.
 | 
			
		||||
To compute the residuals, we need to call `add_residuals()` with each model-data pair:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
by_country <- by_country %>% 
 | 
			
		||||
by_country <- by_country |> 
 | 
			
		||||
  mutate(
 | 
			
		||||
    resids = map2(data, model, add_residuals)
 | 
			
		||||
  )
 | 
			
		||||
@@ -209,7 +209,7 @@ Note that each regular column is repeated once for each row of the nested tibble
 | 
			
		||||
Now we have regular data frame, we can plot the residuals:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
resids %>% 
 | 
			
		||||
resids |> 
 | 
			
		||||
  ggplot(aes(year, resid)) +
 | 
			
		||||
    geom_line(aes(group = country), alpha = 1 / 3) + 
 | 
			
		||||
    geom_smooth(se = FALSE)
 | 
			
		||||
@@ -219,7 +219,7 @@ resids %>%
 | 
			
		||||
Facetting by continent is particularly revealing:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
resids %>% 
 | 
			
		||||
resids |> 
 | 
			
		||||
  ggplot(aes(year, resid, group = country)) +
 | 
			
		||||
    geom_line(alpha = 1 / 3) + 
 | 
			
		||||
    facet_wrap(~continent)
 | 
			
		||||
@@ -245,9 +245,9 @@ broom::glance(nz_mod)
 | 
			
		||||
We can use `mutate()` and `unnest()` to create a data frame with a row for each country:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
glance <- by_country %>% 
 | 
			
		||||
  mutate(glance = map(model, broom::glance)) %>% 
 | 
			
		||||
  select(country, continent, glance) %>% 
 | 
			
		||||
glance <- by_country |> 
 | 
			
		||||
  mutate(glance = map(model, broom::glance)) |> 
 | 
			
		||||
  select(country, continent, glance) |> 
 | 
			
		||||
  unnest(glance)
 | 
			
		||||
glance
 | 
			
		||||
```
 | 
			
		||||
@@ -257,7 +257,7 @@ glance
 | 
			
		||||
With this data frame in hand, we can start to look for models that don't fit well:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
glance %>% 
 | 
			
		||||
glance |> 
 | 
			
		||||
  arrange(r.squared)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@@ -266,7 +266,7 @@ Let's double check that with a plot.
 | 
			
		||||
Here we have a relatively small number of observations and a discrete variable, so `geom_jitter()` is effective:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
glance %>% 
 | 
			
		||||
glance |> 
 | 
			
		||||
  ggplot(aes(continent, r.squared)) + 
 | 
			
		||||
    geom_jitter(width = 0.5)
 | 
			
		||||
```
 | 
			
		||||
@@ -276,8 +276,8 @@ We could pull out the countries with particularly bad $R^2$ and plot the data:
 | 
			
		||||
```{r}
 | 
			
		||||
bad_fit <- filter(glance, r.squared < 0.25)
 | 
			
		||||
 | 
			
		||||
gapminder %>% 
 | 
			
		||||
  semi_join(bad_fit, by = "country") %>% 
 | 
			
		||||
gapminder |> 
 | 
			
		||||
  semi_join(bad_fit, by = "country") |> 
 | 
			
		||||
  ggplot(aes(year, lifeExp, colour = country)) +
 | 
			
		||||
    geom_line()
 | 
			
		||||
```
 | 
			
		||||
@@ -377,15 +377,15 @@ So far you've seen how to use it with a grouped data frame.
 | 
			
		||||
When applied to a grouped data frame, `nest()` keeps the grouping columns as is, and bundles everything else into the list-column:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
gapminder %>% 
 | 
			
		||||
  group_by(country, continent) %>% 
 | 
			
		||||
gapminder |> 
 | 
			
		||||
  group_by(country, continent) |> 
 | 
			
		||||
  nest()
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
You can also use it on an ungrouped data frame, specifying which columns you want to nest:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
gapminder %>% 
 | 
			
		||||
gapminder |> 
 | 
			
		||||
  nest(data = c(year:gdpPercap))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@@ -402,15 +402,15 @@ df <- tribble(
 | 
			
		||||
  "d,e,f,g"
 | 
			
		||||
) 
 | 
			
		||||
 | 
			
		||||
df %>% 
 | 
			
		||||
df |> 
 | 
			
		||||
  mutate(x2 = stringr::str_split(x1, ","))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
`unnest()` knows how to handle these lists of vectors:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df %>% 
 | 
			
		||||
  mutate(x2 = stringr::str_split(x1, ",")) %>% 
 | 
			
		||||
df |> 
 | 
			
		||||
  mutate(x2 = stringr::str_split(x1, ",")) |> 
 | 
			
		||||
  unnest(x2)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@@ -427,7 +427,7 @@ sim <- tribble(
 | 
			
		||||
  "rpois", list(lambda = 10)
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
sim %>%
 | 
			
		||||
sim |>
 | 
			
		||||
  mutate(sims = invoke_map(f, params, n = 10))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@@ -440,8 +440,8 @@ One restriction of `summarise()` is that it only works with summary functions th
 | 
			
		||||
That means that you can't use it with functions like `quantile()` that return a vector of arbitrary length:
 | 
			
		||||
 | 
			
		||||
```{r, error = TRUE}
 | 
			
		||||
mtcars %>% 
 | 
			
		||||
  group_by(cyl) %>% 
 | 
			
		||||
mtcars |> 
 | 
			
		||||
  group_by(cyl) |> 
 | 
			
		||||
  summarise(q = quantile(mpg))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@@ -449,8 +449,8 @@ You can however, wrap the result in a list!
 | 
			
		||||
This obeys the contract of `summarise()`, because each summary is now a list (a vector) of length 1.
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
mtcars %>% 
 | 
			
		||||
  group_by(cyl) %>% 
 | 
			
		||||
mtcars |> 
 | 
			
		||||
  group_by(cyl) |> 
 | 
			
		||||
  summarise(q = list(quantile(mpg)))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@@ -458,9 +458,9 @@ To make useful results with unnest, you'll also need to capture the probabilitie
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
probs <- c(0.01, 0.25, 0.5, 0.75, 0.99)
 | 
			
		||||
mtcars %>% 
 | 
			
		||||
  group_by(cyl) %>% 
 | 
			
		||||
  summarise(p = list(probs), q = list(quantile(mpg, probs))) %>% 
 | 
			
		||||
mtcars |> 
 | 
			
		||||
  group_by(cyl) |> 
 | 
			
		||||
  summarise(p = list(probs), q = list(quantile(mpg, probs))) |> 
 | 
			
		||||
  unnest(c(p, q))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@@ -486,7 +486,7 @@ The advantage of this structure is that it generalises in a straightforward way
 | 
			
		||||
Now if you want to iterate over names and values in parallel, you can use `map2()`:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df %>% 
 | 
			
		||||
df |> 
 | 
			
		||||
  mutate(
 | 
			
		||||
    smry = map2_chr(name, value, ~ stringr::str_c(.x, ": ", .y[1]))
 | 
			
		||||
  )
 | 
			
		||||
@@ -503,9 +503,9 @@ df %>%
 | 
			
		||||
    Why isn't that helpful here?
 | 
			
		||||
 | 
			
		||||
    ```{r}
 | 
			
		||||
    mtcars %>% 
 | 
			
		||||
      group_by(cyl) %>% 
 | 
			
		||||
      summarise(q = list(quantile(mpg))) %>% 
 | 
			
		||||
    mtcars |> 
 | 
			
		||||
      group_by(cyl) |> 
 | 
			
		||||
      summarise(q = list(quantile(mpg))) |> 
 | 
			
		||||
      unnest(q)
 | 
			
		||||
    ```
 | 
			
		||||
 | 
			
		||||
@@ -513,8 +513,8 @@ df %>%
 | 
			
		||||
    Why might it be useful?
 | 
			
		||||
 | 
			
		||||
    ```{r, eval = FALSE}
 | 
			
		||||
    mtcars %>% 
 | 
			
		||||
      group_by(cyl) %>% 
 | 
			
		||||
    mtcars |> 
 | 
			
		||||
      group_by(cyl) |> 
 | 
			
		||||
      summarise_all(list(list))
 | 
			
		||||
    ```
 | 
			
		||||
 | 
			
		||||
@@ -542,7 +542,7 @@ df <- tribble(
 | 
			
		||||
  runif(5)
 | 
			
		||||
)
 | 
			
		||||
  
 | 
			
		||||
df %>% mutate(
 | 
			
		||||
df |> mutate(
 | 
			
		||||
  type = map_chr(x, typeof),
 | 
			
		||||
  length = map_int(x, length)
 | 
			
		||||
)
 | 
			
		||||
@@ -561,7 +561,7 @@ df <- tribble(
 | 
			
		||||
  list(a = 1, b = 2),
 | 
			
		||||
  list(a = 2, c = 4)
 | 
			
		||||
)
 | 
			
		||||
df %>% mutate(
 | 
			
		||||
df |> mutate(
 | 
			
		||||
  a = map_dbl(x, "a"),
 | 
			
		||||
  b = map_dbl(x, "b", .null = NA_real_)
 | 
			
		||||
)
 | 
			
		||||
@@ -573,7 +573,7 @@ df %>% mutate(
 | 
			
		||||
For example, in the following very simple example we repeat the first row 4 times (because there the first element of `y` has length four), and the second row once:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
tibble(x = 1:2, y = list(1:4, 1)) %>% unnest(y)
 | 
			
		||||
tibble(x = 1:2, y = list(1:4, 1)) |> unnest(y)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
This means that you can't simultaneously unnest two columns that contain different number of elements:
 | 
			
		||||
@@ -587,7 +587,7 @@ df1 <- tribble(
 | 
			
		||||
   2, "c",           3
 | 
			
		||||
)
 | 
			
		||||
df1
 | 
			
		||||
df1 %>% unnest(c(y, z))
 | 
			
		||||
df1 |> unnest(c(y, z))
 | 
			
		||||
 | 
			
		||||
# Doesn't work because y and z have different number of elements
 | 
			
		||||
df2 <- tribble(
 | 
			
		||||
@@ -596,7 +596,7 @@ df2 <- tribble(
 | 
			
		||||
   2, c("b", "c"),   3
 | 
			
		||||
)
 | 
			
		||||
df2
 | 
			
		||||
df2 %>% unnest(c(y, z))
 | 
			
		||||
df2 |> unnest(c(y, z))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
The same principle applies when unnesting list-columns of data frames.
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user