Convert from %>% to |>

2022-02-23 13:15:52 -06:00
parent 1b0c50894a
commit da0fbd50d5
36 changed files with 542 additions and 552 deletions
--- a/extra/model/model-basics.Rmd
+++ b/extra/model/model-basics.Rmd
@@ -88,7 +88,7 @@ One easy place to start is to find the vertical distance between each point and
 (Note that I've shifted the x values slightly so you can see the individual distances.)

 ```{r, echo = FALSE}
-dist1 <- sim1 %>% 
+dist1 <- sim1 |> 
  mutate(
    dodge = rep(c(-1, 0, 1) / 20, 10),
    x1 = x + dodge,
@@ -137,7 +137,7 @@ sim1_dist <- function(a1, a2) {
  measure_distance(c(a1, a2), sim1)
 }

-models <- models %>% 
+models <- models |> 
  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
 models
 ```
@@ -171,10 +171,10 @@ I picked the parameters of the grid roughly by looking at where the best models
 grid <- expand.grid(
  a1 = seq(-5, 20, length = 25),
  a2 = seq(1, 3, length = 25)
-  ) %>% 
+  ) |> 
  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))

-grid %>% 
+grid |> 
  ggplot(aes(a1, a2)) +
  geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") +
  geom_point(aes(colour = -dist)) 
@@ -282,7 +282,7 @@ The easiest way to do that is to use `modelr::data_grid()`.
 Its first argument is a data frame, and for each subsequent argument it finds the unique variables and then generates all combinations:

 ```{r}
-grid <- sim1 %>% 
+grid <- sim1 |> 
  data_grid(x) 
 grid
 ```
@@ -294,7 +294,7 @@ We'll use `modelr::add_predictions()` which takes a data frame and a model.
 It adds the predictions from the model to a new column in the data frame:

 ```{r}
-grid <- grid %>% 
+grid <- grid |> 
  add_predictions(sim1_mod) 
 grid
 ```
@@ -324,7 +324,7 @@ Note, however, that we use the original dataset, not a manufactured grid.
 This is because to compute residuals we need actual y values.

 ```{r}
-sim1 <- sim1 %>% 
+sim1 <- sim1 |> 
  add_residuals(sim1_mod)
 sim1
 ```
@@ -444,8 +444,8 @@ We can fit a model to it, and generate predictions:
 ```{r}
 mod2 <- lm(y ~ x, data = sim2)

-grid <- sim2 %>% 
-  data_grid(x) %>% 
+grid <- sim2 |> 
+  data_grid(x) |> 
  add_predictions(mod2)
 grid
 ```
@@ -463,7 +463,7 @@ You can't make predictions about levels that you didn't observe.
 Sometimes you'll do this by accident so it's good to recognise this error message:

 ```{r, error = TRUE}
-tibble(x = "e") %>% 
+tibble(x = "e") |> 
  add_predictions(mod2)
 ```

@@ -501,8 +501,8 @@ To visualise these models we need two new tricks:
 Together this gives us:

 ```{r}
-grid <- sim3 %>% 
-  data_grid(x1, x2) %>% 
+grid <- sim3 |> 
+  data_grid(x1, x2) |> 
  gather_predictions(mod1, mod2)
 grid
 ```
@@ -524,7 +524,7 @@ We can take look at the residuals.
 Here I've facetted by both model and `x2` because it makes it easier to see the pattern within each group.

 ```{r}
-sim3 <- sim3 %>% 
+sim3 <- sim3 |> 
  gather_residuals(mod1, mod2)

 ggplot(sim3, aes(x1, resid, colour = x2)) + 
@@ -547,11 +547,11 @@ Initially things proceed almost identically to the previous example:
 mod1 <- lm(y ~ x1 + x2, data = sim4)
 mod2 <- lm(y ~ x1 * x2, data = sim4)

-grid <- sim4 %>% 
+grid <- sim4 |> 
  data_grid(
    x1 = seq_range(x1, 5), 
    x2 = seq_range(x2, 5) 
-  ) %>% 
+  ) |> 
  gather_predictions(mod1, mod2)
 grid
 ```
@@ -687,8 +687,8 @@ mod3 <- lm(y ~ ns(x, 3), data = sim5)
 mod4 <- lm(y ~ ns(x, 4), data = sim5)
 mod5 <- lm(y ~ ns(x, 5), data = sim5)

-grid <- sim5 %>% 
-  data_grid(x = seq_range(x, n = 50, expand = 0.1)) %>% 
+grid <- sim5 |> 
+  data_grid(x = seq_range(x, n = 50, expand = 0.1)) |> 
  gather_predictions(mod1, mod2, mod3, mod4, mod5, .pred = "y")

 ggplot(sim5, aes(x, y)) + 
--- a/extra/model/model-building.Rmd
+++ b/extra/model/model-building.Rmd
@@ -71,8 +71,8 @@ But first, lets make a couple of tweaks to the diamonds dataset to make it easie
 2.  Log-transform the carat and price variables.

 ```{r}
-diamonds2 <- diamonds %>% 
-  filter(carat <= 2.5) %>% 
+diamonds2 <- diamonds |> 
+  filter(carat <= 2.5) |> 
  mutate(lprice = log2(price), lcarat = log2(carat))
 ```

@@ -95,10 +95,10 @@ Then we look at what the model tells us about the data.
 Note that I back transform the predictions, undoing the log transformation, so I can overlay the predictions on the raw data:

 ```{r}
-grid <- diamonds2 %>% 
-  data_grid(carat = seq_range(carat, 20)) %>% 
-  mutate(lcarat = log2(carat)) %>% 
-  add_predictions(mod_diamond, "lprice") %>% 
+grid <- diamonds2 |> 
+  data_grid(carat = seq_range(carat, 20)) |> 
+  mutate(lcarat = log2(carat)) |> 
+  add_predictions(mod_diamond, "lprice") |> 
  mutate(price = 2 ^ lprice)

 ggplot(diamonds2, aes(carat, price)) + 
@@ -113,7 +113,7 @@ This is probably because no diamond in this dataset costs more than \$19,000.
 Now we can look at the residuals, which verifies that we've successfully removed the strong linear pattern:

 ```{r}
-diamonds2 <- diamonds2 %>% 
+diamonds2 <- diamonds2 |> 
  add_residuals(mod_diamond, "lresid")

 ggplot(diamonds2, aes(lcarat, lresid)) + 
@@ -147,8 +147,8 @@ Fortunately, they're currently all independent which means that we can plot them
 To make the process a little easier, we're going to use the `.model` argument to `data_grid`:

 ```{r}
-grid <- diamonds2 %>% 
-  data_grid(cut, .model = mod_diamond2) %>% 
+grid <- diamonds2 |> 
+  data_grid(cut, .model = mod_diamond2) |> 
  add_predictions(mod_diamond2)
 grid

@@ -160,7 +160,7 @@ If the model needs variables that you haven't explicitly supplied, `data_grid()`
 For continuous variables, it uses the median, and categorical variables it uses the most common value (or values, if there's a tie).

 ```{r}
-diamonds2 <- diamonds2 %>% 
+diamonds2 <- diamonds2 |> 
  add_residuals(mod_diamond2, "lresid2")

 ggplot(diamonds2, aes(lcarat, lresid2)) + 
@@ -171,11 +171,11 @@ This plot indicates that there are some diamonds with quite large residuals - re
 It's often useful to look at unusual values individually:

 ```{r}
-diamonds2 %>% 
-  filter(abs(lresid2) > 1) %>% 
-  add_predictions(mod_diamond2) %>% 
-  mutate(pred = round(2 ^ pred)) %>% 
-  select(price, pred, carat:table, x:z) %>% 
+diamonds2 |> 
+  filter(abs(lresid2) > 1) |> 
+  add_predictions(mod_diamond2) |> 
+  mutate(pred = round(2 ^ pred)) |> 
+  select(price, pred, carat:table, x:z) |> 
  arrange(price)
 ```

@@ -203,9 +203,9 @@ This is a really small dataset --- only 365 rows and 2 columns --- and we're not
 Let's get started by counting the number of flights per day and visualising it with ggplot2.

 ```{r}
-daily <- flights %>% 
-  mutate(date = make_date(year, month, day)) %>% 
-  group_by(date) %>% 
+daily <- flights |> 
+  mutate(date = make_date(year, month, day)) |> 
+  group_by(date) |> 
  summarise(n = n())
 daily

@@ -219,7 +219,7 @@ Understanding the long-term trend is challenging because there's a very strong d
 Let's start by looking at the distribution of flight numbers by day-of-week:

 ```{r}
-daily <- daily %>% 
+daily <- daily |> 
  mutate(wday = wday(date, label = TRUE))
 ggplot(daily, aes(wday, n)) + 
  geom_boxplot()
@@ -234,8 +234,8 @@ First, we fit the model, and display its predictions overlaid on the original da
 ```{r}
 mod <- lm(n ~ wday, data = daily)

-grid <- daily %>% 
-  data_grid(wday) %>% 
+grid <- daily |> 
+  data_grid(wday) |> 
  add_predictions(mod, "n")

 ggplot(daily, aes(wday, n)) + 
@@ -246,9 +246,9 @@ ggplot(daily, aes(wday, n)) +
 Next we compute and visualise the residuals:

 ```{r}
-daily <- daily %>% 
+daily <- daily |> 
  add_residuals(mod)
-daily %>% 
+daily |> 
  ggplot(aes(date, resid)) + 
  geom_ref_line(h = 0) + 
  geom_line()
@@ -272,7 +272,7 @@ This plot is useful because now that we've removed much of the large day-of-week
 2.  There are some days with far fewer flights than expected:

    ```{r}
-    daily %>% 
+    daily |> 
      filter(resid < -100)
    ```

@@ -284,7 +284,7 @@ This plot is useful because now that we've removed much of the large day-of-week
    We can highlight that trend with `geom_smooth()`:

    ```{r}
-    daily %>% 
+    daily |> 
      ggplot(aes(date, resid)) + 
      geom_ref_line(h = 0) + 
      geom_line(colour = "grey50") + 
@@ -301,8 +301,8 @@ Let's first tackle our failure to accurately predict the number of flights on Sa
 A good place to start is to go back to the raw numbers, focussing on Saturdays:

 ```{r}
-daily %>% 
-  filter(wday == "Sat") %>% 
+daily |> 
+  filter(wday == "Sat") |> 
  ggplot(aes(date, n)) + 
    geom_point() + 
    geom_line() +
@@ -329,11 +329,11 @@ term <- function(date) {
  )
 }

-daily <- daily %>% 
+daily <- daily |> 
  mutate(term = term(date)) 

-daily %>% 
-  filter(wday == "Sat") %>% 
+daily |> 
+  filter(wday == "Sat") |> 
  ggplot(aes(date, n, colour = term)) +
  geom_point(alpha = 1/3) + 
  geom_line() +
@@ -345,7 +345,7 @@ daily %>%
 It's useful to see how this new variable affects the other days of the week:

 ```{r}
-daily %>% 
+daily |> 
  ggplot(aes(wday, n, colour = term)) +
    geom_boxplot()
 ```
@@ -357,8 +357,8 @@ This improves our model, but not as much as we might hope:
 mod1 <- lm(n ~ wday, data = daily)
 mod2 <- lm(n ~ wday * term, data = daily)

-daily %>% 
-  gather_residuals(without_term = mod1, with_term = mod2) %>% 
+daily |> 
+  gather_residuals(without_term = mod1, with_term = mod2) |> 
  ggplot(aes(date, resid, colour = model)) +
    geom_line(alpha = 0.75)
 ```
@@ -366,8 +366,8 @@ daily %>%
 We can see the problem by overlaying the predictions from the model on to the raw data:

 ```{r}
-grid <- daily %>% 
-  data_grid(wday, term) %>% 
+grid <- daily |> 
+  data_grid(wday, term) |> 
  add_predictions(mod2, "n")

 ggplot(daily, aes(wday, n)) +
@@ -383,8 +383,8 @@ This greatly reduces the impact of the outliers on our estimates, and gives a mo
 ```{r, warn = FALSE}
 mod3 <- MASS::rlm(n ~ wday * term, data = daily)

-daily %>% 
-  add_residuals(mod3, "resid") %>% 
+daily |> 
+  add_residuals(mod3, "resid") |> 
  ggplot(aes(date, resid)) + 
  geom_hline(yintercept = 0, size = 2, colour = "white") + 
  geom_line()
@@ -399,7 +399,7 @@ For example, we could write:

 ```{r}
 compute_vars <- function(data) {
-  data %>% 
+  data |> 
    mutate(
      term = term(date), 
      wday = wday(date, label = TRUE)
@@ -430,9 +430,9 @@ A simple linear trend isn't adequate, so we could try using a natural spline to
 library(splines)
 mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)

-daily %>% 
-  data_grid(wday, date = seq_range(date, n = 13)) %>% 
-  add_predictions(mod) %>% 
+daily |> 
+  data_grid(wday, date = seq_range(date, n = 13)) |> 
+  add_predictions(mod) |> 
  ggplot(aes(date, pred, colour = wday)) + 
    geom_line() +
    geom_point()
@@ -451,7 +451,7 @@ It's a good sign when you get the same signal from different approaches.
    How would these days generalise to another year?

    ```{r}
-    daily %>% 
+    daily |> 
      slice_max(n = 3, resid)
    ```

--- a/extra/model/model-many.Rmd
+++ b/extra/model/model-many.Rmd
@@ -60,7 +60,7 @@ In this case study, we're going to focus on just three variables to answer the q
 A good place to start is with a plot:

 ```{r}
-gapminder %>% 
+gapminder |> 
  ggplot(aes(year, lifeExp, group = country)) +
    geom_line(alpha = 1/3)
 ```
@@ -79,20 +79,20 @@ You already know how to do that if we had a single country:

 ```{r, out.width = "33%", fig.asp = 1, fig.width = 3, fig.align='default'}
 nz <- filter(gapminder, country == "New Zealand")
-nz %>% 
+nz |> 
  ggplot(aes(year, lifeExp)) + 
  geom_line() + 
  ggtitle("Full data = ")

 nz_mod <- lm(lifeExp ~ year, data = nz)
-nz %>% 
-  add_predictions(nz_mod) %>%
+nz |> 
+  add_predictions(nz_mod) |>
  ggplot(aes(year, pred)) + 
  geom_line() + 
  ggtitle("Linear trend + ")

-nz %>% 
-  add_residuals(nz_mod) %>% 
+nz |> 
+  add_residuals(nz_mod) |> 
  ggplot(aes(year, resid)) + 
  geom_hline(yintercept = 0, colour = "white", size = 3) + 
  geom_line() + 
@@ -111,8 +111,8 @@ To do that, we need a new data structure: the **nested data frame**.
 To create a nested data frame we start with a grouped data frame, and "nest" it:

 ```{r}
-by_country <- gapminder %>% 
-  group_by(country, continent) %>% 
+by_country <- gapminder |> 
+  group_by(country, continent) |> 
  nest()

 by_country
@@ -163,7 +163,7 @@ In other words, instead of creating a new object in the global environment, we'r
 That's a job for `dplyr::mutate()`:

 ```{r}
-by_country <- by_country %>% 
+by_country <- by_country |> 
  mutate(model = map(data, country_model))
 by_country
 ```
@@ -172,9 +172,9 @@ This has a big advantage: because all the related objects are stored together, y
 The semantics of the data frame takes care of that for you:

 ```{r}
-by_country %>% 
+by_country |> 
  filter(continent == "Europe")
-by_country %>% 
+by_country |> 
  arrange(continent, country)
 ```

@@ -188,7 +188,7 @@ Now we have 142 data frames and 142 models.
 To compute the residuals, we need to call `add_residuals()` with each model-data pair:

 ```{r}
-by_country <- by_country %>% 
+by_country <- by_country |> 
  mutate(
    resids = map2(data, model, add_residuals)
  )
@@ -209,7 +209,7 @@ Note that each regular column is repeated once for each row of the nested tibble
 Now we have regular data frame, we can plot the residuals:

 ```{r}
-resids %>% 
+resids |> 
  ggplot(aes(year, resid)) +
    geom_line(aes(group = country), alpha = 1 / 3) + 
    geom_smooth(se = FALSE)
@@ -219,7 +219,7 @@ resids %>%
 Facetting by continent is particularly revealing:

 ```{r}
-resids %>% 
+resids |> 
  ggplot(aes(year, resid, group = country)) +
    geom_line(alpha = 1 / 3) + 
    facet_wrap(~continent)
@@ -245,9 +245,9 @@ broom::glance(nz_mod)
 We can use `mutate()` and `unnest()` to create a data frame with a row for each country:

 ```{r}
-glance <- by_country %>% 
-  mutate(glance = map(model, broom::glance)) %>% 
-  select(country, continent, glance) %>% 
+glance <- by_country |> 
+  mutate(glance = map(model, broom::glance)) |> 
+  select(country, continent, glance) |> 
  unnest(glance)
 glance
 ```
@@ -257,7 +257,7 @@ glance
 With this data frame in hand, we can start to look for models that don't fit well:

 ```{r}
-glance %>% 
+glance |> 
  arrange(r.squared)
 ```

@@ -266,7 +266,7 @@ Let's double check that with a plot.
 Here we have a relatively small number of observations and a discrete variable, so `geom_jitter()` is effective:

 ```{r}
-glance %>% 
+glance |> 
  ggplot(aes(continent, r.squared)) + 
    geom_jitter(width = 0.5)
 ```
@@ -276,8 +276,8 @@ We could pull out the countries with particularly bad $R^2$ and plot the data:
 ```{r}
 bad_fit <- filter(glance, r.squared < 0.25)

-gapminder %>% 
-  semi_join(bad_fit, by = "country") %>% 
+gapminder |> 
+  semi_join(bad_fit, by = "country") |> 
  ggplot(aes(year, lifeExp, colour = country)) +
    geom_line()
 ```
@@ -377,15 +377,15 @@ So far you've seen how to use it with a grouped data frame.
 When applied to a grouped data frame, `nest()` keeps the grouping columns as is, and bundles everything else into the list-column:

 ```{r}
-gapminder %>% 
-  group_by(country, continent) %>% 
+gapminder |> 
+  group_by(country, continent) |> 
  nest()
 ```

 You can also use it on an ungrouped data frame, specifying which columns you want to nest:

 ```{r}
-gapminder %>% 
+gapminder |> 
  nest(data = c(year:gdpPercap))
 ```

@@ -402,15 +402,15 @@ df <- tribble(
  "d,e,f,g"
 ) 

-df %>% 
+df |> 
  mutate(x2 = stringr::str_split(x1, ","))
 ```

 `unnest()` knows how to handle these lists of vectors:

 ```{r}
-df %>% 
-  mutate(x2 = stringr::str_split(x1, ",")) %>% 
+df |> 
+  mutate(x2 = stringr::str_split(x1, ",")) |> 
  unnest(x2)
 ```

@@ -427,7 +427,7 @@ sim <- tribble(
  "rpois", list(lambda = 10)
 )

-sim %>%
+sim |>
  mutate(sims = invoke_map(f, params, n = 10))
 ```

@@ -440,8 +440,8 @@ One restriction of `summarise()` is that it only works with summary functions th
 That means that you can't use it with functions like `quantile()` that return a vector of arbitrary length:

 ```{r, error = TRUE}
-mtcars %>% 
-  group_by(cyl) %>% 
+mtcars |> 
+  group_by(cyl) |> 
  summarise(q = quantile(mpg))
 ```

@@ -449,8 +449,8 @@ You can however, wrap the result in a list!
 This obeys the contract of `summarise()`, because each summary is now a list (a vector) of length 1.

 ```{r}
-mtcars %>% 
-  group_by(cyl) %>% 
+mtcars |> 
+  group_by(cyl) |> 
  summarise(q = list(quantile(mpg)))
 ```

@@ -458,9 +458,9 @@ To make useful results with unnest, you'll also need to capture the probabilitie

 ```{r}
 probs <- c(0.01, 0.25, 0.5, 0.75, 0.99)
-mtcars %>% 
-  group_by(cyl) %>% 
-  summarise(p = list(probs), q = list(quantile(mpg, probs))) %>% 
+mtcars |> 
+  group_by(cyl) |> 
+  summarise(p = list(probs), q = list(quantile(mpg, probs))) |> 
  unnest(c(p, q))
 ```

@@ -486,7 +486,7 @@ The advantage of this structure is that it generalises in a straightforward way
 Now if you want to iterate over names and values in parallel, you can use `map2()`:

 ```{r}
-df %>% 
+df |> 
  mutate(
    smry = map2_chr(name, value, ~ stringr::str_c(.x, ": ", .y[1]))
  )
@@ -503,9 +503,9 @@ df %>%
    Why isn't that helpful here?

    ```{r}
-    mtcars %>% 
-      group_by(cyl) %>% 
-      summarise(q = list(quantile(mpg))) %>% 
+    mtcars |> 
+      group_by(cyl) |> 
+      summarise(q = list(quantile(mpg))) |> 
      unnest(q)
    ```

@@ -513,8 +513,8 @@ df %>%
    Why might it be useful?

    ```{r, eval = FALSE}
-    mtcars %>% 
-      group_by(cyl) %>% 
+    mtcars |> 
+      group_by(cyl) |> 
      summarise_all(list(list))
    ```

@@ -542,7 +542,7 @@ df <- tribble(
  runif(5)
 )
  
-df %>% mutate(
+df |> mutate(
  type = map_chr(x, typeof),
  length = map_int(x, length)
 )
@@ -561,7 +561,7 @@ df <- tribble(
  list(a = 1, b = 2),
  list(a = 2, c = 4)
 )
-df %>% mutate(
+df |> mutate(
  a = map_dbl(x, "a"),
  b = map_dbl(x, "b", .null = NA_real_)
 )
@@ -573,7 +573,7 @@ df %>% mutate(
 For example, in the following very simple example we repeat the first row 4 times (because there the first element of `y` has length four), and the second row once:

 ```{r}
-tibble(x = 1:2, y = list(1:4, 1)) %>% unnest(y)
+tibble(x = 1:2, y = list(1:4, 1)) |> unnest(y)
 ```

 This means that you can't simultaneously unnest two columns that contain different number of elements:
@@ -587,7 +587,7 @@ df1 <- tribble(
   2, "c",           3
 )
 df1
-df1 %>% unnest(c(y, z))
+df1 |> unnest(c(y, z))

 # Doesn't work because y and z have different number of elements
 df2 <- tribble(
@@ -596,7 +596,7 @@ df2 <- tribble(
   2, c("b", "c"),   3
 )
 df2
-df2 %>% unnest(c(y, z))
+df2 |> unnest(c(y, z))
 ```

 The same principle applies when unnesting list-columns of data frames.