Convert from %>% to |>

This commit is contained in:
Hadley Wickham
2022-02-23 13:15:52 -06:00
parent 1b0c50894a
commit da0fbd50d5
36 changed files with 542 additions and 552 deletions

View File

@@ -51,9 +51,9 @@ To use hierarchical clustering in R, begin by selecting the numeric columns from
```{r}
small_iris <- sample_n(iris, 50)
iris_hclust <- small_iris %>%
select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>%
dist() %>%
iris_hclust <- small_iris |>
select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) |>
dist() |>
hclust(method = "complete")
```
@@ -92,10 +92,10 @@ knitr::include_graphics("images/EDA-linkage.png")
```{r fig.height = 4}
small_iris %>%
select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>%
dist() %>%
hclust(method = "single") %>%
small_iris |>
select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) |>
dist() |>
hclust(method = "single") |>
plot(labels = small_iris$Species)
```
@@ -116,8 +116,8 @@ knitr::include_graphics("images/EDA-kmeans.png")
Use `kmeans()` to perform k means clustering with R. As with hierarchical clustering, you can only apply k means clustering to numerical data. Pass your numerical data to the `kmeans()` function, then set `center` to the number of clusters to search for ($k$) and `nstart` to the number of simulations to run. Since the results of k means clustering depend on the initial assignment of points to groups, which is random, R will run `nstart` simulations and then return the best results (as measured by the minimum sum of squared distances between each point and the centroid of the group it is assigned to). Finally, set the maximum number of iterations to let each simulation run in case the simulation cannot quickly find a stable grouping.
```{r}
iris_kmeans <- small_iris %>%
select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>%
iris_kmeans <- small_iris |>
select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) |>
kmeans(centers = 3, nstart = 20, iter.max = 50)
iris_kmeans$cluster
@@ -129,8 +129,8 @@ Unlike `hclust()`, the k means algorithm does not provide an intuitive visual in
ggplot(small_iris, aes(x = Sepal.Width, y = Sepal.Length)) +
geom_point(aes(color = factor(iris_kmeans$cluster)))
small_iris %>%
group_by(iris_kmeans$cluster) %>%
small_iris |>
group_by(iris_kmeans$cluster) |>
summarise(n_obs = n(), avg_width = mean(Sepal.Width), avg_length = mean(Sepal.Length))
```

View File

@@ -24,7 +24,7 @@ First, let's address a distraction: the data is censored in an odd way. The y va
```{r}
n <- nrow(heights)
heights <- heights %>% filter(income < 150000)
heights <- heights |> filter(income < 150000)
nrow(heights) / n
```
@@ -33,7 +33,7 @@ I'm going to record the original number of observations in `n`. We'll come back
Also, you can see that heights have been rounded to the nearest inch so using boxplots will make it easier to see the pattern. We'll also remove the very tall and very short people so we can focus on the most typically heights:
```{r}
heights <- heights %>% filter(between(height, 59, 78))
heights <- heights |> filter(between(height, 59, 78))
nrow(heights) / n
ggplot(heights, aes(height, income, group = height)) +
@@ -107,8 +107,8 @@ What happens if we also include `sex` in the model?
```{r}
h2 <- lm(income ~ height * sex, data = heights)
grid <- heights %>%
expand(height, sex) %>%
grid <- heights |>
expand(height, sex) |>
add_predictions(h2, "income")
ggplot(heights, aes(height, income)) +
@@ -123,8 +123,8 @@ Need to commment about predictions for tall women and short men - there is not a
```{r}
h3 <- lm(income ~ height + sex, data = heights)
grid <- heights %>%
expand(height, sex) %>%
grid <- heights |>
expand(height, sex) |>
gather_predictions(h2, h3)
ggplot(grid, aes(height, pred, colour = sex)) +
@@ -140,7 +140,7 @@ But before we add a variable to our model, we need to do a little EDA + cleaning
```{r}
ggplot(heights, aes(education)) + geom_bar()
heights_ed <- heights %>% filter(education >= 12)
heights_ed <- heights |> filter(education >= 12)
nrow(heights) / n
```
@@ -154,8 +154,8 @@ he2 <- lm(income ~ height * education, data = heights_ed)
How can we visualise the results of this model? One way to think about it as a surface: we have a 2d grid of height and education, and point on that grid gets a predicted income.
```{r}
grid <- heights_ed %>%
expand(height, education) %>%
grid <- heights_ed |>
expand(height, education) |>
gather_predictions(he1, he2)
ggplot(grid, aes(height, education, fill = pred)) +
@@ -177,21 +177,21 @@ ggplot(grid, aes(education, pred, group = height)) +
One of the big advantages to `+` instead of `*` is that because the terms are independent we display them using two simple plots instead of one complex plot:
```{r}
heights_ed %>%
heights_ed |>
expand(
height = seq_range(height, 10),
education = mean(education, na.rm = TRUE)
) %>%
add_predictions(he1, "income") %>%
) |>
add_predictions(he1, "income") |>
ggplot(aes(height, income)) +
geom_line()
heights_ed %>%
heights_ed |>
expand(
height = mean(height, na.rm = TRUE),
education = seq_range(education, 10)
) %>%
add_predictions(he1, "income") %>%
) |>
add_predictions(he1, "income") |>
ggplot(aes(education, income)) +
geom_line()
```
@@ -226,8 +226,8 @@ tidy(hes)
```
```{r}
heights %>%
group_by(sex) %>%
heights |>
group_by(sex) |>
do(glance(lm(income ~ height, data = .)))
```
@@ -252,9 +252,9 @@ One way to introduce non-linearity into our model is to use transformed variants
mod_e1 <- lm(income ~ education, data = heights_ed)
mod_e2 <- lm(income ~ education + I(education ^ 2) + I(education ^ 3), data = heights_ed)
heights_ed %>%
expand(education) %>%
gather_predictions(mod_e1, mod_e2) %>%
heights_ed |>
expand(education) |>
gather_predictions(mod_e1, mod_e2) |>
ggplot(aes(education, pred, colour = model)) +
geom_point() +
geom_line()
@@ -267,9 +267,9 @@ mod_e1 <- lm(income ~ education, data = heights_ed)
mod_e2 <- lm(income ~ poly(education, 2), data = heights_ed)
mod_e3 <- lm(income ~ poly(education, 3), data = heights_ed)
heights_ed %>%
expand(education) %>%
gather_predictions(mod_e1, mod_e2, mod_e3) %>%
heights_ed |>
expand(education) |>
gather_predictions(mod_e1, mod_e2, mod_e3) |>
ggplot(aes(education, pred, colour = model)) +
geom_point() +
geom_line()
@@ -278,8 +278,8 @@ heights_ed %>%
However: there's one major problem with using `poly()`: outside the range of the data, polynomials are going to rapidly shoot off to positive or negative infinity.
```{r}
tibble(education = seq(5, 25)) %>%
gather_predictions(mod_e1, mod_e2, mod_e3) %>%
tibble(education = seq(5, 25)) |>
gather_predictions(mod_e1, mod_e2, mod_e3) |>
ggplot(aes(education, pred, colour = model)) +
geom_line()
```
@@ -292,8 +292,8 @@ mod_e1 <- lm(income ~ education, data = heights_ed)
mod_e2 <- lm(income ~ ns(education, 2), data = heights_ed)
mod_e3 <- lm(income ~ ns(education, 3), data = heights_ed)
tibble(education = seq(5, 25)) %>%
gather_predictions(mod_e1, mod_e2, mod_e3) %>%
tibble(education = seq(5, 25)) |>
gather_predictions(mod_e1, mod_e2, mod_e3) |>
ggplot(aes(education, pred, colour = model)) +
geom_line()
```

View File

@@ -65,10 +65,10 @@ If you're unlucky, you may need to repeat this procedure.)
```{r}
tibble::tibble(
i = seq_along(issues),
names = issues %>% map(names)
) %>%
tidyr::unnest(names) %>%
table() %>%
names = issues |> map(names)
) |>
tidyr::unnest(names) |>
table() |>
t()
```
@@ -80,9 +80,9 @@ To work with this sort of data, you typically want to turn it into a data frame
```{r}
issues %>% map_int("id")
issues %>% map_lgl("locked")
issues %>% map_chr("state")
issues |> map_int("id")
issues |> map_lgl("locked")
issues |> map_chr("state")
```
You can use the same technique to extract more deeply nested structure.
@@ -90,23 +90,23 @@ For example, imagine you want to extract the name and id of the user.
You could do that in two steps:
```{r}
users <- issues %>% map("user")
users %>% map_chr("login")
users %>% map_int("id")
users <- issues |> map("user")
users |> map_chr("login")
users |> map_int("id")
```
But by supplying a character *vector* to `map_*`, you can do it in one:
```{r}
issues %>% map_chr(c("user", "login"))
issues %>% map_int(c("user", "id"))
issues |> map_chr(c("user", "login"))
issues |> map_int(c("user", "id"))
```
What happens if that path is missing in some of the elements?
For example, lets try and extract the HTML url to the pull request:
```{r, error = TRUE}
issues %>% map_chr(c("pull_request", "html_url"))
issues |> map_chr(c("pull_request", "html_url"))
```
Unfortunately that doesn't work.
@@ -114,7 +114,7 @@ Whenever you see an error from purrr complaining about the "type" of the result,
You can diagnose the problem more easily if you use `map()`:
```{r}
issues %>% map(c("pull_request", "html_url"))
issues |> map(c("pull_request", "html_url"))
```
To get the results into a character vector, we need to tell purrr what it should change `NULL` to.
@@ -122,7 +122,7 @@ You can do that with the `.null` argument.
The most common value to use is `NA`:
```{r}
issues %>% map_chr(c("pull_request", "html_url"), .null = NA)
issues |> map_chr(c("pull_request", "html_url"), .null = NA)
```
(You might wonder why that isn't the default value since it's so useful. Well, if it was the default, you'd never get an error message if you had a typo in the names. You'd just get a vector of missing values. That would be annoying to debug because it's a silent failure.)
@@ -130,7 +130,7 @@ issues %>% map_chr(c("pull_request", "html_url"), .null = NA)
It's possible to mix position and named indexing by using a list
```{r}
issues %>% map_chr(list("pull_request", 1), .null = NA)
issues |> map_chr(list("pull_request", 1), .null = NA)
```
## Removing a level of hierarchy
@@ -170,8 +170,8 @@ x <- list(
x = list(a = 1, b = 3, c = 5),
y = list(a = 2, b = 4, c = 6)
)
x %>% str()
x %>% transpose() %>% str()
x |> str()
x |> transpose() |> str()
```
Graphically, this looks like:
@@ -193,7 +193,7 @@ Many JSON APIs represent data frames in a row-based format, rather than R's colu
```{r}
df <- tibble::tibble(x = 1:3, y = c("a", "b", "c"))
df %>% transpose() %>% str()
df |> transpose() |> str()
```
## Turning lists into data frames
@@ -211,7 +211,7 @@ df %>% transpose() %>% str()
```{r, eval = FALSE}
files <- dir("data", pattern = "\\.csv$")
files %>%
set_names(., basename(.)) %>%
files |>
set_names(., basename(.)) |>
map_df(safely(readr::read_csv), .id = "filename")
```

View File

@@ -101,7 +101,7 @@ df <- tibble(
y = true_model(x)
)
df %>%
df |>
ggplot(aes(x, y)) +
geom_point()
```
@@ -117,12 +117,12 @@ my_model <- function(df) {
mod <- my_model(df)
rmse(mod, df)
grid <- df %>%
grid <- df |>
expand(x = seq_range(x, 50))
preds <- grid %>%
preds <- grid |>
add_predictions(mod, var = "y")
df %>%
df |>
ggplot(aes(x, y)) +
geom_line(data = preds) +
geom_point()
@@ -148,7 +148,7 @@ models <- tibble(
rmse = map2_dbl(mod, list(df), rmse)
)
models %>%
models |>
ggplot(aes(n, rmse)) +
geom_line(colour = "grey70") +
geom_point(size = 3)
@@ -161,14 +161,14 @@ There are two approaches to help you get around this problem.
I'll introduce them briefly here, and then we'll go into more depth in the following sections.
```{r}
boot <- bootstrap(df, 100) %>%
boot <- bootstrap(df, 100) |>
mutate(
mod = map(strap, my_model),
pred = map2(list(grid), mod, add_predictions)
)
boot %>%
unnest(pred) %>%
boot |>
unnest(pred) |>
ggplot(aes(x, pred, group = .id)) +
geom_line(alpha = 1/3)
```
@@ -191,7 +191,7 @@ The following code generates 100 test-training splits, holding out 20% of the da
We then fit a model to the training set, and evaluate the error on the test set:
```{r}
cv <- crossv_mc(df, 100) %>%
cv <- crossv_mc(df, 100) |>
mutate(
mod = map(train, my_model),
rmse = map2_dbl(mod, test, rmse)
@@ -203,7 +203,7 @@ Obviously, a plot is going to help us see distribution more easily.
I've added our original estimate of the model error as a white vertical line (where the same dataset is used for both training and testing), and you can see it's very optimistic.
```{r}
cv %>%
cv |>
ggplot(aes(rmse)) +
geom_ref_line(v = rmse(mod, df)) +
geom_freqpoly(binwidth = 0.2) +
@@ -215,8 +215,8 @@ These represent samples where we ended up with a few cases on all with low value
Let's take a look:
```{r}
filter(cv, rmse > 1.5) %>%
unnest(map(train, as.data.frame)) %>%
filter(cv, rmse > 1.5) |>
unnest(map(train, as.data.frame)) |>
ggplot(aes(x, .id)) +
geom_point() +
xlim(0, 1)

View File

@@ -88,7 +88,7 @@ One easy place to start is to find the vertical distance between each point and
(Note that I've shifted the x values slightly so you can see the individual distances.)
```{r, echo = FALSE}
dist1 <- sim1 %>%
dist1 <- sim1 |>
mutate(
dodge = rep(c(-1, 0, 1) / 20, 10),
x1 = x + dodge,
@@ -137,7 +137,7 @@ sim1_dist <- function(a1, a2) {
measure_distance(c(a1, a2), sim1)
}
models <- models %>%
models <- models |>
mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
models
```
@@ -171,10 +171,10 @@ I picked the parameters of the grid roughly by looking at where the best models
grid <- expand.grid(
a1 = seq(-5, 20, length = 25),
a2 = seq(1, 3, length = 25)
) %>%
) |>
mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
grid %>%
grid |>
ggplot(aes(a1, a2)) +
geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") +
geom_point(aes(colour = -dist))
@@ -282,7 +282,7 @@ The easiest way to do that is to use `modelr::data_grid()`.
Its first argument is a data frame, and for each subsequent argument it finds the unique variables and then generates all combinations:
```{r}
grid <- sim1 %>%
grid <- sim1 |>
data_grid(x)
grid
```
@@ -294,7 +294,7 @@ We'll use `modelr::add_predictions()` which takes a data frame and a model.
It adds the predictions from the model to a new column in the data frame:
```{r}
grid <- grid %>%
grid <- grid |>
add_predictions(sim1_mod)
grid
```
@@ -324,7 +324,7 @@ Note, however, that we use the original dataset, not a manufactured grid.
This is because to compute residuals we need actual y values.
```{r}
sim1 <- sim1 %>%
sim1 <- sim1 |>
add_residuals(sim1_mod)
sim1
```
@@ -444,8 +444,8 @@ We can fit a model to it, and generate predictions:
```{r}
mod2 <- lm(y ~ x, data = sim2)
grid <- sim2 %>%
data_grid(x) %>%
grid <- sim2 |>
data_grid(x) |>
add_predictions(mod2)
grid
```
@@ -463,7 +463,7 @@ You can't make predictions about levels that you didn't observe.
Sometimes you'll do this by accident so it's good to recognise this error message:
```{r, error = TRUE}
tibble(x = "e") %>%
tibble(x = "e") |>
add_predictions(mod2)
```
@@ -501,8 +501,8 @@ To visualise these models we need two new tricks:
Together this gives us:
```{r}
grid <- sim3 %>%
data_grid(x1, x2) %>%
grid <- sim3 |>
data_grid(x1, x2) |>
gather_predictions(mod1, mod2)
grid
```
@@ -524,7 +524,7 @@ We can take look at the residuals.
Here I've facetted by both model and `x2` because it makes it easier to see the pattern within each group.
```{r}
sim3 <- sim3 %>%
sim3 <- sim3 |>
gather_residuals(mod1, mod2)
ggplot(sim3, aes(x1, resid, colour = x2)) +
@@ -547,11 +547,11 @@ Initially things proceed almost identically to the previous example:
mod1 <- lm(y ~ x1 + x2, data = sim4)
mod2 <- lm(y ~ x1 * x2, data = sim4)
grid <- sim4 %>%
grid <- sim4 |>
data_grid(
x1 = seq_range(x1, 5),
x2 = seq_range(x2, 5)
) %>%
) |>
gather_predictions(mod1, mod2)
grid
```
@@ -687,8 +687,8 @@ mod3 <- lm(y ~ ns(x, 3), data = sim5)
mod4 <- lm(y ~ ns(x, 4), data = sim5)
mod5 <- lm(y ~ ns(x, 5), data = sim5)
grid <- sim5 %>%
data_grid(x = seq_range(x, n = 50, expand = 0.1)) %>%
grid <- sim5 |>
data_grid(x = seq_range(x, n = 50, expand = 0.1)) |>
gather_predictions(mod1, mod2, mod3, mod4, mod5, .pred = "y")
ggplot(sim5, aes(x, y)) +

View File

@@ -71,8 +71,8 @@ But first, lets make a couple of tweaks to the diamonds dataset to make it easie
2. Log-transform the carat and price variables.
```{r}
diamonds2 <- diamonds %>%
filter(carat <= 2.5) %>%
diamonds2 <- diamonds |>
filter(carat <= 2.5) |>
mutate(lprice = log2(price), lcarat = log2(carat))
```
@@ -95,10 +95,10 @@ Then we look at what the model tells us about the data.
Note that I back transform the predictions, undoing the log transformation, so I can overlay the predictions on the raw data:
```{r}
grid <- diamonds2 %>%
data_grid(carat = seq_range(carat, 20)) %>%
mutate(lcarat = log2(carat)) %>%
add_predictions(mod_diamond, "lprice") %>%
grid <- diamonds2 |>
data_grid(carat = seq_range(carat, 20)) |>
mutate(lcarat = log2(carat)) |>
add_predictions(mod_diamond, "lprice") |>
mutate(price = 2 ^ lprice)
ggplot(diamonds2, aes(carat, price)) +
@@ -113,7 +113,7 @@ This is probably because no diamond in this dataset costs more than \$19,000.
Now we can look at the residuals, which verifies that we've successfully removed the strong linear pattern:
```{r}
diamonds2 <- diamonds2 %>%
diamonds2 <- diamonds2 |>
add_residuals(mod_diamond, "lresid")
ggplot(diamonds2, aes(lcarat, lresid)) +
@@ -147,8 +147,8 @@ Fortunately, they're currently all independent which means that we can plot them
To make the process a little easier, we're going to use the `.model` argument to `data_grid`:
```{r}
grid <- diamonds2 %>%
data_grid(cut, .model = mod_diamond2) %>%
grid <- diamonds2 |>
data_grid(cut, .model = mod_diamond2) |>
add_predictions(mod_diamond2)
grid
@@ -160,7 +160,7 @@ If the model needs variables that you haven't explicitly supplied, `data_grid()`
For continuous variables, it uses the median, and categorical variables it uses the most common value (or values, if there's a tie).
```{r}
diamonds2 <- diamonds2 %>%
diamonds2 <- diamonds2 |>
add_residuals(mod_diamond2, "lresid2")
ggplot(diamonds2, aes(lcarat, lresid2)) +
@@ -171,11 +171,11 @@ This plot indicates that there are some diamonds with quite large residuals - re
It's often useful to look at unusual values individually:
```{r}
diamonds2 %>%
filter(abs(lresid2) > 1) %>%
add_predictions(mod_diamond2) %>%
mutate(pred = round(2 ^ pred)) %>%
select(price, pred, carat:table, x:z) %>%
diamonds2 |>
filter(abs(lresid2) > 1) |>
add_predictions(mod_diamond2) |>
mutate(pred = round(2 ^ pred)) |>
select(price, pred, carat:table, x:z) |>
arrange(price)
```
@@ -203,9 +203,9 @@ This is a really small dataset --- only 365 rows and 2 columns --- and we're not
Let's get started by counting the number of flights per day and visualising it with ggplot2.
```{r}
daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
daily <- flights |>
mutate(date = make_date(year, month, day)) |>
group_by(date) |>
summarise(n = n())
daily
@@ -219,7 +219,7 @@ Understanding the long-term trend is challenging because there's a very strong d
Let's start by looking at the distribution of flight numbers by day-of-week:
```{r}
daily <- daily %>%
daily <- daily |>
mutate(wday = wday(date, label = TRUE))
ggplot(daily, aes(wday, n)) +
geom_boxplot()
@@ -234,8 +234,8 @@ First, we fit the model, and display its predictions overlaid on the original da
```{r}
mod <- lm(n ~ wday, data = daily)
grid <- daily %>%
data_grid(wday) %>%
grid <- daily |>
data_grid(wday) |>
add_predictions(mod, "n")
ggplot(daily, aes(wday, n)) +
@@ -246,9 +246,9 @@ ggplot(daily, aes(wday, n)) +
Next we compute and visualise the residuals:
```{r}
daily <- daily %>%
daily <- daily |>
add_residuals(mod)
daily %>%
daily |>
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line()
@@ -272,7 +272,7 @@ This plot is useful because now that we've removed much of the large day-of-week
2. There are some days with far fewer flights than expected:
```{r}
daily %>%
daily |>
filter(resid < -100)
```
@@ -284,7 +284,7 @@ This plot is useful because now that we've removed much of the large day-of-week
We can highlight that trend with `geom_smooth()`:
```{r}
daily %>%
daily |>
ggplot(aes(date, resid)) +
geom_ref_line(h = 0) +
geom_line(colour = "grey50") +
@@ -301,8 +301,8 @@ Let's first tackle our failure to accurately predict the number of flights on Sa
A good place to start is to go back to the raw numbers, focussing on Saturdays:
```{r}
daily %>%
filter(wday == "Sat") %>%
daily |>
filter(wday == "Sat") |>
ggplot(aes(date, n)) +
geom_point() +
geom_line() +
@@ -329,11 +329,11 @@ term <- function(date) {
)
}
daily <- daily %>%
daily <- daily |>
mutate(term = term(date))
daily %>%
filter(wday == "Sat") %>%
daily |>
filter(wday == "Sat") |>
ggplot(aes(date, n, colour = term)) +
geom_point(alpha = 1/3) +
geom_line() +
@@ -345,7 +345,7 @@ daily %>%
It's useful to see how this new variable affects the other days of the week:
```{r}
daily %>%
daily |>
ggplot(aes(wday, n, colour = term)) +
geom_boxplot()
```
@@ -357,8 +357,8 @@ This improves our model, but not as much as we might hope:
mod1 <- lm(n ~ wday, data = daily)
mod2 <- lm(n ~ wday * term, data = daily)
daily %>%
gather_residuals(without_term = mod1, with_term = mod2) %>%
daily |>
gather_residuals(without_term = mod1, with_term = mod2) |>
ggplot(aes(date, resid, colour = model)) +
geom_line(alpha = 0.75)
```
@@ -366,8 +366,8 @@ daily %>%
We can see the problem by overlaying the predictions from the model on to the raw data:
```{r}
grid <- daily %>%
data_grid(wday, term) %>%
grid <- daily |>
data_grid(wday, term) |>
add_predictions(mod2, "n")
ggplot(daily, aes(wday, n)) +
@@ -383,8 +383,8 @@ This greatly reduces the impact of the outliers on our estimates, and gives a mo
```{r, warn = FALSE}
mod3 <- MASS::rlm(n ~ wday * term, data = daily)
daily %>%
add_residuals(mod3, "resid") %>%
daily |>
add_residuals(mod3, "resid") |>
ggplot(aes(date, resid)) +
geom_hline(yintercept = 0, size = 2, colour = "white") +
geom_line()
@@ -399,7 +399,7 @@ For example, we could write:
```{r}
compute_vars <- function(data) {
data %>%
data |>
mutate(
term = term(date),
wday = wday(date, label = TRUE)
@@ -430,9 +430,9 @@ A simple linear trend isn't adequate, so we could try using a natural spline to
library(splines)
mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)
daily %>%
data_grid(wday, date = seq_range(date, n = 13)) %>%
add_predictions(mod) %>%
daily |>
data_grid(wday, date = seq_range(date, n = 13)) |>
add_predictions(mod) |>
ggplot(aes(date, pred, colour = wday)) +
geom_line() +
geom_point()
@@ -451,7 +451,7 @@ It's a good sign when you get the same signal from different approaches.
How would these days generalise to another year?
```{r}
daily %>%
daily |>
slice_max(n = 3, resid)
```

View File

@@ -60,7 +60,7 @@ In this case study, we're going to focus on just three variables to answer the q
A good place to start is with a plot:
```{r}
gapminder %>%
gapminder |>
ggplot(aes(year, lifeExp, group = country)) +
geom_line(alpha = 1/3)
```
@@ -79,20 +79,20 @@ You already know how to do that if we had a single country:
```{r, out.width = "33%", fig.asp = 1, fig.width = 3, fig.align='default'}
nz <- filter(gapminder, country == "New Zealand")
nz %>%
nz |>
ggplot(aes(year, lifeExp)) +
geom_line() +
ggtitle("Full data = ")
nz_mod <- lm(lifeExp ~ year, data = nz)
nz %>%
add_predictions(nz_mod) %>%
nz |>
add_predictions(nz_mod) |>
ggplot(aes(year, pred)) +
geom_line() +
ggtitle("Linear trend + ")
nz %>%
add_residuals(nz_mod) %>%
nz |>
add_residuals(nz_mod) |>
ggplot(aes(year, resid)) +
geom_hline(yintercept = 0, colour = "white", size = 3) +
geom_line() +
@@ -111,8 +111,8 @@ To do that, we need a new data structure: the **nested data frame**.
To create a nested data frame we start with a grouped data frame, and "nest" it:
```{r}
by_country <- gapminder %>%
group_by(country, continent) %>%
by_country <- gapminder |>
group_by(country, continent) |>
nest()
by_country
@@ -163,7 +163,7 @@ In other words, instead of creating a new object in the global environment, we'r
That's a job for `dplyr::mutate()`:
```{r}
by_country <- by_country %>%
by_country <- by_country |>
mutate(model = map(data, country_model))
by_country
```
@@ -172,9 +172,9 @@ This has a big advantage: because all the related objects are stored together, y
The semantics of the data frame takes care of that for you:
```{r}
by_country %>%
by_country |>
filter(continent == "Europe")
by_country %>%
by_country |>
arrange(continent, country)
```
@@ -188,7 +188,7 @@ Now we have 142 data frames and 142 models.
To compute the residuals, we need to call `add_residuals()` with each model-data pair:
```{r}
by_country <- by_country %>%
by_country <- by_country |>
mutate(
resids = map2(data, model, add_residuals)
)
@@ -209,7 +209,7 @@ Note that each regular column is repeated once for each row of the nested tibble
Now we have regular data frame, we can plot the residuals:
```{r}
resids %>%
resids |>
ggplot(aes(year, resid)) +
geom_line(aes(group = country), alpha = 1 / 3) +
geom_smooth(se = FALSE)
@@ -219,7 +219,7 @@ resids %>%
Facetting by continent is particularly revealing:
```{r}
resids %>%
resids |>
ggplot(aes(year, resid, group = country)) +
geom_line(alpha = 1 / 3) +
facet_wrap(~continent)
@@ -245,9 +245,9 @@ broom::glance(nz_mod)
We can use `mutate()` and `unnest()` to create a data frame with a row for each country:
```{r}
glance <- by_country %>%
mutate(glance = map(model, broom::glance)) %>%
select(country, continent, glance) %>%
glance <- by_country |>
mutate(glance = map(model, broom::glance)) |>
select(country, continent, glance) |>
unnest(glance)
glance
```
@@ -257,7 +257,7 @@ glance
With this data frame in hand, we can start to look for models that don't fit well:
```{r}
glance %>%
glance |>
arrange(r.squared)
```
@@ -266,7 +266,7 @@ Let's double check that with a plot.
Here we have a relatively small number of observations and a discrete variable, so `geom_jitter()` is effective:
```{r}
glance %>%
glance |>
ggplot(aes(continent, r.squared)) +
geom_jitter(width = 0.5)
```
@@ -276,8 +276,8 @@ We could pull out the countries with particularly bad $R^2$ and plot the data:
```{r}
bad_fit <- filter(glance, r.squared < 0.25)
gapminder %>%
semi_join(bad_fit, by = "country") %>%
gapminder |>
semi_join(bad_fit, by = "country") |>
ggplot(aes(year, lifeExp, colour = country)) +
geom_line()
```
@@ -377,15 +377,15 @@ So far you've seen how to use it with a grouped data frame.
When applied to a grouped data frame, `nest()` keeps the grouping columns as is, and bundles everything else into the list-column:
```{r}
gapminder %>%
group_by(country, continent) %>%
gapminder |>
group_by(country, continent) |>
nest()
```
You can also use it on an ungrouped data frame, specifying which columns you want to nest:
```{r}
gapminder %>%
gapminder |>
nest(data = c(year:gdpPercap))
```
@@ -402,15 +402,15 @@ df <- tribble(
"d,e,f,g"
)
df %>%
df |>
mutate(x2 = stringr::str_split(x1, ","))
```
`unnest()` knows how to handle these lists of vectors:
```{r}
df %>%
mutate(x2 = stringr::str_split(x1, ",")) %>%
df |>
mutate(x2 = stringr::str_split(x1, ",")) |>
unnest(x2)
```
@@ -427,7 +427,7 @@ sim <- tribble(
"rpois", list(lambda = 10)
)
sim %>%
sim |>
mutate(sims = invoke_map(f, params, n = 10))
```
@@ -440,8 +440,8 @@ One restriction of `summarise()` is that it only works with summary functions th
That means that you can't use it with functions like `quantile()` that return a vector of arbitrary length:
```{r, error = TRUE}
mtcars %>%
group_by(cyl) %>%
mtcars |>
group_by(cyl) |>
summarise(q = quantile(mpg))
```
@@ -449,8 +449,8 @@ You can however, wrap the result in a list!
This obeys the contract of `summarise()`, because each summary is now a list (a vector) of length 1.
```{r}
mtcars %>%
group_by(cyl) %>%
mtcars |>
group_by(cyl) |>
summarise(q = list(quantile(mpg)))
```
@@ -458,9 +458,9 @@ To make useful results with unnest, you'll also need to capture the probabilitie
```{r}
probs <- c(0.01, 0.25, 0.5, 0.75, 0.99)
mtcars %>%
group_by(cyl) %>%
summarise(p = list(probs), q = list(quantile(mpg, probs))) %>%
mtcars |>
group_by(cyl) |>
summarise(p = list(probs), q = list(quantile(mpg, probs))) |>
unnest(c(p, q))
```
@@ -486,7 +486,7 @@ The advantage of this structure is that it generalises in a straightforward way
Now if you want to iterate over names and values in parallel, you can use `map2()`:
```{r}
df %>%
df |>
mutate(
smry = map2_chr(name, value, ~ stringr::str_c(.x, ": ", .y[1]))
)
@@ -503,9 +503,9 @@ df %>%
Why isn't that helpful here?
```{r}
mtcars %>%
group_by(cyl) %>%
summarise(q = list(quantile(mpg))) %>%
mtcars |>
group_by(cyl) |>
summarise(q = list(quantile(mpg))) |>
unnest(q)
```
@@ -513,8 +513,8 @@ df %>%
Why might it be useful?
```{r, eval = FALSE}
mtcars %>%
group_by(cyl) %>%
mtcars |>
group_by(cyl) |>
summarise_all(list(list))
```
@@ -542,7 +542,7 @@ df <- tribble(
runif(5)
)
df %>% mutate(
df |> mutate(
type = map_chr(x, typeof),
length = map_int(x, length)
)
@@ -561,7 +561,7 @@ df <- tribble(
list(a = 1, b = 2),
list(a = 2, c = 4)
)
df %>% mutate(
df |> mutate(
a = map_dbl(x, "a"),
b = map_dbl(x, "b", .null = NA_real_)
)
@@ -573,7 +573,7 @@ df %>% mutate(
For example, in the following very simple example we repeat the first row 4 times (because there the first element of `y` has length four), and the second row once:
```{r}
tibble(x = 1:2, y = list(1:4, 1)) %>% unnest(y)
tibble(x = 1:2, y = list(1:4, 1)) |> unnest(y)
```
This means that you can't simultaneously unnest two columns that contain different number of elements:
@@ -587,7 +587,7 @@ df1 <- tribble(
2, "c", 3
)
df1
df1 %>% unnest(c(y, z))
df1 |> unnest(c(y, z))
# Doesn't work because y and z have different number of elements
df2 <- tribble(
@@ -596,7 +596,7 @@ df2 <- tribble(
2, c("b", "c"), 3
)
df2
df2 %>% unnest(c(y, z))
df2 |> unnest(c(y, z))
```
The same principle applies when unnesting list-columns of data frames.

View File

@@ -109,9 +109,9 @@ df <- data.frame(
)
df[1:4] %>% sapply(class) %>% str()
df[1:2] %>% sapply(class) %>% str()
df[3:4] %>% sapply(class) %>% str()
df[1:4] |> sapply(class) |> str()
df[1:2] |> sapply(class) |> str()
df[3:4] |> sapply(class) |> str()
```
In the next chapter, you'll learn about the purrr package which provides a variety of alternatives. In this case, you could use `map_chr()` which always returns a character vector: if it can't, it will throw an error. Another option is the base `vapply()` function which takes a third argument indicating what the output should look like.