parent
1729264d91
commit
daaa861f74
|
@ -548,9 +548,11 @@ There are a few other general strategies to help you parse files:
|
||||||
frame.
|
frame.
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
df <- tibble(
|
df <- tribble(
|
||||||
x = c("1", "2", "3"),
|
~x, ~y,
|
||||||
y = c("1.21", "2.32", "4.56")
|
"1", "1.21",
|
||||||
|
"2", "2.32",
|
||||||
|
"3", "4.56"
|
||||||
)
|
)
|
||||||
df
|
df
|
||||||
|
|
||||||
|
|
|
@ -542,7 +542,12 @@ You can also perform transformations inside the model formula. For example, `log
|
||||||
Again, if you get confused about what your model is doing, you can always use `model_matrix()` to see exactly what equation `lm()` is fitting:
|
Again, if you get confused about what your model is doing, you can always use `model_matrix()` to see exactly what equation `lm()` is fitting:
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
df <- tibble(y = 1:3, x = 1:3)
|
df <- tribble(
|
||||||
|
~y, ~x,
|
||||||
|
1, 1,
|
||||||
|
2, 2,
|
||||||
|
3, 3
|
||||||
|
)
|
||||||
model_matrix(df, y ~ x^2 + x)
|
model_matrix(df, y ~ x^2 + x)
|
||||||
model_matrix(df, y ~ I(x^2) + x)
|
model_matrix(df, y ~ I(x^2) + x)
|
||||||
```
|
```
|
||||||
|
|
|
@ -295,6 +295,16 @@ tibble(
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
It's even easier with `tribble()` as it can automatically work out that you need a list:
|
||||||
|
|
||||||
|
```{r}
|
||||||
|
tribble(
|
||||||
|
~x, ~y,
|
||||||
|
1:3, "1, 2",
|
||||||
|
3:5, "3, 4, 5"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
List-columns are often most useful as intermediate data structure. They're hard to work with directly, because most R functions work with atomic vectors or data frames, but the advantage of keeping related items together in a data frame is worth a little hassle.
|
List-columns are often most useful as intermediate data structure. They're hard to work with directly, because most R functions work with atomic vectors or data frames, but the advantage of keeping related items together in a data frame is worth a little hassle.
|
||||||
|
|
||||||
Generally there are three parts of an effective list-column pipeline:
|
Generally there are three parts of an effective list-column pipeline:
|
||||||
|
@ -349,7 +359,11 @@ gapminder %>%
|
||||||
Some useful functions take an atomic vector and return a list. For example, in [strings] you learned about `stringr::str_split()` which takes a character vector and returns a list of character vectors. If you use that inside mutate, you'll get a list-column:
|
Some useful functions take an atomic vector and return a list. For example, in [strings] you learned about `stringr::str_split()` which takes a character vector and returns a list of character vectors. If you use that inside mutate, you'll get a list-column:
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
df <- tibble(x1 = c("a,b,c", "d,e,f,g"))
|
df <- tribble(
|
||||||
|
~x1,
|
||||||
|
"a,b,c",
|
||||||
|
"d,e,f,g"
|
||||||
|
)
|
||||||
|
|
||||||
df %>%
|
df %>%
|
||||||
mutate(x2 = stringr::str_split(x1, ","))
|
mutate(x2 = stringr::str_split(x1, ","))
|
||||||
|
@ -478,13 +492,12 @@ These are described in more detail below.
|
||||||
If you can reduce your list column to an atomic vector then it will be a regular column. For example, you can always summarise an object with it's type and length, so this code will work regardless of what sort of list-column you have:
|
If you can reduce your list column to an atomic vector then it will be a regular column. For example, you can always summarise an object with it's type and length, so this code will work regardless of what sort of list-column you have:
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
df <- tibble(
|
df <- tribble(
|
||||||
x = list(
|
~x,
|
||||||
letters[1:5],
|
letters[1:5],
|
||||||
1:3,
|
1:3,
|
||||||
runif(5)
|
runif(5)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
df %>% mutate(
|
df %>% mutate(
|
||||||
type = map_chr(x, typeof),
|
type = map_chr(x, typeof),
|
||||||
|
@ -497,12 +510,11 @@ This is the same basic information that you get from the default tbl print metho
|
||||||
Don't forget about the `map_*()` shortcuts - you can use `map_chr(x, "apple")` to extract the string stored in `apple` for each element of `x`. This is useful for pulling apart nested lists into regular columns. Use the `.null` argument to provide a value to use if the element is missing (instead of returning `NULL`):
|
Don't forget about the `map_*()` shortcuts - you can use `map_chr(x, "apple")` to extract the string stored in `apple` for each element of `x`. This is useful for pulling apart nested lists into regular columns. Use the `.null` argument to provide a value to use if the element is missing (instead of returning `NULL`):
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
df <- tibble(
|
df <- tribble(
|
||||||
x = list(
|
~x,
|
||||||
list(a = 1, b = 2),
|
list(a = 1, b = 2),
|
||||||
list(a = 2, c = 4)
|
list(a = 2, c = 4)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
df %>% mutate(
|
df %>% mutate(
|
||||||
a = map_dbl(x, "a"),
|
a = map_dbl(x, "a"),
|
||||||
b = map_dbl(x, "b", .null = NA_real_)
|
b = map_dbl(x, "b", .null = NA_real_)
|
||||||
|
@ -522,19 +534,19 @@ This means that you can't simultaneously unnest two columns that contain differe
|
||||||
```{r, error = TRUE}
|
```{r, error = TRUE}
|
||||||
# Ok, because y and z have the same number of elements in
|
# Ok, because y and z have the same number of elements in
|
||||||
# every row
|
# every row
|
||||||
df1 <- tibble(
|
df1 <- tribble(
|
||||||
x = 1:2,
|
~x, ~y, ~z,
|
||||||
y = list(c("a", "b"), "c"),
|
1, c("a", "b"), 1:2,
|
||||||
z = list(1:2, 3)
|
2, "c", 3
|
||||||
)
|
)
|
||||||
df1
|
df1
|
||||||
df1 %>% unnest(y, z)
|
df1 %>% unnest(y, z)
|
||||||
|
|
||||||
# Doesn't work because y and z have different number of elements
|
# Doesn't work because y and z have different number of elements
|
||||||
df2 <- tibble(
|
df2 <- tribble(
|
||||||
x = 1:2,
|
~x, ~y, ~z,
|
||||||
y = list("a", c("b", "c")),
|
1, "a", 1:2,
|
||||||
z = list(1:2, 3)
|
2, c("b", "c"), 3
|
||||||
)
|
)
|
||||||
df2
|
df2
|
||||||
df2 %>% unnest(y, z)
|
df2 %>% unnest(y, z)
|
||||||
|
|
|
@ -206,8 +206,18 @@ To help you learn how joins work, I'm going to use a visual representation:
|
||||||
knitr::include_graphics("diagrams/join-setup.png")
|
knitr::include_graphics("diagrams/join-setup.png")
|
||||||
```
|
```
|
||||||
```{r}
|
```{r}
|
||||||
(x <- tibble(key = c(1, 2, 3), val_x = c("x1", "x2", "x3")))
|
x <- tribble(
|
||||||
(y <- tibble(key = c(1, 2, 4), val_y = c("y1", "y2", "y3")))
|
~key, ~val_x,
|
||||||
|
1, "x1",
|
||||||
|
2, "x2",
|
||||||
|
3, "x3"
|
||||||
|
)
|
||||||
|
y <- tribble(
|
||||||
|
~key, ~val_y,
|
||||||
|
1, "y1",
|
||||||
|
2, "y2",
|
||||||
|
4, "y3"
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
The coloured column represents the "key" variable: these are used to match the rows between the tables. The grey column represents the "value" column that is carried along for the ride. In these examples I'll show a single key variable and single value variable, but idea generalises in a straightforward way to multiple keys and multiple values.
|
The coloured column represents the "key" variable: these are used to match the rows between the tables. The grey column represents the "value" column that is carried along for the ride. In these examples I'll show a single key variable and single value variable, but idea generalises in a straightforward way to multiple keys and multiple values.
|
||||||
|
@ -288,8 +298,18 @@ So far all the diagrams have assumed that the keys are unique. But that's not al
|
||||||
and a foreign key in `x`.
|
and a foreign key in `x`.
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
x <- tibble(key = c(1, 2, 2, 1), val_x = stringr::str_c("x", 1:4))
|
x <- tribble(
|
||||||
y <- tibble(key = 1:2, val_y = stringr::str_c("y", 1:2))
|
~key, ~val_x,
|
||||||
|
1, "x1",
|
||||||
|
2, "x2",
|
||||||
|
2, "x3",
|
||||||
|
1, "x4"
|
||||||
|
)
|
||||||
|
y <- tribble(
|
||||||
|
~key, ~val_y,
|
||||||
|
1, "y1",
|
||||||
|
2, "y2"
|
||||||
|
)
|
||||||
left_join(x, y, by = "key")
|
left_join(x, y, by = "key")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -302,8 +322,20 @@ So far all the diagrams have assumed that the keys are unique. But that's not al
|
||||||
```
|
```
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
x <- tibble(key = c(1, 2, 2, 3), val_x = stringr::str_c("x", 1:4))
|
x <- tribble(
|
||||||
y <- tibble(key = c(1, 2, 2, 3), val_y = stringr::str_c("y", 1:4))
|
~key, ~val_x,
|
||||||
|
1, "x1",
|
||||||
|
2, "x2",
|
||||||
|
2, "x3",
|
||||||
|
3, "x4"
|
||||||
|
)
|
||||||
|
y <- tribble(
|
||||||
|
~key, ~val_y,
|
||||||
|
1, "y1",
|
||||||
|
2, "y2",
|
||||||
|
2, "y3",
|
||||||
|
3, "y4"
|
||||||
|
)
|
||||||
left_join(x, y, by = "key")
|
left_join(x, y, by = "key")
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -543,8 +575,16 @@ The final type of two-table verb are the set operations. Generally, I use these
|
||||||
Given this simple data:
|
Given this simple data:
|
||||||
|
|
||||||
```{r}
|
```{r}
|
||||||
(df1 <- tibble(x = 1:2, y = c(1L, 1L)))
|
df1 <- tribble(
|
||||||
(df2 <- tibble(x = 1:2, y = 1:2))
|
~x, ~y,
|
||||||
|
1, 1,
|
||||||
|
2, 1
|
||||||
|
)
|
||||||
|
df2 <- tribble(
|
||||||
|
~x, ~y,
|
||||||
|
1, 1,
|
||||||
|
1, 2
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
The four possibilities are:
|
The four possibilities are:
|
||||||
|
|
|
@ -499,11 +499,12 @@ This works because every geom has a default stat; and every stat has a default g
|
||||||
is generated by counting rows.
|
is generated by counting rows.
|
||||||
|
|
||||||
```{r, warning = FALSE}
|
```{r, warning = FALSE}
|
||||||
demo <- tibble(
|
demo <- tribble(
|
||||||
a = c("bar_1", "bar_2", "bar_3"),
|
~a, ~b,
|
||||||
b = c(20, 30, 40)
|
"bar_1", 20,
|
||||||
|
"bar_2", 30,
|
||||||
|
"bar_3", 40
|
||||||
)
|
)
|
||||||
demo
|
|
||||||
|
|
||||||
ggplot(data = demo) +
|
ggplot(data = demo) +
|
||||||
geom_bar(mapping = aes(x = a, y = b), stat = "identity")
|
geom_bar(mapping = aes(x = a, y = b), stat = "identity")
|
||||||
|
|
Loading…
Reference in New Issue