render compile

This commit is contained in:
2026-05-21 01:33:30 +08:00
parent 2c2eaa0d0d
commit 204a12a58f
2379 changed files with 142545 additions and 814 deletions
+115
View File
@@ -0,0 +1,115 @@
---
title: "Lesson 8"
format: html
---
```{r}
require(tidyverse)
library(nycflights13)
flights |>
select(3:6) |>
head(3) |>
rename_all(~ gsub("_", "", .))
flights |>
select(3:6) |>
head(3) |>
rename_with(toupper, .cols = 2:4)
# 每月10号-15号,dep_delay > 100 的航班
# 每月哪个出发地origin的 dep_delay总时长最长
flights |>
group_by(month, origin) |>
summarize(n = n(), total_dep_delay = mean(dep_delay, na.rm = TRUE)) |>
slice_max(total_dep_delay)
# arrange(month, origin, desc(total_dep_delay))
# 每月随机抽取一天,随机抽取三个航班
slice_sample(n = 1)
flights |>
tidyr::nest(ymddf = -c(year, month, day)) |>
group_by(year, month) |>
slice_sample(n = 5) |>
unnest(ymddf) |>
group_by(year, month, day) |>
slice_sample(n = 3)
flights |>
tidyr::nest(ymddf = -c(year, month, day)) |>
group_by(year, month) |>
slice_sample(n = 1) |>
mutate(
ymddf = purrr::map(ymddf, \(x) {
x |>
slice_sample(n = 3)
})
)
# 每月 每个出发地 周末的平均dep_delay 与 工作日的平均dep_delay 差值最大的3个航班
flights |>
mutate(date = ymd(paste(year, month, day))) |>
mutate(weekday = wday(date)) |>
mutate(isworkday = if_else(between(weekday, 2, 6), "Yes", "No")) |>
group_by(year, month, origin, flight, isworkday) |>
summarize(mean_delay = mean(dep_delay, na.rm = TRUE)) |>
tidyr::nest(diffdelaydf = c(isworkday, mean_delay)) |>
filter(
purrr::map(diffdelaydf, \(x) {
nrow(x)
}) >
1
) |>
mutate(
diffdelay = purrr::map_dbl(diffdelaydf, \(x) {
x |>
arrange(isworkday) |>
pull(mean_delay) |>
diff()
})
) |>
group_by(year, month, origin) |>
slice_max(diffdelay, n = 3)
wday(today())
weekday()
slice_sample(n = 1)
flights |>
group_by(month) |>
slice_sample(n = 1)
```
```{r}
p1 <- flights |>
group_by(year, month) |>
summarize(n = n()) |>
ggplot(aes(month, n)) +
geom_point(shape = 21, size = 6, color = "black", fill = "red") +
geom_line()
ggsave("./a.pdf")
ggsave("./a.png")
require(patchwork)
p1 / p1
```
+1
View File
@@ -0,0 +1 @@
../../_extensions
Binary file not shown.
Binary file not shown.

After

Width:  |  Height:  |  Size: 271 KiB

+309
View File
@@ -0,0 +1,309 @@
---
title: "Data Transform"
format:
dwsd-revealjs:
logo: _extensions/drwater/dwsd/inst/ucaslogo.png
---
```{r}
#| echo: false
source("../../coding/_common.R")
library(nycflights13)
library(tidyverse)
```
## `tidyverse`风格数据分析总体流程
![](../../image/data-science/transform.png)
## [dplyr cheatsheet](../../image/cheatsheet/data-transformation.pdf)
```{r}
#| echo: false
dwfun::ggsavep("../../image/cheatsheet/data-transformation.svg", loadit = TRUE)
```
## 查看数据
```{r}
flights
```
## 选择列
```{r}
#| results: false
flights |>
select(year, month, day)
```
## 选择列
```{r}
#| results: false
flights |>
select(year:day)
```
## 选择列
```{r}
flights |>
select(3:5)
```
## 选择列
```{r}
flights |>
select(!year:day)
```
## 选择列
```{r}
flights |>
select(-(year:day))
```
## 选择列
```{r}
flights |>
select(where(is.character))
```
## 选择列
```{r}
flights |>
select(!where(is.character)) |>
select(contains("_"))
```
## 选择列
```{r}
flights |>
select(tail_num = tailnum)
```
## 选择列
```{r}
flights |>
select(air_time, everything())
```
## 重命名
```{r}
flights |>
rename(tail_num = tailnum)
```
## 重命名
```{r}
flights |>
rename(年份 = 1) |>
rename(月份 = 2)
```
## 重命名
```{r}
flights |> select(1:4) |> head(n = 3)
# 重命名
flights |> select(1:4) |> head(n = 3) |> rename_all(~ c("c1", "c2", "c3", "c4"))
```
## 重命名
```{r}
flights |> select(1:4) |> head(n = 3)
# 重命名
flights |> select(1:4) |> head(n = 3) |> rename_all(toupper)
```
## 重命名
```{r}
flights |> select(1:4) |> head(n = 3)
# 重命名
flights |>
select(1:4) |>
head(n = 3) |>
rename_all(~ paste0(toupper(.), "_NEW"))
```
## 练习
将含有下划线的列名中的下划线去掉。
```{r}
flights |> select(1:4) |> head(n = 3)
```
## 练习
将`airqualitydf`中列名的单位信息去除(前5列)。
```{r}
airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx", sheet = 2)
airqualitydf |> select(1:5)
```
## `filter`
```{r}
flights |>
filter(dep_delay > 120)
```
## filter 练习
Flights that departed on January 1.
```{r}
#| echo: false
flights |>
filter(month == 1 & day == 1)
```
## filter 练习
Select flights that departed in January or February
```{r}
#| echo: false
flights |>
filter(month %in% c(1, 2))
```
## filter 练习
```{r}
jan1 <- flights |>
filter(month == 1 & day == 1)
```
## filter
```{r}
#| error: true
#| eval: false
flights |>
filter(month = 1)
```
## filter
```{r}
flights |>
filter(month == 1 | 2)
```
## 排序
```{r}
flights |>
arrange(year, month, day, dep_time)
```
## 排序
```{r}
flights |>
arrange(desc(dep_delay))
```
## slice
```{r}
flights |> head(n = 5)
flights |> slice(1:5)
```
## slice
```{r}
flights |>
slice_max(dep_delay, n = 5)
```
## slice
```{r}
flights |>
slice_min(dep_delay, prop = 0.005)
```
## 排序练习
根据`origin`、`dest`、`air_time`倒序排序。
```{r}
#| echo: false
flights |>
arrange(origin, dest, desc(air_time)) |>
select(origin, dest, air_time, everything())
```
## 去重
```{r}
# Remove duplicate rows, if any
flights |>
distinct()
```
## 去重
```{r}
# Find all unique origin and destination pairs
flights |>
distinct(origin, dest)
```
## 去重
```{r}
flights |>
distinct(origin, dest, .keep_all = TRUE)
```
## 欢迎讨论!{.center}
```{r}
#| results: 'asis'
rmdify::slideend(
wechat = FALSE,
type = "public",
tel = FALSE,
thislink = "../"
)
```