diff --git a/SD/3.1_datatransform/_extensions b/SD/3.1_datatransform/_extensions new file mode 120000 index 0000000..74119e3 --- /dev/null +++ b/SD/3.1_datatransform/_extensions @@ -0,0 +1 @@ +../../_extensions \ No newline at end of file diff --git a/SD/3.1_datatransform/index.qmd b/SD/3.1_datatransform/index.qmd new file mode 100644 index 0000000..e970e8c --- /dev/null +++ b/SD/3.1_datatransform/index.qmd @@ -0,0 +1,949 @@ +--- +title: "Data Transform" +subtitle: 《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution +author: 苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心 +date: today +lang: zh +format: + revealjs: + theme: dark + slide-number: true + chalkboard: + buttons: true + preview-links: auto + lang: zh + toc: true + toc-depth: 1 + toc-title: 大纲 + logo: ./_extensions/inst/img/ucaslogo.png + css: ./_extensions/inst/css/revealjs.css + pointer: + key: "p" + color: "#32cd32" + pointerSize: 18 +revealjs-plugins: + - pointer +filters: + - d2 +--- + +```{r} +#| echo: false +knitr::opts_chunk$set(echo = TRUE) +source("../../coding/_common.R") +library(nycflights13) +library(tidyverse) +``` + + + +## 计数 + +```{r} +flights |> + count(origin, dest, sort = TRUE) +``` + + +## 计数-练习 + +统计每月的航班数量。 + +```{r} +#| echo: false +flights |> + count(year, month, sort = TRUE) +``` + + +## 计算新变量 + +```{r} +flights |> + mutate( + gain = dep_delay - arr_delay, + speed = distance / air_time * 60 + ) +``` + +## 计算新变量 + +```{r} +flights |> + mutate( + gain = dep_delay - arr_delay, + speed = distance / air_time * 60, + .before = 1 + ) +``` + +## 计算新变量 + +```{r} +flights |> + mutate( + gain = dep_delay - arr_delay, + speed = distance / air_time * 60, + .after = day + ) +``` + +## 计算新变量 + +```{r} +flights |> + mutate( + gain = dep_delay - arr_delay, + hours = air_time / 60, + gain_per_hour = gain / hours, + .keep = "used" + ) +``` + + + + +## 列排序 + +```{r} +flights |> + relocate(time_hour, air_time) +``` + +## 列排序 + +```{r} +#| results: false + +flights |> + relocate(year:dep_time, .after = time_hour) +flights |> + relocate(starts_with("arr"), .before = dep_time) +flights |> + select(starts_with("arr"), everything()) +``` + + +## 练习 + +计算目的地为IAH,按飞行速度排序的表格,保留year:day, `dep_time`, carrier, flight与speed列。 + +```{r} +flights |> + filter(dest == "IAH") |> + mutate(speed = distance / air_time * 60) |> + select(year:day, dep_time, carrier, flight, speed) |> + arrange(desc(speed)) +``` + +## 练习 + +计算目的地为IAH,按飞行速度排序的表格,保留year:day, `dep_time`, carrier, flight与speed列。 + +```{r} +#| results: false + +flights1 <- filter(flights, dest == "IAH") +flights2 <- mutate(flights1, speed = distance / air_time * 60) +flights3 <- select(flights2, year:day, dep_time, carrier, flight, speed) +arrange(flights3, desc(speed)) +``` + +## 练习 + +计算目的地为IAH,按飞行速度排序的表格,保留year:day, `dep_time`, carrier, flight与speed列。 + +```{r} +flights |> + filter(dest == "IAH") |> + mutate(speed = distance / air_time * 60) |> + select(year:day, dep_time, carrier, flight, speed) |> + arrange(desc(speed)) +``` + + +## 分组统计 + +```{r} + +library(tidyverse) + +mtcars %>% + group_by(cyl) %>% + summarize(n = n()) +``` + +## 分组统计 + +```{r} +flights |> + group_by(month) +``` + +## 分组统计 + +```{r} +flights |> + group_by(month) |> + summarize( + avg_delay = mean(dep_delay) + ) +``` + +## 分组统计 + +```{r} +flights |> + group_by(month) |> + summarize( + avg_delay = mean(dep_delay, na.rm = TRUE) + ) +``` + +## 分组统计 + +```{r} +flights |> + group_by(month) |> + summarize( + avg_delay = mean(dep_delay, na.rm = TRUE), + n = n() + ) +``` + +## 分组统计 + +```{r} +flights |> + group_by(dest) |> + slice_max(arr_delay, n = 1) |> + relocate(dest) +``` + +## 分组统计 + +```{r} + +flights |> + filter(dest == "IAH") |> + group_by(year, month, day) |> + summarize( + arr_delay = mean(arr_delay, na.rm = TRUE) + ) +``` + + +## 分组 + +```{r} +daily <- flights |> + group_by(year, month, day) +daily +``` + +## 分组统计 + +```{r} +daily_flights <- daily |> + summarize(n = n()) +``` + +## 分组统计 + +```{r} +#| results: false + +daily_flights <- daily |> + summarize( + n = n(), + .groups = "drop_last" + ) +``` + + + +## 删除分组 + +```{r} +daily |> ungroup() +``` + +## 删除分组 + +```{r} +daily |> + ungroup() |> + summarize( + avg_delay = mean(dep_delay, na.rm = TRUE), + flights = n() + ) +``` + +## 分组统计 + +```{r} +flights |> + summarize( + delay = mean(dep_delay, na.rm = TRUE), + n = n(), + .by = month + ) +``` + +## 分组统计 + +```{r} +flights |> + summarize( + delay = mean(dep_delay, na.rm = TRUE), + n = n(), + .by = c(origin, dest) + ) +``` + + + +## 练习 + +```{r} +df <- tibble( + x = 1:5, + y = c("a", "b", "a", "a", "b"), + z = c("K", "K", "L", "L", "K") +) +df +``` + +```{r} +#| eval: false + +df |> arrange(y) +``` + +## 练习 + +```{r} +#| echo: false +df +``` + +```{r} +#| eval: false +df |> + group_by(y) |> + summarize(mean_x = mean(x)) +``` + +## 练习 + +```{r} +#| echo: false +df +``` +```{r} +#| eval: false + +df |> + group_by(y, z) |> + summarize(mean_x = mean(x)) +``` + +## 练习 + +```{r} +#| echo: false +df +``` +```{r} +#| eval: false + +df |> + group_by(y, z) |> + summarize(mean_x = mean(x), .groups = "drop") +``` + +## 练习 + +```{r} +#| echo: false +df +``` + +```{r} +#| eval: false +df |> + group_by(y, z) |> + summarize(mean_x = mean(x)) + +df |> + group_by(y, z) |> + mutate(mean_x = mean(x)) +``` + + + + +## 练习 + +- 计算不同采样点的平均CO浓度、最大CO浓度、最小CO浓度、中位数CO浓度(`CO_mg/m3`)。 +- 计算各小时全国的平均CO浓度、最大CO浓度、最小CO浓度、中位数CO浓度(`CO_mg/m3`)。 +- 计算不同采样点各小时的平均CO浓度、最大CO浓度、最小CO浓度、中位数CO浓度(`CO_mg/m3`)。 +- 计算各采样点中CO浓度小于全国平均CO浓度的占比。 +- 找出全国各采样点中CO浓度小于全国平均CO浓度的占比最高的10个采样点。 + +```{r} +airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx", + sheet = 2) +``` + + +## 练习 + +按月统计dep_delay最大的3个航班的航班号(flight),用逗号连接。 + +```{r} +#| echo: false +flights |> + group_by(year, month) |> + slice_max(dep_delay, n = 3) |> + summarize(flight = paste(paste0(carrier, flight), collapse = ", ")) |> +knitr::kable() +``` + + + + +## 数据变形示意图 + +```{r} +billboard +knitr::include_graphics("../../image/tidy-data/variables.png", dpi = 270) +``` + +## 数据变形 + +```{r} +billboard |> + pivot_longer( + cols = starts_with("wk"), + names_to = "week", + values_to = "rank", + values_drop_na = TRUE + ) +``` + +## 数据变形 + +```{r} +billboard_longer <- billboard |> + pivot_longer( + cols = starts_with("wk"), + names_to = "week", + values_to = "rank", + values_drop_na = TRUE + ) |> + mutate( + week = parse_number(week) + ) +billboard_longer +``` + + +## 练习 + + +```{r} +#| echo: false +df <- tribble( + ~id, ~bp1, ~bp2, + "A", 100, 120, + "B", 140, 115, + "C", 120, 125 +) +df +``` + +将以上数据(`df`)转换为如下形式。 + +```{r} +#| echo: false +df |> + pivot_longer( + cols = bp1:bp2, + names_to = "measurement", + values_to = "value" + ) +``` + +## 练习 + +请转换如下`iris`数据。 + +```{r} +#| echo: false +as_tibble(head(iris, n = 3)) +cat("转为如下形式:") +iris |> + pivot_longer(cols = c(Sepal.Length, + Sepal.Width, + Petal.Length, + Petal.Width), + names_to = "flower_attr", + values_to = "attr_value") |> + head() +``` + + + + +## 数据变形示意图2 + +```{r} +who2 +knitr::include_graphics("../../image/tidy-data/multiple-names.png", dpi = 270) +``` + + +## 数据变形 + +```{r} +who2 |> + pivot_longer( + cols = !(country:year), + names_to = c("diagnosis", "gender", "age"), + names_sep = "_", + values_to = "count" + ) +``` + + +## 数据变形示意图 + +```{r} +household +knitr::include_graphics("../../image/tidy-data/names-and-values.png", dpi = 270) +``` + +## 数据变形 + +```{r} +household |> + pivot_longer( + cols = !family, + names_to = c(".value", "child"), + names_sep = "_", + values_drop_na = TRUE + ) +``` + + +## 查看数据 + +```{r} +cms_patient_experience +``` + +## 查看数据 + +```{r} +cms_patient_experience |> + distinct(measure_cd, measure_title) +``` + +## 数据变形(变宽) + +```{r} +cms_patient_experience |> + pivot_wider( + names_from = measure_cd, + values_from = prf_rate + ) +``` + +## 数据变形(变宽) + +```{r} +cms_patient_experience |> + pivot_wider( + id_cols = starts_with("org"), + names_from = measure_cd, + values_from = prf_rate + ) +``` + +## 练习 + +```{r} +df <- tribble( + ~id, ~measurement, ~value, + "A", "bp1", 100, + "B", "bp1", 140, + "B", "bp2", 115, + "A", "bp2", 120, + "A", "bp3", 105 +) +``` + + +变形成如下形式: + + +```{r} +#| echo: false +df |> + pivot_wider( + names_from = measurement, + values_from = value + ) +``` + + +## 练习:变宽 + +```{r} +df <- tribble( + ~id, ~measurement, ~value, + "A", "bp1", 100, + "A", "bp1", 102, + "A", "bp2", 120, + "B", "bp1", 140, + "B", "bp2", 115 +) +``` + +## 练习 + +```{r} +df |> + pivot_wider( + names_from = measurement, + values_from = value + ) +``` + +## 练习 + +```{r} +df |> + group_by(id, measurement) |> + summarize(n = n(), .groups = "drop") |> + filter(n > 1) +``` + + +## nest,套嵌数据框 + +```{r} +#| echo: false +df <- tibble(x = c(1, 1, 1, 2, 2, 3), y = 1:6, z = 6:1) +df +``` + +```{r} +df %>% nest(data = c(y, z)) +``` + +## nest,套嵌数据框 + +Specify variables to nest by (rather than variables to nest) using `.by` + +```{r} +df %>% nest(.by = x) +``` + + +## nest,套嵌数据框 + +In this case, since `...` isn't used you can specify the resulting column name with `.key` + +```{r} +df %>% nest(.by = x, .key = "cols") +``` + + +## nest,套嵌数据框 + +Use tidyselect syntax and helpers, just like in `dplyr::select()` + +```{r} +df %>% nest(data = any_of(c("y", "z"))) +``` + +## nest,套嵌数据框 + +`...` and `.by` can be used together to drop columns you no longer need, +or to include the columns you are nesting by in the inner data frame too. +This drops `z`: + +```{r} +df %>% nest(data = y, .by = x) +``` + +## nest,套嵌数据框 + +This includes `x` in the inner data frame: + +```{r} +df %>% nest(data = everything(), .by = x) +``` + +## nest,套嵌数据框 + +Multiple nesting structures can be specified at once + +```{r} +iris %>% + nest(petal = starts_with("Petal"), sepal = starts_with("Sepal")) +``` + + +## nest,套嵌数据框 + +```{r} +iris %>% + nest(width = contains("Width"), length = contains("Length")) +``` + + +## nest,套嵌数据框 + +Nesting a grouped data frame nests all variables apart from the group vars + +```{r} +fish_encounters +fish_encounters %>% + dplyr::group_by(fish) %>% + nest() +``` + +## nest,套嵌数据框 + +That is similar to `nest(.by = )`, except here the result isn't grouped + +```{r} +fish_encounters %>% + nest(.by = fish) +``` + +## nest,套嵌数据框 + +Nesting is often useful for creating per group models + +```{r} +mtcars %>% + nest(.by = cyl) %>% + dplyr::mutate(models = lapply(data, function(df) lm(mpg ~ wt, data = df))) +``` + + +## 练习 + +```{r} +#| echo: false +(airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx", + sheet = 2)) +``` + +```{r} +airqualitydf +airqualitynestdf <- airqualitydf |> + nest(sitedf = -site) +``` + +## `nest`与`group_by`联用 + +```{r} +#| echo: false +iris %>% + group_by(Species) %>% + nest(.key = "spdf") +``` + +## unnest + +```{r} +airqualitynestdf |> unnest(sitedf) +``` + + + +## `purrr`包 + +- map():依次应用一元函数到一个序列的每个元素上,基本等同 lapply() +- map2():依次应用二元函数到两个序列的每对元素上 +- pmap():应用多元函数到多个序列的每组元素上,可以实现对数据框逐行迭代 +- map 系列默认返回列表型,可根据想要的返回类型添加后缀:_int, _dbl, _lgl, _chr, _df, 甚至可以接着对返回的数据框df做行/列合并:_dfr, _dfc +- 如果只想要函数依次作用的过程,而不需要返回结果,改用 walk 系列即可 +- 所应用的函数,有 purrr公式风格简写(匿名函数),支持一元,二元,多元函数 +- purrr 包中的其它有用函数 + +## `purrr`包 + +- `map_chr(.x, .f)`: 返回字符型向量 +- `map_lgl(.x, .f)`: 返回逻辑型向量 +- `map_dbl(.x, .f)`: 返回实数型向量 +- `map_int(.x, .f)`: 返回整数型向量 +- `map_dfr(.x, .f)`: 返回数据框列表,再 bind_rows 按行合并为一个数据框 +- `map_dfc(.x, .f)`: 返回数据框列表,再 bind_cols 按列合并为一个数据框 + + +## `purrr`包-cheatsheet + +```{r} +dwfun::ggsavep("../../image/cheatsheet/purrr.svg", loadit = TRUE) +``` + +[purrr](../../image/cheatsheet/purrr.pdf) + + +## `purrr`包 + +生成从1到10的10组随机数,每组随机数个数为100,均值依次为1到10,标准差为1,并存储在数据框中。 + +```{r} +res <- list() +for (i in 1:10) { + res[[i]] <- tibble(随机数 = rnorm(n = 100, mean = i, sd = 1)) +} +res + +``` + + + + +## `purrr`包 + +生成从1到10的10组随机数,每组随机数个数为100,均值依次为1到10,标准差为1,并存储在数据框中。 + +```{r} +1:10 |> +purrr::map(~tibble(随机数 = rnorm(n = 100, mean = .x, sd = 1))) +``` + +## purrr + +```{r} +library(purrr) +mtcars |> + split(mtcars$cyl) |> # from base R + map(\(df) lm(mpg ~ wt, data = df)) |> + map(summary) %>% + map_dbl("r.squared") +``` + + +## 练习: + +计算每月最后一个周六的航班数: + +```{r} +flights +``` + +## `tidyr` + `purrr`包 + +任务:展示不同城市间的大气指标散点图 + +```{r} +(airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx", + sheet = 2)) +``` + +## join + +Perform left join + +```{r} +(df1 <- data.frame(id = 1:5, value1 = letters[1:5])) +(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3])) +left_join(df1, df2, by = "id") +``` + +## left join + +Create sample data frames with non-matching rows + +```{r} +(df1 <- data.frame(id = 1:5, value1 = letters[1:5])) +(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3])) +left_join(df2, df1, by = "id") +``` + +## left join + + +Create sample data frames with multiple columns + +```{r} +df1 <- data.frame(id1 = c(1, 2, 3), id2 = c("A", "B", "C"), value1 = letters[1:3]) +df2 <- data.frame(id1 = c(2, 3, 4), id2 = c("B", "C", "D"), value2 = LETTERS[1:3]) +# Perform left join +left_join(df1, df2, by = c("id1", "id2")) +``` + +## right join + + +```{r} +(df1 <- data.frame(id = 1:5, value1 = letters[1:5])) +(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3])) + +# Perform right join +right_join(df1, df2, by = "id") +``` + + +## inner join + +```{r} +# Create sample data frames +(df1 <- data.frame(id = 1:5, value1 = letters[1:5])) +(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3])) +# Perform inner join +inner_join(df1, df2, by = "id") +``` + + +## full join + +```{r} +# Create sample data frames +(df1 <- data.frame(id = 1:5, value1 = letters[1:5])) +(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3])) +# Perform inner join +full_join(df1, df2, by = "id") +``` + +## semi join + +Create sample data frames + +```{r} +(df1 <- data.frame(id = 1:5, value1 = letters[1:5])) +(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3])) +# Perform semi join +semi_join(df1, df2, by = "id") +``` + + +## 练习 + +合并`airquality.xlsx`中的数据。 + + + +## 练习 + +统计各城市白天与晚上的大气质量差异,计算不同指标差异最大的10个城市。 + + + + + + +## 欢迎讨论!{.center} + + +`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drc.drwater.net/course/public/RWEP/PUB/SD/")` +