第8次课
This commit is contained in:
		
							
								
								
									
										1
									
								
								SD/20240328_1_datatransform/_extensions
									
									
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								SD/20240328_1_datatransform/_extensions
									
									
									
									
									
										Symbolic link
									
								
							@@ -0,0 +1 @@
 | 
			
		||||
../../_extensions
 | 
			
		||||
							
								
								
									
										949
									
								
								SD/20240328_1_datatransform/index.qmd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										949
									
								
								SD/20240328_1_datatransform/index.qmd
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,949 @@
 | 
			
		||||
---
 | 
			
		||||
title: "Data Transform"
 | 
			
		||||
subtitle: 《区域水环境污染数据分析实践》<br>Data analysis practice of regional water environment pollution
 | 
			
		||||
author: 苏命、王为东<br>中国科学院大学资源与环境学院<br>中国科学院生态环境研究中心
 | 
			
		||||
date: today
 | 
			
		||||
lang: zh
 | 
			
		||||
format:
 | 
			
		||||
  revealjs:
 | 
			
		||||
    theme: dark
 | 
			
		||||
    slide-number: true
 | 
			
		||||
    chalkboard:
 | 
			
		||||
      buttons: true
 | 
			
		||||
    preview-links: auto
 | 
			
		||||
    lang: zh
 | 
			
		||||
    toc: true
 | 
			
		||||
    toc-depth: 1
 | 
			
		||||
    toc-title: 大纲
 | 
			
		||||
    logo: ./_extensions/inst/img/ucaslogo.png
 | 
			
		||||
    css: ./_extensions/inst/css/revealjs.css
 | 
			
		||||
    pointer:
 | 
			
		||||
      key: "p"
 | 
			
		||||
      color: "#32cd32"
 | 
			
		||||
      pointerSize: 18
 | 
			
		||||
revealjs-plugins:
 | 
			
		||||
  - pointer
 | 
			
		||||
filters:
 | 
			
		||||
  - d2
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
knitr::opts_chunk$set(echo = TRUE)
 | 
			
		||||
source("../../coding/_common.R")
 | 
			
		||||
library(nycflights13)
 | 
			
		||||
library(tidyverse)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 计数
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |>
 | 
			
		||||
  count(origin, dest, sort = TRUE)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 计数-练习
 | 
			
		||||
 | 
			
		||||
统计每月的航班数量。
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
flights |>
 | 
			
		||||
  count(year, month, sort = TRUE)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 计算新变量
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  mutate(
 | 
			
		||||
    gain = dep_delay - arr_delay,
 | 
			
		||||
    speed = distance / air_time * 60
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 计算新变量
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  mutate(
 | 
			
		||||
    gain = dep_delay - arr_delay,
 | 
			
		||||
    speed = distance / air_time * 60,
 | 
			
		||||
    .before = 1
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 计算新变量
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  mutate(
 | 
			
		||||
    gain = dep_delay - arr_delay,
 | 
			
		||||
    speed = distance / air_time * 60,
 | 
			
		||||
    .after = day
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 计算新变量
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  mutate(
 | 
			
		||||
    gain = dep_delay - arr_delay,
 | 
			
		||||
    hours = air_time / 60,
 | 
			
		||||
    gain_per_hour = gain / hours,
 | 
			
		||||
    .keep = "used"
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 列排序
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  relocate(time_hour, air_time)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 列排序
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| results: false
 | 
			
		||||
 | 
			
		||||
flights |> 
 | 
			
		||||
  relocate(year:dep_time, .after = time_hour)
 | 
			
		||||
flights |> 
 | 
			
		||||
  relocate(starts_with("arr"), .before = dep_time)
 | 
			
		||||
flights |> 
 | 
			
		||||
  select(starts_with("arr"), everything())
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
计算目的地为IAH,按飞行速度排序的表格,保留year:day, `dep_time`, carrier, flight与speed列。
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  filter(dest == "IAH") |> 
 | 
			
		||||
  mutate(speed = distance / air_time * 60) |> 
 | 
			
		||||
  select(year:day, dep_time, carrier, flight, speed) |> 
 | 
			
		||||
  arrange(desc(speed))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
计算目的地为IAH,按飞行速度排序的表格,保留year:day, `dep_time`, carrier, flight与speed列。
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| results: false
 | 
			
		||||
 | 
			
		||||
flights1 <- filter(flights, dest == "IAH")
 | 
			
		||||
flights2 <- mutate(flights1, speed = distance / air_time * 60)
 | 
			
		||||
flights3 <- select(flights2, year:day, dep_time, carrier, flight, speed)
 | 
			
		||||
arrange(flights3, desc(speed))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
计算目的地为IAH,按飞行速度排序的表格,保留year:day, `dep_time`, carrier, flight与speed列。
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  filter(dest == "IAH") |> 
 | 
			
		||||
  mutate(speed = distance / air_time * 60) |> 
 | 
			
		||||
  select(year:day, dep_time, carrier, flight, speed) |> 
 | 
			
		||||
  arrange(desc(speed))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 分组统计
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
 | 
			
		||||
library(tidyverse)
 | 
			
		||||
 | 
			
		||||
mtcars %>% 
 | 
			
		||||
  group_by(cyl) %>%
 | 
			
		||||
  summarize(n = n())
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 分组统计
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  group_by(month)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 分组统计
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  group_by(month) |> 
 | 
			
		||||
  summarize(
 | 
			
		||||
    avg_delay = mean(dep_delay)
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 分组统计
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  group_by(month) |> 
 | 
			
		||||
  summarize(
 | 
			
		||||
    avg_delay = mean(dep_delay, na.rm = TRUE)
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 分组统计
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  group_by(month) |> 
 | 
			
		||||
  summarize(
 | 
			
		||||
    avg_delay = mean(dep_delay, na.rm = TRUE), 
 | 
			
		||||
    n = n()
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 分组统计
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  group_by(dest) |> 
 | 
			
		||||
  slice_max(arr_delay, n = 1) |>
 | 
			
		||||
  relocate(dest)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 分组统计
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
 | 
			
		||||
flights |>
 | 
			
		||||
  filter(dest == "IAH") |> 
 | 
			
		||||
  group_by(year, month, day) |> 
 | 
			
		||||
  summarize(
 | 
			
		||||
    arr_delay = mean(arr_delay, na.rm = TRUE)
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 分组
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
daily <- flights |>  
 | 
			
		||||
  group_by(year, month, day)
 | 
			
		||||
daily
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 分组统计
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
daily_flights <- daily |> 
 | 
			
		||||
  summarize(n = n())
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 分组统计
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| results: false
 | 
			
		||||
 | 
			
		||||
daily_flights <- daily |> 
 | 
			
		||||
  summarize(
 | 
			
		||||
    n = n(), 
 | 
			
		||||
    .groups = "drop_last"
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 删除分组
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
daily |> ungroup()
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 删除分组
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
daily |> 
 | 
			
		||||
  ungroup() |>
 | 
			
		||||
  summarize(
 | 
			
		||||
    avg_delay = mean(dep_delay, na.rm = TRUE), 
 | 
			
		||||
    flights = n()
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 分组统计
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  summarize(
 | 
			
		||||
    delay = mean(dep_delay, na.rm = TRUE), 
 | 
			
		||||
    n = n(),
 | 
			
		||||
    .by = month
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 分组统计
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights |> 
 | 
			
		||||
  summarize(
 | 
			
		||||
    delay = mean(dep_delay, na.rm = TRUE), 
 | 
			
		||||
    n = n(),
 | 
			
		||||
    .by = c(origin, dest)
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df <- tibble(
 | 
			
		||||
  x = 1:5,
 | 
			
		||||
  y = c("a", "b", "a", "a", "b"),
 | 
			
		||||
  z = c("K", "K", "L", "L", "K")
 | 
			
		||||
)
 | 
			
		||||
df
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| eval: false
 | 
			
		||||
    
 | 
			
		||||
df |> arrange(y)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
df
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| eval: false
 | 
			
		||||
df |>
 | 
			
		||||
  group_by(y) |>
 | 
			
		||||
  summarize(mean_x = mean(x))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
df
 | 
			
		||||
```
 | 
			
		||||
```{r}
 | 
			
		||||
#| eval: false
 | 
			
		||||
    
 | 
			
		||||
df |>
 | 
			
		||||
  group_by(y, z) |>
 | 
			
		||||
  summarize(mean_x = mean(x))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
df
 | 
			
		||||
```
 | 
			
		||||
```{r}
 | 
			
		||||
#| eval: false
 | 
			
		||||
    
 | 
			
		||||
df |>
 | 
			
		||||
  group_by(y, z) |>
 | 
			
		||||
  summarize(mean_x = mean(x), .groups = "drop")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
df
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| eval: false
 | 
			
		||||
df |>
 | 
			
		||||
  group_by(y, z) |>
 | 
			
		||||
  summarize(mean_x = mean(x))
 | 
			
		||||
 | 
			
		||||
df |>
 | 
			
		||||
  group_by(y, z) |>
 | 
			
		||||
  mutate(mean_x = mean(x))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
- 计算不同采样点的平均CO浓度、最大CO浓度、最小CO浓度、中位数CO浓度(`CO_mg/m3`)。
 | 
			
		||||
- 计算各小时全国的平均CO浓度、最大CO浓度、最小CO浓度、中位数CO浓度(`CO_mg/m3`)。
 | 
			
		||||
- 计算不同采样点各小时的平均CO浓度、最大CO浓度、最小CO浓度、中位数CO浓度(`CO_mg/m3`)。
 | 
			
		||||
- 计算各采样点中CO浓度小于全国平均CO浓度的占比。
 | 
			
		||||
- 找出全国各采样点中CO浓度小于全国平均CO浓度的占比最高的10个采样点。
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx",
 | 
			
		||||
  sheet = 2)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
按月统计dep_delay最大的3个航班的航班号(flight),用逗号连接。
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
flights |>
 | 
			
		||||
  group_by(year, month) |>
 | 
			
		||||
  slice_max(dep_delay, n = 3) |>
 | 
			
		||||
  summarize(flight = paste(paste0(carrier, flight), collapse = ", ")) |>
 | 
			
		||||
knitr::kable()
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 数据变形示意图
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
billboard
 | 
			
		||||
knitr::include_graphics("../../image/tidy-data/variables.png", dpi = 270)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 数据变形
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
billboard |> 
 | 
			
		||||
  pivot_longer(
 | 
			
		||||
    cols = starts_with("wk"), 
 | 
			
		||||
    names_to = "week", 
 | 
			
		||||
    values_to = "rank",
 | 
			
		||||
    values_drop_na = TRUE
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 数据变形
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
billboard_longer <- billboard |> 
 | 
			
		||||
  pivot_longer(
 | 
			
		||||
    cols = starts_with("wk"), 
 | 
			
		||||
    names_to = "week", 
 | 
			
		||||
    values_to = "rank",
 | 
			
		||||
    values_drop_na = TRUE
 | 
			
		||||
  ) |> 
 | 
			
		||||
  mutate(
 | 
			
		||||
    week = parse_number(week)
 | 
			
		||||
  )
 | 
			
		||||
billboard_longer
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
df <- tribble(
 | 
			
		||||
  ~id,  ~bp1, ~bp2,
 | 
			
		||||
   "A",  100,  120,
 | 
			
		||||
   "B",  140,  115,
 | 
			
		||||
   "C",  120,  125
 | 
			
		||||
)
 | 
			
		||||
df
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
将以上数据(`df`)转换为如下形式。
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
df |> 
 | 
			
		||||
  pivot_longer(
 | 
			
		||||
    cols = bp1:bp2,
 | 
			
		||||
    names_to = "measurement",
 | 
			
		||||
    values_to = "value"
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
请转换如下`iris`数据。
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
as_tibble(head(iris, n = 3))
 | 
			
		||||
cat("转为如下形式:")
 | 
			
		||||
iris |>
 | 
			
		||||
  pivot_longer(cols = c(Sepal.Length,
 | 
			
		||||
    Sepal.Width,
 | 
			
		||||
    Petal.Length,
 | 
			
		||||
    Petal.Width),
 | 
			
		||||
    names_to = "flower_attr",
 | 
			
		||||
    values_to = "attr_value") |>
 | 
			
		||||
  head()
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 数据变形示意图2
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
who2
 | 
			
		||||
knitr::include_graphics("../../image/tidy-data/multiple-names.png", dpi = 270)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 数据变形
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
who2 |> 
 | 
			
		||||
  pivot_longer(
 | 
			
		||||
    cols = !(country:year),
 | 
			
		||||
    names_to = c("diagnosis", "gender", "age"), 
 | 
			
		||||
    names_sep = "_",
 | 
			
		||||
    values_to = "count"
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 数据变形示意图
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
household
 | 
			
		||||
knitr::include_graphics("../../image/tidy-data/names-and-values.png", dpi = 270)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 数据变形
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
household |> 
 | 
			
		||||
  pivot_longer(
 | 
			
		||||
    cols = !family, 
 | 
			
		||||
    names_to = c(".value", "child"), 
 | 
			
		||||
    names_sep = "_", 
 | 
			
		||||
    values_drop_na = TRUE
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 查看数据
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
cms_patient_experience
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 查看数据
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
cms_patient_experience |> 
 | 
			
		||||
  distinct(measure_cd, measure_title)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 数据变形(变宽)
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
cms_patient_experience |> 
 | 
			
		||||
  pivot_wider(
 | 
			
		||||
    names_from = measure_cd,
 | 
			
		||||
    values_from = prf_rate
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 数据变形(变宽)
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
cms_patient_experience |> 
 | 
			
		||||
  pivot_wider(
 | 
			
		||||
    id_cols = starts_with("org"),
 | 
			
		||||
    names_from = measure_cd,
 | 
			
		||||
    values_from = prf_rate
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df <- tribble(
 | 
			
		||||
  ~id, ~measurement, ~value,
 | 
			
		||||
  "A",        "bp1",    100,
 | 
			
		||||
  "B",        "bp1",    140,
 | 
			
		||||
  "B",        "bp2",    115, 
 | 
			
		||||
  "A",        "bp2",    120,
 | 
			
		||||
  "A",        "bp3",    105
 | 
			
		||||
)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
变形成如下形式:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
df |> 
 | 
			
		||||
  pivot_wider(
 | 
			
		||||
    names_from = measurement,
 | 
			
		||||
    values_from = value
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 练习:变宽
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df <- tribble(
 | 
			
		||||
  ~id, ~measurement, ~value,
 | 
			
		||||
  "A",        "bp1",    100,
 | 
			
		||||
  "A",        "bp1",    102,
 | 
			
		||||
  "A",        "bp2",    120,
 | 
			
		||||
  "B",        "bp1",    140, 
 | 
			
		||||
  "B",        "bp2",    115
 | 
			
		||||
)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df |>
 | 
			
		||||
  pivot_wider(
 | 
			
		||||
    names_from = measurement,
 | 
			
		||||
    values_from = value
 | 
			
		||||
  )
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df |> 
 | 
			
		||||
  group_by(id, measurement) |> 
 | 
			
		||||
  summarize(n = n(), .groups = "drop") |> 
 | 
			
		||||
  filter(n > 1)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## nest,套嵌数据框
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
df <- tibble(x = c(1, 1, 1, 2, 2, 3), y = 1:6, z = 6:1)
 | 
			
		||||
df
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df %>% nest(data = c(y, z))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## nest,套嵌数据框
 | 
			
		||||
 | 
			
		||||
Specify variables to nest by (rather than variables to nest) using `.by`
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df %>% nest(.by = x)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## nest,套嵌数据框
 | 
			
		||||
 | 
			
		||||
In this case, since `...` isn't used you can specify the resulting column name with `.key`
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df %>% nest(.by = x, .key = "cols")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## nest,套嵌数据框
 | 
			
		||||
 | 
			
		||||
Use tidyselect syntax and helpers, just like in `dplyr::select()`
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df %>% nest(data = any_of(c("y", "z")))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## nest,套嵌数据框
 | 
			
		||||
 | 
			
		||||
`...` and `.by` can be used together to drop columns you no longer need,
 | 
			
		||||
or to include the columns you are nesting by in the inner data frame too.
 | 
			
		||||
This drops `z`:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df %>% nest(data = y, .by = x)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## nest,套嵌数据框
 | 
			
		||||
 | 
			
		||||
This includes `x` in the inner data frame:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df %>% nest(data = everything(), .by = x)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## nest,套嵌数据框
 | 
			
		||||
 | 
			
		||||
Multiple nesting structures can be specified at once
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
iris %>%
 | 
			
		||||
  nest(petal = starts_with("Petal"), sepal = starts_with("Sepal"))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## nest,套嵌数据框
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
iris %>%
 | 
			
		||||
  nest(width = contains("Width"), length = contains("Length"))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## nest,套嵌数据框
 | 
			
		||||
 | 
			
		||||
Nesting a grouped data frame nests all variables apart from the group vars
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
fish_encounters
 | 
			
		||||
fish_encounters %>%
 | 
			
		||||
  dplyr::group_by(fish) %>%
 | 
			
		||||
  nest()
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## nest,套嵌数据框
 | 
			
		||||
 | 
			
		||||
That is similar to `nest(.by = )`, except here the result isn't grouped
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
fish_encounters %>%
 | 
			
		||||
  nest(.by = fish)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## nest,套嵌数据框
 | 
			
		||||
 | 
			
		||||
Nesting is often useful for creating per group models
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
mtcars %>%
 | 
			
		||||
  nest(.by = cyl) %>%
 | 
			
		||||
  dplyr::mutate(models = lapply(data, function(df) lm(mpg ~ wt, data = df)))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
(airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx",
 | 
			
		||||
  sheet = 2))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
airqualitydf
 | 
			
		||||
airqualitynestdf <- airqualitydf |>
 | 
			
		||||
  nest(sitedf = -site)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## `nest`与`group_by`联用
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
#| echo: false
 | 
			
		||||
iris %>%
 | 
			
		||||
  group_by(Species) %>%
 | 
			
		||||
  nest(.key = "spdf")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## unnest
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
airqualitynestdf |> unnest(sitedf)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## `purrr`包
 | 
			
		||||
 | 
			
		||||
- map():依次应用一元函数到一个序列的每个元素上,基本等同 lapply()
 | 
			
		||||
- map2():依次应用二元函数到两个序列的每对元素上
 | 
			
		||||
- pmap():应用多元函数到多个序列的每组元素上,可以实现对数据框逐行迭代
 | 
			
		||||
- map 系列默认返回列表型,可根据想要的返回类型添加后缀:_int, _dbl, _lgl, _chr, _df, 甚至可以接着对返回的数据框df做行/列合并:_dfr, _dfc
 | 
			
		||||
- 如果只想要函数依次作用的过程,而不需要返回结果,改用 walk 系列即可
 | 
			
		||||
- 所应用的函数,有 purrr公式风格简写(匿名函数),支持一元,二元,多元函数
 | 
			
		||||
- purrr 包中的其它有用函数
 | 
			
		||||
 | 
			
		||||
## `purrr`包
 | 
			
		||||
 | 
			
		||||
- `map_chr(.x, .f)`: 返回字符型向量
 | 
			
		||||
- `map_lgl(.x, .f)`: 返回逻辑型向量
 | 
			
		||||
- `map_dbl(.x, .f)`: 返回实数型向量
 | 
			
		||||
- `map_int(.x, .f)`: 返回整数型向量
 | 
			
		||||
- `map_dfr(.x, .f)`: 返回数据框列表,再 bind_rows 按行合并为一个数据框
 | 
			
		||||
- `map_dfc(.x, .f)`: 返回数据框列表,再 bind_cols 按列合并为一个数据框
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## `purrr`包-cheatsheet
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
dwfun::ggsavep("../../image/cheatsheet/purrr.svg", loadit = TRUE)
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
[purrr](../../image/cheatsheet/purrr.pdf) 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## `purrr`包
 | 
			
		||||
 | 
			
		||||
生成从1到10的10组随机数,每组随机数个数为100,均值依次为1到10,标准差为1,并存储在数据框中。
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
res <- list()
 | 
			
		||||
for (i in 1:10) {
 | 
			
		||||
  res[[i]] <- tibble(随机数 = rnorm(n = 100, mean = i, sd = 1))
 | 
			
		||||
}
 | 
			
		||||
res
 | 
			
		||||
  
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## `purrr`包
 | 
			
		||||
 | 
			
		||||
生成从1到10的10组随机数,每组随机数个数为100,均值依次为1到10,标准差为1,并存储在数据框中。
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
1:10 |>
 | 
			
		||||
purrr::map(~tibble(随机数 = rnorm(n = 100, mean = .x, sd = 1)))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## purrr
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
library(purrr)
 | 
			
		||||
mtcars |> 
 | 
			
		||||
  split(mtcars$cyl) |>  # from base R
 | 
			
		||||
  map(\(df) lm(mpg ~ wt, data = df)) |> 
 | 
			
		||||
  map(summary) %>%
 | 
			
		||||
  map_dbl("r.squared")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 练习:
 | 
			
		||||
 | 
			
		||||
计算每月最后一个周六的航班数:
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
flights
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## `tidyr` + `purrr`包
 | 
			
		||||
 | 
			
		||||
任务:展示不同城市间的大气指标散点图
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
(airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx",
 | 
			
		||||
  sheet = 2))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## join
 | 
			
		||||
 | 
			
		||||
Perform left join
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
 | 
			
		||||
(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
 | 
			
		||||
left_join(df1, df2, by = "id")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## left join
 | 
			
		||||
 | 
			
		||||
Create sample data frames with non-matching rows
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
 | 
			
		||||
(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
 | 
			
		||||
left_join(df2, df1, by = "id")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## left join
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Create sample data frames with multiple columns
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
df1 <- data.frame(id1 = c(1, 2, 3), id2 = c("A", "B", "C"), value1 = letters[1:3])
 | 
			
		||||
df2 <- data.frame(id1 = c(2, 3, 4), id2 = c("B", "C", "D"), value2 = LETTERS[1:3])
 | 
			
		||||
# Perform left join
 | 
			
		||||
left_join(df1, df2, by = c("id1", "id2"))
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## right join
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
 | 
			
		||||
(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
 | 
			
		||||
 | 
			
		||||
# Perform right join
 | 
			
		||||
right_join(df1, df2, by = "id")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## inner join
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
# Create sample data frames
 | 
			
		||||
(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
 | 
			
		||||
(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
 | 
			
		||||
# Perform inner join
 | 
			
		||||
inner_join(df1, df2, by = "id")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## full join
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
# Create sample data frames
 | 
			
		||||
(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
 | 
			
		||||
(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
 | 
			
		||||
# Perform inner join
 | 
			
		||||
full_join(df1, df2, by = "id")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## semi join
 | 
			
		||||
 | 
			
		||||
Create sample data frames
 | 
			
		||||
 | 
			
		||||
```{r}
 | 
			
		||||
(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
 | 
			
		||||
(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
 | 
			
		||||
# Perform semi join
 | 
			
		||||
semi_join(df1, df2, by = "id")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
合并`airquality.xlsx`中的数据。
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 练习
 | 
			
		||||
 | 
			
		||||
统计各城市白天与晚上的大气质量差异,计算不同指标差异最大的10个城市。
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## 欢迎讨论!{.center}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drwater.rcees.ac.cn/course/public/RWEP/@PUB/SD/")`
 | 
			
		||||
 | 
			
		||||
		Reference in New Issue
	
	Block a user