570 lines
7.5 KiB
Plaintext
570 lines
7.5 KiB
Plaintext
---
|
|
title: "Data Transform"
|
|
subtitle: 《区域水环境污染数据分析实践》<br>Data analysis practice of regional water environment pollution
|
|
author: 苏命、王为东<br>中国科学院大学资源与环境学院<br>中国科学院生态环境研究中心
|
|
date: today
|
|
lang: zh
|
|
format:
|
|
revealjs:
|
|
theme: dark
|
|
slide-number: true
|
|
chalkboard:
|
|
buttons: true
|
|
preview-links: auto
|
|
lang: zh
|
|
toc: true
|
|
toc-depth: 1
|
|
toc-title: 大纲
|
|
logo: ./_extensions/inst/img/ucaslogo.png
|
|
css: ./_extensions/inst/css/revealjs.css
|
|
pointer:
|
|
key: "p"
|
|
color: "#32cd32"
|
|
pointerSize: 18
|
|
revealjs-plugins:
|
|
- pointer
|
|
filters:
|
|
- d2
|
|
---
|
|
|
|
```{r}
|
|
#| echo: false
|
|
knitr::opts_chunk$set(echo = TRUE)
|
|
source("../../coding/_common.R")
|
|
library(nycflights13)
|
|
library(tidyverse)
|
|
|
|
```
|
|
|
|
|
|
## 查看数据
|
|
|
|
```{r}
|
|
flights
|
|
```
|
|
|
|
## 查看数据
|
|
|
|
```{r}
|
|
glimpse(flights)
|
|
```
|
|
|
|
## 查看数据
|
|
|
|
```{r}
|
|
#| eval: false
|
|
|
|
flights |>
|
|
filter(dest == "IAH") |>
|
|
group_by(year, month, day) |>
|
|
summarize(
|
|
arr_delay = mean(arr_delay, na.rm = TRUE)
|
|
)
|
|
```
|
|
|
|
## filter
|
|
|
|
```{r}
|
|
flights |>
|
|
filter(dep_delay > 120)
|
|
```
|
|
|
|
## filter
|
|
|
|
```{r}
|
|
# Flights that departed on January 1
|
|
flights |>
|
|
filter(month == 1 & day == 1)
|
|
|
|
# Flights that departed in January or February
|
|
flights |>
|
|
filter(month == 1 | month == 2)
|
|
```
|
|
|
|
## filter
|
|
|
|
```{r}
|
|
# A shorter way to select flights that departed in January or February
|
|
flights |>
|
|
filter(month %in% c(1, 2))
|
|
```
|
|
|
|
## filter
|
|
|
|
```{r}
|
|
jan1 <- flights |>
|
|
filter(month == 1 & day == 1)
|
|
```
|
|
|
|
## filter
|
|
|
|
```{r}
|
|
#| error: true
|
|
|
|
flights |>
|
|
filter(month = 1)
|
|
```
|
|
|
|
## filter
|
|
|
|
```{r}
|
|
#| eval: false
|
|
|
|
flights |>
|
|
filter(month == 1 | 2)
|
|
```
|
|
|
|
## 排序
|
|
|
|
```{r}
|
|
flights |>
|
|
arrange(year, month, day, dep_time)
|
|
```
|
|
|
|
## 排序
|
|
|
|
```{r}
|
|
flights |>
|
|
arrange(desc(dep_delay))
|
|
```
|
|
|
|
## 去重
|
|
|
|
```{r}
|
|
# Remove duplicate rows, if any
|
|
flights |>
|
|
distinct()
|
|
|
|
# Find all unique origin and destination pairs
|
|
flights |>
|
|
distinct(origin, dest)
|
|
```
|
|
|
|
## 去重
|
|
|
|
```{r}
|
|
flights |>
|
|
distinct(origin, dest, .keep_all = TRUE)
|
|
```
|
|
|
|
## 计数
|
|
|
|
```{r}
|
|
flights |>
|
|
count(origin, dest, sort = TRUE)
|
|
```
|
|
|
|
## 计算新变量
|
|
|
|
```{r}
|
|
flights |>
|
|
mutate(
|
|
gain = dep_delay - arr_delay,
|
|
speed = distance / air_time * 60
|
|
)
|
|
```
|
|
|
|
## 计算新变量
|
|
|
|
```{r}
|
|
flights |>
|
|
mutate(
|
|
gain = dep_delay - arr_delay,
|
|
speed = distance / air_time * 60,
|
|
.before = 1
|
|
)
|
|
```
|
|
|
|
## 计算新变量
|
|
|
|
```{r}
|
|
#| results: false
|
|
|
|
flights |>
|
|
mutate(
|
|
gain = dep_delay - arr_delay,
|
|
speed = distance / air_time * 60,
|
|
.after = day
|
|
)
|
|
```
|
|
|
|
## 计算新变量
|
|
|
|
```{r}
|
|
#| results: false
|
|
|
|
flights |>
|
|
mutate(
|
|
gain = dep_delay - arr_delay,
|
|
hours = air_time / 60,
|
|
gain_per_hour = gain / hours,
|
|
.keep = "used"
|
|
)
|
|
```
|
|
|
|
## 选择列
|
|
|
|
```{r}
|
|
#| results: false
|
|
|
|
flights |>
|
|
select(year, month, day)
|
|
```
|
|
|
|
## 选择列
|
|
|
|
```{r}
|
|
#| results: false
|
|
|
|
flights |>
|
|
select(year:day)
|
|
```
|
|
|
|
## 选择列
|
|
|
|
```{r}
|
|
#| results: false
|
|
|
|
flights |>
|
|
select(!year:day)
|
|
```
|
|
|
|
## 选择列
|
|
|
|
```{r}
|
|
#| results: false
|
|
|
|
flights |>
|
|
select(where(is.character))
|
|
```
|
|
|
|
## 选择列
|
|
|
|
```{r}
|
|
flights |>
|
|
select(tail_num = tailnum)
|
|
```
|
|
|
|
## 重命名
|
|
|
|
```{r}
|
|
flights |>
|
|
rename(tail_num = tailnum)
|
|
```
|
|
|
|
## 列排序
|
|
|
|
```{r}
|
|
flights |>
|
|
relocate(time_hour, air_time)
|
|
```
|
|
|
|
## 列排序
|
|
|
|
```{r}
|
|
#| results: false
|
|
|
|
flights |>
|
|
relocate(year:dep_time, .after = time_hour)
|
|
flights |>
|
|
relocate(starts_with("arr"), .before = dep_time)
|
|
```
|
|
|
|
## 列排序
|
|
|
|
```{r}
|
|
#| eval: false
|
|
|
|
# For data checking, not used in results shown in book
|
|
flights <- flights |> mutate(
|
|
dep_time = hour * 60 + minute,
|
|
arr_time = (arr_time %/% 100) * 60 + (arr_time %% 100),
|
|
airtime2 = arr_time - dep_time,
|
|
dep_sched = dep_time + dep_delay
|
|
)
|
|
|
|
ggplot(flights, aes(x = dep_sched)) + geom_histogram(binwidth = 60)
|
|
ggplot(flights, aes(x = dep_sched %% 60)) + geom_histogram(binwidth = 1)
|
|
ggplot(flights, aes(x = air_time - airtime2)) + geom_histogram()
|
|
```
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
flights |>
|
|
filter(dest == "IAH") |>
|
|
mutate(speed = distance / air_time * 60) |>
|
|
select(year:day, dep_time, carrier, flight, speed) |>
|
|
arrange(desc(speed))
|
|
```
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
#| results: false
|
|
|
|
arrange(
|
|
select(
|
|
mutate(
|
|
filter(
|
|
flights,
|
|
dest == "IAH"
|
|
),
|
|
speed = distance / air_time * 60
|
|
),
|
|
year:day, dep_time, carrier, flight, speed
|
|
),
|
|
desc(speed)
|
|
)
|
|
```
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
#| results: false
|
|
|
|
flights1 <- filter(flights, dest == "IAH")
|
|
flights2 <- mutate(flights1, speed = distance / air_time * 60)
|
|
flights3 <- select(flights2, year:day, dep_time, carrier, flight, speed)
|
|
arrange(flights3, desc(speed))
|
|
```
|
|
|
|
## 分组统计
|
|
|
|
|
|
```{r}
|
|
|
|
library(tidyverse)
|
|
|
|
mtcars %>%
|
|
group_by(cyl) %>%
|
|
summarize(n = n())
|
|
```
|
|
|
|
## 分组统计
|
|
|
|
```{r}
|
|
flights |>
|
|
group_by(month)
|
|
```
|
|
|
|
## 分组统计
|
|
|
|
```{r}
|
|
flights |>
|
|
group_by(month) |>
|
|
summarize(
|
|
avg_delay = mean(dep_delay)
|
|
)
|
|
```
|
|
|
|
## 分组统计
|
|
|
|
```{r}
|
|
flights |>
|
|
group_by(month) |>
|
|
summarize(
|
|
avg_delay = mean(dep_delay, na.rm = TRUE)
|
|
)
|
|
```
|
|
|
|
## 分组统计
|
|
|
|
```{r}
|
|
flights |>
|
|
group_by(month) |>
|
|
summarize(
|
|
avg_delay = mean(dep_delay, na.rm = TRUE),
|
|
n = n()
|
|
)
|
|
```
|
|
|
|
## 分组统计
|
|
|
|
```{r}
|
|
flights |>
|
|
group_by(dest) |>
|
|
slice_max(arr_delay, n = 1) |>
|
|
relocate(dest)
|
|
```
|
|
|
|
## 分组
|
|
|
|
```{r}
|
|
daily <- flights |>
|
|
group_by(year, month, day)
|
|
daily
|
|
```
|
|
|
|
## 分组统计
|
|
|
|
```{r}
|
|
daily_flights <- daily |>
|
|
summarize(n = n())
|
|
```
|
|
|
|
## 分组统计
|
|
|
|
```{r}
|
|
#| results: false
|
|
|
|
daily_flights <- daily |>
|
|
summarize(
|
|
n = n(),
|
|
.groups = "drop_last"
|
|
)
|
|
```
|
|
|
|
## 删除分组
|
|
|
|
```{r}
|
|
daily |> ungroup()
|
|
```
|
|
|
|
## 删除分组
|
|
|
|
```{r}
|
|
daily |>
|
|
ungroup() |>
|
|
summarize(
|
|
avg_delay = mean(dep_delay, na.rm = TRUE),
|
|
flights = n()
|
|
)
|
|
```
|
|
|
|
## 分组统计
|
|
|
|
```{r}
|
|
#| results: false
|
|
flights |>
|
|
summarize(
|
|
delay = mean(dep_delay, na.rm = TRUE),
|
|
n = n(),
|
|
.by = month
|
|
)
|
|
```
|
|
|
|
## 分组统计
|
|
|
|
```{r}
|
|
#| results: false
|
|
flights |>
|
|
summarize(
|
|
delay = mean(dep_delay, na.rm = TRUE),
|
|
n = n(),
|
|
.by = c(origin, dest)
|
|
)
|
|
```
|
|
|
|
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
#| eval: false
|
|
|
|
df <- tibble(
|
|
x = 1:5,
|
|
y = c("a", "b", "a", "a", "b"),
|
|
z = c("K", "K", "L", "L", "K")
|
|
)
|
|
df |>
|
|
group_by(y)
|
|
```
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
#| eval: false
|
|
|
|
df |>
|
|
arrange(y)
|
|
```
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
#| eval: false
|
|
|
|
df |>
|
|
group_by(y) |>
|
|
summarize(mean_x = mean(x))
|
|
```
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
#| eval: false
|
|
|
|
df |>
|
|
group_by(y, z) |>
|
|
summarize(mean_x = mean(x))
|
|
```
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
#| eval: false
|
|
|
|
df |>
|
|
group_by(y, z) |>
|
|
summarize(mean_x = mean(x), .groups = "drop")
|
|
```
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
#| eval: false
|
|
|
|
df |>
|
|
group_by(y, z) |>
|
|
summarize(mean_x = mean(x))
|
|
|
|
df |>
|
|
group_by(y, z) |>
|
|
mutate(mean_x = mean(x))
|
|
```
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
batters <- Lahman::Batting |>
|
|
group_by(playerID) |>
|
|
summarize(
|
|
performance = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
|
|
n = sum(AB, na.rm = TRUE)
|
|
)
|
|
batters
|
|
```
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
#| warning: false
|
|
#| fig-alt: |
|
|
#| A scatterplot of number of batting performance vs. batting opportunites
|
|
#| overlaid with a smoothed line. Average performance increases sharply
|
|
#| from 0.2 at when n is 1 to 0.25 when n is ~1000. Average performance
|
|
#| continues to increase linearly at a much shallower slope reaching
|
|
#| ~0.3 when n is ~15,000.
|
|
|
|
batters |>
|
|
filter(n > 100) |>
|
|
ggplot(aes(x = n, y = performance)) +
|
|
geom_point(alpha = 1 / 10) +
|
|
geom_smooth(se = FALSE)
|
|
```
|
|
|
|
## 排序
|
|
|
|
```{r}
|
|
batters |>
|
|
arrange(desc(performance))
|
|
```
|
|
|
|
## 欢迎讨论!{.center}
|
|
|
|
|
|
`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drwater.rcees.ac.cn/course/public/RWEP/@PUB/SD/")`
|
|
|