--- title: "Data Transform" subtitle: 《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution author: 苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心 date: today lang: zh format: revealjs: theme: dark slide-number: true chalkboard: buttons: true preview-links: auto lang: zh toc: true toc-depth: 1 toc-title: 大纲 logo: ./_extensions/inst/img/ucaslogo.png css: ./_extensions/inst/css/revealjs.css pointer: key: "p" color: "#32cd32" pointerSize: 18 revealjs-plugins: - pointer filters: - d2 --- ```{r} #| echo: false knitr::opts_chunk$set(echo = TRUE) source("../../coding/_common.R") library(nycflights13) library(tidyverse) ``` ## 查看数据 ```{r} flights ``` ## 查看数据 ```{r} glimpse(flights) ``` ## 查看数据 ```{r} #| eval: false flights |> filter(dest == "IAH") |> group_by(year, month, day) |> summarize( arr_delay = mean(arr_delay, na.rm = TRUE) ) ``` ## filter ```{r} flights |> filter(dep_delay > 120) ``` ## filter ```{r} # Flights that departed on January 1 flights |> filter(month == 1 & day == 1) # Flights that departed in January or February flights |> filter(month == 1 | month == 2) ``` ## filter ```{r} # A shorter way to select flights that departed in January or February flights |> filter(month %in% c(1, 2)) ``` ## filter ```{r} jan1 <- flights |> filter(month == 1 & day == 1) ``` ## filter ```{r} #| error: true flights |> filter(month = 1) ``` ## filter ```{r} #| eval: false flights |> filter(month == 1 | 2) ``` ## 排序 ```{r} flights |> arrange(year, month, day, dep_time) ``` ## 排序 ```{r} flights |> arrange(desc(dep_delay)) ``` ## 去重 ```{r} # Remove duplicate rows, if any flights |> distinct() # Find all unique origin and destination pairs flights |> distinct(origin, dest) ``` ## 去重 ```{r} flights |> distinct(origin, dest, .keep_all = TRUE) ``` ## 计数 ```{r} flights |> count(origin, dest, sort = TRUE) ``` ## 计算新变量 ```{r} flights |> mutate( gain = dep_delay - arr_delay, speed = distance / air_time * 60 ) ``` ## 计算新变量 ```{r} flights |> mutate( gain = dep_delay - arr_delay, speed = distance / air_time * 60, .before = 1 ) ``` ## 计算新变量 ```{r} #| results: false flights |> mutate( gain = dep_delay - arr_delay, speed = distance / air_time * 60, .after = day ) ``` ## 计算新变量 ```{r} #| results: false flights |> mutate( gain = dep_delay - arr_delay, hours = air_time / 60, gain_per_hour = gain / hours, .keep = "used" ) ``` ## 选择列 ```{r} #| results: false flights |> select(year, month, day) ``` ## 选择列 ```{r} #| results: false flights |> select(year:day) ``` ## 选择列 ```{r} #| results: false flights |> select(!year:day) ``` ## 选择列 ```{r} #| results: false flights |> select(where(is.character)) ``` ## 选择列 ```{r} flights |> select(tail_num = tailnum) ``` ## 重命名 ```{r} flights |> rename(tail_num = tailnum) ``` ## 列排序 ```{r} flights |> relocate(time_hour, air_time) ``` ## 列排序 ```{r} #| results: false flights |> relocate(year:dep_time, .after = time_hour) flights |> relocate(starts_with("arr"), .before = dep_time) ``` ## 列排序 ```{r} #| eval: false # For data checking, not used in results shown in book flights <- flights |> mutate( dep_time = hour * 60 + minute, arr_time = (arr_time %/% 100) * 60 + (arr_time %% 100), airtime2 = arr_time - dep_time, dep_sched = dep_time + dep_delay ) ggplot(flights, aes(x = dep_sched)) + geom_histogram(binwidth = 60) ggplot(flights, aes(x = dep_sched %% 60)) + geom_histogram(binwidth = 1) ggplot(flights, aes(x = air_time - airtime2)) + geom_histogram() ``` ## 练习 ```{r} flights |> filter(dest == "IAH") |> mutate(speed = distance / air_time * 60) |> select(year:day, dep_time, carrier, flight, speed) |> arrange(desc(speed)) ``` ## 练习 ```{r} #| results: false arrange( select( mutate( filter( flights, dest == "IAH" ), speed = distance / air_time * 60 ), year:day, dep_time, carrier, flight, speed ), desc(speed) ) ``` ## 练习 ```{r} #| results: false flights1 <- filter(flights, dest == "IAH") flights2 <- mutate(flights1, speed = distance / air_time * 60) flights3 <- select(flights2, year:day, dep_time, carrier, flight, speed) arrange(flights3, desc(speed)) ``` ## 分组统计 ```{r} library(tidyverse) mtcars %>% group_by(cyl) %>% summarize(n = n()) ``` ## 分组统计 ```{r} flights |> group_by(month) ``` ## 分组统计 ```{r} flights |> group_by(month) |> summarize( avg_delay = mean(dep_delay) ) ``` ## 分组统计 ```{r} flights |> group_by(month) |> summarize( avg_delay = mean(dep_delay, na.rm = TRUE) ) ``` ## 分组统计 ```{r} flights |> group_by(month) |> summarize( avg_delay = mean(dep_delay, na.rm = TRUE), n = n() ) ``` ## 分组统计 ```{r} flights |> group_by(dest) |> slice_max(arr_delay, n = 1) |> relocate(dest) ``` ## 分组 ```{r} daily <- flights |> group_by(year, month, day) daily ``` ## 分组统计 ```{r} daily_flights <- daily |> summarize(n = n()) ``` ## 分组统计 ```{r} #| results: false daily_flights <- daily |> summarize( n = n(), .groups = "drop_last" ) ``` ## 删除分组 ```{r} daily |> ungroup() ``` ## 删除分组 ```{r} daily |> ungroup() |> summarize( avg_delay = mean(dep_delay, na.rm = TRUE), flights = n() ) ``` ## 分组统计 ```{r} #| results: false flights |> summarize( delay = mean(dep_delay, na.rm = TRUE), n = n(), .by = month ) ``` ## 分组统计 ```{r} #| results: false flights |> summarize( delay = mean(dep_delay, na.rm = TRUE), n = n(), .by = c(origin, dest) ) ``` ## 练习 ```{r} #| eval: false df <- tibble( x = 1:5, y = c("a", "b", "a", "a", "b"), z = c("K", "K", "L", "L", "K") ) df |> group_by(y) ``` ## 练习 ```{r} #| eval: false df |> arrange(y) ``` ## 练习 ```{r} #| eval: false df |> group_by(y) |> summarize(mean_x = mean(x)) ``` ## 练习 ```{r} #| eval: false df |> group_by(y, z) |> summarize(mean_x = mean(x)) ``` ## 练习 ```{r} #| eval: false df |> group_by(y, z) |> summarize(mean_x = mean(x), .groups = "drop") ``` ## 练习 ```{r} #| eval: false df |> group_by(y, z) |> summarize(mean_x = mean(x)) df |> group_by(y, z) |> mutate(mean_x = mean(x)) ``` ## 练习 ```{r} batters <- Lahman::Batting |> group_by(playerID) |> summarize( performance = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE), n = sum(AB, na.rm = TRUE) ) batters ``` ## 练习 ```{r} #| warning: false #| fig-alt: | #| A scatterplot of number of batting performance vs. batting opportunites #| overlaid with a smoothed line. Average performance increases sharply #| from 0.2 at when n is 1 to 0.25 when n is ~1000. Average performance #| continues to increase linearly at a much shallower slope reaching #| ~0.3 when n is ~15,000. batters |> filter(n > 100) |> ggplot(aes(x = n, y = performance)) + geom_point(alpha = 1 / 10) + geom_smooth(se = FALSE) ``` ## 排序 ```{r} batters |> arrange(desc(performance)) ``` ## 欢迎讨论!{.center} `r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drwater.rcees.ac.cn/course/public/RWEP/@PUB/SD/")`