diff --git a/SD/2.1_codestyle/_extensions b/SD/2.1_codestyle/_extensions
deleted file mode 120000
index 74119e3..0000000
--- a/SD/2.1_codestyle/_extensions
+++ /dev/null
@@ -1 +0,0 @@
-../../_extensions
\ No newline at end of file
diff --git a/SD/2.1_codestyle/index.qmd b/SD/2.1_codestyle/index.qmd
deleted file mode 100644
index b6b2bd5..0000000
--- a/SD/2.1_codestyle/index.qmd
+++ /dev/null
@@ -1,162 +0,0 @@
----
-title: "代码编写规则"
-subtitle: 《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution
-author: 苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心
-date: today
-lang: zh
-format:
- revealjs:
- theme: dark
- slide-number: true
- chalkboard:
- buttons: true
- preview-links: auto
- lang: zh
- toc: true
- toc-depth: 1
- toc-title: 大纲
- logo: ./_extensions/inst/img/ucaslogo.png
- css: ./_extensions/inst/css/revealjs.css
- pointer:
- key: "p"
- color: "#32cd32"
- pointerSize: 18
-revealjs-plugins:
- - pointer
-filters:
- - d2
----
-
-
-```{r}
-#| echo: false
-knitr::opts_chunk$set(echo = TRUE)
-# source("../../coding/_common.R")
-library(tidyverse)
-library(nycflights13)
-```
-
-## tidy data
-
-```{r}
-knitr::include_graphics("../../image/tidy-1.png", dpi = 270)
-```
-
-
-## pipe(管道) |>
-
-```{r}
-#| eval: false
-require(patchwork)
-plot(1:10)
-1:10 |> plot()
-plot(x = 1:10, y = sin(1:10))
-1:10 |> plot(y = sin(1:10))
-
-```
-
-```{r}
-#| echo: false
-#| layout-nrow: 1
-#| fig-width: 4
-#| fig-height: 3
-#| out-height: 90%
-require(patchwork)
-plot(1:10)
-1:10 |> plot()
-plot(x = 1:10, y = sin(1:10))
-1:10 |> plot(y = sin(1:10))
-
-```
-
-
-
-## pipe(管道):%>%
-
-```{r}
-#| eval: false
-#| layout-nrow: 1
-#| fig-width: 3
-#| fig-height: 4
-#| out-height: 125%
-require(magrittr)
-1:10 %>% plot()
-1:10 %>% plot(y = sin(1:10))
-sin(1:10) %>% plot(1:10, .)
-sin(1:10) |> plot(x = 1:10, y = _)
-
-```
-
-```{r}
-#| echo: false
-#| layout-nrow: 1
-#| fig-width: 3
-#| fig-height: 4
-#| out-height: 125%
-require(magrittr)
-1:10 %>% plot()
-1:10 %>% plot(y = sin(1:10))
-sin(1:10) %>% plot(1:10, .)
-sin(1:10) |> plot(x = 1:10, y = _)
-
-```
-
-
-
-
-
-
-## 代码编写规则
-
-```{r}
-#| eval: false
-
-# Strive for:
-short_flights <- flights |> filter(air_time < 60)
-# Avoid:
-SHORTFLIGHTS <- flights |> filter(air_time < 60)
-
-# Strive for
-z <- (a + b)^2 / d
-# Avoid
-z<-( a + b ) ^ 2/d
-
-# Strive for
-mean(x, na.rm = TRUE)
-# Avoid
-mean (x ,na.rm=TRUE)
-```
-
-## 练习
-
-```{r}
-#| eval: false
-flights|>filter(dest=="IAH")|>
- group_by(year,month,day)|>summarize(n=n(),
-delay=mean(arr_delay,na.rm=TRUE))|>filter(n>10)
-
-```
-
-## 练习
-
-```{r}
-#| eval: false
-flights |>
- filter(dest == "IAH") |>
- group_by(year, month, day) |>
- summarize(n = n(),
- delay = mean(arr_delay, na.rm = TRUE)) |>
- filter(n > 10)
-
-```
-
-## quarto
-
-
-
-
-## 欢迎讨论!{.center}
-
-
-`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "../")`
-
diff --git a/SD/2.2_dataimport/_extensions b/SD/2.2_dataimport/_extensions
deleted file mode 120000
index 74119e3..0000000
--- a/SD/2.2_dataimport/_extensions
+++ /dev/null
@@ -1 +0,0 @@
-../../_extensions
\ No newline at end of file
diff --git a/SD/2.2_dataimport/index.qmd b/SD/2.2_dataimport/index.qmd
deleted file mode 100644
index 0882b47..0000000
--- a/SD/2.2_dataimport/index.qmd
+++ /dev/null
@@ -1,324 +0,0 @@
----
-title: "Data import"
-subtitle: 《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution
-author: 苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心
-date: today
-lang: zh
-format:
- revealjs:
- theme: dark
- slide-number: true
- chalkboard:
- buttons: true
- preview-links: auto
- lang: zh
- toc: true
- toc-depth: 1
- toc-title: 大纲
- logo: ./_extensions/inst/img/ucaslogo.png
- css: ./_extensions/inst/css/revealjs.css
- pointer:
- key: "p"
- color: "#32cd32"
- pointerSize: 18
-revealjs-plugins:
- - pointer
-filters:
- - d2
----
-
-
-```{r}
-#| echo: false
-knitr::opts_chunk$set(echo = TRUE)
-source("../../coding/_common.R")
-library(tidyverse)
-```
-
-
-## tidyverse风格数据分析总体流程
-
-
-
-
-
-## 导入csv数据
-
-```{r}
-read_lines("../../data/students.csv") |> cat(sep = "\n")
-```
-
-## 导入csv数据
-
-
-```{r}
-read_csv("../../data/students.csv") |>
- knitr::kable()
-```
-
-
-## 读取数据
-
-```{r}
-(students <- read_csv("../../data/students.csv"))
-```
-
-
-## 读取数据
-
-```{r}
-#| message: false
-(students <- read_csv("../../data/students.csv", na = c("N/A", "")))
-```
-
-## 列名不要有空格
-
-```{r}
-students |>
- rename(
- student_id = `Student ID`,
- full_name = `Full Name`
- )
-```
-
-## `janitor`处理空格
-
-```{r}
-#| message: false
-
-students |> janitor::clean_names()
-```
-
-## `janitor`处理空格
-
-```{r}
-students |>
- janitor::clean_names() |>
- mutate(meal_plan = factor(meal_plan))
-```
-
-## `janitor`处理空格
-
-```{r}
-students <- students |>
- janitor::clean_names() |>
- mutate(
- meal_plan = factor(meal_plan),
- age = parse_number(if_else(age == "five", "5", age))
- )
-students
-```
-
-
-## 直接录入
-
-```{r}
-#| message: false
-
-read_csv(
- "The first line of metadata
- The second line of metadata
- x,y,z
- 1,2,3",
- skip = 2
-)
-```
-
-
-## 直接录入
-
-```{r}
-#| message: false
-read_csv(
- "# A comment I want to skip
- x,y,z
- 1,2,3",
- comment = "#"
-)
-```
-
-
-## 指定列名
-
-```{r}
-#| message: false
-
-read_csv(
- "1,2,3
- 4,5,6",
- col_names = c("x", "y", "z")
-)
-```
-
-## 指定列的类型
-
-```{r}
-another_csv <- "
-x,y,z
-1,2,3"
-
-read_csv(
- another_csv,
- col_types = cols(.default = col_character())
-)
-read_csv(
- another_csv,
- col_types = cols_only(x = col_character())
-)
-```
-
-
-## 练习
-
-```{r}
-#| eval: false
-
-read_csv("a,b\n1,2,3\n4,5,6")
-read_csv("a,b,c\n1,2\n1,2,3,4")
-read_csv("a,b\n\"1")
-read_csv("a,b\n1,2\na,b")
-read_csv("a;b\n1;3")
-```
-
-## 练习
-
-```{r}
-#| eval: false
-annoying <- tibble(
- `1` = 1:10,
- `2` = `1` * 2 + rnorm(length(`1`))
-)
-```
-
-
-
-## 批量读取
-
-```{r}
-#| message: false
-
-sales_files <- c("../../data/01-sales.csv",
- "../../data/02-sales.csv",
- "../../data/03-sales.csv")
-read_csv(sales_files, id = "file")
-```
-
-
-
-
-## 读取Excel,建议用`readxl`包
-
-```{r}
-(surveydf <- readxl::read_xlsx("../../data/survey.xlsx"))
-```
-
-## 读取Excel
-
-```{r}
-(airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx", sheet = 2))
-```
-
-
-
-## 批量读取
-
-```{r}
-sales_files <- list.files("../../data",
- pattern = "sales\\.csv$", full.names = TRUE)
-sales_files
-```
-
-## 写入csv
-
-```{r}
-#| warning: false
-#| message: false
-students
-write_csv(students, "students-2.csv")
-read_csv("students-2.csv")
-```
-
-## 写入Excel
-
-```{r}
-writexl::write_xlsx(students, "../../data/writexldemo.xlsx")
-```
-
-## 读取数据库,以MySQL为例
-
-```{r}
-if (FALSE) {
- conn <- cctdb::get_dbconn("nationalairquality")
- DBI::dbListTables(conn)
-}
-```
-
-
-
-## 读取数据库,以MySQL为例
-
-```{r}
-if (FALSE) {
- conn <- cctdb::get_dbconn("nationalairquality")
- metadf <- tbl(conn, "metadf") |>
- head(100) |>
- collect()
- DBI::dbDisconnect(conn)
- saveRDS(metadf, file = "../../data/metadfdemo.RDS")
-}
-metadf <- readRDS(file = "../../data/metadfdemo.RDS")
-lang <- "cn"
-metadf |>
- ggplot(aes(lon, lat)) +
-geom_point(aes(fill = Area)) +
-dwfun::theme_sci()
-```
-
-
-## 练习
-
-```{r}
-#| eval: false
-metadf <- readxl::read_xlsx("../../data/airquality.xlsx")
-dir.create("../../data/metacity2/")
-metadf |>
- nest(sitedf = -site) |>
- mutate(flag = purrr::map2(site, sitedf,
- ~ writexl::write_xlsx(.y, paste0("../../data/metacity2/", .x, ".xlsx"))))
-```
-
-
-## 练习
-
-```{r}
-#| include: false
-#| eval: false
-if (FALSE) {
- require(tidyverse)
- conn <- cctdb::get_dbconn("nationalairquality")
- metadf <- tbl(conn, "metadf") |>
- collect()
- DBI::dbDisconnect(conn)
- metanestdf <- metadf |>
- nest(citydf = -Area)
- names(metanestdf$citydf) <- metanestdf$Area
- writexl::write_xlsx(metanestdf$citydf, path = "../../data/meta_city.xlsx")
- dir.create("../../data/metacity/")
- metanestdf |>
- mutate(flag = purrr::map2(Area, citydf,
- ~ writexl::write_xlsx(.y,
- path = paste0("../../data/metacity/", .x, ".xlsx")
- )))
-}
-```
-
-1. 从“../../data/sales.xlsx”读取第9到13行的数据
-2. 从“../../data/meta_city.xlsx”读取所有的数据,并保存至“../../data/meta_city_onetable1.xlsx”
-3. 从“../../data/metacity/”读取所有的数据,并保存至“../../data/meta_city_onetable2.xlsx”
-
-
-
-## 欢迎讨论!{.center}
-
-
-`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drc.drwater.net/course/public/RWEP/PUB/SD/")`
diff --git a/SD/2.2_dataimport/students-2.csv b/SD/2.2_dataimport/students-2.csv
deleted file mode 100644
index 3775770..0000000
--- a/SD/2.2_dataimport/students-2.csv
+++ /dev/null
@@ -1,7 +0,0 @@
-student_id,full_name,favourite_food,meal_plan,age
-1,Sunil Huffmann,Strawberry yoghurt,Lunch only,4
-2,Barclay Lynn,French fries,Lunch only,5
-3,Jayendra Lyne,NA,Breakfast and lunch,7
-4,Leon Rossini,Anchovies,Lunch only,NA
-5,Chidiegwu Dunkel,Pizza,Breakfast and lunch,5
-6,Güvenç Attila,Ice cream,Lunch only,6
diff --git a/SD/2.3_datatransform/_extensions b/SD/2.3_datatransform/_extensions
deleted file mode 120000
index 74119e3..0000000
--- a/SD/2.3_datatransform/_extensions
+++ /dev/null
@@ -1 +0,0 @@
-../../_extensions
\ No newline at end of file
diff --git a/SD/2.3_datatransform/index.qmd b/SD/2.3_datatransform/index.qmd
deleted file mode 100644
index 095f0c1..0000000
--- a/SD/2.3_datatransform/index.qmd
+++ /dev/null
@@ -1,327 +0,0 @@
----
-title: "Data Transform"
-subtitle: 《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution
-author: 苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心
-date: today
-lang: zh
-format:
- revealjs:
- theme: dark
- slide-number: true
- chalkboard:
- buttons: true
- preview-links: auto
- lang: zh
- toc: true
- toc-depth: 1
- toc-title: 大纲
- logo: ./_extensions/inst/img/ucaslogo.png
- css: ./_extensions/inst/css/revealjs.css
- pointer:
- key: "p"
- color: "#32cd32"
- pointerSize: 18
-revealjs-plugins:
- - pointer
-filters:
- - d2
----
-
-```{r}
-#| echo: false
-knitr::opts_chunk$set(echo = TRUE)
-source("../../coding/_common.R")
-library(nycflights13)
-library(tidyverse)
-
-```
-
-
-## `tidyverse`风格数据分析总体流程
-
-
-
-
-## [dplyr cheatsheet](../../image/cheatsheet/data-transformation.pdf)
-
-
-```{r}
-#| echo: false
-dwfun::ggsavep("../../image/cheatsheet/data-transformation.svg", loadit = TRUE)
-```
-
-
-
-## 查看数据
-
-```{r}
-flights
-```
-
-
-## 选择列
-
-```{r}
-#| results: false
-
-flights |>
- select(year, month, day)
-```
-
-## 选择列
-
-```{r}
-#| results: false
-
-flights |>
- select(year:day)
-```
-
-
-## 选择列
-
-```{r}
-flights |>
-select(3:5)
-```
-
-## 选择列
-
-```{r}
-flights |>
- select(!year:day)
-```
-
-## 选择列
-
-```{r}
-flights |>
- select(-(year:day))
-```
-
-## 选择列
-
-```{r}
-flights |>
- select(where(is.character))
-```
-
-## 选择列
-
-```{r}
-flights |>
- select(!where(is.character)) |>
- select(contains("_"))
-```
-
-## 选择列
-
-```{r}
-flights |>
- select(tail_num = tailnum)
-```
-
-## 选择列
-
-```{r}
-flights |>
- select(air_time, everything())
-```
-
-
-## 重命名
-
-```{r}
-flights |>
- rename(tail_num = tailnum)
-```
-
-## 重命名
-
-```{r}
-flights |>
- rename(年份 = 1) |>
- rename(月份 = 2)
-
-```
-
-## 重命名
-
-```{r}
-flights |> select(1:4) |> head(n = 3)
-# 重命名
-flights |> select(1:4) |> head(n = 3) |>
- rename_all(~c("c1", "c2", "c3", "c4"))
-```
-
-## 重命名
-
-```{r}
-flights |> select(1:4) |> head(n = 3)
-# 重命名
-flights |> select(1:4) |> head(n = 3) |>
- rename_all(toupper)
-```
-
-
-## 重命名
-
-```{r}
-flights |> select(1:4) |> head(n = 3)
-# 重命名
-flights |> select(1:4) |> head(n = 3) |>
- rename_all(~paste0(toupper(.), "_NEW"))
-```
-
-
-## 练习
-
-将含有下划线的列名中的下划线去掉。
-
-
-```{r}
-flights |> select(1:4) |> head(n = 3)
-```
-
-
-## 练习
-
-将`airqualitydf`中列名的单位信息去除(前5列)。
-
-
-```{r}
-airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx", sheet = 2)
-airqualitydf |> select(1:5)
-```
-
-
-
-## `filter`
-
-```{r}
-flights |>
- filter(dep_delay > 120)
-```
-
-## filter 练习
-
-Flights that departed on January 1.
-
-```{r}
-#| echo: false
-flights |>
- filter(month == 1 & day == 1)
-```
-
-## filter 练习
-
-
-Select flights that departed in January or February
-
-```{r}
-#| echo: false
-flights |>
- filter(month %in% c(1, 2))
-```
-
-## filter 练习
-
-```{r}
-jan1 <- flights |>
- filter(month == 1 & day == 1)
-```
-
-## filter
-
-```{r}
-#| error: true
-#| eval: false
-
-flights |>
- filter(month = 1)
-```
-
-## filter
-
-```{r}
-flights |>
- filter(month == 1 | 2)
-```
-
-## 排序
-
-```{r}
-flights |>
- arrange(year, month, day, dep_time)
-```
-
-## 排序
-
-```{r}
-flights |>
- arrange(desc(dep_delay))
-```
-
-## slice
-
-```{r}
-flights |> head(n = 5)
-flights |> slice(1:5)
-```
-
-## slice
-
-```{r}
-flights |>
- slice_max(dep_delay, n = 5)
-```
-
-## slice
-
-```{r}
-flights |>
- slice_min(dep_delay, prop = 0.005)
-```
-
-## 排序练习
-
-根据`origin`、`dest`、`air_time`倒序排序。
-
-```{r}
-#| echo: false
-flights |>
- arrange(origin, dest, desc(air_time)) |>
- select(origin, dest, air_time, everything())
-```
-
-
-## 去重
-
-```{r}
-# Remove duplicate rows, if any
-flights |>
- distinct()
-```
-
-## 去重
-
-```{r}
-# Find all unique origin and destination pairs
-flights |>
- distinct(origin, dest)
-```
-
-## 去重
-
-```{r}
-flights |>
- distinct(origin, dest, .keep_all = TRUE)
-```
-
-
-
-## 欢迎讨论!{.center}
-
-
-`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drc.drwater.net/course/public/RWEP/PUB/SD/")`
-
diff --git a/SD/3.1_datatransform/_extensions b/SD/3.1_datatransform/_extensions
deleted file mode 120000
index 74119e3..0000000
--- a/SD/3.1_datatransform/_extensions
+++ /dev/null
@@ -1 +0,0 @@
-../../_extensions
\ No newline at end of file
diff --git a/SD/3.1_datatransform/index.qmd b/SD/3.1_datatransform/index.qmd
deleted file mode 100644
index e970e8c..0000000
--- a/SD/3.1_datatransform/index.qmd
+++ /dev/null
@@ -1,949 +0,0 @@
----
-title: "Data Transform"
-subtitle: 《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution
-author: 苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心
-date: today
-lang: zh
-format:
- revealjs:
- theme: dark
- slide-number: true
- chalkboard:
- buttons: true
- preview-links: auto
- lang: zh
- toc: true
- toc-depth: 1
- toc-title: 大纲
- logo: ./_extensions/inst/img/ucaslogo.png
- css: ./_extensions/inst/css/revealjs.css
- pointer:
- key: "p"
- color: "#32cd32"
- pointerSize: 18
-revealjs-plugins:
- - pointer
-filters:
- - d2
----
-
-```{r}
-#| echo: false
-knitr::opts_chunk$set(echo = TRUE)
-source("../../coding/_common.R")
-library(nycflights13)
-library(tidyverse)
-```
-
-
-
-## 计数
-
-```{r}
-flights |>
- count(origin, dest, sort = TRUE)
-```
-
-
-## 计数-练习
-
-统计每月的航班数量。
-
-```{r}
-#| echo: false
-flights |>
- count(year, month, sort = TRUE)
-```
-
-
-## 计算新变量
-
-```{r}
-flights |>
- mutate(
- gain = dep_delay - arr_delay,
- speed = distance / air_time * 60
- )
-```
-
-## 计算新变量
-
-```{r}
-flights |>
- mutate(
- gain = dep_delay - arr_delay,
- speed = distance / air_time * 60,
- .before = 1
- )
-```
-
-## 计算新变量
-
-```{r}
-flights |>
- mutate(
- gain = dep_delay - arr_delay,
- speed = distance / air_time * 60,
- .after = day
- )
-```
-
-## 计算新变量
-
-```{r}
-flights |>
- mutate(
- gain = dep_delay - arr_delay,
- hours = air_time / 60,
- gain_per_hour = gain / hours,
- .keep = "used"
- )
-```
-
-
-
-
-## 列排序
-
-```{r}
-flights |>
- relocate(time_hour, air_time)
-```
-
-## 列排序
-
-```{r}
-#| results: false
-
-flights |>
- relocate(year:dep_time, .after = time_hour)
-flights |>
- relocate(starts_with("arr"), .before = dep_time)
-flights |>
- select(starts_with("arr"), everything())
-```
-
-
-## 练习
-
-计算目的地为IAH,按飞行速度排序的表格,保留year:day, `dep_time`, carrier, flight与speed列。
-
-```{r}
-flights |>
- filter(dest == "IAH") |>
- mutate(speed = distance / air_time * 60) |>
- select(year:day, dep_time, carrier, flight, speed) |>
- arrange(desc(speed))
-```
-
-## 练习
-
-计算目的地为IAH,按飞行速度排序的表格,保留year:day, `dep_time`, carrier, flight与speed列。
-
-```{r}
-#| results: false
-
-flights1 <- filter(flights, dest == "IAH")
-flights2 <- mutate(flights1, speed = distance / air_time * 60)
-flights3 <- select(flights2, year:day, dep_time, carrier, flight, speed)
-arrange(flights3, desc(speed))
-```
-
-## 练习
-
-计算目的地为IAH,按飞行速度排序的表格,保留year:day, `dep_time`, carrier, flight与speed列。
-
-```{r}
-flights |>
- filter(dest == "IAH") |>
- mutate(speed = distance / air_time * 60) |>
- select(year:day, dep_time, carrier, flight, speed) |>
- arrange(desc(speed))
-```
-
-
-## 分组统计
-
-```{r}
-
-library(tidyverse)
-
-mtcars %>%
- group_by(cyl) %>%
- summarize(n = n())
-```
-
-## 分组统计
-
-```{r}
-flights |>
- group_by(month)
-```
-
-## 分组统计
-
-```{r}
-flights |>
- group_by(month) |>
- summarize(
- avg_delay = mean(dep_delay)
- )
-```
-
-## 分组统计
-
-```{r}
-flights |>
- group_by(month) |>
- summarize(
- avg_delay = mean(dep_delay, na.rm = TRUE)
- )
-```
-
-## 分组统计
-
-```{r}
-flights |>
- group_by(month) |>
- summarize(
- avg_delay = mean(dep_delay, na.rm = TRUE),
- n = n()
- )
-```
-
-## 分组统计
-
-```{r}
-flights |>
- group_by(dest) |>
- slice_max(arr_delay, n = 1) |>
- relocate(dest)
-```
-
-## 分组统计
-
-```{r}
-
-flights |>
- filter(dest == "IAH") |>
- group_by(year, month, day) |>
- summarize(
- arr_delay = mean(arr_delay, na.rm = TRUE)
- )
-```
-
-
-## 分组
-
-```{r}
-daily <- flights |>
- group_by(year, month, day)
-daily
-```
-
-## 分组统计
-
-```{r}
-daily_flights <- daily |>
- summarize(n = n())
-```
-
-## 分组统计
-
-```{r}
-#| results: false
-
-daily_flights <- daily |>
- summarize(
- n = n(),
- .groups = "drop_last"
- )
-```
-
-
-
-## 删除分组
-
-```{r}
-daily |> ungroup()
-```
-
-## 删除分组
-
-```{r}
-daily |>
- ungroup() |>
- summarize(
- avg_delay = mean(dep_delay, na.rm = TRUE),
- flights = n()
- )
-```
-
-## 分组统计
-
-```{r}
-flights |>
- summarize(
- delay = mean(dep_delay, na.rm = TRUE),
- n = n(),
- .by = month
- )
-```
-
-## 分组统计
-
-```{r}
-flights |>
- summarize(
- delay = mean(dep_delay, na.rm = TRUE),
- n = n(),
- .by = c(origin, dest)
- )
-```
-
-
-
-## 练习
-
-```{r}
-df <- tibble(
- x = 1:5,
- y = c("a", "b", "a", "a", "b"),
- z = c("K", "K", "L", "L", "K")
-)
-df
-```
-
-```{r}
-#| eval: false
-
-df |> arrange(y)
-```
-
-## 练习
-
-```{r}
-#| echo: false
-df
-```
-
-```{r}
-#| eval: false
-df |>
- group_by(y) |>
- summarize(mean_x = mean(x))
-```
-
-## 练习
-
-```{r}
-#| echo: false
-df
-```
-```{r}
-#| eval: false
-
-df |>
- group_by(y, z) |>
- summarize(mean_x = mean(x))
-```
-
-## 练习
-
-```{r}
-#| echo: false
-df
-```
-```{r}
-#| eval: false
-
-df |>
- group_by(y, z) |>
- summarize(mean_x = mean(x), .groups = "drop")
-```
-
-## 练习
-
-```{r}
-#| echo: false
-df
-```
-
-```{r}
-#| eval: false
-df |>
- group_by(y, z) |>
- summarize(mean_x = mean(x))
-
-df |>
- group_by(y, z) |>
- mutate(mean_x = mean(x))
-```
-
-
-
-
-## 练习
-
-- 计算不同采样点的平均CO浓度、最大CO浓度、最小CO浓度、中位数CO浓度(`CO_mg/m3`)。
-- 计算各小时全国的平均CO浓度、最大CO浓度、最小CO浓度、中位数CO浓度(`CO_mg/m3`)。
-- 计算不同采样点各小时的平均CO浓度、最大CO浓度、最小CO浓度、中位数CO浓度(`CO_mg/m3`)。
-- 计算各采样点中CO浓度小于全国平均CO浓度的占比。
-- 找出全国各采样点中CO浓度小于全国平均CO浓度的占比最高的10个采样点。
-
-```{r}
-airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx",
- sheet = 2)
-```
-
-
-## 练习
-
-按月统计dep_delay最大的3个航班的航班号(flight),用逗号连接。
-
-```{r}
-#| echo: false
-flights |>
- group_by(year, month) |>
- slice_max(dep_delay, n = 3) |>
- summarize(flight = paste(paste0(carrier, flight), collapse = ", ")) |>
-knitr::kable()
-```
-
-
-
-
-## 数据变形示意图
-
-```{r}
-billboard
-knitr::include_graphics("../../image/tidy-data/variables.png", dpi = 270)
-```
-
-## 数据变形
-
-```{r}
-billboard |>
- pivot_longer(
- cols = starts_with("wk"),
- names_to = "week",
- values_to = "rank",
- values_drop_na = TRUE
- )
-```
-
-## 数据变形
-
-```{r}
-billboard_longer <- billboard |>
- pivot_longer(
- cols = starts_with("wk"),
- names_to = "week",
- values_to = "rank",
- values_drop_na = TRUE
- ) |>
- mutate(
- week = parse_number(week)
- )
-billboard_longer
-```
-
-
-## 练习
-
-
-```{r}
-#| echo: false
-df <- tribble(
- ~id, ~bp1, ~bp2,
- "A", 100, 120,
- "B", 140, 115,
- "C", 120, 125
-)
-df
-```
-
-将以上数据(`df`)转换为如下形式。
-
-```{r}
-#| echo: false
-df |>
- pivot_longer(
- cols = bp1:bp2,
- names_to = "measurement",
- values_to = "value"
- )
-```
-
-## 练习
-
-请转换如下`iris`数据。
-
-```{r}
-#| echo: false
-as_tibble(head(iris, n = 3))
-cat("转为如下形式:")
-iris |>
- pivot_longer(cols = c(Sepal.Length,
- Sepal.Width,
- Petal.Length,
- Petal.Width),
- names_to = "flower_attr",
- values_to = "attr_value") |>
- head()
-```
-
-
-
-
-## 数据变形示意图2
-
-```{r}
-who2
-knitr::include_graphics("../../image/tidy-data/multiple-names.png", dpi = 270)
-```
-
-
-## 数据变形
-
-```{r}
-who2 |>
- pivot_longer(
- cols = !(country:year),
- names_to = c("diagnosis", "gender", "age"),
- names_sep = "_",
- values_to = "count"
- )
-```
-
-
-## 数据变形示意图
-
-```{r}
-household
-knitr::include_graphics("../../image/tidy-data/names-and-values.png", dpi = 270)
-```
-
-## 数据变形
-
-```{r}
-household |>
- pivot_longer(
- cols = !family,
- names_to = c(".value", "child"),
- names_sep = "_",
- values_drop_na = TRUE
- )
-```
-
-
-## 查看数据
-
-```{r}
-cms_patient_experience
-```
-
-## 查看数据
-
-```{r}
-cms_patient_experience |>
- distinct(measure_cd, measure_title)
-```
-
-## 数据变形(变宽)
-
-```{r}
-cms_patient_experience |>
- pivot_wider(
- names_from = measure_cd,
- values_from = prf_rate
- )
-```
-
-## 数据变形(变宽)
-
-```{r}
-cms_patient_experience |>
- pivot_wider(
- id_cols = starts_with("org"),
- names_from = measure_cd,
- values_from = prf_rate
- )
-```
-
-## 练习
-
-```{r}
-df <- tribble(
- ~id, ~measurement, ~value,
- "A", "bp1", 100,
- "B", "bp1", 140,
- "B", "bp2", 115,
- "A", "bp2", 120,
- "A", "bp3", 105
-)
-```
-
-
-变形成如下形式:
-
-
-```{r}
-#| echo: false
-df |>
- pivot_wider(
- names_from = measurement,
- values_from = value
- )
-```
-
-
-## 练习:变宽
-
-```{r}
-df <- tribble(
- ~id, ~measurement, ~value,
- "A", "bp1", 100,
- "A", "bp1", 102,
- "A", "bp2", 120,
- "B", "bp1", 140,
- "B", "bp2", 115
-)
-```
-
-## 练习
-
-```{r}
-df |>
- pivot_wider(
- names_from = measurement,
- values_from = value
- )
-```
-
-## 练习
-
-```{r}
-df |>
- group_by(id, measurement) |>
- summarize(n = n(), .groups = "drop") |>
- filter(n > 1)
-```
-
-
-## nest,套嵌数据框
-
-```{r}
-#| echo: false
-df <- tibble(x = c(1, 1, 1, 2, 2, 3), y = 1:6, z = 6:1)
-df
-```
-
-```{r}
-df %>% nest(data = c(y, z))
-```
-
-## nest,套嵌数据框
-
-Specify variables to nest by (rather than variables to nest) using `.by`
-
-```{r}
-df %>% nest(.by = x)
-```
-
-
-## nest,套嵌数据框
-
-In this case, since `...` isn't used you can specify the resulting column name with `.key`
-
-```{r}
-df %>% nest(.by = x, .key = "cols")
-```
-
-
-## nest,套嵌数据框
-
-Use tidyselect syntax and helpers, just like in `dplyr::select()`
-
-```{r}
-df %>% nest(data = any_of(c("y", "z")))
-```
-
-## nest,套嵌数据框
-
-`...` and `.by` can be used together to drop columns you no longer need,
-or to include the columns you are nesting by in the inner data frame too.
-This drops `z`:
-
-```{r}
-df %>% nest(data = y, .by = x)
-```
-
-## nest,套嵌数据框
-
-This includes `x` in the inner data frame:
-
-```{r}
-df %>% nest(data = everything(), .by = x)
-```
-
-## nest,套嵌数据框
-
-Multiple nesting structures can be specified at once
-
-```{r}
-iris %>%
- nest(petal = starts_with("Petal"), sepal = starts_with("Sepal"))
-```
-
-
-## nest,套嵌数据框
-
-```{r}
-iris %>%
- nest(width = contains("Width"), length = contains("Length"))
-```
-
-
-## nest,套嵌数据框
-
-Nesting a grouped data frame nests all variables apart from the group vars
-
-```{r}
-fish_encounters
-fish_encounters %>%
- dplyr::group_by(fish) %>%
- nest()
-```
-
-## nest,套嵌数据框
-
-That is similar to `nest(.by = )`, except here the result isn't grouped
-
-```{r}
-fish_encounters %>%
- nest(.by = fish)
-```
-
-## nest,套嵌数据框
-
-Nesting is often useful for creating per group models
-
-```{r}
-mtcars %>%
- nest(.by = cyl) %>%
- dplyr::mutate(models = lapply(data, function(df) lm(mpg ~ wt, data = df)))
-```
-
-
-## 练习
-
-```{r}
-#| echo: false
-(airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx",
- sheet = 2))
-```
-
-```{r}
-airqualitydf
-airqualitynestdf <- airqualitydf |>
- nest(sitedf = -site)
-```
-
-## `nest`与`group_by`联用
-
-```{r}
-#| echo: false
-iris %>%
- group_by(Species) %>%
- nest(.key = "spdf")
-```
-
-## unnest
-
-```{r}
-airqualitynestdf |> unnest(sitedf)
-```
-
-
-
-## `purrr`包
-
-- map():依次应用一元函数到一个序列的每个元素上,基本等同 lapply()
-- map2():依次应用二元函数到两个序列的每对元素上
-- pmap():应用多元函数到多个序列的每组元素上,可以实现对数据框逐行迭代
-- map 系列默认返回列表型,可根据想要的返回类型添加后缀:_int, _dbl, _lgl, _chr, _df, 甚至可以接着对返回的数据框df做行/列合并:_dfr, _dfc
-- 如果只想要函数依次作用的过程,而不需要返回结果,改用 walk 系列即可
-- 所应用的函数,有 purrr公式风格简写(匿名函数),支持一元,二元,多元函数
-- purrr 包中的其它有用函数
-
-## `purrr`包
-
-- `map_chr(.x, .f)`: 返回字符型向量
-- `map_lgl(.x, .f)`: 返回逻辑型向量
-- `map_dbl(.x, .f)`: 返回实数型向量
-- `map_int(.x, .f)`: 返回整数型向量
-- `map_dfr(.x, .f)`: 返回数据框列表,再 bind_rows 按行合并为一个数据框
-- `map_dfc(.x, .f)`: 返回数据框列表,再 bind_cols 按列合并为一个数据框
-
-
-## `purrr`包-cheatsheet
-
-```{r}
-dwfun::ggsavep("../../image/cheatsheet/purrr.svg", loadit = TRUE)
-```
-
-[purrr](../../image/cheatsheet/purrr.pdf)
-
-
-## `purrr`包
-
-生成从1到10的10组随机数,每组随机数个数为100,均值依次为1到10,标准差为1,并存储在数据框中。
-
-```{r}
-res <- list()
-for (i in 1:10) {
- res[[i]] <- tibble(随机数 = rnorm(n = 100, mean = i, sd = 1))
-}
-res
-
-```
-
-
-
-
-## `purrr`包
-
-生成从1到10的10组随机数,每组随机数个数为100,均值依次为1到10,标准差为1,并存储在数据框中。
-
-```{r}
-1:10 |>
-purrr::map(~tibble(随机数 = rnorm(n = 100, mean = .x, sd = 1)))
-```
-
-## purrr
-
-```{r}
-library(purrr)
-mtcars |>
- split(mtcars$cyl) |> # from base R
- map(\(df) lm(mpg ~ wt, data = df)) |>
- map(summary) %>%
- map_dbl("r.squared")
-```
-
-
-## 练习:
-
-计算每月最后一个周六的航班数:
-
-```{r}
-flights
-```
-
-## `tidyr` + `purrr`包
-
-任务:展示不同城市间的大气指标散点图
-
-```{r}
-(airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx",
- sheet = 2))
-```
-
-## join
-
-Perform left join
-
-```{r}
-(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
-(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
-left_join(df1, df2, by = "id")
-```
-
-## left join
-
-Create sample data frames with non-matching rows
-
-```{r}
-(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
-(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
-left_join(df2, df1, by = "id")
-```
-
-## left join
-
-
-Create sample data frames with multiple columns
-
-```{r}
-df1 <- data.frame(id1 = c(1, 2, 3), id2 = c("A", "B", "C"), value1 = letters[1:3])
-df2 <- data.frame(id1 = c(2, 3, 4), id2 = c("B", "C", "D"), value2 = LETTERS[1:3])
-# Perform left join
-left_join(df1, df2, by = c("id1", "id2"))
-```
-
-## right join
-
-
-```{r}
-(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
-(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
-
-# Perform right join
-right_join(df1, df2, by = "id")
-```
-
-
-## inner join
-
-```{r}
-# Create sample data frames
-(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
-(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
-# Perform inner join
-inner_join(df1, df2, by = "id")
-```
-
-
-## full join
-
-```{r}
-# Create sample data frames
-(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
-(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
-# Perform inner join
-full_join(df1, df2, by = "id")
-```
-
-## semi join
-
-Create sample data frames
-
-```{r}
-(df1 <- data.frame(id = 1:5, value1 = letters[1:5]))
-(df2 <- data.frame(id = c(2, 4, 6), value2 = LETTERS[1:3]))
-# Perform semi join
-semi_join(df1, df2, by = "id")
-```
-
-
-## 练习
-
-合并`airquality.xlsx`中的数据。
-
-
-
-## 练习
-
-统计各城市白天与晚上的大气质量差异,计算不同指标差异最大的10个城市。
-
-
-
-
-
-
-## 欢迎讨论!{.center}
-
-
-`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drc.drwater.net/course/public/RWEP/PUB/SD/")`
-
diff --git a/SD/3.9_课后作业8/.RData b/SD/3.9_课后作业8/.RData
deleted file mode 100644
index 6f7dafd..0000000
Binary files a/SD/3.9_课后作业8/.RData and /dev/null differ
diff --git a/SD/3.9_课后作业8/_extensions b/SD/3.9_课后作业8/_extensions
deleted file mode 120000
index 74119e3..0000000
--- a/SD/3.9_课后作业8/_extensions
+++ /dev/null
@@ -1 +0,0 @@
-../../_extensions
\ No newline at end of file
diff --git a/SD/3.9_课后作业8/airqualitydf.RDS b/SD/3.9_课后作业8/airqualitydf.RDS
deleted file mode 100644
index 5d6d693..0000000
Binary files a/SD/3.9_课后作业8/airqualitydf.RDS and /dev/null differ
diff --git a/SD/3.9_课后作业8/airqualitymedianoutrow5.pdf b/SD/3.9_课后作业8/airqualitymedianoutrow5.pdf
deleted file mode 100644
index face837..0000000
Binary files a/SD/3.9_课后作业8/airqualitymedianoutrow5.pdf and /dev/null differ
diff --git a/SD/3.9_课后作业8/freq.pdf b/SD/3.9_课后作业8/freq.pdf
deleted file mode 100644
index 446ee8c..0000000
Binary files a/SD/3.9_课后作业8/freq.pdf and /dev/null differ
diff --git a/SD/3.9_课后作业8/index.qmd b/SD/3.9_课后作业8/index.qmd
deleted file mode 100644
index ffd30e0..0000000
--- a/SD/3.9_课后作业8/index.qmd
+++ /dev/null
@@ -1,66 +0,0 @@
----
-title: "课后作业8"
-subtitle: 《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution
-author: 苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心
-date: today
-lang: zh
-resources:
- - "*.pdf"
- - "*.sas"
-format:
- revealjs:
- theme: dark
- slide-number: true
- chalkboard:
- buttons: true
- preview-links: auto
- lang: zh
- toc: true
- toc-depth: 1
- toc-title: 大纲
- logo: ./_extensions/inst/img/ucaslogo.png
- css: ./_extensions/inst/css/revealjs.css
- pointer:
- key: "p"
- color: "#32cd32"
- pointerSize: 18
-revealjs-plugins:
- - pointer
-filters:
- - d2
----
-
-```{r}
-#| include: false
-#| cache: false
-lang <- "cn"
-require(tidyverse)
-require(learnr)
-```
-
-## 第8次课后作业
-
-1. 根据`airqualitydf.xlsx`,按采样点统计白天(8:00-20:00)与夜晚(20:00-8:00)中空气质量指数(AQI)中位数,按城市统计低于所有采样点AQI30%分位值的采样点占比,列出上述占比最高的10个城市(不考虑采样点数低于5个的城市)。
-2. 按照不同城市分组,统计白天与夜晚AQI中位数是否具有显著差异。
-
-作业模板:[第8次课后作业_模板.qmd](https://git.drwater.net/course/RWEP/raw/branch/main/SD/20240328_9_课后作业/第8次课后作业_模板.qmd)
-
-## 示例代码
-
-### 基于R的示例结果
-
-- [第8次课后作业R示例代码结果](./第8次课后作业_模板.html)
-
-### 基于SAS的示例结果
-
-- [第8次课后作业SAS示例代码](./第8次课后作业_模板.sas)
-- [第8次课后作业SAS示例结果1](./median.pdf)
-- [第8次课后作业SAS示例结果2](./freq.pdf)
-- [第8次课后作业SAS示例结果3](./airqualitymedianoutrow5.pdf)
-- [第8次课后作业SAS示例结果4](./npar1wayConover.pdf)
-
-## 欢迎讨论!{.center}
-
-
-`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drc.drwater.net/course/public/RWEP/PUB/SD/")`
-
diff --git a/SD/3.9_课后作业8/median.pdf b/SD/3.9_课后作业8/median.pdf
deleted file mode 100644
index eafe394..0000000
Binary files a/SD/3.9_课后作业8/median.pdf and /dev/null differ
diff --git a/SD/3.9_课后作业8/metadf.RDS b/SD/3.9_课后作业8/metadf.RDS
deleted file mode 100644
index 454bc5f..0000000
Binary files a/SD/3.9_课后作业8/metadf.RDS and /dev/null differ
diff --git a/SD/3.9_课后作业8/npar1wayConover.pdf b/SD/3.9_课后作业8/npar1wayConover.pdf
deleted file mode 100644
index 2cc2998..0000000
Binary files a/SD/3.9_课后作业8/npar1wayConover.pdf and /dev/null differ
diff --git a/SD/3.9_课后作业8/testdf.RDS b/SD/3.9_课后作业8/testdf.RDS
deleted file mode 100644
index a21548f..0000000
Binary files a/SD/3.9_课后作业8/testdf.RDS and /dev/null differ
diff --git a/SD/3.9_课后作业8/第8次课后作业_模板.qmd b/SD/3.9_课后作业8/第8次课后作业_模板.qmd
deleted file mode 100644
index a6b54d6..0000000
--- a/SD/3.9_课后作业8/第8次课后作业_模板.qmd
+++ /dev/null
@@ -1,152 +0,0 @@
----
-title: 课后作业8
-author: 姓名
-format: html
----
-
-# 数据
-
-下载airquality.xlsx,并读取数据。
-
-```{r}
-#| message: false
-#| warning: false
-# 下载至临时文件
-if (FALSE) {
- tmpxlsxpath <- file.path(tempdir(), "airquality.xlsx")
- download.file("https://git.drwater.net/course/RWEP/raw/branch/PUB/data/airquality.xlsx",
- destfile = tmpxlsxpath)
- airqualitydf <- readxl::read_xlsx(tmpxlsxpath, sheet = 2)
- metadf <- readxl::read_xlsx(tmpxlsxpath, sheet = 1)
- saveRDS(airqualitydf, "./airqualitydf.RDS")
- saveRDS(metadf, "./metadf.RDS")
-}
-airqualitydf <- readRDS("./airqualitydf.RDS")
-metadf <- readRDS("./metadf.RDS")
-```
-
-# 描述统计
-
-根据`airqualitydf.xlsx`,按采样点统计白天(8:00-20:00)与夜晚(20:00-8:00)中空气质量指数(AQI)中位数,按城市统计低于所有采样点AQI30%分位值的采样点占比,列出上述占比最高的10个城市(不考虑采样点数低于5个的城市)。
-
-```{r}
-#| message: false
-#| warning: false
-require(tidyverse)
-airqualitydf |>
- select(datetime, site, AQI) |>
- filter(!is.na(AQI)) |>
- group_by(site) |>
- summarize(AQI.median = median(AQI, na.rm = TRUE)) |>
- left_join(metadf |> select(site, city = Area)) |>
- group_by(city) |>
- filter(n() > 5) |>
- summarize(p = sum(AQI.median < quantile(airqualitydf$AQI, probs = 0.5, na.rm = TRUE)) / n()) |>
- top_n(10, p)
-
-
-airqualitydf |>
- select(datetime, site, AQI) |>
- filter(!is.na(AQI)) |>
- group_by(site) |>
- summarize(AQI.median = median(AQI, na.rm = TRUE))
-
-airqualitydf |>
- select(datetime, site, AQI) |>
- filter(!is.na(AQI)) |>
- left_join(metadf |> select(site, city = Area)) |>
- group_by(city) |>
- filter(length(unique(site)) >= 5) |>
- summarize(p = sum(AQI < quantile(airqualitydf$AQI, probs = 0.2,
- na.rm = TRUE)) / n()) |>
- slice_max(p, n = 10) |>
-knitr::kable()
-
-
-```
-
-
-# 统计检验
-
-按照不同城市分组,统计白天与夜晚AQI中位数是否具有显著差异。
-
-```{r}
-#| message: false
-#| warning: false
-
-if (FALSE) {
- require(infer)
- require(tidyverse)
- testdf <- airqualitydf |>
- select(datetime, site, AQI) |>
- filter(!is.na(AQI)) |>
- left_join(metadf |> select(site, city = Area)) |>
- group_by(city) |>
- filter(length(unique(site)) >= 5) |>
- mutate(dayornight = factor(ifelse(between(hour(datetime), 8, 20), "day", "night"),
- levels = c("day", "night"))
- ) |>
- group_by(city) |>
- nest(citydf = -city) |>
- mutate(median_diff = purrr::map_dbl(citydf, ~
- .x |>
- specify(AQI ~ dayornight) |>
- calculate(stat = "diff in medians", order = c("day", "night")) |>
- pull(stat)
- )) |>
- ungroup() |>
- # slice_sample(n = 12) |>
- mutate(null_dist = purrr::map(citydf, ~
- .x |>
- specify(AQI ~ dayornight) |>
- hypothesize(null = "independence") |>
- generate(reps = 1000, type = "permute") |>
- calculate(stat = "diff in medians", order = c("day", "night"))
- )) |>
- mutate(p_value = purrr::map2_dbl(null_dist, median_diff,
- ~ get_p_value(.x, obs_stat = .y, direction = "both") |>
- pull(p_value)
- )) |>
- mutate(sigdiff = ifelse(p_value < 0.01, "显著差异", "无显著差异")) |>
- mutate(fig = purrr::pmap(list(null_dist, median_diff, city, sigdiff),
- ~ visualize(..1) +
- shade_p_value(obs_stat = ..2, direction = "both") +
- ggtitle(paste0(..3, ":", ..4)) +
- theme_sci(2, 2)
- )) |>
- arrange(p_value)
- saveRDS(testdf, "./testdf.RDS")
-}
-
-if (FALSE) {
-
-lang <- "cn"
-require(dwfun)
-require(rmdify)
-require(drwateR)
-dwfun::init()
-rmdify::rmd_init()
-
-testdf <- readRDS("./testdf.RDS")
-require(tidyverse)
-testdf |>
- select(city, median_diff, p_value, sigdiff) |>
- knitr::kable()
-testdf |>
- mutate(grp = (row_number() - 1)%/% 12) |>
- group_by(grp) |>
- nest(grpdf = -grp) |>
- ungroup() |>
-# slice(1) |>
- mutate(gp = purrr::map(grpdf,
- ~(.x |>
- pull(fig)) |>
- patchwork::wrap_plots(ncol = 3) +
- dwfun::theme_sci(5, 7))) |>
- pull(gp)
-
-
-}
-
-```
-
diff --git a/SD/3.9_课后作业8/第8次课后作业_模板.sas b/SD/3.9_课后作业8/第8次课后作业_模板.sas
deleted file mode 100644
index 9cdaf41..0000000
--- a/SD/3.9_课后作业8/第8次课后作业_模板.sas
+++ /dev/null
@@ -1,289 +0,0 @@
-options ls=256 ps=32767 nodate validmemname=extend validvarname=any;
-
-title 'The SAS System';
-
-%macro print(d);
- proc print data=&d;run;
- %mend;
-%macro printobs(d,obs);
- proc print data=&d (obs=&obs);run;
- %mend;
-%macro printfirstobsobs(d,firstobs,obs);
- proc print data=&d (firstobs=&firstobs obs=&obs);run;
- %mend;
-%macro contents(d);
- proc contents data=&d varnum;run;
- %mend;
-%macro contentsshort(d);
- proc contents data=&d varnum short;run;
- %mend;
-%macro save_dataset(d);
- data "d:&d";
- set &d;
- run;
- %mend;
-%macro load_dataset(d);
- data &d;
- set "d:&d";
- run;
- %mend;
-%macro kill;
- PROC DATASETS LIB=work KILL;RUN;quit;
- %mend;
-proc template;
- list styles;
-run;
-
-
-
-%kill;
-PROC IMPORT OUT=WORK.raw_metadf
- DATAFILE="d:airquality.xlsx"
- DBMS=EXCEL REPLACE;
- RANGE="metadf$";
- GETNAMES=YES;
- MIXED=YES;
- SCANTEXT=YES;
- USEDATE=NO;
- SCANTIME=NO;
-RUN;
- %print(raw_metadf);
- %save_dataset(raw_metadf); *ԭʼݼ;
-
-PROC IMPORT OUT=WORK.raw_airqualitydf
- DATAFILE="d:airquality.xlsx"
- DBMS=EXCEL REPLACE;
- RANGE="airqualitydf$";
- GETNAMES=YES;
- MIXED=YES;
- SCANTEXT=YES;
- USEDATE=NO; *ΪYES;
- SCANTIME=NO; *ΪYES;
-RUN;
- %print(raw_airqualitydf);
- %save_dataset(raw_airqualitydf); *ԭʼݼ;
-
-
-%kill;
-/*Ӳԭʼݼ*/
-%load_dataset(raw_metadf);
-%load_dataset(raw_airqualitydf);
-/*鿴ݼ*/
-%contents(raw_metadf);
-%contentsshort(raw_metadf);
-/*site name Area lon lat*/
-%contents(raw_airqualitydf);
-%contentsshort(raw_airqualitydf);
-/*datetime site 'CO_mg/m3'n 'CO_24h_mg/m3'n 'NO2_g/m3'n 'NO2_24h_g/m3'n 'O3_g/m3'n 'O3_24h_g/m3'n 'O3_8h_g/m3'n 'O3_8h_24h_g/m3'n 'PM10_g/m3'n 'PM10_24h_g/m3'n 'PM2#5_g/m3'n 'PM2#5_24h_g/m3'n 'SO2_g/m3'n 'SO2_24h_g/m3'n AQI PrimaryPollutant Quality Unheathful*/
-
-%printobs(raw_metadf,10);
-%printobs(raw_airqualitydf,10);
-proc sort data=raw_metadf out=metadfsorted;
- by site;
-run;
-proc sort data=raw_airqualitydf out=airqualitydfsorted;
- by site;
-run;
-/*ϲݼԤȡڡʱ䲿֣daynight*/
-data airquality;
- retain datetime date time DayNight site name Area AQI lon lat;
- length DayNight $ 5;
- merge metadfsorted airqualitydfsorted;
- by site;
- date=datepart(datetime);
- time=timepart(datetime);
- if '8:00't<=time<'20:00't then DayNight='day';
- else DayNight='night';
- format datetime e8601dt25. date yymmdd10. time time5.;
- keep site name Area lon lat datetime date time DayNight AQI;
-run;
- %printobs(airquality,100);
- %save_dataset(airquality); *ϲݼ;
-
-
-
-%kill;
-/*#################### DATA SET airquality ####################*/
-/*Ӳ̺ϲݼ*/
-%load_dataset(airquality);
-%printobs(airquality,100);
-
-/*sitenameǷһ£ֲһ£siteͳ*/
-proc sql;
- select count(distinct(site)) as count_site from airquality;
- select count(distinct(name)) as count_name from airquality;
-quit;
-/*
-count_site
-1714
-count_name
-1522
-*/
-
-
-/*#########################################################################*/
-/*ͳư죨8:00-20:00ҹ20:00-8:00пָAQIλ*/
-/*#########################################################################*/
-/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@*/
-ods pdf file='d:means.pdf' style=sapphire dpi=1200;
-proc means data=airquality median maxdec=1;
- class site DayNight;
- var AQI;
- where AQI is not missing;
-run;
-ods pdf close;
-/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@*/
-
-
-/*####################################################################################################*/
-/*ͳƵвAQI30%λֵIJռȣгռߵ10УDz5ijУ*/
-/*####################################################################################################*/
-/*鿴вAQI30%λֵΪSQL֤ȥ*/
-proc univariate data=airquality noprint;
- var AQI;
- output out=airqualitystats pctlpts=30 pctlpre=P;
-run;
- %print(airqualitystats); /*42*/
-
-/*вAQI30%λֵ*/
-proc sql;
- select AQI into : xvalues separated by ',' from airquality;
- select distinct(pctl(30, &xvalues)) into : P30 from airquality;
-quit;
-/*鿴вAQI30%λֵĺֵΪ*/
- %put P30=&P30.;
-
-/*вAQI30%λֵAQIּԺϲݼвֱӷּøλͳƣȥ*/
-data airquality1;
- set airquality;
- if AQI<&P30. then quality='good';
- else quality='fair';
-run;
- %printobs(airquality1,100);
-
-/*вAQIλ*/
-proc means data=airquality median maxdec=1;
- class Area site;
- var AQI;
- where AQI is not missing;
- output out=airqualitymedian median=;
-run;
- %print(airqualitymedian);
-
-/*вAQI30%λֵAQIλּ*/
-data airqualitymedian1;
- set airqualitymedian;
- if AQI<&P30. then quality='good';
- else quality='fair';
- where _TYPE_=3;
-run;
- %print(airqualitymedian1);
-
-/*ͳƵвAQI30%λֵIJռȣ鿴*/
-/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@*/
-ods pdf file='d:freq.pdf' style=sapphire dpi=1200;
-proc freq data=airqualitymedian1;
- table Area*quality /nocol nopercent;
-run;
-ods pdf close;
-/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@*/
-/*ͳƵвAQI30%λֵIJռȣƵͳƽݼ*/
-proc freq data=airqualitymedian1;
- table Area*quality /outpct out=airqualitymedianoutrow(drop=percent pct_col); *Ƶаٷֱ;
-run;
- %printobs(airqualitymedianoutrow,100);
-
-/*жԲͳƣ鿴ȥ*/
-proc means data=airqualitymedianoutrow sum maxdec=0;
- class Area;
- var COUNT;
-run;
-/*5ijУ鿴ȥ*/
-proc sql;
- select *,sum(COUNT) as total_COUNT from airqualitymedianoutrow group by Area having calculated total_COUNT>=5 order by quality desc,PCT_ROW desc,COUNT desc;
-quit;
-/*5ijУݼ*/
-proc sql;
- create table airqualitymedianoutrow5 as select *,sum(COUNT) as total_COUNT from airqualitymedianoutrow group by Area having calculated total_COUNT>=5 order by quality desc,PCT_ROW desc,COUNT desc;
-quit;
-/*гռߵ10У5ijУ*/
-/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
-ods pdf file='d:airqualitymedianoutrow5.pdf' style=sapphire dpi=1200;
- %printobs(airqualitymedianoutrow5,10);
- %printobs(airqualitymedianoutrow5,20);
- %printobs(airqualitymedianoutrow5,30);
- %print(airqualitymedianoutrow5);
-ods pdf close;
-/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/
-
-
-/*#####################################################*/
-/*ղͬз飬ͳưҹAQIλǷ*/
-/*#####################################################*/
-/*еsiteûAreaӦ*/
-proc sql;
- select distinct(Area),count(distinct(Area)) as count_Area from airquality;
-quit;
-proc print data=airquality;
- where Area is missing;
-run;
-proc sql;
- select distinct(site),count(distinct(site)) as count_site from airquality where Area is missing;
-quit;
-/*4site㣩ûAreaУӦ
-site count_site
-2628A 4
-3128A 4
-4034A 4
-4036A 4
-*/
-proc sort data=airquality out=airqualitysorted;
- by Area;
-run;
-/*ղͬз飬ͳưҹAQIλ鿴ȥ*/
-proc means data=airqualitysorted median maxdec=1;
- by Area;
- class DayNight;
- var AQI;
- where AQI is not missing;
-run;
-/*ͳؿҹAQIλǷ*/
-proc npar1way data=airquality median;
- class DayNight;
- var AQI;
-run;
-/*ղͬз飬ͳưҹAQIλǷ*/
-/*@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/
-ods pdf file='d:npar1waymedian.pdf' style=sapphire dpi=1200;
-proc npar1way data=airqualitysorted median;
- class DayNight;
- var AQI;
- by Area;
- where Area is not missing;
-run;
-ods pdf close;
-/*@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/
-/*
-Using Wilcoxon scores in the linear rank statistic for two-sample data produces the rank sum statistic of the Mann-Whitney-Wilcoxon test.
-Using Wilcoxon scores in the one-way ANOVA statistic produces the Kruskal-Wallis test.
-Wilcoxon scores are locally most powerful for location shifts of a logistic distribution.
-*//*
-Using median scores in the linear rank statistic for two-sample data produces the two-sample median test.
-The one-way ANOVA statistic with median scores is equivalent to the Brown-Mood test.
-Median scores are particularly powerful for distributions that are symmetric and heavy-tailed.*/
-
-/*
-Scores for Linear Rank and One-Way ANOVA Tests
-For each score type that you specify, PROC NPAR1WAY computes a one-way ANOVA statistic and also a linear rank statistic for two-sample data. The following score types are used primarily to test for differences in location: Wilcoxon, median, Van der Waerden (normal), and Savage. The following scores types are used to test for scale differences: Siegel-Tukey, Ansari-Bradley, Klotz, and Mood. Conover scores can be used to test for differences in both location and scale. This section gives formulas for the score types available in PROC NPAR1WAY. For further information about the formulas and the applicability of each score, see Randles and Wolfe (1979), Gibbons and Chakraborti (2010), Conover (1999), and Hollander and Wolfe (1999).
-In addition to the score types described in this section, you can specify the SCORES=DATA option to use the input data observations as scores. This enables you to produce a wide variety of tests. You can construct any scores by using the DATA step, and then you can use PROC NPAR1WAY to compute the corresponding linear rank and one-way ANOVA tests for these scores. You can also analyze raw (unscored) data by using the SCORES=DATA option; for two-sample data, the corresponding exact test is a permutation test that is known as Pitmans test.
-*/
-/*@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/
-ods pdf file='d:npar1wayConover.pdf' style=sapphire dpi=1200;
-proc npar1way data=airqualitysorted Conover;
- class DayNight;
- var AQI;
- by Area;
- where Area is not missing;
-run;
-/*@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/
-/*Conover scores can be used to test for differences in both location and scale.*/
diff --git a/SD/4.1_datavisualize/_extensions b/SD/4.1_datavisualize/_extensions
deleted file mode 120000
index 74119e3..0000000
--- a/SD/4.1_datavisualize/_extensions
+++ /dev/null
@@ -1 +0,0 @@
-../../_extensions
\ No newline at end of file
diff --git a/SD/4.1_datavisualize/index.qmd b/SD/4.1_datavisualize/index.qmd
deleted file mode 100644
index e203666..0000000
--- a/SD/4.1_datavisualize/index.qmd
+++ /dev/null
@@ -1,3938 +0,0 @@
----
-title: "数据可视化"
-subtitle: 《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution
-author: 苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心
-date: today
-lang: zh
-format:
- revealjs:
- theme: dark
- slide-number: true
- chalkboard:
- buttons: true
- preview-links: auto
- lang: zh
- toc: true
- toc-depth: 1
- toc-title: 大纲
- logo: ./_extensions/inst/img/ucaslogo.png
- css: ./_extensions/inst/css/revealjs.css
- pointer:
- key: "p"
- color: "#32cd32"
- pointerSize: 18
-revealjs-plugins:
- - pointer
-filters:
- - d2
-knitr:
- opts_chunk:
- dev: "svg"
- retina: 3
-execute:
- freeze: auto
- cache: true
- echo: true
- fig-width: 5
- fig-height: 6
----
-
-```{r}
-#| include: false
-#| cache: false
-knitr::opts_chunk$set(echo = TRUE)
-source("../../coding/_common.R")
-require(learnr)
-library(tidyverse)
-library(palmerpenguins)
-library(ggthemes)
-```
-
-
-
-## {background-image="../../img/concepts/tidyverse-packages-ggplot.png" background-position="center" background-size="100%"}
-
-
-
-## The ggplot2 Package
-
-
-
-... is an **R package to visualize data** created by Hadley Wickham in 2005
-
-```{r}
-#| label: ggplot-package-install-2
-#| eval: false
-# install.packages("ggplot2")
-library(ggplot2)
-```
-
-
-
-::: fragment
-... is part of the [`{tidyverse}`](https://www.tidyverse.org/)
-
-```{r}
-#| label: tidyverse-package-install-2
-#| eval: false
-# install.packages("tidyverse")
-library(tidyverse)
-```
-:::
-
-# The Grammar of {ggplot2}
-
-
-
-## The Grammar of {ggplot2}
-
-
-
-
- Component |
- Function |
- Explanation |
-
-
- Data |
- ggplot(data) |
- *The raw data that you want to visualise.* |
-
-
- Aesthetics |
- aes() |
- *Aesthetic mappings between variables and visual properties.* |
-
- Geometries |
- geom_*() |
- *The geometric shapes representing the data.* |
-
-
-
-
-
-## The Grammar of {ggplot2}
-
-
-
-
-
- Component |
- Function |
- Explanation |
-
-
- Data |
- ggplot(data) |
- *The raw data that you want to visualise.* |
-
-
- Aesthetics |
- aes() |
- *Aesthetic mappings between variables and visual properties.* |
-
- Geometries |
- geom_*() |
- *The geometric shapes representing the data.* |
-
-
- Statistics |
- stat_*() |
- *The statistical transformations applied to the data.* |
-
-
- Scales |
- scale_*() |
- *Maps between the data and the aesthetic dimensions.* |
-
-
- Coordinate System |
- coord_*() |
- *Maps data into the plane of the data rectangle.* |
-
-
- Facets |
- facet_*() |
- *The arrangement of the data into a grid of plots.* |
-
-
- Visual Themes |
- theme() / theme_*() |
- *The overall visual defaults of a plot.* |
-
-
-
-
-
-## The Data
-
-Bike sharing counts in London, UK, powered by [TfL Open Data](https://tfl.gov.uk/modes/cycling/santander-cycles)
-
-::: incremental
-- covers the years 2015 and 2016
-- incl. weather data acquired from [freemeteo.com](https://freemeteo.com)
-- prepared by Hristo Mavrodiev for [Kaggle](https://www.kaggle.com/hmavrodiev/london-bike-sharing-dataset)
-:::
-
-
-
-::: fragment
-```{r}
-#| label: data-import
-bikes <- readr::read_csv("../../data/ggplot2/london-bikes-custom.csv",
- ## or: "https://raw.githubusercontent.com/z3tt/graphic-design-ggplot2/main/data/london-bikes-custom.csv"
- col_types = "Dcfffilllddddc"
-)
-
-bikes$season <- forcats::fct_inorder(bikes$season)
-```
-:::
-
-------------------------------------------------------------------------
-
-```{r}
-#| label: data-table
-#| echo: false
-#| purl: false
-library(tidyverse)
-tibble(
- Variable = names(bikes),
- Description = c(
- "Date encoded as `YYYY-MM-DD`", "`day` (6:00am–5:59pm) or `night` (6:00pm–5:59am)", "`2015` or `2016`", "`1` (January) to `12` (December)", "`winter`, `spring`, `summer`, or `autumn`", "Sum of reported bikes rented", "`TRUE` being Monday to Friday and no bank holiday", "`TRUE` being Saturday or Sunday", "`TRUE` being a bank holiday in the UK", "Average air temperature (°C)", "Average feels like temperature (°C)", "Average air humidity (%)", "Average wind speed (km/h)", "Most common weather type"
- ),
- Class = c(
- "date", "character", "factor", "factor", "factor", "integer", "logical", "logical", "logical", "double", "double", "double", "double", "character"
- )
- ) %>%
- kableExtra::kbl(
- booktabs = TRUE, longtable = TRUE
- ) %>%
- kableExtra::kable_styling(
- font_size = 20
- ) %>%
- kableExtra::kable_minimal(
- "hover", full_width = TRUE, position = "left", html_font = "Spline Sans Mono"
- )
-```
-
-## `ggplot2::ggplot()`
-
-```{r}
-#| label: ggplot-function
-#| eval: false
-#| echo: false
-#?ggplot
-```
-
-{fig-alt="The help page of the ggplot() function." fig-width="175%"}
-
-## Data
-
-```{r}
-#| label: setup-ggplot-slides
-#| include: false
-#| purl: false
-library(ggplot2)
-theme_set(theme_grey(base_size = 14))
-```
-
-```{r}
-#| label: ggplot-data
-#| output-location: column
-ggplot(data = bikes)
-```
-
-## Aesthetic Mapping(视觉映射):`aes(.)`
-
-
-
-= link variables to graphical properties
-
-::: incremental
-- positions (`x`, `y`)
-- colors (`color`, `fill`)
-- shapes (`shape`, `linetype`)
-- size (`size`)
-- transparency (`alpha`)
-- groupings (`group`)
-:::
-
-## Aesthetic Mapping(视觉映射):`aes(.)`
-
-```{r}
-#| label: ggplot-aesthetics-outside
-#| output-location: column
-#| code-line-numbers: "2|1,2"
-ggplot(data = bikes) +
- aes(x = temp_feel, y = count)
-```
-
-## aesthetics
-
-`aes()` outside as component
-
-```{r}
-#| label: ggplot-aesthetics-outside-comp
-#| eval: false
-ggplot(data = bikes) +
- aes(x = temp_feel, y = count)
-```
-
-
-
-::: fragment
-`aes()` inside, explicit matching
-
-```{r}
-#| label: ggplot-aesthetics-inside
-#| eval: false
-ggplot(data = bikes, mapping = aes(x = temp_feel, y = count))
-```
-
-
-:::
-
-::: fragment
-`aes()` inside, implicit matching
-
-```{r}
-#| label: ggplot-aesthetics-inside-implicit
-#| eval: false
-ggplot(bikes, aes(temp_feel, count))
-```
-
-
-:::
-
-::: fragment
-`aes()` inside, mixed matching
-
-```{r}
-#| label: ggplot-aesthetics-inside-mix
-#| eval: false
-ggplot(bikes, aes(x = temp_feel, y = count))
-```
-:::
-
-# Geometrical Layers
-
-## Geometries(几何图层):geom_*
-
-
-
-= interpret aesthetics as graphical representations
-
-::: incremental
-- points
-- lines
-- polygons
-- text labels
-- ...
-:::
-
-## Geometries(几何图层):geom_*
-
-```{r}
-#| label: geom-point
-#| output-location: column
-#| code-line-numbers: "1,2,3,4|5"
-ggplot(
- bikes,
- aes(x = temp_feel, y = count)
- ) +
- geom_point()
-```
-
-## Visual Properties of Layers(图层属性)
-
-```{r}
-#| label: geom-point-properties
-#| output-location: column
-#| code-line-numbers: "5,6,7,8,9,10,11|6,7,8,9,10"
-ggplot(
- bikes,
- aes(x = temp_feel, y = count)
- ) +
- geom_point(
- color = "#28a87d",
- alpha = .5,
- shape = "X",
- stroke = 1,
- size = 4
- )
-```
-
-## Setting vs Mapping of Visual Properties
-
-::: {layout-ncol="2"}
-```{r}
-#| label: geom-point-properties-set
-#| fig-height: 3.5
-#| code-line-numbers: "6"
-ggplot(
- bikes,
- aes(x = temp_feel, y = count)
- ) +
- geom_point(
- color = "#28a87d",
- alpha = .5
- )
-```
-
-::: fragment
-```{r}
-#| label: geom-point-properties-map
-#| fig-height: 3.5
-#| code-line-numbers: "6"
-ggplot(
- bikes,
- aes(x = temp_feel, y = count)
- ) +
- geom_point(
- aes(color = season),
- alpha = .5
- )
-```
-:::
-:::
-
-## Mapping Expressions
-
-```{r}
-#| label: geom-point-aes-expression
-#| output-location: column
-#| code-line-numbers: "6"
-ggplot(
- bikes,
- aes(x = temp_feel, y = count)
- ) +
- geom_point(
- aes(color = temp_feel > 20),
- alpha = .5
- )
-```
-
-## Filter Data
-
-```{r}
-#| label: geom-point-aes-expression-exercise-na
-#| output-location: column
-#| code-line-numbers: "2"
-ggplot(
- filter(bikes, !is.na(weather_type)),
- aes(x = temp, y = temp_feel)
- ) +
- geom_point(
- aes(color = weather_type == "clear",
- size = count),
- shape = 18,
- alpha = .5
- )
-```
-
-## Filter Data
-
-```{r}
-#| label: geom-point-aes-expression-exercise-na-pipe
-#| output-location: column
-#| code-line-numbers: "2"
-ggplot(
- bikes %>% filter(!is.na(weather_type)),
- aes(x = temp, y = temp_feel)
- ) +
- geom_point(
- aes(color = weather_type == "clear",
- size = count),
- shape = 18,
- alpha = .5
- )
-```
-
-```{r}
-#| label: reset-theme
-#| include: false
-#| purl: false
-theme_set(theme_grey(base_size = 14))
-```
-
-## Local vs. Global(应用至当前图层或所有图层)
-
-::: {layout-ncol="2"}
-```{r}
-#| label: geom-point-aes-geom
-#| code-line-numbers: "3,6"
-#| fig-height: 3.2
-ggplot(
- bikes,
- aes(x = temp_feel, y = count)
- ) +
- geom_point(
- aes(color = season),
- alpha = .5
- )
-```
-
-::: fragment
-```{r}
-#| label: geom-point-aes-global
-#| code-line-numbers: "3,4"
-#| fig-height: 3.2
-ggplot(
- bikes,
- aes(x = temp_feel, y = count,
- color = season)
- ) +
- geom_point(
- alpha = .5
- )
-```
-:::
-:::
-
-## Adding More Layers
-
-```{r}
-#| label: geom-smooth
-#| output-location: column
-#| code-line-numbers: "9,10,11"
-ggplot(
- bikes,
- aes(x = temp_feel, y = count,
- color = season)
- ) +
- geom_point(
- alpha = .5
- ) +
- geom_smooth(
- method = "lm"
- )
-```
-
-## Global Color Encoding
-
-```{r}
-#| label: geom-smooth-aes-global
-#| output-location: column
-#| code-line-numbers: "3,4,9,10,11"
-ggplot(
- bikes,
- aes(x = temp_feel, y = count,
- color = season)
- ) +
- geom_point(
- alpha = .5
- ) +
- geom_smooth(
- method = "lm"
- )
-```
-
-## Local Color Encoding
-
-```{r}
-#| label: geom-smooth-aes-fixed
-#| output-location: column
-#| code-line-numbers: "6,9,10,11"
-ggplot(
- bikes,
- aes(x = temp_feel, y = count)
- ) +
- geom_point(
- aes(color = season),
- alpha = .5
- ) +
- geom_smooth(
- method = "lm"
- )
-```
-
-## The \`group\` Aesthetic
-
-```{r}
-#| label: geom-smooth-aes-grouped
-#| output-location: column
-#| code-line-numbers: "10"
-ggplot(
- bikes,
- aes(x = temp_feel, y = count)
- ) +
- geom_point(
- aes(color = season),
- alpha = .5
- ) +
- geom_smooth(
- aes(group = day_night),
- method = "lm"
- )
-```
-
-## Set Both as Global Aesthetics
-
-```{r}
-#| label: geom-smooth-aes-global-grouped
-#| output-location: column
-#| code-line-numbers: "4,5"
-ggplot(
- bikes,
- aes(x = temp_feel, y = count,
- color = season,
- group = day_night)
- ) +
- geom_point(
- alpha = .5
- ) +
- geom_smooth(
- method = "lm"
- )
-```
-
-## Overwrite Global Aesthetics
-
-```{r}
-#| label: geom-smooth-aes-global-grouped-overwrite
-#| output-location: column
-#| code-line-numbers: "4,12"
-ggplot(
- bikes,
- aes(x = temp_feel, y = count,
- color = season,
- group = day_night)
- ) +
- geom_point(
- alpha = .5
- ) +
- geom_smooth(
- method = "lm",
- color = "black"
- )
-```
-
-
-
-# Statistical Layers
-
-
-## \`stat_\*()\` and \`geom_\*()\`
-
-::: {layout-ncol="2"}
-```{r}
-#| label: stat-geom
-#| fig-height: 5.1
-#| code-line-numbers: "2"
-ggplot(bikes, aes(x = temp_feel, y = count)) +
- stat_smooth(geom = "smooth")
-```
-
-```{r}
-#| label: geom-stat
-#| fig-height: 5.1
-#| code-line-numbers: "2"
-ggplot(bikes, aes(x = temp_feel, y = count)) +
- geom_smooth(stat = "smooth")
-```
-:::
-
-
-## \`stat_\*()\` and \`geom_\*()\`
-
-::: {layout-ncol="2"}
-```{r}
-#| label: stat-geom-2
-#| fig-height: 5.1
-#| code-line-numbers: "2"
-ggplot(bikes, aes(x = season)) +
- stat_count(geom = "bar")
-```
-
-```{r}
-#| label: geom-stat-2
-#| fig-height: 5.1
-#| code-line-numbers: "2"
-ggplot(bikes, aes(x = season)) +
- geom_bar(stat = "count")
-```
-:::
-
-
-## \`stat_\*()\` and \`geom_\*()\`
-
-::: {layout-ncol="2"}
-```{r}
-#| label: stat-geom-3
-#| fig-height: 5.1
-#| code-line-numbers: "2"
-ggplot(bikes, aes(x = date, y = temp_feel)) +
- stat_identity(geom = "point")
-```
-
-```{r}
-#| label: geom-stat-3
-#| fig-height: 5.1
-#| code-line-numbers: "2"
-ggplot(bikes, aes(x = date, y = temp_feel)) +
- geom_point(stat = "identity")
-```
-:::
-
-## Statistical Summaries
-
-```{r}
-#| label: stat-summary
-#| output-location: column
-#| code-line-numbers: "5|3"
-ggplot(
- bikes,
- aes(x = season, y = temp_feel)
- ) +
- stat_summary()
-```
-
-
-## Statistical Summaries
-
-```{r}
-#| label: stat-summary-defaults
-#| output-location: column
-#| code-line-numbers: "6,7"
-ggplot(
- bikes,
- aes(x = season, y = temp_feel)
- ) +
- stat_summary(
- fun.data = mean_se, ## the default
- geom = "pointrange" ## the default
- )
-```
-
-
-## Statistical Summaries
-
-```{r}
-#| label: stat-summary-median
-#| output-location: column
-#| code-line-numbers: "5|5,6,11|6,7,8,9,10,11|7,8"
-ggplot(
- bikes,
- aes(x = season, y = temp_feel)
- ) +
- geom_boxplot() +
- stat_summary(
- fun = mean,
- geom = "point",
- color = "#28a87d",
- size = 3
- )
-```
-
-
-## Statistical Summaries
-
-```{r}
-#| label: stat-summary-custom
-#| output-location: column
-#| code-line-numbers: "5,6,7,8,9|7,8"
-ggplot(
- bikes,
- aes(x = season, y = temp_feel)
- ) +
- stat_summary(
- fun = mean,
- fun.max = function(y) mean(y) + sd(y),
- fun.min = function(y) mean(y) - sd(y)
- )
-```
-
-
-
-# Extending a ggplot
-
-## Store a ggplot as Object
-
-```{r}
-#| label: ggplot-object
-#| code-line-numbers: "1,16"
-g <-
- ggplot(
- bikes,
- aes(x = temp_feel, y = count,
- color = season,
- group = day_night)
- ) +
- geom_point(
- alpha = .5
- ) +
- geom_smooth(
- method = "lm",
- color = "black"
- )
-
-class(g)
-```
-
-## Inspect a ggplot Object
-
-```{r}
-#| label: ggplot-object-data
-g$data
-```
-
-## Inspect a ggplot Object
-
-```{r}
-#| label: ggplot-object-mapping
-g$mapping
-```
-
-## Extend a ggplot Object: Add Layers
-
-```{r}
-#| label: ggplot-object-extend-geom
-#| output-location: column
-g +
- geom_rug(
- alpha = .2
- )
-```
-
-## Remove a Layer from the Legend
-
-```{r}
-#| label: geom-guide-none
-#| output-location: column
-#| code-line-numbers: "4"
-g +
- geom_rug(
- alpha = .2,
- show.legend = FALSE
- )
-```
-
-## Extend a ggplot Object: Add Labels
-
-```{r}
-#| label: ggplot-labs-individual
-#| output-location: column
-#| code-line-numbers: "2,3,4"
-g +
- xlab("Feels-like temperature (°F)") +
- ylab("Reported bike shares") +
- ggtitle("TfL bike sharing trends")
-```
-
-## Extend a ggplot Object: Add Labels
-
-```{r}
-#| label: ggplot-labs-bundled
-#| output-location: column
-#| code-line-numbers: "2,3,4,5,6"
-g +
- labs(
- x = "Feels-like temperature (°F)",
- y = "Reported bike shares",
- title = "TfL bike sharing trends"
- )
-```
-
-## Extend a ggplot Object: Add Labels
-
-```{r}
-#| label: ggplot-labs-bundled-color
-#| output-location: column
-#| code-line-numbers: "6"
-g <- g +
- labs(
- x = "Feels-like temperature (°F)",
- y = "Reported bike shares",
- title = "TfL bike sharing trends",
- color = "Season:"
- )
-
-g
-```
-
-## Extend a ggplot Object: Add Labels
-
-```{r}
-#| label: ggplot-labs-bundled-extended
-#| output-location: column
-#| code-line-numbers: "6,7,9"
-g +
- labs(
- x = "Feels-like temperature (°F)",
- y = "Reported bike shares",
- title = "TfL bike sharing trends",
- subtitle = "Reported bike rents versus feels-like temperature in London",
- caption = "Data: TfL",
- color = "Season:",
- tag = "Fig. 1"
- )
-```
-
-## Extend a ggplot Object: Add Labels
-
-::: {layout-ncol="2"}
-```{r}
-#| label: ggplot-labs-empty-vs-null-A
-#| fig-height: 3.6
-#| code-line-numbers: "3"
-g +
- labs(
- x = "",
- caption = "Data: TfL"
- )
-```
-
-```{r}
-#| label: ggplot-labs-empty-vs-null-B
-#| fig-height: 3.6
-#| code-line-numbers: "3"
-g +
- labs(
- x = NULL,
- caption = "Data: TfL"
- )
-```
-:::
-
-## Extend a ggplot Object: Themes
-
-::: {layout-ncol="2"}
-```{r}
-#| label: ggplot-object-extend-theme-light
-#| fig-height: 5.5
-g + theme_light()
-```
-
-::: fragment
-```{r}
-#| label: ggplot-object-extend-theme-minimal
-#| fig-height: 5.5
-g + theme_minimal()
-```
-:::
-:::
-
-## Change the Theme Base Settings
-
-```{r}
-#| label: ggplot-theme-extend-theme-base
-#| output-location: column
-#| code-line-numbers: "2,3|1,2,3,4"
-g + theme_light(
- base_size = 14
-)
-```
-
-## Set a Theme Globally
-
-```{r}
-#| label: ggplot-theme-global
-#| output-location: column
-theme_set(theme_light())
-
-g
-```
-
-## Change the Theme Base Settings
-
-```{r}
-#| label: ggplot-theme-global-base
-#| output-location: column
-#| code-line-numbers: "2,3|1,2,3,4"
-theme_set(theme_light(
- base_size = 14
-))
-
-g
-```
-
-
-## Overwrite Specific Theme Settings
-
-```{r}
-#| label: ggplot-theme-settings-individual-1
-#| output-location: column
-#| code-line-numbers: "2|3"
-g +
- theme(
- panel.grid.minor = element_blank()
- )
-```
-
-## Overwrite Specific Theme Settings
-
-```{r}
-#| label: ggplot-theme-settings-individual-2
-#| output-location: column
-#| code-line-numbers: "4"
-g +
- theme(
- panel.grid.minor = element_blank(),
- plot.title = element_text(face = "bold")
- )
-```
-
-## Overwrite Specific Theme Settings
-
-```{r}
-#| label: ggplot-theme-settings-individual-3
-#| output-location: column
-#| code-line-numbers: "5"
-g +
- theme(
- panel.grid.minor = element_blank(),
- plot.title = element_text(face = "bold"),
- legend.position = "top"
- )
-```
-
-## Overwrite Specific Theme Settings
-
-```{r}
-#| label: ggplot-theme-settings-individual-legend-none
-#| output-location: column
-#| code-line-numbers: "5"
-g +
- theme(
- panel.grid.minor = element_blank(),
- plot.title = element_text(face = "bold"),
- legend.position = "none"
- )
-```
-
-## Overwrite Specific Theme Settings
-
-```{r}
-#| label: ggplot-theme-settings-individual-4
-#| output-location: column
-#| code-line-numbers: "6|2,3,4,6,7"
-g +
- theme(
- panel.grid.minor = element_blank(),
- plot.title = element_text(face = "bold"),
- legend.position = "top",
- plot.title.position = "plot"
- )
-```
-
-## Overwrite Theme Settings Globally
-
-```{r}
-#| label: ggplot-theme-settings-global
-#| output-location: column
-#| code-line-numbers: "1|2,3,4,5|1,2,3,4,5,6"
-theme_update(
- panel.grid.minor = element_blank(),
- plot.title = element_text(face = "bold"),
- legend.position = "top",
- plot.title.position = "plot"
-)
-
-g
-```
-
-## Save the Graphic
-
-```{r}
-#| label: ggplot-save
-#| eval: false
-ggsave(g, filename = "my_plot.png")
-```
-
-::: fragment
-```{r}
-#| label: ggplot-save-implicit
-#| eval: false
-ggsave("my_plot.png")
-```
-:::
-
-::: fragment
-```{r}
-#| label: ggplot-save-aspect
-#| eval: false
-ggsave("my_plot.png", width = 8, height = 5, dpi = 600)
-```
-:::
-
-::: fragment
-```{r}
-#| label: ggplot-save-vector
-#| eval: false
-ggsave("my_plot.pdf", width = 20, height = 12, unit = "cm", device = cairo_pdf)
-```
-:::
-
-::: fragment
-```{r}
-#| label: ggplot-save-cairo_pdf
-#| eval: false
-grDevices::cairo_pdf("my_plot.pdf", width = 10, height = 7)
-g
-dev.off()
-```
-:::
-
-------------------------------------------------------------------------
-
-
-
-{fig-alt="A comparison of vector and raster graphics." fig-width="150%"}
-
-# Facets(面)
-
-## Facets(面)
-
-
-
-= split variables to multiple panels
-
-::: fragment
-Facets are also known as:
-
-- small multiples
-- trellis graphs
-- lattice plots
-- conditioning
-:::
-
-------------------------------------------------------------------------
-
-::: {layout-ncol="2"}
-```{r}
-#| label: facet-types-wrap
-#| echo: false
-#| purl: false
-ggplot(bikes, aes(x = 1, y = 1)) +
- geom_text(
- aes(label = paste0("Subset for\n", stringr::str_to_title(season))),
- size = 5, family = "Cabinet Grotesk", lineheight = .9
- ) +
- facet_wrap(~stringr::str_to_title(season)) +
- ggtitle("facet_wrap()") +
- theme_bw(base_size = 24) +
- theme(
- plot.title = element_text(hjust = .5, family = "Tabular", face = "bold"),
- strip.text = element_text(face = "bold", size = 18),
- panel.grid = element_blank(),
- axis.ticks = element_blank(),
- axis.text = element_blank(),
- axis.title = element_blank(),
- plot.background = element_rect(color = "#f8f8f8", fill = "#f8f8f8"),
- plot.margin = margin(t = 3, r = 25)
- )
-```
-
-::: fragment
-```{r}
-#| label: facet-types-grid
-#| echo: false
-#| purl: false
-data <- tibble(
- x = 1, y = 1,
- day_night = c("Day", "Day", "Night", "Night"),
- year = factor(c("2015", "2016", "2015", "2016"), levels = levels(bikes$year)),
- label = c("Subset for\nDay × 2015", "Subset for\nDay × 2016",
- "Subset for\nNight × 2015", "Subset for\nNight × 2016")
-)
-
-ggplot(data, aes(x = 1, y = 1)) +
- geom_text(
- aes(label = label),
- size = 5, family = "Cabinet Grotesk", lineheight = .9
- ) +
- facet_grid(day_night ~ year) +
- ggtitle("facet_grid()") +
- theme_bw(base_size = 24) +
- theme(
- plot.title = element_text(hjust = .5, family = "Tabular", face = "bold"),
- strip.text = element_text(face = "bold", size = 18),
- panel.grid = element_blank(),
- axis.ticks = element_blank(),
- axis.text = element_blank(),
- axis.title = element_blank(),
- plot.background = element_rect(color = "#f8f8f8", fill = "#f8f8f8"),
- plot.margin = margin(t = 3, l = 25)
- )
-```
-:::
-:::
-
-## Setup
-
-```{r}
-#| label: theme-size-facets
-#| include: false
-#| purl: false
-theme_set(theme_light(base_size = 12))
-
-theme_update(
- panel.grid.minor = element_blank(),
- plot.title = element_text(face = "bold"),
- legend.position = "top",
- plot.title.position = "plot"
-)
-```
-
-```{r}
-#| label: facet-setup
-#| output-location: column
-#| code-line-numbers: "1,2,3,4,5,6,7,8,9,10|12"
-g <-
- ggplot(
- bikes,
- aes(x = temp_feel, y = count,
- color = season)
- ) +
- geom_point(
- alpha = .3,
- guide = "none"
- )
-
-g
-```
-
-## Wrapped Facets
-
-```{r}
-#| label: facet-wrap
-#| output-location: column
-#| code-line-numbers: "1,2,3,4|2,4|3"
-g +
- facet_wrap(
- vars(day_night)
- )
-```
-
-## Wrapped Facets
-
-```{r}
-#| label: facet-wrap-circumflex
-#| output-location: column
-#| code-line-numbers: "3"
-g +
- facet_wrap(
- ~ day_night
- )
-```
-
-## Facet Multiple Variables
-
-```{r}
-#| label: facet-wrap-multiple
-#| output-location: column
-#| code-line-numbers: "3"
-g +
- facet_wrap(
- ~ is_workday + day_night
- )
-```
-
-## Facet Options: Cols + Rows
-
-```{r}
-#| label: facet-wrap-options-ncol
-#| output-location: column
-#| code-line-numbers: "4"
-g +
- facet_wrap(
- ~ day_night,
- ncol = 1
- )
-```
-
-## Facet Options: Free Scaling
-
-```{r}
-#| label: facet-wrap-options-scales
-#| output-location: column
-#| code-line-numbers: "5"
-g +
- facet_wrap(
- ~ day_night,
- ncol = 1,
- scales = "free"
- )
-```
-
-## Facet Options: Free Scaling
-
-```{r}
-#| label: facet-wrap-options-freey
-#| output-location: column
-#| code-line-numbers: "5"
-g +
- facet_wrap(
- ~ day_night,
- ncol = 1,
- scales = "free_y"
- )
-```
-
-## Facet Options: Switch Labels
-
-```{r}
-#| label: facet-wrap-options-switch
-#| output-location: column
-#| code-line-numbers: "5"
-g +
- facet_wrap(
- ~ day_night,
- ncol = 1,
- switch = "x"
- )
-```
-
-## Gridded Facets
-
-```{r}
-#| label: facet-grid
-#| output-location: column
-#| code-line-numbers: "2,5|3,4"
-g +
- facet_grid(
- rows = vars(day_night),
- cols = vars(is_workday)
- )
-```
-
-## Gridded Facets
-
-```{r}
-#| label: facet-grid-circumflex
-#| output-location: column
-#| code-line-numbers: "3"
-g +
- facet_grid(
- day_night ~ is_workday
- )
-```
-
-## Facet Multiple Variables
-
-```{r}
-#| label: facet-grid-multiple
-#| output-location: column
-#| code-line-numbers: "3"
-g +
- facet_grid(
- day_night ~ is_workday + season
- )
-```
-
-## Facet Options: Free Scaling
-
-```{r}
-#| label: facet-grid-options-scales
-#| output-location: column
-#| code-line-numbers: "4"
-g +
- facet_grid(
- day_night ~ is_workday,
- scales = "free"
- )
-```
-
-## Facet Options: Switch Labels
-
-```{r}
-#| label: facet-grid-options-switch
-#| output-location: column
-#| code-line-numbers: "5"
-g +
- facet_grid(
- day_night ~ is_workday,
- scales = "free",
- switch = "y"
- )
-```
-
-## Facet Options: Proportional Spacing
-
-```{r}
-#| label: facet-grid-options-space
-#| output-location: column
-#| code-line-numbers: "4,5|5"
-g +
- facet_grid(
- day_night ~ is_workday,
- scales = "free",
- space = "free"
- )
-```
-
-## Facet Options: Proportional Spacing
-
-```{r}
-#| label: facet-grid-options-space-y
-#| output-location: column
-#| code-line-numbers: "4,5"
-g +
- facet_grid(
- day_night ~ is_workday,
- scales = "free_y",
- space = "free_y"
- )
-```
-
-## Diamonds Facet
-
-```{r}
-#| label: diamonds-facet-start
-#| output-location: column
-#| code-line-numbers: "1,2,3,4,5,6,7,8,9,10,11,12|8,9,10"
-ggplot(
- diamonds,
- aes(x = carat, y = price)
- ) +
- geom_point(
- alpha = .3
- ) +
- geom_smooth(
- method = "lm",
- se = FALSE,
- color = "dodgerblue"
- )
-```
-
-## Diamonds Facet
-
-```{r}
-#| label: diamonds-facet
-#| output-location: column
-#| code-line-numbers: "13,14,15,16,17"
-ggplot(
- diamonds,
- aes(x = carat, y = price)
- ) +
- geom_point(
- alpha = .3
- ) +
- geom_smooth(
- method = "lm",
- se = FALSE,
- color = "dodgerblue"
- ) +
- facet_grid(
- cut ~ clarity,
- space = "free_x",
- scales = "free_x"
- )
-```
-
-## Diamonds Facet (Dark Theme Bonus)
-
-```{r}
-#| label: diamonds-facet-dark
-#| output-location: column
-#| code-line-numbers: "19,20,21,22"
-ggplot(
- diamonds,
- aes(x = carat, y = price)
- ) +
- geom_point(
- alpha = .3,
- color = "white"
- ) +
- geom_smooth(
- method = "lm",
- se = FALSE,
- color = "dodgerblue"
- ) +
- facet_grid(
- cut ~ clarity,
- space = "free_x",
- scales = "free_x"
- ) +
- theme_dark(
- base_size = 14
- )
-```
-
-# Scales(尺度)
-
-```{r}
-#| label: theme-size-reset
-#| include: false
-#| purl: false
-theme_set(theme_light(base_size = 14))
-
-theme_update(
- panel.grid.minor = element_blank(),
- plot.title = element_text(face = "bold"),
- legend.position = "top",
- plot.title.position = "plot"
-)
-```
-
-## Scales
-
-
-
-= translate between variable ranges and property ranges
-
-::: incremental
-- feels-like temperature ⇄ x
-- reported bike shares ⇄ y
-- season ⇄ color
-- year ⇄ shape
-- ...
-:::
-
-## Scales
-
-The `scale_*()` components control the properties of all the
aesthetic dimensions mapped to the data.
-
-
Consequently, there are `scale_*()` functions for all aesthetics such as:
-
-- **positions** via `scale_x_*()` and `scale_y_*()`
-
-- **colors** via `scale_color_*()` and `scale_fill_*()`
-
-- **sizes** via `scale_size_*()` and `scale_radius_*()`
-
-- **shapes** via `scale_shape_*()` and `scale_linetype_*()`
-
-- **transparency** via `scale_alpha_*()`
-
-## Scales
-
-The `scale_*()` components control the properties of all the
aesthetic dimensions mapped to the data.
-
-
The extensions (`*`) can be filled by e.g.:
-
-- `continuous()`, `discrete()`, `reverse()`, `log10()`, `sqrt()`, `date()` for positions
-
-- `continuous()`, `discrete()`, `manual()`, `gradient()`, `gradient2()`, `brewer()` for colors
-
-- `continuous()`, `discrete()`, `manual()`, `ordinal()`, `area()`, `date()` for sizes
-
-- `continuous()`, `discrete()`, `manual()`, `ordinal()` for shapes
-
-- `continuous()`, `discrete()`, `manual()`, `ordinal()`, `date()` for transparency
-
-------------------------------------------------------------------------
-
-](../../img/concepts/continuous_discrete.png){fig-size="120%" fig-align="center" fig-alt="Allison Horsts illustration ofthe correct use of continuous versus discrete; however, in {ggplot2} these are interpeted in a different way: as quantitative and qualitative."}
-
-## Continuous vs. Discrete in {ggplot2}
-
-::: {layout-ncol="2"}
-## Continuous:
quantitative or numerical data
-
-- height
-- weight
-- age
-- counts
-
-## Discrete:
qualitative or categorical data
-
-- species
-- sex
-- study sites
-- age group
-:::
-
-## Continuous vs. Discrete in {ggplot2}
-
-::: {layout-ncol="2"}
-## Continuous:
quantitative or numerical data
-
-- height (continuous)
-- weight (continuous)
-- age (continuous or discrete)
-- counts (discrete)
-
-## Discrete:
qualitative or categorical data
-
-- species (nominal)
-- sex (nominal)
-- study site (nominal or ordinal)
-- age group (ordinal)
-:::
-
-## Aesthetics + Scales
-
-```{r}
-#| label: scales-default-invisible
-#| output-location: column
-#| code-line-numbers: "3,4"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point()
-```
-
-## Aesthetics + Scales
-
-```{r}
-#| label: scales-default
-#| output-location: column
-#| code-line-numbers: "3,4,7,8,9|7,8,9"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_x_date() +
- scale_y_continuous() +
- scale_color_discrete()
-```
-
-## Scales
-
-```{r}
-#| label: scales-overwrite-1
-#| output-location: column
-#| code-line-numbers: "7"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_x_continuous() +
- scale_y_continuous() +
- scale_color_discrete()
-```
-
-## Scales
-
-```{r}
-#| label: scales-overwrite-2
-#| output-location: column
-#| code-line-numbers: "8"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_x_continuous() +
- scale_y_log10() +
- scale_color_discrete()
-```
-
-## Scales
-
-```{r}
-#| label: scales-overwrite-3
-#| output-location: column
-#| code-line-numbers: "9"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_x_continuous() +
- scale_y_log10() +
- scale_color_viridis_d()
-```
-
-## \`scale_x\|y_continuous\`
-
-```{r}
-#| label: scales-xy-continuous-trans
-#| output-location: column
-#| code-line-numbers: "8,9,10|9"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_y_continuous(
- trans = "log10"
- )
-```
-
-## \`scale_x\|y_continuous\`
-
-```{r}
-#| label: scales-xy-continuous-name
-#| output-location: column
-#| code-line-numbers: "7,8,9|8"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_y_continuous(
- name = "Reported bike shares"
- )
-```
-
-## \`scale_x\|y_continuous\`
-
-```{r}
-#| label: scales-xy-continuous-breaks-seq
-#| output-location: column
-#| code-line-numbers: "9"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_y_continuous(
- name = "Reported bike shares",
- breaks = seq(0, 60000, by = 15000)
- )
-```
-
-## \`scale_x\|y_continuous\`
-
-```{r}
-#| label: scales-xy-continuous-breaks-short
-#| output-location: column
-#| code-line-numbers: "9"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_y_continuous(
- name = "Reported bike shares",
- breaks = 0:4*15000
- )
-```
-
-## \`scale_x\|y_continuous\`
-
-```{r}
-#| label: scales-xy-continuous-breaks-irregular
-#| output-location: column
-#| code-line-numbers: "9"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_y_continuous(
- name = "Reported bike shares",
- breaks = c(0, 2:12*2500, 40000, 50000)
- )
-```
-
-## \`scale_x\|y_continuous\`
-
-```{r}
-#| label: scales-xy-continuous-labels
-#| output-location: column
-#| code-line-numbers: "8,10"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_y_continuous(
- name = "Reported bike shares in thousands",
- breaks = 0:4*15000,
- labels = 0:4*15
- )
-```
-
-## \`scale_x\|y_continuous\`
-
-```{r}
-#| label: scales-xy-continuous-labels-paste
-#| output-location: column
-#| code-line-numbers: "10"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_y_continuous(
- name = "Reported bike shares in thousands",
- breaks = 0:4*15000,
- labels = paste(0:4*15000, "bikes")
- )
-```
-
-## \`scale_x\|y_continuous\`
-
-```{r}
-#| label: scales-xy-continuous-limits
-#| output-location: column
-#| code-line-numbers: "10"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_y_continuous(
- name = "Reported bike shares",
- breaks = 0:4*15000,
- limits = c(NA, 60000)
- )
-```
-
-## \`scale_x\|y_continuous\`
-
-```{r}
-#| label: scales-xy-continuous-expand.no
-#| output-location: column
-#| code-line-numbers: "10"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_y_continuous(
- name = "Reported bike shares",
- breaks = 0:4*15000,
- expand = c(0, 0)
- )
-```
-
-## \`scale_x\|y_continuous\`
-
-```{r}
-#| label: scales-xy-continuous-expand
-#| output-location: column
-#| code-line-numbers: "10"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_y_continuous(
- name = "Reported bike shares",
- breaks = -1:5*15000,
- expand = c(.5, .5) ## c(add, mult)
- )
-```
-
-## \`scale_x\|y_continuous\`
-
-```{r}
-#| label: scales-xy-continuous-expand-add-explicit
-#| output-location: column
-#| code-line-numbers: "10"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_y_continuous(
- name = "Reported bike shares",
- breaks = -1:5*15000,
- expand = expansion(add = 2000)
- )
-```
-
-## \`scale_x\|y_continuous\`
-
-```{r}
-#| label: scales-xy-continuous-guide-none
-#| output-location: column
-#| code-line-numbers: "10"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_y_continuous(
- name = "Reported bike shares",
- breaks = 0:4*15000,
- guide = "none"
- )
-```
-
-## \`scale_x\|y_date\`
-
-```{r}
-#| label: scales-xy-date-breaks-months
-#| output-location: column
-#| code-line-numbers: "7,10|7,8,9,10|9"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_x_date(
- name = NULL,
- date_breaks = "4 months"
- )
-```
-
-## \`scale_x\|y_date\`
-
-```{r}
-#| label: scales-xy-date-breaks-weeks
-#| output-location: column
-#| code-line-numbers: "9"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_x_date(
- name = NULL,
- date_breaks = "20 weeks"
- )
-```
-
-## \`scale_x\|y_date\` with \`strftime()\`
-
-```{r}
-#| label: scales-xy-date-labels
-#| output-location: column
-#| code-line-numbers: "9,10"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_x_date(
- name = NULL,
- date_breaks = "6 months",
- date_labels = "%Y/%m/%d"
- )
-```
-
-## \`scale_x\|y_date\` with \`strftime()\`
-
-```{r}
-#| label: scales-xy-date-labels-special
-#| output-location: column
-#| code-line-numbers: "10"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_x_date(
- name = NULL,
- date_breaks = "6 months",
- date_labels = "%b '%y"
- )
-```
-
-## \`scale_x\|y_discrete\`
-
-```{r}
-#| label: scales-xy-discrete
-#| output-location: column
-#| code-line-numbers: "3,6,9|6,7,8,9|7,8"
-ggplot(
- bikes,
- aes(x = season, y = count)
- ) +
- geom_boxplot() +
- scale_x_discrete(
- name = "Period",
- labels = c("Dec-Feb", "Mar-May", "Jun-Aug", "Sep-Nov")
- )
-```
-
-## \`scale_x\|y_discrete\`
-
-```{r}
-#| label: scales-xy-discrete-expand
-#| output-location: column
-#| code-line-numbers: "8"
-ggplot(
- bikes,
- aes(x = season, y = count)
- ) +
- geom_boxplot() +
- scale_x_discrete(
- name = "Season",
- expand = c(.5, 0) ## add, mult
- )
-```
-
-## Discrete or Continuous?
-
-```{r}
-#| label: scales-xy-fake-discrete-visible
-#| output-location: column
-#| code-line-numbers: "3,5,6,7"
-ggplot(
- bikes,
- aes(x = as.numeric(season), y = count)
- ) +
- geom_boxplot(
- aes(group = season)
- )
-```
-
-## Discrete or Continuous?
-
-```{r}
-#| label: scales-xy-fake-discrete
-#| output-location: column
-#| code-line-numbers: "9,10,11,12,13|11|12"
-ggplot(
- bikes,
- aes(x = as.numeric(season),
- y = count)
- ) +
- geom_boxplot(
- aes(group = season)
- ) +
- scale_x_continuous(
- name = "Season",
- breaks = 1:4,
- labels = levels(bikes$season)
- )
-```
-
-## Discrete or Continuous?
-
-```{r}
-#| label: scales-xy-fake-discrete-shift
-#| output-location: column
-#| code-line-numbers: "3,4"
-ggplot(
- bikes,
- aes(x = as.numeric(season) +
- as.numeric(season) / 8,
- y = count)
- ) +
- geom_boxplot(
- aes(group = season)
- ) +
- scale_x_continuous(
- name = "Season",
- breaks = 1:4,
- labels = levels(bikes$season)
- )
-```
-
-## \`scale_color\|fill_discrete\`
-
-```{r}
-#| label: scales-color-discrete-type-vector
-#| output-location: column
-#| code-line-numbers: "7,10|7,8,9,10|8,9"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_color_discrete(
- name = "Season:",
- type = c("#69b0d4", "#00CB79", "#F7B01B", "#a78f5f")
- )
-```
-
-## Inspect Assigned Colors
-
-```{r}
-#| label: scales-color-discrete-type-inspect
-#| output-location: column
-#| code-line-numbers: "1|12|14"
-g <- ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_color_discrete(
- name = "Season:",
- type = c("#3ca7d9", "#1ec99b", "#F7B01B", "#bb7e8f")
- )
-
-gb <- ggplot_build(g)
-
-gb$data[[1]][c(1:5, 200:205, 400:405), 1:5]
-```
-
-## \`scale_color\|fill_discrete\`
-
-```{r}
-#| label: scales-color-discrete-type-vector-named
-#| output-location: column
-#| code-line-numbers: "1,2,3,4,5,6|1,16"
-my_colors <- c(
- `winter` = "#3c89d9",
- `spring` = "#1ec99b",
- `summer` = "#F7B01B",
- `autumn` = "#a26e7c"
-)
-
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_color_discrete(
- name = "Season:",
- type = my_colors
- )
-```
-
-## \`scale_color\|fill_discrete\`
-
-```{r}
-#| label: scales-color-discrete-type-vector-named-shuffled
-#| output-location: column
-#| code-line-numbers: "2,5|1,16"
-my_colors_alphabetical <- c(
- `autumn` = "#a26e7c",
- `spring` = "#1ec99b",
- `summer` = "#F7B01B",
- `winter` = "#3c89d9"
-)
-
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_color_discrete(
- name = "Season:",
- type = my_colors_alphabetical
- )
-```
-
-## \`scale_color\|fill_discrete\`
-
-```{r}
-#| label: scales-color-discrete-type-palette
-#| output-location: column
-#| code-line-numbers: "1|11,12,13"
-library(RColorBrewer)
-
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = season)
- ) +
- geom_point() +
- scale_color_discrete(
- name = "Season:",
- type = brewer.pal(
- n = 4, name = "Dark2"
- )
- )
-```
-
-## \`scale_color\|fill_manual\`
-
-```{r}
-#| label: scales-color-manual-na
-#| output-location: column
-#| code-line-numbers: "4,9,10"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = weather_type)
- ) +
- geom_point() +
- scale_color_manual(
- name = "Season:",
- values = brewer.pal(n = 6, name = "Pastel1"),
- na.value = "black"
- )
-```
-
-## \`scale_color\|fill_carto_d\`
-
-```{r}
-#| label: scales-color-discrete-carto
-#| output-location: column
-#| code-line-numbers: "7,8,9,10"
-ggplot(
- bikes,
- aes(x = date, y = count,
- color = weather_type)
- ) +
- geom_point() +
- rcartocolor::scale_color_carto_d(
- name = "Season:",
- palette = "Pastel",
- na.value = "black"
- )
-```
-
-## Diamonds Facet
-
-```{r}
-#| label: diamonds-facet-store
-#| output-location: column
-#| code-line-numbers: "1|10|20"
-facet <-
- ggplot(
- diamonds,
- aes(x = carat, y = price)
- ) +
- geom_point(
- alpha = .3
- ) +
- geom_smooth(
- aes(color = cut),
- method = "lm",
- se = FALSE
- ) +
- facet_grid(
- cut ~ clarity,
- space = "free_x",
- scales = "free_x"
- )
-
-facet
-```
-
-## Diamonds Facet
-
-```{r}
-#| label: diamonds-facet-scales-xy
-#| output-location: column
-facet +
- scale_x_continuous(
- breaks = 0:5
- ) +
- scale_y_continuous(
- limits = c(0, 30000),
- breaks = 0:3*10000,
- labels = c("$0", "$10,000", "$20,000", "$30,000")
- )
-```
-
-## Diamonds Facet
-
-```{r}
-#| label: diamonds-facet-scales-y-paste-format
-#| output-location: column
-#| code-line-numbers: "8,9,10,11,12,13,14"
-facet +
- scale_x_continuous(
- breaks = 0:5
- ) +
- scale_y_continuous(
- limits = c(0, 30000),
- breaks = 0:3*10000,
- labels = paste0(
- "$", format(
- 0:3*10000,
- big.mark = ",",
- trim = TRUE
- )
- )
- )
-```
-
-## Diamonds Facet
-
-```{r}
-#| label: diamonds-facet-scales-y-function
-#| output-location: column
-#| code-line-numbers: "8,9,10,11,12,13"
-facet +
- scale_x_continuous(
- breaks = 0:5
- ) +
- scale_y_continuous(
- limits = c(0, 30000),
- breaks = 0:3*10000,
- labels = function(y) paste0(
- "$", format(
- y, big.mark = ",",
- trim = TRUE
- )
- )
- )
-```
-
-## Diamonds Facet
-
-```{r}
-#| label: diamonds-facet-scales-y-dollar-format
-#| output-location: column
-#| code-line-numbers: "8"
-facet +
- scale_x_continuous(
- breaks = 0:5
- ) +
- scale_y_continuous(
- limits = c(0, 30000),
- breaks = 0:3*10000,
- labels = scales::dollar_format()
- )
-```
-
-## Diamonds Facet
-
-```{r}
-#| label: diamonds-facet-scales-color
-#| output-location: column
-#| code-line-numbers: "10,11,12,13"
-facet +
- scale_x_continuous(
- breaks = 0:5
- ) +
- scale_y_continuous(
- limits = c(0, 30000),
- breaks = 0:3*10000,
- labels = scales::dollar_format()
- ) +
- scale_color_brewer(
- palette = "Set2",
- guide = "none"
- )
-```
-
-## Diamonds Facet
-
-```{r}
-#| label: diamonds-facet-scales-no-legend
-#| output-location: column
-#| code-line-numbers: "13,14,15"
-facet +
- scale_x_continuous(
- breaks = 0:5
- ) +
- scale_y_continuous(
- limits = c(0, 30000),
- breaks = 0:3*10000,
- labels = scales::dollar_format()
- ) +
- scale_color_brewer(
- palette = "Set2"
- ) +
- theme(
- legend.position = "none"
- )
-```
-
-# Coordinate Systems(投影)
-
-## Coordinate Systems
-
-
-
-= interpret the position aesthetics
-
-::: incremental
-- **linear coordinate systems:** preserve the geometrical shapes
- - `coord_cartesian()`
- - `coord_fixed()`
- - `coord_flip()`
-- **non-linear coordinate systems:** likely change the geometrical shapes
- - `coord_polar()`
- - `coord_map()` and `coord_sf()`
- - `coord_trans()`
-:::
-
-## Cartesian Coordinate System
-
-```{r}
-#| label: coord-cartesian
-#| output-location: column
-#| code-line-numbers: "6"
-ggplot(
- bikes,
- aes(x = season, y = count)
- ) +
- geom_boxplot() +
- coord_cartesian()
-```
-
-## Cartesian Coordinate System
-
-```{r}
-#| label: coord-cartesian-zoom
-#| output-location: column
-#| code-line-numbers: "6,7,8"
-ggplot(
- bikes,
- aes(x = season, y = count)
- ) +
- geom_boxplot() +
- coord_cartesian(
- ylim = c(NA, 15000)
- )
-```
-
-## Changing Limits
-
-::: {layout-ncol="2"}
-```{r}
-#| label: coord-cartesian-ylim
-#| fig-height: 3.5
-#| code-line-numbers: "6,7,8"
-ggplot(
- bikes,
- aes(x = season, y = count)
- ) +
- geom_boxplot() +
- coord_cartesian(
- ylim = c(NA, 15000)
- )
-```
-
-```{r}
-#| label: scale-y-limits
-#| fig-height: 3.5
-#| code-line-numbers: "6,7,8"
-ggplot(
- bikes,
- aes(x = season, y = count)
- ) +
- geom_boxplot() +
- scale_y_continuous(
- limits = c(NA, 15000)
- )
-```
-:::
-
-## Clipping
-
-```{r}
-#| label: coord-clip
-#| output-location: column
-#| code-line-numbers: "8"
-ggplot(
- bikes,
- aes(x = season, y = count)
- ) +
- geom_boxplot() +
- coord_cartesian(
- ylim = c(NA, 15000),
- clip = "off"
- )
-```
-
-## Clipping
-
-```{r}
-#| label: coord-clip-text
-#| output-location: column
-#| code-line-numbers: "2,3|6,7,8,9,10|12"
-ggplot(
- filter(bikes, is_holiday == TRUE),
- aes(x = temp_feel, y = count)
- ) +
- geom_point() +
- geom_text(
- aes(label = season),
- nudge_x = .3,
- hjust = 0
- ) +
- coord_cartesian(
- clip = "off"
- )
-```
-
-## ... or better use {ggrepel}
-
-```{r}
-#| label: coord-clip-text-repel
-#| output-location: column
-#| code-line-numbers: "6"
-ggplot(
- filter(bikes, is_holiday == TRUE),
- aes(x = temp_feel, y = count)
- ) +
- geom_point() +
- ggrepel::geom_text_repel(
- aes(label = season),
- nudge_x = .3,
- hjust = 0
- ) +
- coord_cartesian(
- clip = "off"
- )
-```
-
-## Remove All Padding
-
-```{r}
-#| label: coord-expand-off-clip
-#| output-location: column
-#| code-line-numbers: "7|8"
-ggplot(
- bikes,
- aes(x = temp_feel, y = count)
- ) +
- geom_point() +
- coord_cartesian(
- expand = FALSE,
- clip = "off"
- )
-```
-
-## Fixed Coordinate System
-
-::: {layout-ncol="2"}
-```{r}
-#| label: coord-fixed
-#| fig-height: 4.2
-#| code-line-numbers: "6"
-ggplot(
- bikes,
- aes(x = temp_feel, y = temp)
- ) +
- geom_point() +
- coord_fixed()
-```
-
-::: fragment
-```{r}
-#| label: coord-fixed-custom
-#| fig-height: 4.2
-#| code-line-numbers: "6"
-ggplot(
- bikes,
- aes(x = temp_feel, y = temp)
- ) +
- geom_point() +
- coord_fixed(ratio = 4)
-```
-:::
-:::
-
-## Flipped Coordinate System
-
-::: {layout-ncol="2"}
-```{r}
-#| label: coord-cartesian-comp-flip
-#| fig-height: 4.1
-#| code-line-numbers: "6"
-ggplot(
- bikes,
- aes(x = weather_type)
- ) +
- geom_bar() +
- coord_cartesian()
-```
-
-```{r}
-#| label: coord-flip
-#| fig-height: 4.1
-#| code-line-numbers: "6"
-ggplot(
- bikes,
- aes(x = weather_type)
- ) +
- geom_bar() +
- coord_flip()
-```
-:::
-
-## Flipped Coordinate System
-
-::: {layout-ncol="2"}
-```{r}
-#| label: coord-cartesian-switch-x-y
-#| fig-height: 4.1
-#| code-line-numbers: "3,6"
-ggplot(
- bikes,
- aes(y = weather_type)
- ) +
- geom_bar() +
- coord_cartesian()
-```
-
-```{r}
-#| label: coord-flip-again
-#| fig-height: 4.1
-#| code-line-numbers: "3,6"
-ggplot(
- bikes,
- aes(x = weather_type)
- ) +
- geom_bar() +
- coord_flip()
-```
-:::
-
-## Reminder: Sort Your Bars!
-
-```{r}
-#| label: forcats-sort-infreq
-#| output-location: column
-#| code-line-numbers: "3|2"
-ggplot(
- filter(bikes, !is.na(weather_type)),
- aes(y = fct_infreq(weather_type))
- ) +
- geom_bar()
-```
-
-## Reminder: Sort Your Bars!
-
-```{r}
-#| label: forcats-sort-infreq-rev
-#| output-location: column
-#| code-line-numbers: "3,4,5"
-ggplot(
- filter(bikes, !is.na(weather_type)),
- aes(y = fct_rev(
- fct_infreq(weather_type)
- ))
- ) +
- geom_bar()
-```
-
-## Circular Corrdinate System
-
-::: {layout-ncol="2"}
-```{r}
-#| label: coord-polar
-#| fig-height: 3.9
-#| code-line-numbers: "7"
-ggplot(
- filter(bikes, !is.na(weather_type)),
- aes(x = weather_type,
- fill = weather_type)
- ) +
- geom_bar() +
- coord_polar()
-```
-
-::: fragment
-```{r}
-#| label: coord-cartesian-comp-polar
-#| fig-height: 3.9
-#| code-line-numbers: "7"
-ggplot(
- filter(bikes, !is.na(weather_type)),
- aes(x = weather_type,
- fill = weather_type)
- ) +
- geom_bar() +
- coord_cartesian()
-```
-:::
-:::
-
-## Circular Cordinate System
-
-::: {layout-ncol="2"}
-```{r}
-#| label: coord-polar-coxcomb
-#| fig-height: 3.9
-#| code-line-numbers: "6,7"
-ggplot(
- filter(bikes, !is.na(weather_type)),
- aes(x = fct_infreq(weather_type),
- fill = weather_type)
- ) +
- geom_bar(width = 1) +
- coord_polar()
-```
-
-```{r}
-#| label: coord-cartesian-comp-polar-no-padding
-#| fig-height: 3.9
-#| code-line-numbers: "6,7"
-ggplot(
- filter(bikes, !is.na(weather_type)),
- aes(x = fct_infreq(weather_type),
- fill = weather_type)
- ) +
- geom_bar(width = 1) +
- coord_cartesian()
-```
-:::
-
-## Circular Corrdinate System
-
-::: {layout-ncol="2"}
-```{r}
-#| label: coord-polar-theta-x
-#| fig-height: 3.9
-#| code-line-numbers: "7"
-ggplot(
- filter(bikes, !is.na(weather_type)),
- aes(x = fct_infreq(weather_type),
- fill = weather_type)
- ) +
- geom_bar() +
- coord_polar(theta = "x")
-```
-
-```{r}
-#| label: coord-polar-theta-y
-#| fig-height: 3.9
-#| code-line-numbers: "7"
-ggplot(
- filter(bikes, !is.na(weather_type)),
- aes(x = fct_infreq(weather_type),
- fill = weather_type)
- ) +
- geom_bar() +
- coord_polar(theta = "y")
-```
-:::
-
-## Circular Corrdinate System
-
-::: {layout-ncol="2"}
-```{r}
-#| label: coord-polar-pie
-#| fig-height: 4.1
-#| code-line-numbers: "5"
-ggplot(
- filter(bikes, !is.na(weather_type)),
- aes(x = 1, fill = weather_type)
- ) +
- geom_bar(position = "stack") +
- coord_polar(theta = "y")
-```
-
-```{r}
-#| label: coord-cartesian-comp-polar-stacked
-#| fig-height: 4.1
-#| code-line-numbers: "5"
-ggplot(
- filter(bikes, !is.na(weather_type)),
- aes(x = 1, fill = weather_type)
- ) +
- geom_bar(position = "stack") +
- coord_cartesian()
-```
-:::
-
-## Circular Corrdinate System
-
-::: {layout-ncol="2"}
-```{r}
-#| label: coord-polar-pie-sorted
-#| fig-height: 3.6
-#| code-line-numbers: "4,6,7"
-ggplot(
- filter(bikes, !is.na(weather_type)),
- aes(x = 1,
- fill = fct_rev(fct_infreq(weather_type)))
- ) +
- geom_bar(position = "stack") +
- coord_polar(theta = "y") +
- scale_fill_discrete(name = NULL)
-```
-
-```{r}
-#| label: coord-cartesian-comp-polar-stacked-sorted
-#| fig-height: 3.6
-#| code-line-numbers: "4,6,7"
-ggplot(
- filter(bikes, !is.na(weather_type)),
- aes(x = 1,
- fill = fct_rev(fct_infreq(weather_type)))
- ) +
- geom_bar(position = "stack") +
- coord_cartesian() +
- scale_fill_discrete(name = NULL)
-```
-:::
-
-## Transform a Coordinate System
-
-```{r}
-#| label: coord-trans-log
-#| output-location: column
-#| code-line-numbers: "6"
-ggplot(
- bikes,
- aes(x = temp, y = count)
- ) +
- geom_point() +
- coord_trans(y = "log10")
-```
-
-## Transform a Coordinate System
-
-::: {layout-ncol="2"}
-```{r}
-#| label: trans-log-via-coord
-#| fig-height: 3.6
-#| code-line-numbers: "6"
-ggplot(
- bikes,
- aes(x = temp, y = count,
- group = day_night)
- ) +
- geom_point() +
- geom_smooth(method = "lm") +
- coord_trans(y = "log10")
-```
-
-::: fragment
-```{r}
-#| label: trans-log-via-scale
-#| fig-height: 3.6
-#| code-line-numbers: "6"
-ggplot(
- bikes,
- aes(x = temp, y = count,
- group = day_night)
- ) +
- geom_point() +
- geom_smooth(method = "lm") +
- scale_y_log10()
-```
-:::
-:::
-
-
-# 图形组合
-
-------------------------------------------------------------------------
-
-](../../img/layout/ah_patchwork.jpg){fig-align="center" fig-alt="Allison Horsts monster illustration of the patchwork extension package."}
-
-::: footer
-:::
-
-------------------------------------------------------------------------
-
-::: panel-tabset
-### Graphic
-
-```{r}
-#| label: patchwork-p1
-#| fig-width: 10
-#| fig-height: 5.8
-#| echo: false
-theme_std <- theme_set(theme_minimal(base_size = 18))
-theme_update(
- # text = element_text(family = "Pally"),
- panel.grid = element_blank(),
- axis.text = element_text(color = "grey50", size = 12),
- axis.title = element_text(color = "grey40", face = "bold"),
- axis.title.x = element_text(margin = margin(t = 12)),
- axis.title.y = element_text(margin = margin(r = 12)),
- axis.line = element_line(color = "grey80", size = .4),
- legend.text = element_text(color = "grey50", size = 12),
- plot.tag = element_text(size = 40, margin = margin(b = 15)),
- plot.background = element_rect(fill = "white", color = "white")
-)
-
-bikes_sorted <-
- bikes %>%
- filter(!is.na(weather_type)) %>%
- group_by(weather_type) %>%
- mutate(sum = sum(count)) %>%
- ungroup() %>%
- mutate(
- weather_type = forcats::fct_reorder(
- str_to_title(str_wrap(weather_type, 5)), sum
- )
- )
-
-p1 <- ggplot(
- bikes_sorted,
- aes(x = weather_type, y = count, color = weather_type)
- ) +
- geom_hline(yintercept = 0, color = "grey80", size = .4) +
- stat_summary(
- geom = "point", fun = "sum", size = 12
- ) +
- stat_summary(
- geom = "linerange", ymin = 0, fun.max = function(y) sum(y),
- size = 2, show.legend = FALSE
- ) +
- coord_flip(ylim = c(0, NA), clip = "off") +
- scale_y_continuous(
- expand = c(0, 0), limits = c(0, 8500000),
- labels = scales::comma_format(scale = .0001, suffix = "K")
- ) +
- scale_color_viridis_d(
- option = "magma", direction = -1, begin = .1, end = .9, name = NULL,
- guide = guide_legend(override.aes = list(size = 7))
- ) +
- labs(
- x = NULL, y = "Sum of reported bike shares", tag = "P1",
- ) +
- theme(
- axis.line.y = element_blank(),
- axis.text.y = element_text(family = "Pally", color = "grey50", face = "bold",
- margin = margin(r = 15), lineheight = .9)
- )
-
-p1
-```
-
-### Code
-
-```{r}
-#| label: patchwork-p1
-#| eval: false
-```
-:::
-
-------------------------------------------------------------------------
-
-::: panel-tabset
-### Graphic
-
-```{r}
-#| label: patchwork-p2
-#| fig-width: 10
-#| fig-height: 5.8
-#| echo: false
-p2 <- bikes_sorted %>%
- filter(season == "winter", is_weekend == TRUE, day_night == "night") %>%
- group_by(weather_type, .drop = FALSE) %>%
- mutate(id = row_number()) %>%
- ggplot(
- aes(x = weather_type, y = id, color = weather_type)
- ) +
- geom_point(size = 4.5) +
- scale_color_viridis_d(
- option = "magma", direction = -1, begin = .1, end = .9, name = NULL,
- guide = guide_legend(override.aes = list(size = 7))
- ) +
- labs(
- x = NULL, y = "Reported bike shares on\nweekend winter nights", tag = "P2",
- ) +
- coord_cartesian(ylim = c(.5, NA), clip = "off")
-
-p2
-```
-
-### Code
-
-```{r}
-#| label: patchwork-p2
-#| eval: false
-```
-:::
-
-------------------------------------------------------------------------
-
-::: panel-tabset
-### Graphic
-
-```{r}
-#| label: patchwork-p3
-#| fig-width: 10
-#| fig-height: 5.8
-#| echo: false
-my_colors <- c("#cc0000", "#000080")
-
-p3 <- bikes %>%
- group_by(week = lubridate::week(date), day_night, year) %>%
- summarize(count = sum(count)) %>%
- group_by(week, day_night) %>%
- mutate(avg = mean(count)) %>%
- ggplot(aes(x = week, y = count, group = interaction(day_night, year))) +
- geom_line(color = "grey65", size = 1) +
- geom_line(aes(y = avg, color = day_night), stat = "unique", size = 1.7) +
- annotate(
- geom = "text", label = c("Day", "Night"), color = my_colors,
- x = c(5, 18), y = c(125000, 29000), size = 8, fontface = "bold", family = "Pally"
- ) +
- scale_x_continuous(breaks = c(1, 1:10*5)) +
- scale_y_continuous(labels = scales::comma_format()) +
- scale_color_manual(values = my_colors, guide = "none") +
- labs(
- x = "Week of the Year", y = "Reported bike shares\n(cumulative # per week)", tag = "P3",
- )
-
-p3
-```
-
-### Code
-
-```{r}
-#| label: patchwork-p3
-#| eval: false
-```
-:::
-
-## {patchwork}
-
-```{r}
-#| label: patchwork-composition
-#| fig-width: 15
-#| fig-height: 12
-#| fig-align: "center"
-#| code-line-numbers: "3|2,3"
-# install.packages("patchwork")
-require(patchwork)
-(p1 + p2) / p3
-```
-
-## "Collect Guides"
-
-```{r}
-#| label: patchwork-composition-guides
-#| fig-width: 15
-#| fig-height: 12
-#| fig-align: "center"
-(p1 + p2) / p3 + plot_layout(guides = "collect")
-```
-
-## Apply Theming
-
-```{r}
-#| label: patchwork-composition-guides-just
-#| fig-width: 15
-#| fig-height: 12
-#| fig-align: "center"
-((p1 + p2) / p3 & theme(legend.justification = "top")) +
-plot_layout(guides = "collect")
-```
-
-## Apply Theming
-
-```{r}
-#| label: patchwork-composition-legend-off
-#| fig-width: 15
-#| fig-height: 12
-#| fig-align: "center"
-(p1 + p2) / p3 & theme(legend.position = "none",
- plot.background = element_rect(color = "black", size = 3))
-```
-
-## Adjust Widths and Heights
-
-```{r}
-#| label: patchwork-composition-heights-widths
-#| fig-width: 15
-#| fig-height: 12
-#| fig-align: "center"
-#| code-line-numbers: "2"
-((p1 + p2) / p3 & theme(legend.position = "none")) +
- plot_layout(heights = c(.2, .1), widths = c(2, 1))
-```
-
-## Use A Custom Layout
-
-```{r}
-#| label: patchwork-composition-design
-#| fig-width: 15
-#| fig-height: 12
-#| fig-align: "center"
-#| code-line-numbers: "1,2,3,4|5"
-picasso <- "
-AAAAAA#BBBB
-CCCCCCCCC##
-CCCCCCCCC##"
-(p1 + p2 + p3 & theme(legend.position = "none")) +
-plot_layout(design = picasso)
-```
-
-## Add Labels
-
-```{r}
-#| label: patchwork-composition-labs-prep
-pl1 <- p1 + labs(tag = NULL, title = "Plot One") + theme(legend.position = "none")
-pl2 <- p2 + labs(tag = NULL, title = "Plot Two") + theme(legend.position = "none")
-pl3 <- p3 + labs(tag = NULL, title = "Plot Three") + theme(legend.position = "none")
-```
-
-## Add Labels
-
-```{r}
-#| label: patchwork-composition-labs
-#| fig-width: 15
-#| fig-height: 12
-#| fig-align: "center"
-#| code-line-numbers: "2"
-(pl1 + pl2) / pl3 +
- plot_annotation(tag_levels = "1",
- tag_prefix = "P",
- title = "An overarching title for all 3 plots, placed on the very top while all other titles are sitting below the tags.")
-```
-
-## Add Text
-
-::: panel-tabset
-### Graphic
-
-```{r}
-#| label: patchwork-composition-textbox-prep
-#| echo: false
-#| fig-width: 9
-#| fig-height: 4.5
-#| fig-align: "center"
-text <- tibble::tibble(
- x = 0, y = 0, label = "Lorem ipsum dolor sit amet, **consectetur adipiscing elit**, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
-)
-pt <- ggplot(text, aes(x = x, y = y)) +
- ggtext::geom_textbox(
- aes(label = label),
- box.color = NA, width = unit(23, "lines"),
- color = "grey40", size = 6.5, lineheight = 1.4
- ) +
- coord_cartesian(expand = FALSE, clip = "off") +
- theme_void()
-pt
-```
-
-### Code
-
-```{r}
-#| label: patchwork-composition-textbox-prep
-#| eval: false
-```
-:::
-
-## Add Text
-
-```{r}
-#| label: patchwork-composition-textbox
-#| fig-width: 15
-#| fig-height: 12
-#| fig-align: "center"
-(p1 + pt) / p3
-```
-
-## Add Inset Plots
-
-```{r}
-#| label: patchwork-composition-inset-1
-#| fig-width: 12
-#| fig-height: 7
-#| fig-align: "center"
-pl1 + inset_element(pl2,
- l = .6, b = .1, r = 1, t = .6)
-```
-
-## Add Inset Plots
-
-```{r}
-#| label: patchwork-composition-inset-2
-#| fig-width: 12
-#| fig-height: 7
-#| fig-align: "center"
-pl1 + inset_element(pl2,
- l = .6, b = 0, r = 1, t = .5, align_to = 'full')
-```
-
-## Add Inset Plots
-
-```{r}
-#| label: patchwork-composition-inset-3
-#| fig-width: 15
-#| fig-height: 12
-#| fig-align: "center"
-(pl1 + inset_element(pl2,
- l = .6, b = .1, r = 1, t = .6) + pt) / pl3
-```
-
-
-## 练习
-
-```{r}
-library(palmerpenguins)
-library(ggthemes)
-penguins
-```
-
-## 效果
-
-```{r}
-#| echo: false
-#| warning: false
-#| fig-width: 8
-#| fig-height: 5
-penguins |>
- ggplot(aes(x = flipper_length_mm, y = body_mass_g)) +
-geom_point(aes(color = species, shape = species)) +
-geom_smooth(method = "lm") +
-labs(
- title = "Body mass and flipper length",
- subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
- x = "Flipper length (mm)",
- y = "Body mass (g)",
- color = "Species",
- shape = "Species"
-) +
-scale_color_colorblind()
-```
-
-
-```{r}
-#| include: false
-#| fig-alt: |
-#| A blank, gray plot area.
-
-ggplot(data = penguins)
-```
-
-
-```{r}
-#| include: false
-ggplot(
- data = penguins,
- mapping = aes(x = flipper_length_mm, y = body_mass_g)
-)
-```
-
-
-```{r}
-#| include: false
-#| fig-alt: |
-#| A scatterplot of body mass vs. flipper length of penguins. The plot
-#| displays a positive, linear, and relatively strong relationship between
-#| these two variables.
-
-ggplot(
- data = penguins,
- mapping = aes(x = flipper_length_mm, y = body_mass_g)
-) +
- geom_point()
-```
-
-
-```{r}
-#| include: false
-#| warning: false
-#| fig-alt: |
-#| A scatterplot of body mass vs. flipper length of penguins. The plot
-#| displays a positive, fairly linear, and relatively strong relationship
-#| between these two variables. Species (Adelie, Chinstrap, and Gentoo)
-#| are represented with different colors.
-
-ggplot(
- data = penguins,
- mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)
-) +
- geom_point()
-```
-
-
-```{r}
-#| include: false
-#| warning: false
-#| fig-alt: |
-#| A scatterplot of body mass vs. flipper length of penguins. Overlaid
-#| on the scatterplot are three smooth curves displaying the
-#| relationship between these variables for each species (Adelie,
-#| Chinstrap, and Gentoo). Different penguin species are plotted in
-#| different colors for the points and the smooth curves.
-
-ggplot(
- data = penguins,
- mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)
-) +
- geom_point() +
- geom_smooth(method = "lm")
-```
-
-
-```{r}
-#| include: false
-#| warning: false
-#| fig-alt: |
-#| A scatterplot of body mass vs. flipper length of penguins. Overlaid
-#| on the scatterplot is a single line of best fit displaying the
-#| relationship between these variables for each species (Adelie,
-#| Chinstrap, and Gentoo). Different penguin species are plotted in
-#| different colors for the points only.
-
-ggplot(
- data = penguins,
- mapping = aes(x = flipper_length_mm, y = body_mass_g)
-) +
- geom_point(mapping = aes(color = species)) +
- geom_smooth(method = "lm")
-```
-
-
-```{r}
-#| include: false
-#| warning: false
-#| fig-alt: |
-#| A scatterplot of body mass vs. flipper length of penguins. Overlaid
-#| on the scatterplot is a single line of best fit displaying the
-#| relationship between these variables for each species (Adelie,
-#| Chinstrap, and Gentoo). Different penguin species are plotted in
-#| different colors and shapes for the points only.
-
-ggplot(
- data = penguins,
- mapping = aes(x = flipper_length_mm, y = body_mass_g)
-) +
- geom_point(mapping = aes(color = species, shape = species)) +
- geom_smooth(method = "lm")
-```
-
-## 练习
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-p <- ggplot(
- data = penguins,
- mapping = aes(x = flipper_length_mm, y = body_mass_g)
-) +
- geom_point(aes(color = species, shape = species)) +
- geom_smooth(method = "lm") +
- labs(
- title = "Body mass and flipper length",
- subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
- x = "Flipper length (mm)", y = "Body mass (g)",
- color = "Species", shape = "Species"
- ) +
- scale_color_colorblind()
-```
-
-
-
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 练习
-
-:::: {.panel-tabset}
-
-### Code
-
-
-```{r}
-#| warning: false
-#| fig-alt: |
-#| A scatterplot of body mass vs. flipper length of penguins, colored
-#| by bill depth. A smooth curve of the relationship between body mass
-#| and flipper length is overlaid. The relationship is positive,
-#| fairly linear, and moderately strong.
-p <- ggplot(
- data = penguins,
- mapping = aes(x = flipper_length_mm, y = body_mass_g)) +
- geom_point(aes(color = bill_depth_mm)) +
- geom_smooth()
-```
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 练习
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-p <- ggplot(
- data = penguins,
- mapping = aes(x = flipper_length_mm, y = body_mass_g, color = island)
-) +
- geom_point() +
- geom_smooth(se = FALSE)
-```
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 分层展示
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-p <- ggplot(
- data = penguins,
- mapping = aes(x = flipper_length_mm, y = body_mass_g)
-) +
- geom_point() +
- geom_smooth()
-```
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 柱状图
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| fig-alt: |
-#| A bar chart of frequencies of species of penguins: Adelie
-#| (approximately 150), Chinstrap (approximately 90), Gentoo
-#| (approximately 125).
-
-p <- ggplot(penguins, aes(x = species)) +
- geom_bar()
-```
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 柱状图
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| fig-alt: |
-#| A bar chart of frequencies of species of penguins, where the bars are
-#| ordered in decreasing order of their heights (frequencies): Adelie
-#| (approximately 150), Gentoo (approximately 125), Chinstrap
-#| (approximately 90).
-
-p <- ggplot(penguins, aes(x = fct_infreq(species))) +
- geom_bar()
-```
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-
-## 直方图
-
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| warning: false
-#| fig-alt: |
-#| A histogram of body masses of penguins. The distribution is unimodal
-#| and right skewed, ranging between approximately 2500 to 6500 grams.
-
-p <- ggplot(penguins, aes(x = body_mass_g)) +
- geom_histogram(binwidth = 200)
-```
-
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 直方图
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| warning: false
-#| layout-ncol: 2
-#| fig-width: 3
-#| fig-alt: |
-#| Two histograms of body masses of penguins, one with binwidth of 20
-#| (left) and one with binwidth of 2000 (right). The histogram with binwidth
-#| of 20 shows lots of ups and downs in the heights of the bins, creating a
-#| jagged outline. The histogram with binwidth of 2000 shows only three bins.
-
-p1 <- ggplot(penguins, aes(x = body_mass_g)) +
- geom_histogram(binwidth = 20)
-p2 <- ggplot(penguins, aes(x = body_mass_g)) +
- geom_histogram(binwidth = 2000)
-p <- p1 + p2
-```
-
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 10
-#| fig-height: 5
-p
-```
-
-::::
-
-## 密度图
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| fig-alt: |
-#| A density plot of body masses of penguins. The distribution is unimodal
-#| and right skewed, ranging between approximately 2500 to 6500 grams.
-
-p <- ggplot(penguins, aes(x = body_mass_g)) +
- geom_density()
-```
-
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-
-## 箱图
-
-
-:::: {.panel-tabset}
-
-### Code
-
-
-```{r}
-#| warning: false
-#| fig-alt: |
-#| Side-by-side box plots of distributions of body masses of Adelie,
-#| Chinstrap, and Gentoo penguins. The distribution of Adelie and
-#| Chinstrap penguins' body masses appear to be symmetric with
-#| medians around 3750 grams. The median body mass of Gentoo penguins
-#| is much higher, around 5000 grams, and the distribution of the
-#| body masses of these penguins appears to be somewhat right skewed.
-
-p <- ggplot(penguins,
- aes(x = species, y = body_mass_g)) +
- geom_boxplot()
-```
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 分组
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| warning: false
-#| fig-alt: |
-#| A density plot of body masses of penguins by species of penguins. Each
-#| species (Adelie, Chinstrap, and Gentoo) is represented with different
-#| colored outlines for the density curves.
-
-p <- ggplot(penguins,
- aes(x = body_mass_g, color = species)) +
- geom_density(linewidth = 0.75)
-```
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 分组
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| warning: false
-#| fig-alt: |
-#| A density plot of body masses of penguins by species of penguins. Each
-#| species (Adelie, Chinstrap, and Gentoo) is represented in different
-#| colored outlines for the density curves. The density curves are also
-#| filled with the same colors, with some transparency added.
-
-p <- ggplot(penguins,
- aes(x = body_mass_g, color = species, fill = species)) +
- geom_density(alpha = 0.5)
-```
-
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-
-## 分组
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| fig-alt: |
-#| Bar plots of penguin species by island (Biscoe, Dream, and Torgersen)
-p <- ggplot(penguins,
- aes(x = island, fill = species)) +
- geom_bar()
-```
-
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-
-## 分组
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| fig-alt: |
-#| Bar plots of penguin species by island (Biscoe, Dream, and Torgersen)
-#| the bars are scaled to the same height, making it a relative frequencies
-#| plot
-
-p <- ggplot(penguins,
- aes(x = island, fill = species)) +
- geom_bar(position = "fill")
-```
-
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 分组
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| warning: false
-#| fig-alt: |
-#| A scatterplot of body mass vs. flipper length of penguins. The plot
-#| displays a positive, linear, relatively strong relationship between
-#| these two variables.
-
-p <- ggplot(penguins,
- aes(x = flipper_length_mm, y = body_mass_g)) +
- geom_point()
-```
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 分组
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| warning: false
-#| fig-alt: |
-#| A scatterplot of body mass vs. flipper length of penguins. The plot
-#| displays a positive, linear, relatively strong relationship between
-#| these two variables. The points are colored based on the species of the
-#| penguins and the shapes of the points represent islands (round points are
-#| Biscoe island, triangles are Dream island, and squared are Torgersen
-#| island). The plot is very busy and it's difficult to distinguish the shapes
-#| of the points.
-
-p <- ggplot(penguins,
- aes(x = flipper_length_mm, y = body_mass_g)) +
- geom_point(aes(color = species, shape = island))
-```
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-
-## 分面
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| warning: false
-#| fig-width: 8
-#| fig-asp: 0.33
-#| fig-alt: |
-#| A scatterplot of body mass vs. flipper length of penguins. The shapes and
-#| colors of points represent species. Penguins from each island are on a
-#| separate facet. Within each facet, the relationship between body mass and
-#| flipper length is positive, linear, relatively strong.
-
-p <- ggplot(penguins,
- aes(x = flipper_length_mm, y = body_mass_g)) +
- geom_point(aes(color = species, shape = species)) +
- facet_wrap(~island)
-```
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 分面
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| warning: false
-
-p <- ggplot(
- data = penguins,
- mapping = aes(
- x = bill_length_mm, y = bill_depth_mm,
- color = species, shape = species
- )
-) +
- geom_point() +
- labs(color = "Species")
-```
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 练习
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-#| layout-ncol: 2
-
-p1 <- ggplot(penguins, aes(x = island, fill = species)) +
- geom_bar(position = "fill")
-p2 <- ggplot(penguins, aes(x = species, fill = island)) +
- geom_bar(position = "fill")
-p <- p1 + p2
-```
-
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 10
-#| fig-height: 5
-p
-```
-
-::::
-
-## 练习
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-p <- ggplot(penguins,
- aes(x = flipper_length_mm, y = body_mass_g)) +
- geom_point()
-```
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 练习
-
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-p <- ggplot(mpg, aes(x = class)) +
-geom_bar()
-```
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 练习
-
-:::: {.panel-tabset}
-
-### Code
-
-
-```{r}
-p <- ggplot(mpg, aes(x = cty, y = hwy)) +
-geom_point()
-```
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 练习
-
-:::: {.panel-tabset}
-
-### Code
-
-```{r}
-p <- ggplot(data = mpg) +
-geom_point(mapping = aes(x = displ, y = hwy))
-```
-
-### Graphic
-
-```{r}
-#| echo: false
-#| fig-width: 8
-#| fig-height: 5
-p
-```
-
-::::
-
-## 欢迎讨论!{.center}
-
-
-`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drc.drwater.net/course/public/RWEP/PUB/SD/")`
-
diff --git a/SD/4.1_datavisualize/mpg-plot.png b/SD/4.1_datavisualize/mpg-plot.png
deleted file mode 100644
index de3439b..0000000
Binary files a/SD/4.1_datavisualize/mpg-plot.png and /dev/null differ
diff --git a/SD/4.2_实践部分/_extensions b/SD/4.2_实践部分/_extensions
deleted file mode 120000
index 74119e3..0000000
--- a/SD/4.2_实践部分/_extensions
+++ /dev/null
@@ -1 +0,0 @@
-../../_extensions
\ No newline at end of file
diff --git a/SD/4.2_实践部分/index.qmd b/SD/4.2_实践部分/index.qmd
deleted file mode 100644
index f07e128..0000000
--- a/SD/4.2_实践部分/index.qmd
+++ /dev/null
@@ -1,103 +0,0 @@
----
-title: "数据前处理+ggplot2画图实践"
-subtitle: 《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution
-author: 苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心
-date: today
-lang: zh
-format:
- revealjs:
- theme: dark
- slide-number: true
- chalkboard:
- buttons: true
- preview-links: auto
- lang: zh
- toc: true
- toc-depth: 1
- toc-title: 大纲
- logo: ./_extensions/inst/img/ucaslogo.png
- css: ./_extensions/inst/css/revealjs.css
- pointer:
- key: "p"
- color: "#32cd32"
- pointerSize: 18
-revealjs-plugins:
- - pointer
-filters:
- - d2
----
-
-
-```{r}
-#| echo: false
-knitr::opts_chunk$set(echo = TRUE)
-source("../../coding/_common.R")
-library(tidyverse)
-```
-
-
-## 数据说明
-
-```{r}
-mpg
-```
-
-## 完成图1
-
-
-```{r}
-#| echo: false
-mpg |>
-ggplot(aes(x = displ, y = hwy, color = class)) +
- geom_point()
-
-```
-
-## 完成图2
-
-
-```{r}
-#| echo: false
-ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
- geom_point() +
- geom_smooth(aes(linetype = drv))
-```
-
-## 完成图3
-
-
-```{r}
-#| echo: false
-ggplot(mpg, aes(x = displ, y = hwy)) +
- geom_point(aes(color = class)) +
- geom_smooth()
-```
-
-## 完成图4
-
-
-```{r}
-#| echo: false
-ggplot(mpg, aes(x = displ, y = hwy)) +
- geom_point() +
- facet_wrap(~cyl)
-
-```
-
-## 完成图5
-
-
-```{r}
-#| echo: false
-ggplot(mpg, aes(x = drv, fill = class)) +
-geom_bar(position = "fill")
-```
-
-
-## 综合实践:三维荧光数据处理
-
-
-## 欢迎讨论!{.center}
-
-
-`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drc.drwater.net/course/public/RWEP/PUB/SD/")`
diff --git a/SD/4.9_课后作业9/_extensions b/SD/4.9_课后作业9/_extensions
deleted file mode 120000
index 74119e3..0000000
--- a/SD/4.9_课后作业9/_extensions
+++ /dev/null
@@ -1 +0,0 @@
-../../_extensions
\ No newline at end of file
diff --git a/SD/4.9_课后作业9/index.qmd b/SD/4.9_课后作业9/index.qmd
deleted file mode 100644
index 410fa9f..0000000
--- a/SD/4.9_课后作业9/index.qmd
+++ /dev/null
@@ -1,48 +0,0 @@
----
-title: "课后作业9"
-subtitle: 《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution
-author: 苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心
-date: today
-lang: zh
-format:
- revealjs:
- theme: dark
- slide-number: true
- chalkboard:
- buttons: true
- preview-links: auto
- lang: zh
- toc: true
- toc-depth: 1
- toc-title: 大纲
- logo: ./_extensions/inst/img/ucaslogo.png
- css: ./_extensions/inst/css/revealjs.css
- pointer:
- key: "p"
- color: "#32cd32"
- pointerSize: 18
-revealjs-plugins:
- - pointer
-filters:
- - d2
----
-
-```{r}
-#| include: false
-#| cache: false
-lang <- "cn"
-require(tidyverse)
-require(learnr)
-```
-
-## 第9次课后作业
-
-自选数据集,使用R语言开展不同因子(如年份、季节、处理方式等)间某指标的差异分析,采用图表方式形成简要报告。
-
-作业模板:[第9次课后作业_模板.qmd](https://git.drwater.net/course/RWEP/raw/branch/main/SD/20240402_9_课后作业/第9次课后作业_模板.qmd)
-
-
-## 欢迎讨论!{.center}
-
-
-`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drc.drwater.net/course/public/RWEP/PUB/SD/")`
diff --git a/SD/4.9_课后作业9/第9次课后作业_模板.qmd b/SD/4.9_课后作业9/第9次课后作业_模板.qmd
deleted file mode 100644
index 90dde6c..0000000
--- a/SD/4.9_课后作业9/第9次课后作业_模板.qmd
+++ /dev/null
@@ -1,8 +0,0 @@
----
-title: 课后作业9
-author: 姓名
-format: html
----
-
-要求:自选数据集,使用R语言开展不同因子间(如年份、季节、处理方式等)某指标的差异分析,采用图表+文字说明等方式形成简要报告。
-
diff --git a/SD/5.1_model/_extensions b/SD/5.1_model/_extensions
deleted file mode 120000
index 74119e3..0000000
--- a/SD/5.1_model/_extensions
+++ /dev/null
@@ -1 +0,0 @@
-../../_extensions
\ No newline at end of file
diff --git a/SD/5.1_model/images/10-Fold-CV.svg b/SD/5.1_model/images/10-Fold-CV.svg
deleted file mode 100644
index 5be8f3f..0000000
--- a/SD/5.1_model/images/10-Fold-CV.svg
+++ /dev/null
@@ -1,158 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/Ac_2tads.jpg b/SD/5.1_model/images/Ac_2tads.jpg
deleted file mode 100644
index cacbfc5..0000000
Binary files a/SD/5.1_model/images/Ac_2tads.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/Hatching-process.jpg b/SD/5.1_model/images/Hatching-process.jpg
deleted file mode 100644
index 15a8821..0000000
Binary files a/SD/5.1_model/images/Hatching-process.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/bad_workflow.png b/SD/5.1_model/images/bad_workflow.png
deleted file mode 100644
index f4fff10..0000000
Binary files a/SD/5.1_model/images/bad_workflow.png and /dev/null differ
diff --git a/SD/5.1_model/images/cap.png b/SD/5.1_model/images/cap.png
deleted file mode 100644
index 658645b..0000000
Binary files a/SD/5.1_model/images/cap.png and /dev/null differ
diff --git a/SD/5.1_model/images/confusion-matrix-accuracy.png b/SD/5.1_model/images/confusion-matrix-accuracy.png
deleted file mode 100644
index 9cd48ae..0000000
Binary files a/SD/5.1_model/images/confusion-matrix-accuracy.png and /dev/null differ
diff --git a/SD/5.1_model/images/confusion-matrix-sensitivity.png b/SD/5.1_model/images/confusion-matrix-sensitivity.png
deleted file mode 100644
index 6bb0fd3..0000000
Binary files a/SD/5.1_model/images/confusion-matrix-sensitivity.png and /dev/null differ
diff --git a/SD/5.1_model/images/confusion-matrix-specificity.png b/SD/5.1_model/images/confusion-matrix-specificity.png
deleted file mode 100644
index bef51a2..0000000
Binary files a/SD/5.1_model/images/confusion-matrix-specificity.png and /dev/null differ
diff --git a/SD/5.1_model/images/confusion-matrix.png b/SD/5.1_model/images/confusion-matrix.png
deleted file mode 100644
index 1a2771c..0000000
Binary files a/SD/5.1_model/images/confusion-matrix.png and /dev/null differ
diff --git a/SD/5.1_model/images/fe_venn.svg b/SD/5.1_model/images/fe_venn.svg
deleted file mode 100644
index b32e41a..0000000
--- a/SD/5.1_model/images/fe_venn.svg
+++ /dev/null
@@ -1,69 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/fe_venn_info.svg b/SD/5.1_model/images/fe_venn_info.svg
deleted file mode 100644
index 2b73585..0000000
--- a/SD/5.1_model/images/fe_venn_info.svg
+++ /dev/null
@@ -1,74 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/good_workflow.png b/SD/5.1_model/images/good_workflow.png
deleted file mode 100644
index 56a39c1..0000000
Binary files a/SD/5.1_model/images/good_workflow.png and /dev/null differ
diff --git a/SD/5.1_model/images/grid_points.svg b/SD/5.1_model/images/grid_points.svg
deleted file mode 100644
index ea06c10..0000000
--- a/SD/5.1_model/images/grid_points.svg
+++ /dev/null
@@ -1,80 +0,0 @@
-
-
diff --git a/SD/5.1_model/images/initial-split.svg b/SD/5.1_model/images/initial-split.svg
deleted file mode 100644
index 8426ec9..0000000
--- a/SD/5.1_model/images/initial-split.svg
+++ /dev/null
@@ -1,54 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/ml_illustration.jpg b/SD/5.1_model/images/ml_illustration.jpg
deleted file mode 100644
index 490632c..0000000
Binary files a/SD/5.1_model/images/ml_illustration.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/model-optimization.svg b/SD/5.1_model/images/model-optimization.svg
deleted file mode 100644
index f690500..0000000
--- a/SD/5.1_model/images/model-optimization.svg
+++ /dev/null
@@ -1,111 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/parsnip-flagger.jpg b/SD/5.1_model/images/parsnip-flagger.jpg
deleted file mode 100644
index f010c87..0000000
Binary files a/SD/5.1_model/images/parsnip-flagger.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/pointing.svg b/SD/5.1_model/images/pointing.svg
deleted file mode 100644
index 9a3dc1d..0000000
--- a/SD/5.1_model/images/pointing.svg
+++ /dev/null
@@ -1,27 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/rolling.svg b/SD/5.1_model/images/rolling.svg
deleted file mode 100644
index 4ae1759..0000000
--- a/SD/5.1_model/images/rolling.svg
+++ /dev/null
@@ -1,579 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/small_init.svg b/SD/5.1_model/images/small_init.svg
deleted file mode 100644
index 74e82be..0000000
--- a/SD/5.1_model/images/small_init.svg
+++ /dev/null
@@ -1,75 +0,0 @@
-
-
diff --git a/SD/5.1_model/images/snake.png b/SD/5.1_model/images/snake.png
deleted file mode 100644
index 6dc4b4c..0000000
Binary files a/SD/5.1_model/images/snake.png and /dev/null differ
diff --git a/SD/5.1_model/images/stack_01.png b/SD/5.1_model/images/stack_01.png
deleted file mode 100644
index 54bbdbc..0000000
Binary files a/SD/5.1_model/images/stack_01.png and /dev/null differ
diff --git a/SD/5.1_model/images/stack_02.png b/SD/5.1_model/images/stack_02.png
deleted file mode 100644
index 7544610..0000000
Binary files a/SD/5.1_model/images/stack_02.png and /dev/null differ
diff --git a/SD/5.1_model/images/stack_03.png b/SD/5.1_model/images/stack_03.png
deleted file mode 100644
index 13f7117..0000000
Binary files a/SD/5.1_model/images/stack_03.png and /dev/null differ
diff --git a/SD/5.1_model/images/stack_04.png b/SD/5.1_model/images/stack_04.png
deleted file mode 100644
index 8c02230..0000000
Binary files a/SD/5.1_model/images/stack_04.png and /dev/null differ
diff --git a/SD/5.1_model/images/stack_05.png b/SD/5.1_model/images/stack_05.png
deleted file mode 100644
index afb47db..0000000
Binary files a/SD/5.1_model/images/stack_05.png and /dev/null differ
diff --git a/SD/5.1_model/images/steve.gif b/SD/5.1_model/images/steve.gif
deleted file mode 100644
index b4b7f77..0000000
Binary files a/SD/5.1_model/images/steve.gif and /dev/null differ
diff --git a/SD/5.1_model/images/taxi.png b/SD/5.1_model/images/taxi.png
deleted file mode 100644
index e92c00d..0000000
Binary files a/SD/5.1_model/images/taxi.png and /dev/null differ
diff --git a/SD/5.1_model/images/taxi_spinning.svg b/SD/5.1_model/images/taxi_spinning.svg
deleted file mode 100644
index f178312..0000000
--- a/SD/5.1_model/images/taxi_spinning.svg
+++ /dev/null
@@ -1,34 +0,0 @@
-
diff --git a/SD/5.1_model/images/tm-org.png b/SD/5.1_model/images/tm-org.png
deleted file mode 100644
index 68ac928..0000000
Binary files a/SD/5.1_model/images/tm-org.png and /dev/null differ
diff --git a/SD/5.1_model/images/tuning-overfitting-test-1.svg b/SD/5.1_model/images/tuning-overfitting-test-1.svg
deleted file mode 100644
index 007d97d..0000000
--- a/SD/5.1_model/images/tuning-overfitting-test-1.svg
+++ /dev/null
@@ -1,1417 +0,0 @@
-
-
diff --git a/SD/5.1_model/images/tuning-overfitting-train-1.svg b/SD/5.1_model/images/tuning-overfitting-train-1.svg
deleted file mode 100644
index 11b29b7..0000000
--- a/SD/5.1_model/images/tuning-overfitting-train-1.svg
+++ /dev/null
@@ -1,1427 +0,0 @@
-
-
diff --git a/SD/5.1_model/images/what_is_ml.jpg b/SD/5.1_model/images/what_is_ml.jpg
deleted file mode 100644
index 9eb554a..0000000
Binary files a/SD/5.1_model/images/what_is_ml.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/whole-game-boost.svg b/SD/5.1_model/images/whole-game-boost.svg
deleted file mode 100644
index 20bf116..0000000
--- a/SD/5.1_model/images/whole-game-boost.svg
+++ /dev/null
@@ -1,96 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/whole-game-final-fit.jpg b/SD/5.1_model/images/whole-game-final-fit.jpg
deleted file mode 100644
index 14b42ae..0000000
Binary files a/SD/5.1_model/images/whole-game-final-fit.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/whole-game-final-performance.jpg b/SD/5.1_model/images/whole-game-final-performance.jpg
deleted file mode 100644
index 6b4305e..0000000
Binary files a/SD/5.1_model/images/whole-game-final-performance.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/whole-game-final-resamples.svg b/SD/5.1_model/images/whole-game-final-resamples.svg
deleted file mode 100644
index 36e4777..0000000
--- a/SD/5.1_model/images/whole-game-final-resamples.svg
+++ /dev/null
@@ -1,143 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/whole-game-final.svg b/SD/5.1_model/images/whole-game-final.svg
deleted file mode 100644
index 5613cb8..0000000
--- a/SD/5.1_model/images/whole-game-final.svg
+++ /dev/null
@@ -1,141 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/whole-game-logistic.svg b/SD/5.1_model/images/whole-game-logistic.svg
deleted file mode 100644
index 6e0b82d..0000000
--- a/SD/5.1_model/images/whole-game-logistic.svg
+++ /dev/null
@@ -1,86 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/whole-game-model-1.jpg b/SD/5.1_model/images/whole-game-model-1.jpg
deleted file mode 100644
index a43441c..0000000
Binary files a/SD/5.1_model/images/whole-game-model-1.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/whole-game-model-n.jpg b/SD/5.1_model/images/whole-game-model-n.jpg
deleted file mode 100644
index 53fccc2..0000000
Binary files a/SD/5.1_model/images/whole-game-model-n.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/whole-game-resamples.jpg b/SD/5.1_model/images/whole-game-resamples.jpg
deleted file mode 100644
index bd0ac85..0000000
Binary files a/SD/5.1_model/images/whole-game-resamples.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/whole-game-select.jpg b/SD/5.1_model/images/whole-game-select.jpg
deleted file mode 100644
index 15080da..0000000
Binary files a/SD/5.1_model/images/whole-game-select.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/whole-game-select.svg b/SD/5.1_model/images/whole-game-select.svg
deleted file mode 100644
index 1c99074..0000000
--- a/SD/5.1_model/images/whole-game-select.svg
+++ /dev/null
@@ -1,114 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/whole-game-split-short.svg b/SD/5.1_model/images/whole-game-split-short.svg
deleted file mode 100644
index cb09c11..0000000
--- a/SD/5.1_model/images/whole-game-split-short.svg
+++ /dev/null
@@ -1,76 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/whole-game-split.jpg b/SD/5.1_model/images/whole-game-split.jpg
deleted file mode 100644
index 1206adf..0000000
Binary files a/SD/5.1_model/images/whole-game-split.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/whole-game-split.svg b/SD/5.1_model/images/whole-game-split.svg
deleted file mode 100644
index 159763c..0000000
--- a/SD/5.1_model/images/whole-game-split.svg
+++ /dev/null
@@ -1,81 +0,0 @@
-
-
-
diff --git a/SD/5.1_model/images/whole-game-transparent-model-1.jpg b/SD/5.1_model/images/whole-game-transparent-model-1.jpg
deleted file mode 100644
index 767c3d9..0000000
Binary files a/SD/5.1_model/images/whole-game-transparent-model-1.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/whole-game-transparent-resamples.jpg b/SD/5.1_model/images/whole-game-transparent-resamples.jpg
deleted file mode 100644
index 83009b0..0000000
Binary files a/SD/5.1_model/images/whole-game-transparent-resamples.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/whole-game-transparent-select.jpg b/SD/5.1_model/images/whole-game-transparent-select.jpg
deleted file mode 100644
index 9376279..0000000
Binary files a/SD/5.1_model/images/whole-game-transparent-select.jpg and /dev/null differ
diff --git a/SD/5.1_model/images/whole-game-transparent-split.jpg b/SD/5.1_model/images/whole-game-transparent-split.jpg
deleted file mode 100644
index be2baf3..0000000
Binary files a/SD/5.1_model/images/whole-game-transparent-split.jpg and /dev/null differ
diff --git a/SD/5.1_model/index.qmd b/SD/5.1_model/index.qmd
deleted file mode 100644
index 7f33daa..0000000
--- a/SD/5.1_model/index.qmd
+++ /dev/null
@@ -1,1427 +0,0 @@
----
-title: "模型构建"
-subtitle: 《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution
-author: 苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心
-date: today
-lang: zh
-format:
- revealjs:
- theme: dark
- slide-number: true
- chalkboard:
- buttons: true
- preview-links: auto
- lang: zh
- toc: true
- toc-depth: 1
- toc-title: 大纲
- logo: ./_extensions/inst/img/ucaslogo.png
- css: ./_extensions/inst/css/revealjs.css
- pointer:
- key: "p"
- color: "#32cd32"
- pointerSize: 18
-revealjs-plugins:
- - pointer
-filters:
- - d2
-knitr:
- opts_chunk:
- dev: "svg"
- retina: 3
-execute:
- freeze: auto
- cache: true
- echo: true
- fig-width: 5
- fig-height: 6
----
-
-# tidymodels主要步骤
-
-```{r}
-#| echo: false
-hexes <- function(..., size = 64) {
- x <- c(...)
- x <- sort(unique(x), decreasing = TRUE)
- right <- (seq_along(x) - 1) * size
- res <- glue::glue(
- '{.absolute top=-20 right= width="" height=""}',
- .open = "<", .close = ">"
- )
- paste0(res, collapse = " ")
-}
-
-knitr::opts_chunk$set(
- digits = 3,
- comment = "#>",
- dev = 'svglite'
-)
-
-# devtools::install_github("gadenbuie/countdown")
-# library(countdown)
-library(ggplot2)
-theme_set(theme_bw())
-options(cli.width = 70, ggplot2.discrete.fill = c("#7e96d5", "#de6c4e"))
-
-train_color <- "#1a162d"
-test_color <- "#cd4173"
-data_color <- "#767381"
-assess_color <- "#84cae1"
-splits_pal <- c(data_color, train_color, test_color)
-
-```
-
-
-
-
-
-## 何为tidymodels? {background-image="images/tm-org.png" background-size="80%"}
-
-```{r load-tm}
-#| message: true
-#| echo: true
-#| warning: true
-library(tidymodels)
-```
-
-## 整体思路
-
-```{r diagram-split, echo = FALSE}
-#| fig-align: "center"
-
-knitr::include_graphics("images/whole-game-split.jpg")
-```
-
-## 整体思路
-
-```{r diagram-model-1, echo = FALSE}
-#| fig-align: "center"
-
-knitr::include_graphics("images/whole-game-model-1.jpg")
-```
-
-:::notes
-Stress that we are **not** fitting a model on the entire training set other than for illustrative purposes in deck 2.
-:::
-
-## 整体思路
-
-```{r diagram-model-n, echo = FALSE}
-#| fig-align: "center"
-
-knitr::include_graphics("images/whole-game-model-n.jpg")
-```
-
-## 整体思路
-
-```{r, echo = FALSE}
-#| fig-align: "center"
-
-knitr::include_graphics("images/whole-game-resamples.jpg")
-```
-
-## 整体思路
-
-```{r, echo = FALSE}
-#| fig-align: "center"
-
-knitr::include_graphics("images/whole-game-select.jpg")
-```
-
-## 整体思路
-
-```{r diagram-final-fit, echo = FALSE}
-#| fig-align: "center"
-
-knitr::include_graphics("images/whole-game-final-fit.jpg")
-```
-
-## 整体思路
-
-```{r diagram-final-performance, echo = FALSE}
-#| fig-align: "center"
-
-knitr::include_graphics("images/whole-game-final-performance.jpg")
-```
-
-## 相关包的安装
-
-```{r load-pkgs}
-#| eval: false
-
-# Install the packages for the workshop
-pkgs <-
- c("bonsai", "doParallel", "embed", "finetune", "lightgbm", "lme4",
- "plumber", "probably", "ranger", "rpart", "rpart.plot", "rules",
- "splines2", "stacks", "text2vec", "textrecipes", "tidymodels",
- "vetiver", "remotes")
-
-install.packages(pkgs)
-```
-
-. . .
-
-
-
-
-
-
-## Data on Chicago taxi trips
-
-```{r taxi-print}
-library(tidymodels)
-taxi
-```
-
-## 数据分割与使用
-
-对于机器学习,我们通常将数据分成训练集和测试集:
-
-. . .
-
-- 训练集用于估计模型参数。
-- 测试集用于独立评估模型性能。
-
-. . .
-
-在训练过程中不要使用测试集。
-
-
-. . .
-
-```{r test-train-split}
-#| echo: false
-#| fig.width: 12
-#| fig.height: 3
-#|
-set.seed(123)
-library(forcats)
-require(tidymodels)
-require(tidyverse)
-one_split <- taxi |>
- dplyr::slice(1:30) |>
- rsample::initial_split() |>
- generics::tidy() |>
- tibble::add_row(Row = 1:30, Data = "Original") |>
- dplyr::mutate(Data = case_when(
- Data == "Analysis" ~ "Training",
- Data == "Assessment" ~ "Testing",
- TRUE ~ Data
- )) |>
- dplyr::mutate(Data = factor(Data, levels = c("Original", "Training", "Testing")))
-all_split <-
- ggplot(one_split, aes(x = Row, y = fct_rev(Data), fill = Data)) +
- geom_tile(color = "white",
- linewidth = 1) +
- scale_fill_manual(values = splits_pal, guide = "none") +
- theme_minimal() +
- theme(axis.text.y = element_text(size = rel(2)),
- axis.text.x = element_blank(),
- legend.position = "top",
- panel.grid = element_blank()) +
- coord_equal(ratio = 1) +
- labs(x = NULL, y = NULL)
-all_split
-```
-
-## The initial split
-
-```{r taxi-split}
-set.seed(123)
-taxi_split <- initial_split(taxi)
-taxi_split
-```
-
-## Accessing the data
-
-```{r taxi-train-test}
-taxi_train <- training(taxi_split)
-taxi_test <- testing(taxi_split)
-```
-
-## The training set
-
-```{r taxi-train}
-taxi_train
-```
-
-## 练习
-
-```{r taxi-split-prop}
-set.seed(123)
-taxi_split <- initial_split(taxi, prop = 0.8)
-taxi_train <- training(taxi_split)
-taxi_test <- testing(taxi_split)
-
-nrow(taxi_train)
-nrow(taxi_test)
-```
-
-## Stratification
-
-Use `strata = tip`
-
-```{r taxi-split-prop-strata}
-set.seed(123)
-taxi_split <- initial_split(taxi, prop = 0.8, strata = tip)
-taxi_split
-```
-
-## Stratification
-
-Stratification often helps, with very little downside
-
-```{r taxi-tip-pct-by-split, echo = FALSE}
-bind_rows(
- taxi_train %>% mutate(split = "train"),
- taxi_test %>% mutate(split = "test")
-) %>%
- ggplot(aes(x = split, fill = tip)) +
- geom_bar(position = "fill")
-```
-
-## 模型类型
-
-
-模型多种多样
-
-- `lm` for linear model
-
-- `glm` for generalized linear model (e.g. logistic regression)
-
-- `glmnet` for regularized regression
-
-- `keras` for regression using TensorFlow
-
-- `stan` for Bayesian regression
-
-- `spark` for large data sets
-
-
-## 指定模型
-
-```{r}
-#| echo: false
-library(tidymodels)
-
-set.seed(123)
-
-taxi_split <- initial_split(taxi, prop = 0.8, strata = tip)
-taxi_train <- training(taxi_split)
-taxi_test <- testing(taxi_split)
-```
-
-```{r logistic-reg}
-logistic_reg()
-```
-
-
-:::notes
-Models have default engines
-:::
-
-
-
-## To specify a model
-
-```{r logistic-reg-glmnet}
-logistic_reg() %>%
- set_engine("glmnet")
-```
-
-. . .
-
-
-```{r logistic-reg-stan}
-logistic_reg() %>%
- set_engine("stan")
-```
-
-
-::: columns
-::: {.column width="40%"}
-- Choose a model
-- Specify an engine
-- Set the [mode]{.underline}
-:::
-
-::: {.column width="60%"}
-
-:::
-:::
-
-
-## To specify a model
-
-```{r decision-tree}
-decision_tree()
-```
-
-:::notes
-Some models have a default mode
-:::
-
-## To specify a model
-
-```{r decision-tree-classification}
-decision_tree() %>%
- set_mode("classification")
-```
-
-. . .
-
-
-
-::: r-fit-text
-All available models are listed at
-:::
-
-
-## Workflows
-
-```{r good-workflow}
-#| echo: false
-#| out-width: '70%'
-#| fig-align: 'center'
-knitr::include_graphics("images/good_workflow.png")
-```
-
-
-## 为什么要使用 `workflow()`?
-
-
-- 与基本的 R 工具相比,工作流能更好地处理新的因子水平
-
-. . .
-
-- 除了公式之外,还可以使用其他的预处理器(更多关于高级 tidymodels 中的特征工程!)
-
-. . .
-
-- 在使用多个模型时,它们可以帮助组织工作
-
-. . .
-
-- [最重要的是]{.underline},工作流涵盖了整个建模过程:`fit()` 和 `predict()` 不仅适用于实际的模型拟合,还适用于预处理步骤
-
-::: notes
-工作流比基本的 R 处理水平更好的两种方式:
-
-- 强制要求在预测时不允许出现新的水平(这是一个可选的检查,可以关闭)
-
-- 恢复在拟合时存在但在预测时缺失的水平(例如,“新”数据中没有该水平的实例)
-:::
-
-
-## A model workflow
-
-```{r tree-spec}
-tree_spec <-
- decision_tree(cost_complexity = 0.002) %>%
- set_mode("classification")
-
-tree_spec %>%
- fit(tip ~ ., data = taxi_train)
-```
-
-## A model workflow
-
-```{r tree-wflow}
-tree_spec <-
- decision_tree(cost_complexity = 0.002) %>%
- set_mode("classification")
-
-workflow() %>%
- add_formula(tip ~ .) %>%
- add_model(tree_spec) %>%
- fit(data = taxi_train)
-```
-
-## A model workflow
-
-```{r tree-wflow-fit}
-tree_spec <-
- decision_tree(cost_complexity = 0.002) %>%
- set_mode("classification")
-
-workflow(tip ~ ., tree_spec) %>%
- fit(data = taxi_train)
-```
-
-## 预测
-
-How do you use your new `tree_fit` model?
-
-```{r tree-wflow-fit-2}
-tree_spec <-
- decision_tree(cost_complexity = 0.002) %>%
- set_mode("classification")
-
-tree_fit <-
- workflow(tip ~ ., tree_spec) %>%
- fit(data = taxi_train)
-```
-
-## 练习
-
-*Run:*
-
-`predict(tree_fit, new_data = taxi_test)`
-
-
-
-. . .
-
-
-*Run:*
-
-`augment(tree_fit, new_data = taxi_test)`
-
-*What do you get?*
-
-
-## tidymodels 的预测
-
-- 预测结果始终在一个 **tibble** 内
-- 列名和类型可读性强
-- `new_data` 中的行数和输出中的行数**相同**
-
-## 理解模型
-
-如何 **理解**`tree_fit` 模型?
-
-```{r plot-tree-fit-4}
-#| echo: false
-#| fig-align: center
-#| fig-width: 8
-#| fig-height: 5
-#| out-width: 100%
-library(rpart.plot)
-tree_fit %>%
- extract_fit_engine() %>%
- rpart.plot(roundint = FALSE)
-```
-
-## Evaluating models: 预测值
-
-```{r}
-#| echo: false
-library(tidymodels)
-
-set.seed(123)
-taxi_split <- initial_split(taxi, prop = 0.8, strata = tip)
-taxi_train <- training(taxi_split)
-taxi_test <- testing(taxi_split)
-
-tree_spec <- decision_tree(cost_complexity = 0.0001, mode = "classification")
-taxi_wflow <- workflow(tip ~ ., tree_spec)
-taxi_fit <- fit(taxi_wflow, taxi_train)
-```
-
-```{r taxi-fit-augment}
-augment(taxi_fit, new_data = taxi_train) %>%
- relocate(tip, .pred_class, .pred_yes, .pred_no)
-```
-
-## Confusion matrix
-
-
-
-## Confusion matrix
-
-```{r conf-mat}
-augment(taxi_fit, new_data = taxi_train) %>%
- conf_mat(truth = tip, estimate = .pred_class)
-```
-
-## Confusion matrix
-
-```{r conf-mat-plot}
-augment(taxi_fit, new_data = taxi_train) %>%
- conf_mat(truth = tip, estimate = .pred_class) %>%
- autoplot(type = "heatmap")
-```
-
-## Metrics for model performance
-
-::: columns
-::: {.column width="60%"}
-```{r acc}
-augment(taxi_fit, new_data = taxi_train) %>%
- accuracy(truth = tip, estimate = .pred_class)
-```
-:::
-
-::: {.column width="40%"}
-
-:::
-:::
-
-## 二分类模型评估
-
-模型的敏感性(Sensitivity)和特异性(Specificity)是评估二分类模型性能的重要指标:
-
-- **敏感性**(Sensitivity),也称为真阳性率,衡量了模型正确识别正类别样本的能力。公式为真阳性数除以真阳性数加上假阴性数:
-
-$$
-\text{Sensitivity} = \frac{\text{True Positives}}{\text{True Positives} + \text{False Negatives}}
-$$
-
-
-- **特异性**(Specificity),也称为真阴性率,衡量了模型正确识别负类别样本的能力。公式为真阴性数除以真阴性数加上假阳性数:
-
-$$
-\text{Specificity} = \frac{\text{True Negatives}}{\text{True Negatives} + \text{False Positives}}
-$$
-
-在评估模型时,我们希望敏感性和特异性都很高。高敏感性表示模型能够捕获真正的正类别样本,高特异性表示模型能够准确排除负类别样本。
-
-
-## Metrics for model performance
-
-::: columns
-::: {.column width="60%"}
-```{r sens}
-augment(taxi_fit, new_data = taxi_train) %>%
- sensitivity(truth = tip, estimate = .pred_class)
-```
-:::
-
-::: {.column width="40%"}
-
-:::
-:::
-
-
-## Metrics for model performance
-
-::: columns
-::: {.column width="60%"}
-```{r sens-2}
-#| code-line-numbers: "3-6"
-augment(taxi_fit, new_data = taxi_train) %>%
- sensitivity(truth = tip, estimate = .pred_class)
-```
-
-
-
-```{r spec}
-augment(taxi_fit, new_data = taxi_train) %>%
- specificity(truth = tip, estimate = .pred_class)
-```
-:::
-
-::: {.column width="40%"}
-
-:::
-:::
-
-## Metrics for model performance
-
-We can use `metric_set()` to combine multiple calculations into one
-
-```{r taxi-metrics}
-taxi_metrics <- metric_set(accuracy, specificity, sensitivity)
-
-augment(taxi_fit, new_data = taxi_train) %>%
- taxi_metrics(truth = tip, estimate = .pred_class)
-```
-
-## Metrics for model performance
-
-```{r taxi-metrics-grouped}
-taxi_metrics <- metric_set(accuracy, specificity, sensitivity)
-
-augment(taxi_fit, new_data = taxi_train) %>%
- group_by(local) %>%
- taxi_metrics(truth = tip, estimate = .pred_class)
-```
-
-
-## Varying the threshold
-
-```{r}
-#| label: thresholds
-#| echo: false
-
-augment(taxi_fit, new_data = taxi_train) %>%
- roc_curve(truth = tip, .pred_yes) %>%
- filter(is.finite(.threshold)) %>%
- pivot_longer(c(specificity, sensitivity), names_to = "statistic", values_to = "value") %>%
- rename(`event threshold` = .threshold) %>%
- ggplot(aes(x = `event threshold`, y = value, col = statistic, group = statistic)) +
- geom_line() +
- scale_color_brewer(palette = "Dark2") +
- labs(y = NULL) +
- coord_equal() +
- theme(legend.position = "top")
-```
-
-## ROC 曲线
-
-- ROC(Receiver Operating Characteristic)曲线用于评估二分类模型的性能,特别是在不同的阈值下比较模型的敏感性和特异性。
-- ROC曲线的横轴是假阳性率(False Positive Rate,FPR),纵轴是真阳性率(True Positive Rate,TPR)。在ROC曲线上,每个点对应于一个特定的阈值。通过改变阈值,我们可以观察到模型在不同条件下的表现。
-- ROC曲线越接近左上角(0,1)点,说明模型的性能越好,因为这表示在较低的假阳性率下,模型能够获得较高的真阳性率。ROC曲线下面积(Area Under the ROC Curve,AUC)也是评估模型性能的一种指标,AUC值越大表示模型性能越好。
-
-
-
-## ROC curve plot
-
-```{r roc-curve}
-#| fig-width: 6
-#| fig-height: 6
-#| output-location: "column"
-
-augment(taxi_fit, new_data = taxi_train) %>%
- roc_curve(truth = tip, .pred_yes) %>%
- autoplot()
-```
-
-
-## 过度拟合
-
-
-
-## 过度拟合
-
-
-
-
-## Cross-validation {background-color="white" background-image="https://www.tmwr.org/premade/resampling.svg" background-size="80%"}
-
-## Cross-validation
-
-
-
-## Cross-validation
-
-
-
-## Cross-validation
-
-```{r vfold-cv}
-vfold_cv(taxi_train) # v = 10 is default
-```
-
-## Cross-validation
-
-What is in this?
-
-```{r taxi-splits}
-taxi_folds <- vfold_cv(taxi_train)
-taxi_folds$splits[1:3]
-```
-
-::: notes
-Talk about a list column, storing non-atomic types in dataframe
-:::
-
-## Cross-validation
-
-```{r vfold-cv-v}
-vfold_cv(taxi_train, v = 5)
-```
-
-## Cross-validation
-
-```{r vfold-cv-strata}
-vfold_cv(taxi_train, strata = tip)
-```
-
-. . .
-
-Stratification often helps, with very little downside
-
-## Cross-validation
-
-We'll use this setup:
-
-```{r taxi-folds}
-set.seed(123)
-taxi_folds <- vfold_cv(taxi_train, v = 10, strata = tip)
-taxi_folds
-```
-
-. . .
-
-Set the seed when creating resamples
-
-
-## Fit our model to the resamples
-
-```{r fit-resamples}
-taxi_res <- fit_resamples(taxi_wflow, taxi_folds)
-taxi_res
-```
-
-## Evaluating model performance
-
-```{r collect-metrics}
-taxi_res %>%
- collect_metrics()
-```
-
-::: notes
-collect_metrics() 是一套 collect_*() 函数之一,可用于处理调参结果的列。调参结果中以 . 为前缀的大多数列都有对应的 collect_*() 函数,可以进行常见摘要选项的汇总。
-:::
-
-. . .
-
-We can reliably measure performance using only the **training** data 🎉
-
-## Comparing metrics
-
-How do the metrics from resampling compare to the metrics from training and testing?
-
-```{r calc-roc-auc}
-#| echo: false
-taxi_training_roc_auc <-
- taxi_fit %>%
- augment(taxi_train) %>%
- roc_auc(tip, .pred_yes) %>%
- pull(.estimate) %>%
- round(digits = 2)
-
-taxi_testing_roc_auc <-
- taxi_fit %>%
- augment(taxi_test) %>%
- roc_auc(tip, .pred_yes) %>%
- pull(.estimate) %>%
- round(digits = 2)
-```
-
-::: columns
-::: {.column width="50%"}
-```{r collect-metrics-2}
-taxi_res %>%
- collect_metrics() %>%
- select(.metric, mean, n)
-```
-:::
-
-::: {.column width="50%"}
-The ROC AUC previously was
-
-- `r taxi_training_roc_auc` for the training set
-- `r taxi_testing_roc_auc` for test set
-:::
-:::
-
-. . .
-
-Remember that:
-
-⚠️ the training set gives you overly optimistic metrics
-
-⚠️ the test set is precious
-
-## Evaluating model performance
-
-```{r save-predictions}
-# Save the assessment set results
-ctrl_taxi <- control_resamples(save_pred = TRUE)
-taxi_res <- fit_resamples(taxi_wflow, taxi_folds, control = ctrl_taxi)
-
-taxi_res
-```
-
-## Evaluating model performance
-
-```{r collect-predictions}
-# Save the assessment set results
-taxi_preds <- collect_predictions(taxi_res)
-taxi_preds
-```
-
-## Evaluating model performance
-
-```{r taxi-metrics-by-id}
-taxi_preds %>%
- group_by(id) %>%
- taxi_metrics(truth = tip, estimate = .pred_class)
-```
-
-## Where are the fitted models?
-
-```{r taxi-res}
-taxi_res
-```
-
-
-## Bootstrapping
-
-
-
-## Bootstrapping
-
-```{r bootstraps}
-set.seed(3214)
-bootstraps(taxi_train)
-```
-
-
-## Monte Carlo Cross-Validation
-
-```{r mc-cv}
-set.seed(322)
-mc_cv(taxi_train, times = 10)
-```
-
-## Validation set
-
-```{r validation-split}
-set.seed(853)
-taxi_val_split <- initial_validation_split(taxi, strata = tip)
-validation_set(taxi_val_split)
-```
-
-
-## Create a random forest model
-
-```{r rf-spec}
-rf_spec <- rand_forest(trees = 1000, mode = "classification")
-rf_spec
-```
-
-## Create a random forest model
-
-```{r rf-wflow}
-rf_wflow <- workflow(tip ~ ., rf_spec)
-rf_wflow
-```
-
-## Evaluating model performance
-
-```{r collect-metrics-rf}
-ctrl_taxi <- control_resamples(save_pred = TRUE)
-
-# Random forest uses random numbers so set the seed first
-
-set.seed(2)
-rf_res <- fit_resamples(rf_wflow, taxi_folds, control = ctrl_taxi)
-collect_metrics(rf_res)
-```
-
-## The whole game - status update
-
-```{r diagram-select, echo = FALSE}
-#| fig-align: "center"
-
-knitr::include_graphics("images/whole-game-transparent-select.jpg")
-```
-
-## The final fit
-
-```{r final-fit}
-# taxi_split has train + test info
-final_fit <- last_fit(rf_wflow, taxi_split)
-
-final_fit
-```
-
-## 何为`final_fit`?
-
-```{r collect-metrics-final-fit}
-collect_metrics(final_fit)
-```
-
-. . .
-
-These are metrics computed with the **test** set
-
-## 何为`final_fit`?
-
-```{r collect-predictions-final-fit}
-collect_predictions(final_fit)
-```
-
-## 何为`final_fit`?
-
-```{r extract-workflow}
-extract_workflow(final_fit)
-```
-
-. . .
-
-Use this for **prediction** on new data, like for deploying
-
-
-
-
-## Tuning models - Specifying tuning parameters
-
-
-```{r}
-#| label: tag-for-tuning
-#| code-line-numbers: "1|"
-
-rf_spec <- rand_forest(min_n = tune()) %>%
- set_mode("classification")
-
-rf_wflow <- workflow(tip ~ ., rf_spec)
-rf_wflow
-```
-
-## Try out multiple values
-
-`tune_grid()` works similar to `fit_resamples()` but covers multiple parameter values:
-
-```{r}
-#| label: rf-tune_grid
-#| code-line-numbers: "2|3-4|5|"
-
-set.seed(22)
-rf_res <- tune_grid(
- rf_wflow,
- taxi_folds,
- grid = 5
-)
-```
-
-## Compare results
-
-Inspecting results and selecting the best-performing hyperparameter(s):
-
-```{r}
-#| label: rf-results
-
-show_best(rf_res)
-
-best_parameter <- select_best(rf_res)
-best_parameter
-```
-
-`collect_metrics()` and `autoplot()` are also available.
-
-## The final fit
-
-```{r}
-#| label: rf-finalize
-
-rf_wflow <- finalize_workflow(rf_wflow, best_parameter)
-
-final_fit <- last_fit(rf_wflow, taxi_split)
-
-collect_metrics(final_fit)
-```
-
-# 实践部分
-
-
-## 数据
-
-```{r}
-require(tidyverse)
-sitedf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv") |>
- select(SITE_ID,
- lon = LON_DD,
- lat = LAT_DD,
- name = LAKENAME,
- area = LAKEAREA,
- zmax = DEPTHMAX
- ) |>
- group_by(SITE_ID) |>
- summarize(lon = mean(lon, na.rm = TRUE),
- lat = mean(lat, na.rm = TRUE),
- name = unique(name),
- area = mean(area, na.rm = TRUE),
- zmax = mean(zmax, na.rm = TRUE))
-
-
-visitdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv") |>
- select(SITE_ID,
- date = DATE_PROFILE,
- year = YEAR,
- visit = VISIT_NO
- ) |>
- distinct()
-
-
-
-waterchemdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv") |>
- select(SITE_ID,
- date = DATE_PROFILE,
- depth = DEPTH,
- temp = TEMP_FIELD,
- do = DO_FIELD,
- ph = PH_FIELD,
- cond = COND_FIELD,
- )
-
-sddf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv") |>
- select(SITE_ID,
- date = DATE_SECCHI,
- sd = SECMEAN,
- clear_to_bottom = CLEAR_TO_BOTTOM
- )
-
-trophicdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv") |>
- select(SITE_ID,
- visit = VISIT_NO,
- tp = PTL,
- tn = NTL,
- chla = CHLA) |>
- left_join(visitdf, by = c("SITE_ID", "visit")) |>
- select(-year, -visit) |>
- group_by(SITE_ID, date) |>
- summarize(tp = mean(tp, na.rm = TRUE),
- tn = mean(tn, na.rm = TRUE),
- chla = mean(chla, na.rm = TRUE)
- )
-
-
-
-phytodf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv") |>
- select(SITE_ID,
- date = DATEPHYT,
- depth = SAMPLE_DEPTH,
- phyta = DIVISION,
- genus = GENUS,
- species = SPECIES,
- tax = TAXANAME,
- abund = ABUND) |>
- mutate(phyta = gsub(" .*$", "", phyta)) |>
- filter(!is.na(genus)) |>
- group_by(SITE_ID, date, depth, phyta, genus) |>
- summarize(abund = sum(abund, na.rm = TRUE)) |>
- nest(phytodf = -c(SITE_ID, date))
-
-envdf <- waterchemdf |>
- filter(depth < 2) |>
- select(-depth) |>
- group_by(SITE_ID, date) |>
- summarise_all(~mean(., na.rm = TRUE)) |>
- ungroup() |>
- left_join(sddf, by = c("SITE_ID", "date")) |>
- left_join(trophicdf, by = c("SITE_ID", "date"))
-
-nla <- envdf |>
- left_join(phytodf) |>
- left_join(sitedf, by = "SITE_ID") |>
- filter(!purrr::map_lgl(phytodf, is.null)) |>
- mutate(cyanophyta = purrr::map(phytodf, ~ .x |>
- dplyr::filter(phyta == "Cyanophyta") |>
- summarize(cyanophyta = sum(abund, na.rm = TRUE))
- )) |>
- unnest(cyanophyta) |>
- select(-phyta) |>
- mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE))
-
-
-# library(rmdify)
-# library(dwfun)
-# dwfun::init()
-
-```
-
-
-## 数据
-
-```{r}
-skimr::skim(nla)
-```
-
-
-
-## 简单模型
-
-```{r}
-nla |>
- filter(tp > 1) |>
- ggplot(aes(tn, tp)) +
-geom_point() +
-geom_smooth(method = "lm") +
-scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
- labels = scales::trans_format("log10", scales::math_format(10^.x))) +
-scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
- labels = scales::trans_format("log10", scales::math_format(10^.x)))
-
-m1 <- lm(log10(tp) ~ log10(tn), data = nla)
-
-summary(m1)
-
-
-```
-
-## 复杂指标
-
-```{r}
-nla |>
- filter(tp > 1) |>
- ggplot(aes(tp, cyanophyta)) +
-geom_point() +
-geom_smooth(method = "lm") +
-scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
- labels = scales::trans_format("log10", scales::math_format(10^.x))) +
-scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
- labels = scales::trans_format("log10", scales::math_format(10^.x)))
-
-m2 <- lm(log10(cyanophyta) ~ log10(tp), data = nla)
-
-summary(m2)
-
-
-```
-
-
-
-
-## tidymodels - Data split
-
-```{r}
-(nla_split <- rsample::initial_split(nla, prop = 0.7, strata = zmax))
-(nla_train <- training(nla_split))
-(nla_test <- testing(nla_split))
-
-```
-
-## tidymodels - recipe
-
-```{r}
-nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn + chla + clear_to_bottom")
-# nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn")
-nla_recipe <- recipes::recipe(nla_formula, data = nla_train) |>
- recipes::step_string2factor(all_nominal()) |>
- recipes::step_nzv(all_nominal()) |>
- recipes::step_log(chla, cyanophyta, base = 10) |>
- recipes::step_normalize(all_numeric_predictors()) |>
- prep()
-nla_recipe
-```
-
-## tidymodels - cross validation
-
-```{r}
-nla_cv <- recipes::bake(
- nla_recipe,
- new_data = training(nla_split)
- ) |>
- rsample::vfold_cv(v = 10)
-nla_cv
-```
-
-## tidymodels - Model specification
-
-```{r}
-xgboost_model <- parsnip::boost_tree(
- mode = "regression",
- trees = 1000,
- min_n = tune(),
- tree_depth = tune(),
- learn_rate = tune(),
- loss_reduction = tune()
-) |>
- set_engine("xgboost", objective = "reg:squarederror")
-xgboost_model
-```
-
-
-## tidymodels - Grid specification
-
-```{r}
-# grid specification
-xgboost_params <- dials::parameters(
- min_n(),
- tree_depth(),
- learn_rate(),
- loss_reduction()
-)
-xgboost_params
-```
-
-## tidymodels - Grid specification
-
-```{r}
-xgboost_grid <- dials::grid_max_entropy(
- xgboost_params,
- size = 60
-)
-knitr::kable(head(xgboost_grid))
-```
-
-## tidymodels - Workflow
-
-```{r}
-xgboost_wf <- workflows::workflow() |>
- add_model(xgboost_model) |>
- add_formula(nla_formula)
-xgboost_wf
-```
-
-
-## tidymodels - Tune
-
-```{r}
-#| cache: true
-# hyperparameter tuning
-if (FALSE) {
- xgboost_tuned <- tune::tune_grid(
- object = xgboost_wf,
- resamples = nla_cv,
- grid = xgboost_grid,
- metrics = yardstick::metric_set(rmse, rsq, mae),
- control = tune::control_grid(verbose = TRUE)
- )
-saveRDS(xgboost_tuned, "./xgboost_tuned.RDS")
-}
-xgboost_tuned <- readRDS("./xgboost_tuned.RDS")
-```
-
-## tidymodels - Best model
-
-```{r}
-xgboost_tuned |>
- tune::show_best(metric = "rmse") |>
- knitr::kable()
-```
-
-
-## tidymodels - Best model
-
-```{r}
-xgboost_tuned |>
- collect_metrics()
-```
-
-
-## tidymodels - Best model
-
-```{r}
-#| fig-width: 9
-#| fig-height: 5
-#| out-width: "100%"
-xgboost_tuned |>
- autoplot()
-```
-
-
-## tidymodels - Best model
-
-
-```{r}
-xgboost_best_params <- xgboost_tuned |>
- tune::select_best("rmse")
-
-knitr::kable(xgboost_best_params)
-```
-
-
-## tidymodels - Final model
-
-```{r}
-xgboost_model_final <- xgboost_model |>
- finalize_model(xgboost_best_params)
-xgboost_model_final
-```
-
-
-## tidymodels - Train evaluation
-
-
-```{r}
-(train_processed <- bake(nla_recipe, new_data = nla_train))
-```
-
-## tidymodels - Train data
-
-```{r}
-train_prediction <- xgboost_model_final |>
- # fit the model on all the training data
- fit(
- formula = nla_formula,
- data = train_processed
- ) |>
- # predict the sale prices for the training data
- predict(new_data = train_processed) |>
- bind_cols(nla_train |>
- mutate(.obs = log10(cyanophyta)))
-xgboost_score_train <-
- train_prediction |>
- yardstick::metrics(.obs, .pred) |>
- mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
-knitr::kable(xgboost_score_train)
-
-```
-
-## tidymodels - train evaluation
-
-```{r}
-#| fig-width: 5
-#| fig-height: 3
-#| out-width: "80%"
-train_prediction |>
- ggplot(aes(.pred, .obs)) +
-geom_point() +
-geom_smooth(method = "lm")
-
-
-```
-
-
-## tidymodels - test data
-
-
-```{r}
-test_processed <- bake(nla_recipe, new_data = nla_test)
-
-test_prediction <- xgboost_model_final |>
- # fit the model on all the training data
- fit(
- formula = nla_formula,
- data = train_processed
- ) |>
- # use the training model fit to predict the test data
- predict(new_data = test_processed) |>
- bind_cols(nla_test |>
- mutate(.obs = log10(cyanophyta)))
-
-# measure the accuracy of our model using `yardstick`
-xgboost_score <- test_prediction |>
- yardstick::metrics(.obs, .pred) |>
- mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
-
-knitr::kable(xgboost_score)
-```
-
-
-## tidymodels - evaluation
-
-```{r}
-#| fig-width: 5
-#| fig-height: 3
-#| out-width: "80%"
-cyanophyta_prediction_residual <- test_prediction |>
- arrange(.pred) %>%
- mutate(residual_pct = (.obs - .pred) / .pred) |>
- select(.pred, residual_pct)
-
-cyanophyta_prediction_residual |>
-ggplot(aes(x = .pred, y = residual_pct)) +
- geom_point() +
- xlab("Predicted Cyanophyta") +
- ylab("Residual (%)")
-```
-
-
-
-
-## tidymodels - test evaluation
-
-```{r}
-#| fig-width: 5
-#| fig-height: 3
-#| out-width: "80%"
-test_prediction |>
- ggplot(aes(.pred, .obs)) +
-geom_point() +
-geom_smooth(method = "lm", colour = "black")
-
-```
-
-
-
-## 欢迎讨论!{.center}
-
-
-`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drc.drwater.net/course/public/RWEP/PUB/SD/")`
diff --git a/SD/5.1_model/mpg-plot.png b/SD/5.1_model/mpg-plot.png
deleted file mode 100644
index de3439b..0000000
Binary files a/SD/5.1_model/mpg-plot.png and /dev/null differ
diff --git a/SD/5.1_model/xgboost_tuned.RDS b/SD/5.1_model/xgboost_tuned.RDS
deleted file mode 100644
index 5d0fe6a..0000000
Binary files a/SD/5.1_model/xgboost_tuned.RDS and /dev/null differ
diff --git a/SD/5.2_大数据分析工具/_extensions b/SD/5.2_大数据分析工具/_extensions
deleted file mode 120000
index 74119e3..0000000
--- a/SD/5.2_大数据分析工具/_extensions
+++ /dev/null
@@ -1 +0,0 @@
-../../_extensions
\ No newline at end of file
diff --git a/SD/5.2_大数据分析工具/index.qmd b/SD/5.2_大数据分析工具/index.qmd
deleted file mode 100644
index db77fce..0000000
--- a/SD/5.2_大数据分析工具/index.qmd
+++ /dev/null
@@ -1,166 +0,0 @@
----
-title: "大数据分析工具"
-subtitle: 《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution
-author: 苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心
-date: today
-lang: zh
-format:
- revealjs:
- theme: dark
- slide-number: true
- chalkboard:
- buttons: true
- preview-links: auto
- lang: zh
- toc: true
- toc-depth: 1
- toc-title: 大纲
- logo: ./_extensions/inst/img/ucaslogo.png
- css: ./_extensions/inst/css/revealjs.css
- pointer:
- key: "p"
- color: "#32cd32"
- pointerSize: 18
-revealjs-plugins:
- - pointer
-filters:
- - d2
----
-
-```{r}
-#| echo: false
-knitr::opts_chunk$set(echo = TRUE)
-source("../../coding/_common.R")
-library(nycflights13)
-library(tidyverse)
-
-```
-
-## 匹配数字
-
-### 匹配数字:
-
-- \d:匹配任意数字字符。
-- \d+:匹配一个或多个数字字符。
-- [0-9]: 匹配数字
-
-### 匹配字母:
-
-- \w:匹配任意字母、数字或下划线字符。
-- \w+:匹配一个或多个字母、数字或下划线字符。
-
-## 匹配数字
-
-### 匹配空白字符:
-
-- \s:匹配任意空白字符,包括空格、制表符、换行符等。
-- \s+:匹配一个或多个空白字符。
-
-### 匹配特定字符:
-
-- [abc]:匹配字符 a、b 或 c 中的任意一个。
-- [a-z]:匹配任意小写字母。
-- [A-Z]:匹配任意大写字母。
-- [0-9]:匹配任意数字。
-
-## 匹配数字
-
-### 匹配重复次数:
-
-- {n}:匹配前一个字符恰好 n 次。
-- {n,}:匹配前一个字符至少 n 次。
-- {n,m}:匹配前一个字符至少 n 次,但不超过 m 次。
-
-### 匹配边界:
-
-- ^:匹配字符串的开头。
-- $:匹配字符串的结尾。
-
-## 匹配数字
-
-### 匹配特殊字符:
-
-- \:转义特殊字符,使其按字面意义匹配。
-- .:匹配任意单个字符。
-- |:表示“或”关系,匹配两个或多个表达式之一。
-
-#### 匹配次数:
-
-- *:匹配前一个字符零次或多次。
-- +:匹配前一个字符一次或多次。
-- ?:匹配前一个字符零次或一次。
-
-## 匹配数字
-
-### 分组和捕获:
-
-- ():将一系列模式组合成一个单元,可与特殊字符一起使用。
-
-### 预定义字符集:
-
-- \d:任意数字,相当于 [0-9]。
-- \w:任意字母、数字或下划线字符,相当于 [a-zA-Z0-9_]。
-- \s:任意空白字符,相当于 [ \t\n\r\f\v]。
-
-
-
-## 实例
-
-```{r}
-library(babynames)
-(x <- c("apple", "apppple", "abc123def"))
-x[str_detect(x, "[0-9]")]
-x[str_detect(x, "abc[0-9]+")]
-x[str_detect(x, "pp")]
-x[str_detect(x, "p{4}")]
-x[str_detect(x, "p{4}")]
-x[str_detect("apple", "ap*")]
-x[str_detect("apple", "app*")]
-x[str_detect("apple", "a..le")]
-```
-
-## 练习
-
-
-找出`babyname`中名字含有ar的行
-
-```{r}
-#| echo: false
-babynames |>
- filter(str_detect(name, "ar"))
-```
-
-## 练习
-
-
-找出`babyname`中名字含有ar或者以ry结尾的行。
-
-```{r}
-#| echo: false
-babynames |>
- filter(str_detect(name, "ar"))
-```
-
-
-
-
-## GNU/Linux服务器
-
-- `ssh`, `scp`
-- `bash`
- - grep
- - sed
- - awk
- - find
- - xargs
-- `Editor`
- - `Virtual Studio Code`
- - `Vim`
- - `Emacs`
-
-
-
-## 欢迎讨论!{.center}
-
-
-`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drc.drwater.net/course/public/RWEP/PUB/SD/")`