2024-03-25 21:48:30 +08:00
|
|
|
|
---
|
2024-03-28 08:15:02 +08:00
|
|
|
|
title: 课后作业8
|
2024-03-25 21:48:30 +08:00
|
|
|
|
author: 姓名
|
|
|
|
|
format: html
|
|
|
|
|
---
|
|
|
|
|
|
2024-04-05 12:19:23 +08:00
|
|
|
|
# 数据
|
|
|
|
|
|
|
|
|
|
下载airquality.xlsx,并读取数据。
|
2024-03-25 21:48:30 +08:00
|
|
|
|
|
|
|
|
|
```{r}
|
2024-04-05 12:19:23 +08:00
|
|
|
|
#| message: false
|
|
|
|
|
#| warning: false
|
2024-03-28 12:25:13 +08:00
|
|
|
|
# 下载至临时文件
|
2024-04-05 12:19:23 +08:00
|
|
|
|
if (FALSE) {
|
|
|
|
|
tmpxlsxpath <- file.path(tempdir(), "airquality.xlsx")
|
|
|
|
|
download.file("https://drwater.rcees.ac.cn/git/course/RWEP/raw/branch/PUB/data/airquality.xlsx",
|
|
|
|
|
destfile = tmpxlsxpath)
|
|
|
|
|
airqualitydf <- readxl::read_xlsx(tmpxlsxpath, sheet = 2)
|
|
|
|
|
metadf <- readxl::read_xlsx(tmpxlsxpath, sheet = 1)
|
|
|
|
|
saveRDS(airqualitydf, "./airqualitydf.RDS")
|
|
|
|
|
saveRDS(metadf, "./metadf.RDS")
|
|
|
|
|
}
|
|
|
|
|
airqualitydf <- readRDS("./airqualitydf.RDS")
|
|
|
|
|
metadf <- readRDS("./metadf.RDS")
|
2024-03-25 21:48:30 +08:00
|
|
|
|
```
|
|
|
|
|
|
2024-04-05 12:19:23 +08:00
|
|
|
|
# 描述统计
|
|
|
|
|
|
|
|
|
|
根据`airqualitydf.xlsx`,按采样点统计白天(8:00-20:00)与夜晚(20:00-8:00)中空气质量指数(AQI)中位数,按城市统计低于所有采样点AQI30%分位值的采样点占比,列出上述占比最高的10个城市(不考虑采样点数低于5个的城市)。
|
2024-03-25 21:48:30 +08:00
|
|
|
|
|
2024-04-02 02:04:22 +08:00
|
|
|
|
```{r}
|
2024-04-05 12:19:23 +08:00
|
|
|
|
#| message: false
|
|
|
|
|
#| warning: false
|
2024-04-02 02:04:22 +08:00
|
|
|
|
require(tidyverse)
|
|
|
|
|
airqualitydf |>
|
|
|
|
|
select(datetime, site, AQI) |>
|
|
|
|
|
filter(!is.na(AQI)) |>
|
|
|
|
|
group_by(site) |>
|
|
|
|
|
summarize(AQI.median = median(AQI, na.rm = TRUE)) |>
|
|
|
|
|
left_join(metadf |> select(site, city = Area)) |>
|
|
|
|
|
group_by(city) |>
|
|
|
|
|
filter(n() > 5) |>
|
|
|
|
|
summarize(p = sum(AQI.median < quantile(airqualitydf$AQI, probs = 0.5, na.rm = TRUE)) / n()) |>
|
|
|
|
|
top_n(10, p)
|
|
|
|
|
|
2024-03-25 21:48:30 +08:00
|
|
|
|
|
2024-04-02 02:04:22 +08:00
|
|
|
|
airqualitydf |>
|
|
|
|
|
select(datetime, site, AQI) |>
|
|
|
|
|
filter(!is.na(AQI)) |>
|
|
|
|
|
group_by(site) |>
|
|
|
|
|
summarize(AQI.median = median(AQI, na.rm = TRUE))
|
2024-03-25 21:48:30 +08:00
|
|
|
|
|
2024-04-02 02:04:22 +08:00
|
|
|
|
airqualitydf |>
|
|
|
|
|
select(datetime, site, AQI) |>
|
|
|
|
|
filter(!is.na(AQI)) |>
|
|
|
|
|
left_join(metadf |> select(site, city = Area)) |>
|
|
|
|
|
group_by(city) |>
|
|
|
|
|
filter(length(unique(site)) >= 5) |>
|
|
|
|
|
summarize(p = sum(AQI < quantile(airqualitydf$AQI, probs = 0.2,
|
|
|
|
|
na.rm = TRUE)) / n()) |>
|
2024-04-05 12:19:23 +08:00
|
|
|
|
slice_max(p, n = 10) |>
|
|
|
|
|
knitr::kable()
|
|
|
|
|
|
|
|
|
|
|
2024-04-02 02:04:22 +08:00
|
|
|
|
```
|
2024-03-28 12:25:13 +08:00
|
|
|
|
|
|
|
|
|
|
2024-04-05 12:19:23 +08:00
|
|
|
|
# 统计检验
|
|
|
|
|
|
|
|
|
|
按照不同城市分组,统计白天与夜晚AQI中位数是否具有显著差异。
|
2024-04-02 02:04:22 +08:00
|
|
|
|
|
|
|
|
|
```{r}
|
2024-04-05 12:19:23 +08:00
|
|
|
|
#| message: false
|
|
|
|
|
#| warning: false
|
2024-04-02 02:04:22 +08:00
|
|
|
|
|
|
|
|
|
if (FALSE) {
|
2024-04-05 12:19:23 +08:00
|
|
|
|
require(infer)
|
|
|
|
|
require(tidyverse)
|
|
|
|
|
testdf <- airqualitydf |>
|
|
|
|
|
select(datetime, site, AQI) |>
|
|
|
|
|
filter(!is.na(AQI)) |>
|
|
|
|
|
left_join(metadf |> select(site, city = Area)) |>
|
|
|
|
|
group_by(city) |>
|
|
|
|
|
filter(length(unique(site)) >= 5) |>
|
|
|
|
|
mutate(dayornight = factor(ifelse(between(hour(datetime), 8, 20), "day", "night"),
|
|
|
|
|
levels = c("day", "night"))
|
|
|
|
|
) |>
|
|
|
|
|
group_by(city) |>
|
|
|
|
|
nest(citydf = -city) |>
|
|
|
|
|
mutate(median_diff = purrr::map_dbl(citydf, ~
|
|
|
|
|
.x |>
|
|
|
|
|
specify(AQI ~ dayornight) |>
|
|
|
|
|
calculate(stat = "diff in medians", order = c("day", "night")) |>
|
|
|
|
|
pull(stat)
|
|
|
|
|
)) |>
|
|
|
|
|
ungroup() |>
|
|
|
|
|
# slice_sample(n = 12) |>
|
|
|
|
|
mutate(null_dist = purrr::map(citydf, ~
|
|
|
|
|
.x |>
|
|
|
|
|
specify(AQI ~ dayornight) |>
|
|
|
|
|
hypothesize(null = "independence") |>
|
|
|
|
|
generate(reps = 1000, type = "permute") |>
|
|
|
|
|
calculate(stat = "diff in medians", order = c("day", "night"))
|
|
|
|
|
)) |>
|
|
|
|
|
mutate(p_value = purrr::map2_dbl(null_dist, median_diff,
|
|
|
|
|
~ get_p_value(.x, obs_stat = .y, direction = "both") |>
|
|
|
|
|
pull(p_value)
|
|
|
|
|
)) |>
|
|
|
|
|
mutate(sigdiff = ifelse(p_value < 0.01, "显著差异", "无显著差异")) |>
|
|
|
|
|
mutate(fig = purrr::pmap(list(null_dist, median_diff, city, sigdiff),
|
|
|
|
|
~ visualize(..1) +
|
|
|
|
|
shade_p_value(obs_stat = ..2, direction = "both") +
|
|
|
|
|
ggtitle(paste0(..3, ":", ..4)) +
|
|
|
|
|
theme_sci(2, 2)
|
|
|
|
|
)) |>
|
|
|
|
|
arrange(p_value)
|
|
|
|
|
saveRDS(testdf, "./testdf.RDS")
|
|
|
|
|
}
|
2024-04-02 02:04:22 +08:00
|
|
|
|
|
2024-04-05 12:19:23 +08:00
|
|
|
|
lang <- "cn"
|
|
|
|
|
require(dwfun)
|
|
|
|
|
require(rmdify)
|
|
|
|
|
require(drwateR)
|
|
|
|
|
dwfun::init()
|
|
|
|
|
rmdify::rmd_init()
|
2024-04-02 02:04:22 +08:00
|
|
|
|
|
2024-04-05 12:19:23 +08:00
|
|
|
|
testdf <- readRDS("./testdf.RDS")
|
|
|
|
|
require(tidyverse)
|
2024-04-02 02:04:22 +08:00
|
|
|
|
testdf |>
|
2024-04-05 12:19:23 +08:00
|
|
|
|
select(city, median_diff, p_value, sigdiff) |>
|
|
|
|
|
knitr::kable()
|
|
|
|
|
testdf |>
|
|
|
|
|
mutate(grp = (row_number() - 1)%/% 12) |>
|
|
|
|
|
group_by(grp) |>
|
|
|
|
|
nest(grpdf = -grp) |>
|
|
|
|
|
ungroup() |>
|
|
|
|
|
# slice(1) |>
|
|
|
|
|
mutate(gp = purrr::map(grpdf,
|
|
|
|
|
~(.x |>
|
|
|
|
|
pull(fig)) |>
|
|
|
|
|
patchwork::wrap_plots(ncol = 3) +
|
|
|
|
|
dwfun::theme_sci(5, 7))) |>
|
|
|
|
|
pull(gp)
|
2024-04-02 02:04:22 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|