diff --git a/SD/20240328_9_课后作业/airqualitymedianoutrow5.pdf b/SD/20240328_9_课后作业/airqualitymedianoutrow5.pdf new file mode 100644 index 0000000..face837 Binary files /dev/null and b/SD/20240328_9_课后作业/airqualitymedianoutrow5.pdf differ diff --git a/SD/20240328_9_课后作业/freq.pdf b/SD/20240328_9_课后作业/freq.pdf new file mode 100644 index 0000000..446ee8c Binary files /dev/null and b/SD/20240328_9_课后作业/freq.pdf differ diff --git a/SD/20240328_9_课后作业/index.qmd b/SD/20240328_9_课后作业/index.qmd index e566d0d..637df89 100644 --- a/SD/20240328_9_课后作业/index.qmd +++ b/SD/20240328_9_课后作业/index.qmd @@ -4,6 +4,9 @@ subtitle: 《区域水环境污染数据分析实践》
Data analysis practic author: 苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心 date: today lang: zh +resources: + - "*.pdf" + - "*.sas" format: revealjs: theme: dark @@ -42,6 +45,19 @@ require(learnr) 作业模板:[第8次课后作业_模板.qmd](https://drwater.rcees.ac.cn/git/course/RWEP/raw/branch/main/SD/20240328_9_课后作业/第8次课后作业_模板.qmd) +## 示例代码 + +### 基于R的示例结果 + +- [第8次课后作业R示例代码结果](./第8次课后作业_模板.html) + +### 基于SAS的示例结果 + +- [第8次课后作业SAS示例代码](./第8次课后作业_模板.sas) +- [第8次课后作业SAS示例结果1](./median.pdf) +- [第8次课后作业SAS示例结果2](./freq.pdf) +- [第8次课后作业SAS示例结果3](./airqualitymedianoutrow5.pdf) +- [第8次课后作业SAS示例结果4](./npar1wayConover.pdf) ## 欢迎讨论!{.center} diff --git a/SD/20240328_9_课后作业/median.pdf b/SD/20240328_9_课后作业/median.pdf new file mode 100644 index 0000000..eafe394 Binary files /dev/null and b/SD/20240328_9_课后作业/median.pdf differ diff --git a/SD/20240328_9_课后作业/npar1wayConover.pdf b/SD/20240328_9_课后作业/npar1wayConover.pdf new file mode 100644 index 0000000..2cc2998 Binary files /dev/null and b/SD/20240328_9_课后作业/npar1wayConover.pdf differ diff --git a/SD/20240328_9_课后作业/第8次课后作业_模板.qmd b/SD/20240328_9_课后作业/第8次课后作业_模板.qmd index 4a09163..dfc598c 100644 --- a/SD/20240328_9_课后作业/第8次课后作业_模板.qmd +++ b/SD/20240328_9_课后作业/第8次课后作业_模板.qmd @@ -4,24 +4,34 @@ author: 姓名 format: html --- -# 下载airquality.xlsx,并读取数据 +# 数据 + +下载airquality.xlsx,并读取数据。 ```{r} -#| eval: false -#| execute: false +#| message: false +#| warning: false # 下载至临时文件 -tmpxlsxpath <- file.path(tempdir(), "airquality.xlsx") -download.file("https://drwater.rcees.ac.cn/git/course/RWEP/raw/branch/PUB/data/airquality.xlsx", - destfile = tmpxlsxpath) -airqualitydf <- readxl::read_xlsx(tmpxlsxpath, sheet = 2) -metadf <- readxl::read_xlsx(tmpxlsxpath, sheet = 1) +if (FALSE) { + tmpxlsxpath <- file.path(tempdir(), "airquality.xlsx") + download.file("https://drwater.rcees.ac.cn/git/course/RWEP/raw/branch/PUB/data/airquality.xlsx", + destfile = tmpxlsxpath) + airqualitydf <- readxl::read_xlsx(tmpxlsxpath, sheet = 2) + metadf <- readxl::read_xlsx(tmpxlsxpath, sheet = 1) + saveRDS(airqualitydf, "./airqualitydf.RDS") + saveRDS(metadf, "./metadf.RDS") +} +airqualitydf <- readRDS("./airqualitydf.RDS") +metadf <- readRDS("./metadf.RDS") ``` -# 根据`airqualitydf.xlsx`,按采样点统计白天(8:00-20:00)与夜晚(20:00-8:00)中空气质量指数(AQI)中位数,按城市统计低于所有采样点AQI30%分位值的采样点占比,列出上述占比最高的10个城市(不考虑采样点数低于5个的城市)。 +# 描述统计 + +根据`airqualitydf.xlsx`,按采样点统计白天(8:00-20:00)与夜晚(20:00-8:00)中空气质量指数(AQI)中位数,按城市统计低于所有采样点AQI30%分位值的采样点占比,列出上述占比最高的10个城市(不考虑采样点数低于5个的城市)。 ```{r} -#| eval: false -#| execute: false +#| message: false +#| warning: false require(tidyverse) airqualitydf |> select(datetime, site, AQI) |> @@ -49,70 +59,90 @@ airqualitydf |> filter(length(unique(site)) >= 5) |> summarize(p = sum(AQI < quantile(airqualitydf$AQI, probs = 0.2, na.rm = TRUE)) / n()) |> - slice_max(p, n = 10) -``` - - -# 按照不同城市分组,统计白天与夜晚AQI中位数是否具有显著差异。 - -```{r} -#| eval: false - -if (FALSE) { - -require(infer) -testdf <- airqualitydf |> - select(datetime, site, AQI) |> - filter(!is.na(AQI)) |> - left_join(metadf |> select(site, city = Area)) |> - group_by(city) |> - filter(length(unique(site)) >= 5) |> - mutate(dayornight = factor(ifelse(between(hour(datetime), 8, 20), "day", "night"), - levels = c("day", "night")) - ) |> - group_by(city) |> - nest(citydf = -city) |> - mutate(median_diff = purrr::map_dbl(citydf, ~ - .x |> - specify(AQI ~ dayornight) |> - calculate(stat = "diff in medians", order = c("day", "night")) |> - pull(stat) -)) |> - ungroup() |> -# slice_sample(n = 12) |> - mutate(null_dist = purrr::map(citydf, ~ - .x |> - specify(AQI ~ dayornight) |> - hypothesize(null = "independence") |> - generate(reps = 1000, type = "permute") |> - calculate(stat = "diff in medians", order = c("day", "night")) -)) |> - mutate(fig = purrr::pmap(list(null_dist, median_diff, city), - ~ visualize(..1) + - shade_p_value(obs_stat = ..2, direction = "both") + - ggtitle(..3) - )) |> - mutate(p_value = purrr::map2_dbl(null_dist, median_diff, - ~ get_p_value(.x, obs_stat = .y, direction = "both") |> - pull(p_value) - )) |> - arrange(p_value) |> - mutate(sigdiff = ifelse(p_value < 0.01, "显著差异", "无显著差异")) - -testdf |> - select(city, sigdiff) |> + slice_max(p, n = 10) |> knitr::kable() - lang <- "cn" -(testdf |> - slice_sample(n = 9) |> - pull(fig)) |> -patchwork::wrap_plots(ncol = 3) + -dwfun::theme_sci(5, 5) - -dwfun::ggsavep("./testdf.pdf") - -} + +``` + + +# 统计检验 + +按照不同城市分组,统计白天与夜晚AQI中位数是否具有显著差异。 + +```{r} +#| message: false +#| warning: false + +if (FALSE) { + require(infer) + require(tidyverse) + testdf <- airqualitydf |> + select(datetime, site, AQI) |> + filter(!is.na(AQI)) |> + left_join(metadf |> select(site, city = Area)) |> + group_by(city) |> + filter(length(unique(site)) >= 5) |> + mutate(dayornight = factor(ifelse(between(hour(datetime), 8, 20), "day", "night"), + levels = c("day", "night")) + ) |> + group_by(city) |> + nest(citydf = -city) |> + mutate(median_diff = purrr::map_dbl(citydf, ~ + .x |> + specify(AQI ~ dayornight) |> + calculate(stat = "diff in medians", order = c("day", "night")) |> + pull(stat) + )) |> + ungroup() |> + # slice_sample(n = 12) |> + mutate(null_dist = purrr::map(citydf, ~ + .x |> + specify(AQI ~ dayornight) |> + hypothesize(null = "independence") |> + generate(reps = 1000, type = "permute") |> + calculate(stat = "diff in medians", order = c("day", "night")) + )) |> + mutate(p_value = purrr::map2_dbl(null_dist, median_diff, + ~ get_p_value(.x, obs_stat = .y, direction = "both") |> + pull(p_value) + )) |> + mutate(sigdiff = ifelse(p_value < 0.01, "显著差异", "无显著差异")) |> + mutate(fig = purrr::pmap(list(null_dist, median_diff, city, sigdiff), + ~ visualize(..1) + + shade_p_value(obs_stat = ..2, direction = "both") + + ggtitle(paste0(..3, ":", ..4)) + + theme_sci(2, 2) + )) |> + arrange(p_value) + saveRDS(testdf, "./testdf.RDS") +} + +lang <- "cn" +require(dwfun) +require(rmdify) +require(drwateR) +dwfun::init() +rmdify::rmd_init() + +testdf <- readRDS("./testdf.RDS") +require(tidyverse) +testdf |> + select(city, median_diff, p_value, sigdiff) |> + knitr::kable() +testdf |> + mutate(grp = (row_number() - 1)%/% 12) |> + group_by(grp) |> + nest(grpdf = -grp) |> + ungroup() |> +# slice(1) |> + mutate(gp = purrr::map(grpdf, + ~(.x |> + pull(fig)) |> + patchwork::wrap_plots(ncol = 3) + + dwfun::theme_sci(5, 7))) |> + pull(gp) + ``` diff --git a/SD/20240328_9_课后作业/第8次课后作业_模板.sas b/SD/20240328_9_课后作业/第8次课后作业_模板.sas new file mode 100644 index 0000000..9cdaf41 --- /dev/null +++ b/SD/20240328_9_课后作业/第8次课后作业_模板.sas @@ -0,0 +1,289 @@ +options ls=256 ps=32767 nodate validmemname=extend validvarname=any; + +title 'The SAS System'; + +%macro print(d); + proc print data=&d;run; + %mend; +%macro printobs(d,obs); + proc print data=&d (obs=&obs);run; + %mend; +%macro printfirstobsobs(d,firstobs,obs); + proc print data=&d (firstobs=&firstobs obs=&obs);run; + %mend; +%macro contents(d); + proc contents data=&d varnum;run; + %mend; +%macro contentsshort(d); + proc contents data=&d varnum short;run; + %mend; +%macro save_dataset(d); + data "d:&d"; + set &d; + run; + %mend; +%macro load_dataset(d); + data &d; + set "d:&d"; + run; + %mend; +%macro kill; + PROC DATASETS LIB=work KILL;RUN;quit; + %mend; +proc template; + list styles; +run; + + + +%kill; +PROC IMPORT OUT=WORK.raw_metadf + DATAFILE="d:airquality.xlsx" + DBMS=EXCEL REPLACE; + RANGE="metadf$"; + GETNAMES=YES; + MIXED=YES; + SCANTEXT=YES; + USEDATE=NO; + SCANTIME=NO; +RUN; + %print(raw_metadf); + %save_dataset(raw_metadf); *ԭʼݼ; + +PROC IMPORT OUT=WORK.raw_airqualitydf + DATAFILE="d:airquality.xlsx" + DBMS=EXCEL REPLACE; + RANGE="airqualitydf$"; + GETNAMES=YES; + MIXED=YES; + SCANTEXT=YES; + USEDATE=NO; *ΪYES; + SCANTIME=NO; *ΪYES; +RUN; + %print(raw_airqualitydf); + %save_dataset(raw_airqualitydf); *ԭʼݼ; + + +%kill; +/*Ӳԭʼݼ*/ +%load_dataset(raw_metadf); +%load_dataset(raw_airqualitydf); +/*鿴ݼ*/ +%contents(raw_metadf); +%contentsshort(raw_metadf); +/*site name Area lon lat*/ +%contents(raw_airqualitydf); +%contentsshort(raw_airqualitydf); +/*datetime site 'CO_mg/m3'n 'CO_24h_mg/m3'n 'NO2_g/m3'n 'NO2_24h_g/m3'n 'O3_g/m3'n 'O3_24h_g/m3'n 'O3_8h_g/m3'n 'O3_8h_24h_g/m3'n 'PM10_g/m3'n 'PM10_24h_g/m3'n 'PM2#5_g/m3'n 'PM2#5_24h_g/m3'n 'SO2_g/m3'n 'SO2_24h_g/m3'n AQI PrimaryPollutant Quality Unheathful*/ + +%printobs(raw_metadf,10); +%printobs(raw_airqualitydf,10); +proc sort data=raw_metadf out=metadfsorted; + by site; +run; +proc sort data=raw_airqualitydf out=airqualitydfsorted; + by site; +run; +/*ϲݼԤȡڡʱ䲿֣daynight*/ +data airquality; + retain datetime date time DayNight site name Area AQI lon lat; + length DayNight $ 5; + merge metadfsorted airqualitydfsorted; + by site; + date=datepart(datetime); + time=timepart(datetime); + if '8:00't<=time<'20:00't then DayNight='day'; + else DayNight='night'; + format datetime e8601dt25. date yymmdd10. time time5.; + keep site name Area lon lat datetime date time DayNight AQI; +run; + %printobs(airquality,100); + %save_dataset(airquality); *ϲݼ; + + + +%kill; +/*#################### DATA SET airquality ####################*/ +/*Ӳ̺ϲݼ*/ +%load_dataset(airquality); +%printobs(airquality,100); + +/*sitenameǷһ£ֲһ£siteͳ*/ +proc sql; + select count(distinct(site)) as count_site from airquality; + select count(distinct(name)) as count_name from airquality; +quit; +/* +count_site +1714 +count_name +1522 +*/ + + +/*#########################################################################*/ +/*ͳư죨8:00-20:00ҹ20:00-8:00пָAQIλ*/ +/*#########################################################################*/ +/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@*/ +ods pdf file='d:means.pdf' style=sapphire dpi=1200; +proc means data=airquality median maxdec=1; + class site DayNight; + var AQI; + where AQI is not missing; +run; +ods pdf close; +/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@*/ + + +/*####################################################################################################*/ +/*ͳƵвAQI30%λֵIJռȣгռߵ10УDz5ijУ*/ +/*####################################################################################################*/ +/*鿴вAQI30%λֵΪSQL֤ȥ*/ +proc univariate data=airquality noprint; + var AQI; + output out=airqualitystats pctlpts=30 pctlpre=P; +run; + %print(airqualitystats); /*42*/ + +/*вAQI30%λֵ*/ +proc sql; + select AQI into : xvalues separated by ',' from airquality; + select distinct(pctl(30, &xvalues)) into : P30 from airquality; +quit; +/*鿴вAQI30%λֵĺֵΪ*/ + %put P30=&P30.; + +/*вAQI30%λֵAQIּԺϲݼвֱӷּøλͳƣȥ*/ +data airquality1; + set airquality; + if AQI<&P30. then quality='good'; + else quality='fair'; +run; + %printobs(airquality1,100); + +/*вAQIλ*/ +proc means data=airquality median maxdec=1; + class Area site; + var AQI; + where AQI is not missing; + output out=airqualitymedian median=; +run; + %print(airqualitymedian); + +/*вAQI30%λֵAQIλּ*/ +data airqualitymedian1; + set airqualitymedian; + if AQI<&P30. then quality='good'; + else quality='fair'; + where _TYPE_=3; +run; + %print(airqualitymedian1); + +/*ͳƵвAQI30%λֵIJռȣ鿴*/ +/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@*/ +ods pdf file='d:freq.pdf' style=sapphire dpi=1200; +proc freq data=airqualitymedian1; + table Area*quality /nocol nopercent; +run; +ods pdf close; +/*@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@*/ +/*ͳƵвAQI30%λֵIJռȣƵͳƽݼ*/ +proc freq data=airqualitymedian1; + table Area*quality /outpct out=airqualitymedianoutrow(drop=percent pct_col); *Ƶаٷֱ; +run; + %printobs(airqualitymedianoutrow,100); + +/*жԲͳƣ鿴ȥ*/ +proc means data=airqualitymedianoutrow sum maxdec=0; + class Area; + var COUNT; +run; +/*5ijУ鿴ȥ*/ +proc sql; + select *,sum(COUNT) as total_COUNT from airqualitymedianoutrow group by Area having calculated total_COUNT>=5 order by quality desc,PCT_ROW desc,COUNT desc; +quit; +/*5ijУݼ*/ +proc sql; + create table airqualitymedianoutrow5 as select *,sum(COUNT) as total_COUNT from airqualitymedianoutrow group by Area having calculated total_COUNT>=5 order by quality desc,PCT_ROW desc,COUNT desc; +quit; +/*гռߵ10У5ijУ*/ +/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/ +ods pdf file='d:airqualitymedianoutrow5.pdf' style=sapphire dpi=1200; + %printobs(airqualitymedianoutrow5,10); + %printobs(airqualitymedianoutrow5,20); + %printobs(airqualitymedianoutrow5,30); + %print(airqualitymedianoutrow5); +ods pdf close; +/*@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@*/ + + +/*#####################################################*/ +/*ղͬз飬ͳưҹAQIλǷ*/ +/*#####################################################*/ +/*еsiteûAreaӦ*/ +proc sql; + select distinct(Area),count(distinct(Area)) as count_Area from airquality; +quit; +proc print data=airquality; + where Area is missing; +run; +proc sql; + select distinct(site),count(distinct(site)) as count_site from airquality where Area is missing; +quit; +/*4site㣩ûAreaУӦ +site count_site +2628A 4 +3128A 4 +4034A 4 +4036A 4 +*/ +proc sort data=airquality out=airqualitysorted; + by Area; +run; +/*ղͬз飬ͳưҹAQIλ鿴ȥ*/ +proc means data=airqualitysorted median maxdec=1; + by Area; + class DayNight; + var AQI; + where AQI is not missing; +run; +/*ͳؿҹAQIλǷ*/ +proc npar1way data=airquality median; + class DayNight; + var AQI; +run; +/*ղͬз飬ͳưҹAQIλǷ*/ +/*@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/ +ods pdf file='d:npar1waymedian.pdf' style=sapphire dpi=1200; +proc npar1way data=airqualitysorted median; + class DayNight; + var AQI; + by Area; + where Area is not missing; +run; +ods pdf close; +/*@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/ +/* +Using Wilcoxon scores in the linear rank statistic for two-sample data produces the rank sum statistic of the Mann-Whitney-Wilcoxon test. +Using Wilcoxon scores in the one-way ANOVA statistic produces the Kruskal-Wallis test. +Wilcoxon scores are locally most powerful for location shifts of a logistic distribution. +*//* +Using median scores in the linear rank statistic for two-sample data produces the two-sample median test. +The one-way ANOVA statistic with median scores is equivalent to the Brown-Mood test. +Median scores are particularly powerful for distributions that are symmetric and heavy-tailed.*/ + +/* +Scores for Linear Rank and One-Way ANOVA Tests +For each score type that you specify, PROC NPAR1WAY computes a one-way ANOVA statistic and also a linear rank statistic for two-sample data. The following score types are used primarily to test for differences in location: Wilcoxon, median, Van der Waerden (normal), and Savage. The following scores types are used to test for scale differences: Siegel-Tukey, Ansari-Bradley, Klotz, and Mood. Conover scores can be used to test for differences in both location and scale. This section gives formulas for the score types available in PROC NPAR1WAY. For further information about the formulas and the applicability of each score, see Randles and Wolfe (1979), Gibbons and Chakraborti (2010), Conover (1999), and Hollander and Wolfe (1999). +In addition to the score types described in this section, you can specify the SCORES=DATA option to use the input data observations as scores. This enables you to produce a wide variety of tests. You can construct any scores by using the DATA step, and then you can use PROC NPAR1WAY to compute the corresponding linear rank and one-way ANOVA tests for these scores. You can also analyze raw (unscored) data by using the SCORES=DATA option; for two-sample data, the corresponding exact test is a permutation test that is known as Pitmans test. +*/ +/*@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/ +ods pdf file='d:npar1wayConover.pdf' style=sapphire dpi=1200; +proc npar1way data=airqualitysorted Conover; + class DayNight; + var AQI; + by Area; + where Area is not missing; +run; +/*@@@@@@@@@@@@@@@@@@@@@@@@@@@ PDF @@@@@@@@@@@@@@@@@@@@@@@@@*/ +/*Conover scores can be used to test for differences in both location and scale.*/ diff --git a/_quarto.yml b/_quarto.yml index c6b52a9..efd4286 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -24,7 +24,7 @@ website: page-navigation: true page-footer: "Copyright 2024, [Ming Su](https://drwater.rcees.ac.cn)" navbar: - background: "light" + background: "grey" search: true right: - icon: house diff --git a/coding/lesson9.qmd b/coding/lesson9.qmd new file mode 100644 index 0000000..56c1017 --- /dev/null +++ b/coding/lesson9.qmd @@ -0,0 +1,80 @@ +--- +title: 第9次课练习 +--- + +```{r} +require(tidyverse) +# require(ggplot2) +require(palmerpenguins) + +penguins |> + select(species, flipper_length_mm, body_mass_g, + bill_length_mm, sex) |> + filter(!is.na(sex)) |> + ggplot(aes( + x = species, + y = body_mass_g + )) + +geom_jitter(colour = "white", + fill = "gray80", size = 1, shape = 21, width = 0.4) + +geom_violin(fill = "orange", alpha = 0.4) + +stat_summary(fun = mean) + +scale_size_continuous(range = c(.5, 2)) + +# ggsci::scale_fill_d3() + +ylab("Body mass (g)") + +ggtitle("Body mass and flipper length") + +theme(axis.text.x = element_text(angle = 90), + panel.background = element_rect(fill = NA) +) + +ggplot2::last_plot() + + +ggsave("../img/penguins.pdf", width = 4, heigh = 3) + ++ +dwfun::theme_sci(5, 5) + + +p1 <- ggplot(penguins, aes(x = island, fill = species)) + + geom_bar(position = "fill") +p2 <- ggplot(penguins, aes(x = species, fill = island)) + + geom_bar(position = "fill") + +require(patchwork) +((p1 / p2) | p1) + + + + +``` + + + +# 消光系数 + +```{r} +licordata <- readRDS("../data/licordata.RDS") +require(tidyverse) + +licordata |> + group_by(date, site) |> + nest(.key = "lightdf") |> + mutate(m = purrr::map( + lightdf, + ~ (lm(log(UP + 1) ~ depth, data = .x)) + )) |> + mutate(k = purrr::map_dbl(m, ~ + -coef(.x)[2])) |> + ggplot(aes(date, k)) + +geom_point(aes(fill = site)) + +Iz = I0 * exp(-zk) + + + + + + +``` + diff --git a/data/writexldemo.xlsx b/data/writexldemo.xlsx index f4fa197..8906d98 100644 Binary files a/data/writexldemo.xlsx and b/data/writexldemo.xlsx differ diff --git a/img/penguins.pdf b/img/penguins.pdf new file mode 100644 index 0000000..1959dd8 Binary files /dev/null and b/img/penguins.pdf differ