L10

L8
add L7.qmd
2026-06-04 15:55:13 +08:00 · 2026-05-28 13:08:20 +08:00 · 2026-05-26 13:43:04 +08:00 · 2026-05-26 13:42:53 +08:00
23 changed files with 674809 additions and 2 deletions
@@ -45809,3 +45809,43 @@ barcante2020cyanobacteria
    year = {2008}
 }

+@Article{tatters2025benthic,
+    title       = {Benthic cyanobacterial accumulations and associated cyanotoxins in coastal urban stormwater pond networks},
+    author      = {Tatters, Avery O. and Clevenger, Courtney and Strangman, Wendy K. and Oehrle, Stuart and Kudela, Raphael M. and Aukamp, Jessica and Wan, Yongshan},
+    year        = 2025,
+    journal     = {Harmful Algae},
+    publisher   = {Elsevier BV},
+    volume      = 144,
+    pages       = 102833,
+    url         = {http://dx.doi.org/10.1016/j.hal.2025.102833},
+    issn        = {1568-9883},
+    doi         = {10.1016/j.hal.2025.102833}
+}
+
+@Article{maurer2024temporal,
+    title       = {Temporal Dynamics of Cyanobacterial Bloom Community Composition and Toxin Production from Urban Lakes},
+    author      = {Maurer, Julie A. and Xia, Runjie and Kim, Andrew M. and Oblie, Nana and Hefferan, Sierra and Xie, Hannuo and Slitt, Angela and Jenkins, Bethany D. and Bertin, Matthew J.},
+    year        = 2024,
+    journal     = {ACS ES&amp;T Water},
+    publisher   = {American Chemical Society (ACS)},
+    volume      = 4,
+    pages       = {3423–3432},
+    url         = {http://dx.doi.org/10.1021/acsestwater.4c00266},
+    issn        = {2690-0637},
+    doi         = {10.1021/acsestwater.4c00266},
+    number      = 8}
+
+@article{paerl2016duelling,
+   author = {Paerl, Hans W. and Otten, Timothy G.},
+   title = {Duelling ‘CyanoHABs’: unravelling the environmental drivers controlling dominance and succession among diazotrophic and non-N2-fixing harmful cyanobacteria},
+   journal = {Environmental Microbiology},
+   volume = {18},
+   number = {2},
+   pages = {316-324},
+   doi = {10.1111/1462-2920.13035},
+   url = {https://enviromicro-journals.onlinelibrary.wiley.com/doi/abs/10.1111/1462-2920.13035},
+   eprint = {https://enviromicro-journals.onlinelibrary.wiley.com/doi/pdf/10.1111/1462-2920.13035},
+   year = {2016}
+}
+
+
@@ -34,4 +34,4 @@ website:
          - text: Codes
            url: https://git.drwater.net/{{< var projtype >}}/{{< var reponame >}}/src/branch/{{< var branch >}}
          - text: Issue
-            url: https://git.drwater.net/manuscript/{{< var reponame >}}/issues
+            url: https://git.drwater.net/{{< var projtype >}}/{{< var reponame >}}/issues
@@ -33,4 +33,4 @@ website:
          - text: Codes
            url: https://git.drwater.net/{{< var projtype >}}/{{< var reponame >}}/src/branch/{{< var branch >}}
          - text: Issue
-            url: https://git.drwater.net/manuscript/{{< var reponame >}}/issues
+            url: https://git.drwater.net/{{< var projtype >}}/{{< var reponame >}}/issues
@@ -0,0 +1,617 @@
+# 实践部分
+
+
+## Kriging
+
+### Code
+
+```{r}
+#| echo: true
+#| eval: false
+#| out-width: 50%
+require(sf)
+require(sp)
+require(gstat)
+require(raster)
+require(ggplot2)
+# 读取太湖边界
+lake_boundary <- st_read("../../data/taihu.shp") |>
+  sf::st_make_valid()
+main_water <- st_difference(
+  lake_boundary[lake_boundary$Name == "boundary", ],
+  st_union(lake_boundary[lake_boundary$Name != "boundary", ])
+)
+
+get_kriging <- function(indf, Yvarname, grid = NULL, boundsf = main_water) {
+  insf <- indf |>
+    sfext::df_to_sf(crs = 4326, coords = c("long", "lat")) |>
+    dplyr::select(-c("long", "lat")) |>
+    dplyr::rename(Y = tidyselect::all_of(Yvarname))
+  if (is.null(grid)) {
+    # 1. 将sf边界转为terra格式
+    v <- terra::vect(boundsf)
+    # 2. 创建基础网格
+    base_grid <- terra::rast(v, resolution = 0.002) # 基础低分辨率
+    # 3. 创建高分辨率区域（例如边界附近）
+    buffer_zone <- buffer(v, width = 0.005) # 边界附近创建缓冲区
+    hi_res_grid <- terra::rast(buffer_zone, resolution = 0.001)
+    # 4. 合并网格
+    final_grid <- merge(base_grid, hi_res_grid)
+    # 5. 转为点并裁剪
+    grid <- terra::as.points(final_grid) |>
+      sf::st_as_sf() |>
+      sf::st_filter(main_water)
+  }
+  insp <- sf::as_Spatial(insf)
+  fit <- automap::autofitVariogram(Y ~ 1, insp)
+  # 克里金插值
+  m <- gstat::gstat(
+    formula = Y ~ 1,
+    data = insp,
+    model = fit$var_model
+  )
+  predsf <- predict(m, grid) |>
+    sf::st_as_sf()
+  outsf <- predsf |>
+    st_coordinates() |>
+    as.data.frame() |>
+    cbind(pred = predsf$var1.pred)
+  return(outsf)
+}
+
+
+wqdf <- readxl::read_xlsx("../data/wqdata.xlsx")
+
+if (FALSE) {
+  wqsf <- wqdf |>
+    nest(datedf = -date) |>
+    dplyr::mutate(
+      krigingsf = purrr::map(datedf, \(x) {
+        get_kriging(x, "conc", boundsf = main_water)
+      })
+    )
+  saveRDS(wqsf, "./wqsf.rds")
+}
+
+wqsf <- readRDS("./wqsf.rds")
+
+wqsf |>
+  unnest(krigingsf) |>
+  ggplot(aes(X, Y)) +
+  geom_contour_filled(aes(z = pred), bins = 20) +
+  geom_sf(
+    data = main_water,
+    aes(x = NULL, y = NULL),
+    fill = NA,
+    colour = "black",
+    linewidth = 1
+  ) +
+  # scale_fill_gradientn(colors = hcl.colors(100, "RdYlBu")) + # 设置颜色渐变
+  # ggsci::scale_fill_aaas() +
+  coord_sf() +
+  theme_void()
+```
+
+### Output
+
+```{r}
+#| echo: false
+#| out-width: 80%
+#| message: false
+#| fig-width: 6
+#| fig-height: 3
+require(sf)
+require(sp)
+require(gstat)
+require(raster)
+require(ggplot2)
+# 读取太湖边界
+lake_boundary <- st_read("../../data/taihu.shp", quiet = TRUE) |>
+  sf::st_make_valid()
+main_water <- st_difference(
+  lake_boundary[lake_boundary$Name == "boundary", ],
+  st_union(lake_boundary[lake_boundary$Name != "boundary", ])
+)
+
+get_kriging <- function(indf, Yvarname, grid = NULL, boundsf = main_water) {
+  insf <- indf |>
+    sfext::df_to_sf(crs = 4326, coords = c("long", "lat")) |>
+    dplyr::select(-c("long", "lat")) |>
+    dplyr::rename(Y = tidyselect::all_of(Yvarname))
+  if (is.null(grid)) {
+    # 1. 将sf边界转为terra格式
+    v <- terra::vect(boundsf)
+    # 2. 创建基础网格
+    base_grid <- terra::rast(v, resolution = 0.002) # 基础低分辨率
+    # 3. 创建高分辨率区域（例如边界附近）
+    buffer_zone <- buffer(v, width = 0.005) # 边界附近创建缓冲区
+    hi_res_grid <- terra::rast(buffer_zone, resolution = 0.001)
+    # 4. 合并网格
+    final_grid <- merge(base_grid, hi_res_grid)
+    # 5. 转为点并裁剪
+    grid <- terra::as.points(final_grid) |>
+      sf::st_as_sf() |>
+      sf::st_filter(main_water)
+  }
+  insp <- sf::as_Spatial(insf)
+  fit <- automap::autofitVariogram(Y ~ 1, insp)
+  # 克里金插值
+  m <- gstat::gstat(
+    formula = Y ~ 1,
+    data = insp,
+    model = fit$var_model
+  )
+  predsf <- predict(m, grid) |>
+    sf::st_as_sf()
+  outsf <- predsf |>
+    st_coordinates() |>
+    as.data.frame() |>
+    cbind(pred = predsf$var1.pred)
+  return(outsf)
+}
+
+
+wqdf <- readxl::read_xlsx("../../data/wqdata.xlsx")
+
+if (FALSE) {
+  wqsf <- wqdf |>
+    nest(datedf = -date) |>
+    dplyr::mutate(
+      krigingsf = purrr::map(datedf, \(x) {
+        get_kriging(x, "conc", boundsf = main_water)
+      })
+    )
+  saveRDS(wqsf, "./wqsf.rds")
+}
+wqsf <- readRDS("./wqsf.rds")
+
+wqsf |>
+  unnest(krigingsf) |>
+  ggplot(aes(X, Y)) +
+  geom_contour_filled(aes(z = pred), bins = 20) +
+  geom_sf(
+    data = main_water,
+    aes(x = NULL, y = NULL),
+    fill = NA,
+    colour = "black",
+    linewidth = 1
+  ) +
+  # scale_fill_gradientn(colors = hcl.colors(100, "RdYlBu")) + # 设置颜色渐变
+  # ggsci::scale_fill_aaas() +
+  coord_sf() +
+  theme_void()
+```
+
+:::
+
+
+## 数据
+
+```{r}
+require(tidyverse)
+sitedf <- readr::read_csv(
+  "https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv"
+) |>
+  select(
+    SITE_ID,
+    lon = LON_DD,
+    lat = LAT_DD,
+    name = LAKENAME,
+    area = LAKEAREA,
+    zmax = DEPTHMAX
+  ) |>
+  group_by(SITE_ID) |>
+  summarize(
+    lon = mean(lon, na.rm = TRUE),
+    lat = mean(lat, na.rm = TRUE),
+    name = unique(name),
+    area = mean(area, na.rm = TRUE),
+    zmax = mean(zmax, na.rm = TRUE)
+  )
+
+
+visitdf <- readr::read_csv(
+  "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
+) |>
+  select(SITE_ID, date = DATE_PROFILE, year = YEAR, visit = VISIT_NO) |>
+  distinct()
+
+
+waterchemdf <- readr::read_csv(
+  "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
+) |>
+  select(
+    SITE_ID,
+    date = DATE_PROFILE,
+    depth = DEPTH,
+    temp = TEMP_FIELD,
+    do = DO_FIELD,
+    ph = PH_FIELD,
+    cond = COND_FIELD,
+  )
+
+sddf <- readr::read_csv(
+  "https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv"
+) |>
+  select(
+    SITE_ID,
+    date = DATE_SECCHI,
+    sd = SECMEAN,
+    clear_to_bottom = CLEAR_TO_BOTTOM
+  )
+
+trophicdf <- readr::read_csv(
+  "https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv"
+) |>
+  select(SITE_ID, visit = VISIT_NO, tp = PTL, tn = NTL, chla = CHLA) |>
+  left_join(visitdf, by = c("SITE_ID", "visit")) |>
+  select(-year, -visit) |>
+  group_by(SITE_ID, date) |>
+  summarize(
+    tp = mean(tp, na.rm = TRUE),
+    tn = mean(tn, na.rm = TRUE),
+    chla = mean(chla, na.rm = TRUE)
+  )
+
+
+phytodf <- readr::read_csv(
+  "https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv"
+) |>
+  select(
+    SITE_ID,
+    date = DATEPHYT,
+    depth = SAMPLE_DEPTH,
+    phyta = DIVISION,
+    genus = GENUS,
+    species = SPECIES,
+    tax = TAXANAME,
+    abund = ABUND
+  ) |>
+  mutate(phyta = gsub(" .*$", "", phyta)) |>
+  filter(!is.na(genus)) |>
+  group_by(SITE_ID, date, depth, phyta, genus) |>
+  summarize(abund = sum(abund, na.rm = TRUE)) |>
+  nest(phytodf = -c(SITE_ID, date))
+
+envdf <- waterchemdf |>
+  filter(depth < 2) |>
+  select(-depth) |>
+  group_by(SITE_ID, date) |>
+  summarise_all(~ mean(., na.rm = TRUE)) |>
+  ungroup() |>
+  left_join(sddf, by = c("SITE_ID", "date")) |>
+  left_join(trophicdf, by = c("SITE_ID", "date"))
+
+nla <- envdf |>
+  left_join(phytodf) |>
+  left_join(sitedf, by = "SITE_ID") |>
+  filter(!purrr::map_lgl(phytodf, is.null)) |>
+  mutate(
+    cyanophyta = purrr::map(
+      phytodf,
+      ~ .x |>
+        dplyr::filter(phyta == "Cyanophyta") |>
+        summarize(cyanophyta = sum(abund, na.rm = TRUE))
+    )
+  ) |>
+  unnest(cyanophyta) |>
+  select(-phyta) |>
+  mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE))
+
+# library(rmdify)
+# library(dwfun)
+# dwfun::init()
+```
+
+
+## 数据
+
+```{r}
+skimr::skim(nla)
+```
+
+
+
+## 简单模型
+
+```{r}
+nla |>
+  filter(tp > 1) |>
+  ggplot(aes(tn, tp)) +
+  geom_point() +
+  geom_smooth(method = "lm") +
+  scale_x_log10(
+    breaks = scales::trans_breaks("log10", function(x) 10^x),
+    labels = scales::trans_format("log10", scales::math_format(10^.x))
+  ) +
+  scale_y_log10(
+    breaks = scales::trans_breaks("log10", function(x) 10^x),
+    labels = scales::trans_format("log10", scales::math_format(10^.x))
+  )
+
+m1 <- lm(log10(tp) ~ log10(tn), data = nla)
+
+summary(m1)
+```
+
+## 复杂指标
+
+```{r}
+nla |>
+  filter(tp > 1) |>
+  ggplot(aes(tp, cyanophyta)) +
+  geom_point() +
+  geom_smooth(method = "lm") +
+  scale_x_log10(
+    breaks = scales::trans_breaks("log10", function(x) 10^x),
+    labels = scales::trans_format("log10", scales::math_format(10^.x))
+  ) +
+  scale_y_log10(
+    breaks = scales::trans_breaks("log10", function(x) 10^x),
+    labels = scales::trans_format("log10", scales::math_format(10^.x))
+  )
+
+m2 <- lm(log10(cyanophyta) ~ log10(tp), data = nla)
+
+summary(m2)
+```
+
+
+
+
+## tidymodels - Data split
+
+```{r}
+require(tidymodels)
+(nla_split <- rsample::initial_split(nla, prop = 0.7, strata = zmax))
+(nla_train <- training(nla_split))
+(nla_test <- testing(nla_split))
+```
+
+## tidymodels - recipe
+
+```{r}
+nla_formula <- as.formula(
+  "cyanophyta ~ temp + do + ph + cond + sd + tp + tn + chla + clear_to_bottom"
+)
+# nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn")
+nla_recipe <- recipes::recipe(nla_formula, data = nla_train) |>
+  recipes::step_string2factor(all_nominal()) |>
+  recipes::step_nzv(all_nominal()) |>
+  recipes::step_log(chla, cyanophyta, base = 10) |>
+  recipes::step_normalize(all_numeric_predictors()) |>
+  prep()
+nla_recipe
+```
+
+## tidymodels - cross validation
+
+```{r}
+nla_cv <- recipes::bake(
+  nla_recipe,
+  new_data = training(nla_split)
+) |>
+  rsample::vfold_cv(v = 10)
+nla_cv
+```
+
+## tidymodels - Model specification
+
+```{r}
+xgboost_model <- parsnip::boost_tree(
+  mode = "regression",
+  trees = 1000,
+  min_n = tune(),
+  tree_depth = tune(),
+  learn_rate = tune(),
+  loss_reduction = tune()
+) |>
+  set_engine("xgboost", objective = "reg:squarederror")
+xgboost_model
+```
+
+
+## tidymodels - Grid specification
+
+```{r}
+# grid specification
+xgboost_params <- dials::parameters(
+  min_n(),
+  tree_depth(),
+  learn_rate(),
+  loss_reduction()
+)
+xgboost_params
+```
+
+## tidymodels - Grid specification
+
+```{r}
+xgboost_grid <- dials::grid_max_entropy(
+  xgboost_params,
+  size = 60
+)
+knitr::kable(head(xgboost_grid))
+```
+
+## tidymodels - Workflow
+
+```{r}
+xgboost_wf <- workflows::workflow() |>
+  add_model(xgboost_model) |>
+  add_formula(nla_formula)
+xgboost_wf
+```
+
+
+## tidymodels - Tune
+
+```{r}
+#| cache: true
+# hyperparameter tuning
+if (FALSE) {
+  xgboost_tuned <- tune::tune_grid(
+    object = xgboost_wf,
+    resamples = nla_cv,
+    grid = xgboost_grid,
+    metrics = yardstick::metric_set(rmse, rsq, mae),
+    control = tune::control_grid(verbose = TRUE)
+  )
+  saveRDS(xgboost_tuned, "./xgboost_tuned.RDS")
+}
+xgboost_tuned <- readRDS("./xgboost_tuned.RDS")
+```
+
+## tidymodels - Best model
+
+```{r}
+xgboost_tuned |>
+  tune::show_best(metric = "rmse") |>
+  knitr::kable()
+```
+
+
+## tidymodels - Best model
+
+```{r}
+xgboost_tuned |>
+  collect_metrics()
+```
+
+
+## tidymodels - Best model
+
+```{r}
+#| fig-width: 9
+#| fig-height: 5
+#| out-width: "100%"
+xgboost_tuned |>
+  autoplot()
+```
+
+
+## tidymodels - Best model
+
+
+```{r}
+xgboost_best_params <- xgboost_tuned |>
+  tune::select_best(metric = "rmse")
+
+knitr::kable(xgboost_best_params)
+```
+
+
+## tidymodels - Final model
+
+```{r}
+xgboost_model_final <- xgboost_model |>
+  finalize_model(xgboost_best_params)
+xgboost_model_final
+```
+
+
+## tidymodels - Train evaluation
+
+
+```{r}
+(train_processed <- bake(nla_recipe, new_data = nla_train))
+```
+
+## tidymodels - Train data
+
+```{r}
+train_prediction <- xgboost_model_final |>
+  # fit the model on all the training data
+  fit(
+    formula = nla_formula,
+    data = train_processed
+  ) |>
+  # predict the sale prices for the training data
+  predict(new_data = train_processed) |>
+  bind_cols(
+    nla_train |>
+      mutate(.obs = log10(cyanophyta))
+  )
+xgboost_score_train <-
+  train_prediction |>
+  yardstick::metrics(.obs, .pred) |>
+  mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
+knitr::kable(xgboost_score_train)
+```
+
+## tidymodels - train evaluation
+
+```{r}
+#| fig-width: 5
+#| fig-height: 3
+#| out-width: "80%"
+train_prediction |>
+  ggplot(aes(.pred, .obs)) +
+  geom_point() +
+  geom_smooth(method = "lm")
+```
+
+
+## tidymodels - test data
+
+
+```{r}
+test_processed <- bake(nla_recipe, new_data = nla_test)
+
+test_prediction <- xgboost_model_final |>
+  # fit the model on all the training data
+  fit(
+    formula = nla_formula,
+    data = train_processed
+  ) |>
+  # use the training model fit to predict the test data
+  predict(new_data = test_processed) |>
+  bind_cols(
+    nla_test |>
+      mutate(.obs = log10(cyanophyta))
+  )
+
+# measure the accuracy of our model using `yardstick`
+xgboost_score <- test_prediction |>
+  yardstick::metrics(.obs, .pred) |>
+  mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
+
+knitr::kable(xgboost_score)
+```
+
+
+## tidymodels - evaluation
+
+```{r}
+#| fig-width: 5
+#| fig-height: 3
+#| out-width: "80%"
+cyanophyta_prediction_residual <- test_prediction |>
+  arrange(.pred) %>%
+  mutate(residual_pct = (.obs - .pred) / .pred) |>
+  select(.pred, residual_pct)
+
+cyanophyta_prediction_residual |>
+  ggplot(aes(x = .pred, y = residual_pct)) +
+  geom_point() +
+  xlab("Predicted Cyanophyta") +
+  ylab("Residual (%)")
+```
+
+
+
+
+## tidymodels - test evaluation
+
+```{r}
+#| fig-width: 5
+#| fig-height: 3
+#| out-width: "80%"
+test_prediction |>
+  ggplot(aes(.pred, .obs)) +
+  geom_point() +
+  geom_smooth(method = "lm", colour = "black")
+```
+
+
+
@@ -0,0 +1,285 @@
+---
+title: "RWEP L7- Data Transform"
+---
+
+# Data Source
+
+
+```{r}
+dailylanddf <- readr::read_csv("../data/chinawq/daily_land.csv")
+metadatadf <- readxl::read_xls("../data/chinawq/metadata_and_statistics.xls")
+monthlylanddf <- readr::read_csv("../data/chinawq/monthly_ocean.csv")
+weeklylanddf <- readr::read_csv("../data/chinawq/weekly_land.csv")
+```
+
+
+```{r}
+require(tidyverse)
+dailylanddf
+
+
+dailylanddf |>
+  select(.data$MonitoringLocationIdentifier, .data$LongitudeMeasure_WGS84)
+
+dailylanddf |>
+  select("MonitoringLocationIdentifier", "LongitudeMeasure_WGS84")
+
+
+dailylanddf |>
+  select(1:3)
+
+
+dailylanddf |>
+  rename(x = 1) |>
+  rename(y = 2)
+```
+
+
+
+# 内置function
+
+
+```{r}
+
+dailylanddf |>
+  select(1:3) |>
+  rename_all(\(x) c("a", "b", "c")) |>
+  rename_all(\(x) paste0(x, "_new"))
+
+dailylanddf |>
+  select(1:3) |>
+  rename_all(~ c("a", "b", "c")) |>
+  rename_all(~ paste0(., "_new"))
+
+
+dailylanddf |>
+  select(1:3) |>
+  rename_all(~ c("x", "y", "z"))
+```
+
+```{r}
+fulldatadf <- readr::read_csv("../data/chinawq/full_dataset.csv")
+
+datadf <- fulldatadf |>
+  select(lon = 2, lat = 3, date = 4, varname = 5, value = 6) |>
+  mutate(value = as.numeric(value)) |>
+  tidyr::pivot_wider(names_from = varname, values_from = value)
+
+saveRDS(datadf, "../data/chinawq/datadf.rds")
+```
+
+
+# 统计
+
+## 统计每月的DO的平均值
+
+```{r}
+datadf <- readRDS("../data/chinawq/datadf.rds")
+
+datadf |>
+  select(1:7) |>
+  mutate(month = month(date)) |>
+  select(lon, lat, month, date, everything()) |>
+  group_by(month) |>
+  summarize(DO_mean = mean(DO, na.rm = TRUE), DO_sd = sd(DO, na.rm = TRUE)) |>
+  ggplot(aes(x = month, y = DO_mean)) +
+  geom_point() +
+  geom_smooth(method = "loess") +
+  geom_errorbar(aes(ymin = DO_mean - DO_sd, ymax = DO_mean + DO_sd))
+```
+
+
+```{r}
+datadf |>
+  select(1:7) |>
+  mutate(month = month(date)) |>
+  mutate(year = year(date)) |>
+  select(lon, lat, year, month, date, everything()) |>
+  summarize(
+    DO_mean = mean(DO, na.rm = TRUE),
+    DO_sd = sd(DO, na.rm = TRUE),
+    .by = c(year, month)
+  ) |>
+  ggplot(aes(x = month, y = DO_mean)) +
+  geom_point() +
+  geom_smooth(method = "loess") +
+  geom_errorbar(aes(ymin = DO_mean - DO_sd, ymax = DO_mean + DO_sd)) +
+  facet_wrap(~year)
+```
+
+
+```{r}
+datadf |>
+  select(1:7) |>
+  mutate(month = month(date)) |>
+  mutate(year = year(date)) |>
+  select(-lon, -lat, -date) |>
+  group_by(year, month) |>
+  summarize_all(\(x) mean(x, na.rm = TRUE))
+```
+
+## nest
+
+```{r}
+
+for (isite in 1:10) {
+  m <- datadf |>
+    select(1:7) |>
+    tidyr::nest(wqdf = -c(lon, lat)) |>
+    pull(wqdf) |>
+    nth(isite) |>
+    lm(formula = DO ~ pH, data = _)
+  slope <- coef(m)[2]
+  print(slope)
+}
+```
+
+## purrr::map
+
+```{r}
+require(tidyverse)
+datadf <- readRDS("../data/chinawq/datadf.rds")
+datadf |>
+  select(1:7) |>
+  tidyr::nest(wqdf = -c(lon, lat)) |>
+  slice(1:10) |>
+  mutate(
+    m = purrr::map(
+      wqdf,
+      \(x) {
+        lm(formula = DO ~ pH, data = x)
+      }
+    )
+  ) |>
+  mutate(slope = purrr::map_dbl(m, \(x) coef(x)[2])) |>
+  mutate(
+    p = purrr::map2(wqdf, slope, \(x, y) {
+      x |>
+        ggplot(aes(x = pH, y = DO)) +
+        geom_smooth(method = "lm") +
+        geom_point() +
+        ggtitle(paste0("Slope: ", round(y, 2)))
+    })
+  ) |>
+  pull(p) |>
+  patchwork::wrap_plots() +
+  patchwork::plot_layout(ncol = 2)
+```
+
+
+```{r}
+longriverdf <- tibble(
+  fn = dir(
+    "../data/LongRiver/data/",
+    pattern = "csv",
+    full.names = TRUE,
+    recursive = TRUE
+  )
+) |>
+  mutate(data = purrr::map(fn, readr::read_csv))
+
+lang <- "cn"
+longriverdf |>
+  mutate(year = gsub("^.*([0-9)]{4})-.*$", "\\1", fn)) |>
+  mutate(month = gsub("^.*[0-9)]{4}-([0-9]{2}).*$", "\\1", fn)) |>
+  mutate(site = gsub("^.*_(.*).csv$", "\\1", fn)) |>
+  select(-fn) |>
+  slice(1:10) |>
+  tidyr::unnest(data) |>
+  ggplot(aes(x = site, y = z)) +
+  geom_boxplot(colour = "black", size = 0.8) +
+  facet_wrap(~month) +
+  dwfun::theme_sci()
+```
+
+```{r}
+longriverdf |>
+  mutate(year = gsub("^.*([0-9)]{4})-.*$", "\\1", fn)) |>
+  mutate(month = gsub("^.*[0-9)]{4}-([0-9]{2}).*$", "\\1", fn)) |>
+  mutate(site = gsub("^.*_(.*).csv$", "\\1", fn)) |>
+  select(-fn) |>
+  slice(1:12) |>
+  mutate(
+    p = purrr::map(data, \(x) {
+      x |>
+        ggplot(aes(tm, z)) +
+        geom_point()
+    })
+  ) |>
+  pull(p) |>
+  patchwork::wrap_plots() |>
+  patchwork::plot_layout(ncol = 3)
+```
+
+```{r}
+# datadf
+# 不同年份，DO 的平均值，
+# x lon
+# y lat
+# colour/ fill Do mean
+# facet: year
+datadf |>
+  dplyr::mutate(year = year(date)) |>
+  summarize(DO_mean = mean(DO, na.rm = TRUE), .by = c(lon, lat, year)) |>
+  dplyr::filter(year > 2006) |>
+  ggplot(aes(x = lon, y = lat)) +
+  geom_point(aes(fill = DO_mean), shape = 21) +
+  scale_fill_viridis_c() +
+  facet_wrap(~year) +
+  labs(x = "Longitude", y = "Latitude", fill = "DO (mg/L)") +
+  theme_bw()
+```
+
+
+```{r}
+# x  月份
+# y NH4: pH
+
+p1 <- datadf |>
+  mutate(month = month(date)) |>
+  dplyr::filter(NH4N < 100) |>
+  ggplot(aes(x = month, y = NH4N)) +
+  geom_boxplot(aes(fill = as.factor(month)), colour = "black", size = 0.8) +
+  scale_y_log10(
+    breaks = scales::trans_breaks("log10", function(x) 10^x),
+    labels = scales::trans_format(
+      "log10",
+      scales::math_format(10^.x, format = "%.1f")
+    )
+  ) +
+  scale_x_continuous(breaks = 1:12, labels = month.abb) +
+  theme(legend.position = "none")
+p2 <- datadf |>
+  mutate(month = month(date)) |>
+  ggplot(aes(x = month, y = CODMn)) +
+  geom_boxplot(aes(fill = as.factor(month)), colour = "black", size = 0.8) +
+  scale_y_log10(
+    breaks = scales::trans_breaks("log10", function(x) 10^x),
+    labels = scales::trans_format(
+      "log10",
+      scales::math_format(10^.x, format = "%.1f")
+    )
+  ) +
+  scale_x_continuous(breaks = 1:12, labels = month.abb) +
+  theme(legend.position = "none")
+p3 <- datadf |>
+  mutate(month = month(date)) |>
+  dplyr::filter(DO < 20) |>
+  ggplot(aes(x = month, y = DO)) +
+  geom_boxplot(aes(fill = as.factor(month)), colour = "black", size = 0.8) +
+  scale_x_continuous(breaks = 1:12, labels = month.abb) +
+  theme(legend.position = "none")
+p4 <- datadf |>
+  mutate(month = month(date)) |>
+  dplyr::filter(pH > 5) |>
+  ggplot(aes(x = month, y = pH)) +
+  geom_boxplot(aes(fill = as.factor(month)), colour = "black", size = 0.8) +
+  theme(legend.position = "none")
+
+(p1 | p2) / (p3 | p4) + patchwork::plot_annotation(tag_levels = "A")
+```
+
+
+
+
+
@@ -0,0 +1,202 @@
+---
+title: "L9 - Data Virtualization"
+format: html
+editor: visual
+---
+
+# Load Data
+
+```{r}
+# getwd()
+# setwd("coding")
+require(tidyverse)
+datadf <- readRDS("../data/chinawq/datadf.rds")
+```
+
+# Data view
+
+```{r}
+skimr::skim(datadf)
+
+head(datadf)
+
+tail(datadf)
+
+names(datadf)
+
+summary(datadf)
+
+str(datadf)
+```
+
+# Plot
+
+```{r}
+#| warning: false
+#| message: false
+# 每月NH4N与pH的相关性散点图
+p <- datadf |>
+  dplyr::filter(NH4N < 100) |>
+  dplyr::filter(between(year(date), 2016, 2019)) |>
+  mutate(month = month(date)) |>
+  ggplot(aes(CODMn, NH4N)) +
+  geom_point(shape = 21, size = 0.8, fill = "orange") +
+  geom_smooth(method = "gam", color = "red") +
+  scale_x_log10() +
+  scale_y_log10() +
+  labs(
+    x = "COD<sub>Mn</sub> (mg L<sup>-1</sup>)",
+    y = "Ammonia (mg L<sup>-1</sup>)"
+  ) +
+  facet_wrap(~month, scale = "free", ncol = 4) +
+  theme(
+    axis.title.x = ggtext::element_markdown(),
+    axis.title.y = ggtext::element_markdown()
+  )
+print(p)
+
+plotly::ggplotly(p)
+```
+
+```{r}
+#| warning: false
+#| message: false
+# 每月NH4N与pH的相关性散点图
+p <- datadf |>
+  dplyr::filter(NH4N < 100) |>
+  dplyr::filter(between(year(date), 2016, 2019)) |>
+  mutate(month = month(date)) |>
+  ggplot(aes(factor(month), NH4N)) +
+  geom_jitter(size = 0.1, colour = "gray95") +
+  geom_violin(fill = "orange", alpha = 0.3) +
+  scale_y_log10() +
+  labs(x = "Month", y = "Ammonia (mg L<sup>-1</sup>)") +
+  theme_classic() +
+  theme(axis.title.y = ggtext::element_markdown())
+print(p)
+
+plotly::ggplotly(p)
+```
+
+# Plotly
+
+```{r}
+
+p <- datadf |>
+  dplyr::filter(year(date) == 2018) |>
+  ggplot(aes(date, NH4N)) +
+  geom_point()
+
+ggsave("L9-1.pdf", width = 4, height = 3)
+
+# install.packages("plotly")
+plotly::ggplotly(p)
+```
+
+
+# Map - sf
+
+```{r}
+# install.packages("sf")
+# install.packages("sfext")
+
+require(sf)
+require(sfext)
+
+chinawqsf <- sf::read_sf(
+  "../data/chinawq/Monitoring_sites/Monitoring_sites.shp"
+)
+
+chinawqsf |>
+  ggplot() +
+  geom_sf(shape = 21, size = 1, fill = "orange")
+
+chinamapsf <- sf::read_sf("../data/中国省级地图GS（2019）1719号.geojson")
+ninelinesf <- sf::read_sf("../data/九段线GS（2019）1719号.geojson")
+
+chinamapsf <- sf::read_sf(
+  "https://git.drwater.net/course/su2026rwep/raw/branch/PUB/data/中国省级地图GS（2019）1719号.geojson"
+)
+ninelinesf <- sf::read_sf(
+  "https://git.drwater.net/course/su2026rwep/raw/branch/PUB/data/九段线GS（2019）1719号.geojson"
+)
+
+chinacrs <- "+proj=laea +lat_0=40 +lon_0=104"
+
+mapeR::get_chinacrs()
+
+chinawqsf |>
+  ggplot() +
+  geom_sf(data = chinamapsf) +
+  geom_sf(data = ninelinesf) +
+  geom_sf(shape = 21, size = 1, fill = "orange") +
+  coord_sf(datum = chinacrs)
+```
+
+
+```{r}
+require(mapeR)
+map_chinese()
+```
+
+```{r}
+chinamapsf <- sf::read_sf("../data/中国省级地图GS（2019）1719号.geojson")
+ninelinesf <- sf::read_sf("../data/九段线GS（2019）1719号.geojson")
+
+chinamapsf <- sf::read_sf(
+  "https://git.drwater.net/course/su2026rwep/raw/branch/PUB/data/中国省级地图GS（2019）1719号.geojson"
+)
+ninelinesf <- sf::read_sf(
+  "https://git.drwater.net/course/su2026rwep/raw/branch/PUB/data/九段线GS（2019）1719号.geojson"
+)
+chinacrs <- "+proj=laea +lat_0=40 +lon_0=104"
+
+# 安装
+install.packages("ggspatial")
+
+datadf |>
+  sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |>
+  sf::st_transform(crs = chinacrs) |>
+  dplyr::filter(!is.na(CODMn)) |>
+  ggplot() +
+  geom_sf(data = chinamapsf, aes(fill = CNAME), alpha = 0.2) +
+  geom_sf(data = ninelinesf) +
+  geom_sf(aes(colour = log1p(CODMn))) +
+  labs(x = NULL, y = NULL, fill = NULL, colour = "CODMn") +
+  scale_colour_viridis_c() +
+  ggspatial::annotation_north_arrow(location = "tl") +
+  ggspatial::annotation_scale(location = "bl") +
+  theme(legend.position = "none")
+
+
+mapeR::map_chinese
+```
+
+# 具体某个湖泊
+
+```{r}
+sf::read_sf("../data/taihu.shp") |>
+  ggplot() +
+  geom_sf()
+
+
+sitesf <- sf::read_sf("~/Desktop/qingcaosha.kml")
+
+st_layers("~/Desktop/qingcaosha.kml")
+
+sitesf <- sf::read_sf("~/Desktop/qingcaosha.kml")
+boundsf <- sf::read_sf("~/Desktop/qingcaosha.kml", layer = "qingcaosha")
+
+ggplot() +
+  geom_sf(data = boundsf) +
+  geom_sf(data = sitesf) +
+  geom_sf_text(data = sitesf, aes(label = Name), vjust = 1, size = 3)
+```
+
+
+
+
+
+
+
+
@@ -0,0 +1 @@
+UTF-8
@@ -0,0 +1 @@
+GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]
Author	SHA1	Message	Date
ming	028ef1c471	L10	2026-06-04 15:55:13 +08:00
ming	7aeb071fba	L8	2026-05-28 13:08:20 +08:00
ming	29daf1b240	add L7.qmd	2026-05-26 13:43:04 +08:00
ming	982f26736f	Update Ref.bib	2026-05-26 13:42:53 +08:00
				`@@ -0,0 +1 @@`
				`GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]`