diff --git a/coding/L10/main.qmd b/coding/L10/main.qmd new file mode 100644 index 0000000..7e4b5e2 --- /dev/null +++ b/coding/L10/main.qmd @@ -0,0 +1,617 @@ +# 实践部分 + + +## Kriging + +### Code + +```{r} +#| echo: true +#| eval: false +#| out-width: 50% +require(sf) +require(sp) +require(gstat) +require(raster) +require(ggplot2) +# 读取太湖边界 +lake_boundary <- st_read("../../data/taihu.shp") |> + sf::st_make_valid() +main_water <- st_difference( + lake_boundary[lake_boundary$Name == "boundary", ], + st_union(lake_boundary[lake_boundary$Name != "boundary", ]) +) + +get_kriging <- function(indf, Yvarname, grid = NULL, boundsf = main_water) { + insf <- indf |> + sfext::df_to_sf(crs = 4326, coords = c("long", "lat")) |> + dplyr::select(-c("long", "lat")) |> + dplyr::rename(Y = tidyselect::all_of(Yvarname)) + if (is.null(grid)) { + # 1. 将sf边界转为terra格式 + v <- terra::vect(boundsf) + # 2. 创建基础网格 + base_grid <- terra::rast(v, resolution = 0.002) # 基础低分辨率 + # 3. 创建高分辨率区域(例如边界附近) + buffer_zone <- buffer(v, width = 0.005) # 边界附近创建缓冲区 + hi_res_grid <- terra::rast(buffer_zone, resolution = 0.001) + # 4. 合并网格 + final_grid <- merge(base_grid, hi_res_grid) + # 5. 转为点并裁剪 + grid <- terra::as.points(final_grid) |> + sf::st_as_sf() |> + sf::st_filter(main_water) + } + insp <- sf::as_Spatial(insf) + fit <- automap::autofitVariogram(Y ~ 1, insp) + # 克里金插值 + m <- gstat::gstat( + formula = Y ~ 1, + data = insp, + model = fit$var_model + ) + predsf <- predict(m, grid) |> + sf::st_as_sf() + outsf <- predsf |> + st_coordinates() |> + as.data.frame() |> + cbind(pred = predsf$var1.pred) + return(outsf) +} + + +wqdf <- readxl::read_xlsx("../data/wqdata.xlsx") + +if (FALSE) { + wqsf <- wqdf |> + nest(datedf = -date) |> + dplyr::mutate( + krigingsf = purrr::map(datedf, \(x) { + get_kriging(x, "conc", boundsf = main_water) + }) + ) + saveRDS(wqsf, "./wqsf.rds") +} + +wqsf <- readRDS("./wqsf.rds") + +wqsf |> + unnest(krigingsf) |> + ggplot(aes(X, Y)) + + geom_contour_filled(aes(z = pred), bins = 20) + + geom_sf( + data = main_water, + aes(x = NULL, y = NULL), + fill = NA, + colour = "black", + linewidth = 1 + ) + + # scale_fill_gradientn(colors = hcl.colors(100, "RdYlBu")) + # 设置颜色渐变 + # ggsci::scale_fill_aaas() + + coord_sf() + + theme_void() +``` + +### Output + +```{r} +#| echo: false +#| out-width: 80% +#| message: false +#| fig-width: 6 +#| fig-height: 3 +require(sf) +require(sp) +require(gstat) +require(raster) +require(ggplot2) +# 读取太湖边界 +lake_boundary <- st_read("../../data/taihu.shp", quiet = TRUE) |> + sf::st_make_valid() +main_water <- st_difference( + lake_boundary[lake_boundary$Name == "boundary", ], + st_union(lake_boundary[lake_boundary$Name != "boundary", ]) +) + +get_kriging <- function(indf, Yvarname, grid = NULL, boundsf = main_water) { + insf <- indf |> + sfext::df_to_sf(crs = 4326, coords = c("long", "lat")) |> + dplyr::select(-c("long", "lat")) |> + dplyr::rename(Y = tidyselect::all_of(Yvarname)) + if (is.null(grid)) { + # 1. 将sf边界转为terra格式 + v <- terra::vect(boundsf) + # 2. 创建基础网格 + base_grid <- terra::rast(v, resolution = 0.002) # 基础低分辨率 + # 3. 创建高分辨率区域(例如边界附近) + buffer_zone <- buffer(v, width = 0.005) # 边界附近创建缓冲区 + hi_res_grid <- terra::rast(buffer_zone, resolution = 0.001) + # 4. 合并网格 + final_grid <- merge(base_grid, hi_res_grid) + # 5. 转为点并裁剪 + grid <- terra::as.points(final_grid) |> + sf::st_as_sf() |> + sf::st_filter(main_water) + } + insp <- sf::as_Spatial(insf) + fit <- automap::autofitVariogram(Y ~ 1, insp) + # 克里金插值 + m <- gstat::gstat( + formula = Y ~ 1, + data = insp, + model = fit$var_model + ) + predsf <- predict(m, grid) |> + sf::st_as_sf() + outsf <- predsf |> + st_coordinates() |> + as.data.frame() |> + cbind(pred = predsf$var1.pred) + return(outsf) +} + + +wqdf <- readxl::read_xlsx("../../data/wqdata.xlsx") + +if (FALSE) { + wqsf <- wqdf |> + nest(datedf = -date) |> + dplyr::mutate( + krigingsf = purrr::map(datedf, \(x) { + get_kriging(x, "conc", boundsf = main_water) + }) + ) + saveRDS(wqsf, "./wqsf.rds") +} +wqsf <- readRDS("./wqsf.rds") + +wqsf |> + unnest(krigingsf) |> + ggplot(aes(X, Y)) + + geom_contour_filled(aes(z = pred), bins = 20) + + geom_sf( + data = main_water, + aes(x = NULL, y = NULL), + fill = NA, + colour = "black", + linewidth = 1 + ) + + # scale_fill_gradientn(colors = hcl.colors(100, "RdYlBu")) + # 设置颜色渐变 + # ggsci::scale_fill_aaas() + + coord_sf() + + theme_void() +``` + +::: + + +## 数据 + +```{r} +require(tidyverse) +sitedf <- readr::read_csv( + "https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv" +) |> + select( + SITE_ID, + lon = LON_DD, + lat = LAT_DD, + name = LAKENAME, + area = LAKEAREA, + zmax = DEPTHMAX + ) |> + group_by(SITE_ID) |> + summarize( + lon = mean(lon, na.rm = TRUE), + lat = mean(lat, na.rm = TRUE), + name = unique(name), + area = mean(area, na.rm = TRUE), + zmax = mean(zmax, na.rm = TRUE) + ) + + +visitdf <- readr::read_csv( + "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv" +) |> + select(SITE_ID, date = DATE_PROFILE, year = YEAR, visit = VISIT_NO) |> + distinct() + + +waterchemdf <- readr::read_csv( + "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv" +) |> + select( + SITE_ID, + date = DATE_PROFILE, + depth = DEPTH, + temp = TEMP_FIELD, + do = DO_FIELD, + ph = PH_FIELD, + cond = COND_FIELD, + ) + +sddf <- readr::read_csv( + "https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv" +) |> + select( + SITE_ID, + date = DATE_SECCHI, + sd = SECMEAN, + clear_to_bottom = CLEAR_TO_BOTTOM + ) + +trophicdf <- readr::read_csv( + "https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv" +) |> + select(SITE_ID, visit = VISIT_NO, tp = PTL, tn = NTL, chla = CHLA) |> + left_join(visitdf, by = c("SITE_ID", "visit")) |> + select(-year, -visit) |> + group_by(SITE_ID, date) |> + summarize( + tp = mean(tp, na.rm = TRUE), + tn = mean(tn, na.rm = TRUE), + chla = mean(chla, na.rm = TRUE) + ) + + +phytodf <- readr::read_csv( + "https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv" +) |> + select( + SITE_ID, + date = DATEPHYT, + depth = SAMPLE_DEPTH, + phyta = DIVISION, + genus = GENUS, + species = SPECIES, + tax = TAXANAME, + abund = ABUND + ) |> + mutate(phyta = gsub(" .*$", "", phyta)) |> + filter(!is.na(genus)) |> + group_by(SITE_ID, date, depth, phyta, genus) |> + summarize(abund = sum(abund, na.rm = TRUE)) |> + nest(phytodf = -c(SITE_ID, date)) + +envdf <- waterchemdf |> + filter(depth < 2) |> + select(-depth) |> + group_by(SITE_ID, date) |> + summarise_all(~ mean(., na.rm = TRUE)) |> + ungroup() |> + left_join(sddf, by = c("SITE_ID", "date")) |> + left_join(trophicdf, by = c("SITE_ID", "date")) + +nla <- envdf |> + left_join(phytodf) |> + left_join(sitedf, by = "SITE_ID") |> + filter(!purrr::map_lgl(phytodf, is.null)) |> + mutate( + cyanophyta = purrr::map( + phytodf, + ~ .x |> + dplyr::filter(phyta == "Cyanophyta") |> + summarize(cyanophyta = sum(abund, na.rm = TRUE)) + ) + ) |> + unnest(cyanophyta) |> + select(-phyta) |> + mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE)) + +# library(rmdify) +# library(dwfun) +# dwfun::init() +``` + + +## 数据 + +```{r} +skimr::skim(nla) +``` + + + +## 简单模型 + +```{r} +nla |> + filter(tp > 1) |> + ggplot(aes(tn, tp)) + + geom_point() + + geom_smooth(method = "lm") + + scale_x_log10( + breaks = scales::trans_breaks("log10", function(x) 10^x), + labels = scales::trans_format("log10", scales::math_format(10^.x)) + ) + + scale_y_log10( + breaks = scales::trans_breaks("log10", function(x) 10^x), + labels = scales::trans_format("log10", scales::math_format(10^.x)) + ) + +m1 <- lm(log10(tp) ~ log10(tn), data = nla) + +summary(m1) +``` + +## 复杂指标 + +```{r} +nla |> + filter(tp > 1) |> + ggplot(aes(tp, cyanophyta)) + + geom_point() + + geom_smooth(method = "lm") + + scale_x_log10( + breaks = scales::trans_breaks("log10", function(x) 10^x), + labels = scales::trans_format("log10", scales::math_format(10^.x)) + ) + + scale_y_log10( + breaks = scales::trans_breaks("log10", function(x) 10^x), + labels = scales::trans_format("log10", scales::math_format(10^.x)) + ) + +m2 <- lm(log10(cyanophyta) ~ log10(tp), data = nla) + +summary(m2) +``` + + + + +## tidymodels - Data split + +```{r} +require(tidymodels) +(nla_split <- rsample::initial_split(nla, prop = 0.7, strata = zmax)) +(nla_train <- training(nla_split)) +(nla_test <- testing(nla_split)) +``` + +## tidymodels - recipe + +```{r} +nla_formula <- as.formula( + "cyanophyta ~ temp + do + ph + cond + sd + tp + tn + chla + clear_to_bottom" +) +# nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn") +nla_recipe <- recipes::recipe(nla_formula, data = nla_train) |> + recipes::step_string2factor(all_nominal()) |> + recipes::step_nzv(all_nominal()) |> + recipes::step_log(chla, cyanophyta, base = 10) |> + recipes::step_normalize(all_numeric_predictors()) |> + prep() +nla_recipe +``` + +## tidymodels - cross validation + +```{r} +nla_cv <- recipes::bake( + nla_recipe, + new_data = training(nla_split) +) |> + rsample::vfold_cv(v = 10) +nla_cv +``` + +## tidymodels - Model specification + +```{r} +xgboost_model <- parsnip::boost_tree( + mode = "regression", + trees = 1000, + min_n = tune(), + tree_depth = tune(), + learn_rate = tune(), + loss_reduction = tune() +) |> + set_engine("xgboost", objective = "reg:squarederror") +xgboost_model +``` + + +## tidymodels - Grid specification + +```{r} +# grid specification +xgboost_params <- dials::parameters( + min_n(), + tree_depth(), + learn_rate(), + loss_reduction() +) +xgboost_params +``` + +## tidymodels - Grid specification + +```{r} +xgboost_grid <- dials::grid_max_entropy( + xgboost_params, + size = 60 +) +knitr::kable(head(xgboost_grid)) +``` + +## tidymodels - Workflow + +```{r} +xgboost_wf <- workflows::workflow() |> + add_model(xgboost_model) |> + add_formula(nla_formula) +xgboost_wf +``` + + +## tidymodels - Tune + +```{r} +#| cache: true +# hyperparameter tuning +if (FALSE) { + xgboost_tuned <- tune::tune_grid( + object = xgboost_wf, + resamples = nla_cv, + grid = xgboost_grid, + metrics = yardstick::metric_set(rmse, rsq, mae), + control = tune::control_grid(verbose = TRUE) + ) + saveRDS(xgboost_tuned, "./xgboost_tuned.RDS") +} +xgboost_tuned <- readRDS("./xgboost_tuned.RDS") +``` + +## tidymodels - Best model + +```{r} +xgboost_tuned |> + tune::show_best(metric = "rmse") |> + knitr::kable() +``` + + +## tidymodels - Best model + +```{r} +xgboost_tuned |> + collect_metrics() +``` + + +## tidymodels - Best model + +```{r} +#| fig-width: 9 +#| fig-height: 5 +#| out-width: "100%" +xgboost_tuned |> + autoplot() +``` + + +## tidymodels - Best model + + +```{r} +xgboost_best_params <- xgboost_tuned |> + tune::select_best(metric = "rmse") + +knitr::kable(xgboost_best_params) +``` + + +## tidymodels - Final model + +```{r} +xgboost_model_final <- xgboost_model |> + finalize_model(xgboost_best_params) +xgboost_model_final +``` + + +## tidymodels - Train evaluation + + +```{r} +(train_processed <- bake(nla_recipe, new_data = nla_train)) +``` + +## tidymodels - Train data + +```{r} +train_prediction <- xgboost_model_final |> + # fit the model on all the training data + fit( + formula = nla_formula, + data = train_processed + ) |> + # predict the sale prices for the training data + predict(new_data = train_processed) |> + bind_cols( + nla_train |> + mutate(.obs = log10(cyanophyta)) + ) +xgboost_score_train <- + train_prediction |> + yardstick::metrics(.obs, .pred) |> + mutate(.estimate = format(round(.estimate, 2), big.mark = ",")) +knitr::kable(xgboost_score_train) +``` + +## tidymodels - train evaluation + +```{r} +#| fig-width: 5 +#| fig-height: 3 +#| out-width: "80%" +train_prediction |> + ggplot(aes(.pred, .obs)) + + geom_point() + + geom_smooth(method = "lm") +``` + + +## tidymodels - test data + + +```{r} +test_processed <- bake(nla_recipe, new_data = nla_test) + +test_prediction <- xgboost_model_final |> + # fit the model on all the training data + fit( + formula = nla_formula, + data = train_processed + ) |> + # use the training model fit to predict the test data + predict(new_data = test_processed) |> + bind_cols( + nla_test |> + mutate(.obs = log10(cyanophyta)) + ) + +# measure the accuracy of our model using `yardstick` +xgboost_score <- test_prediction |> + yardstick::metrics(.obs, .pred) |> + mutate(.estimate = format(round(.estimate, 2), big.mark = ",")) + +knitr::kable(xgboost_score) +``` + + +## tidymodels - evaluation + +```{r} +#| fig-width: 5 +#| fig-height: 3 +#| out-width: "80%" +cyanophyta_prediction_residual <- test_prediction |> + arrange(.pred) %>% + mutate(residual_pct = (.obs - .pred) / .pred) |> + select(.pred, residual_pct) + +cyanophyta_prediction_residual |> + ggplot(aes(x = .pred, y = residual_pct)) + + geom_point() + + xlab("Predicted Cyanophyta") + + ylab("Residual (%)") +``` + + + + +## tidymodels - test evaluation + +```{r} +#| fig-width: 5 +#| fig-height: 3 +#| out-width: "80%" +test_prediction |> + ggplot(aes(.pred, .obs)) + + geom_point() + + geom_smooth(method = "lm", colour = "black") +``` + + + diff --git a/coding/L7.qmd b/coding/L7.qmd index 2163bc3..fabd6b0 100644 --- a/coding/L7.qmd +++ b/coding/L7.qmd @@ -274,7 +274,6 @@ p4 <- datadf |> dplyr::filter(pH > 5) |> ggplot(aes(x = month, y = pH)) + geom_boxplot(aes(fill = as.factor(month)), colour = "black", size = 0.8) + - scale_x_continuous(breaks = 1:12, labels = month.abb) + theme(legend.position = "none") (p1 | p2) / (p3 | p4) + patchwork::plot_annotation(tag_levels = "A") diff --git a/coding/L9-1.pdf b/coding/L9-1.pdf new file mode 100644 index 0000000..1d83a6b Binary files /dev/null and b/coding/L9-1.pdf differ diff --git a/coding/L9.html b/coding/L9.html new file mode 100644 index 0000000..ccf7f75 --- /dev/null +++ b/coding/L9.html @@ -0,0 +1,1070 @@ + + + + + + + + + +L9 - Data Virtualization + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+

L9 - Data Virtualization

+
+ + + +
+ + + + +
+ + + +
+ + +
+

Load Data

+
+
# getwd()
+# setwd("coding")
+require(tidyverse)
+
+
Loading required package: tidyverse
+
+
+
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
+✔ dplyr     1.2.0     ✔ readr     2.1.6
+✔ forcats   1.0.0     ✔ stringr   1.6.0
+✔ ggplot2   4.0.2     ✔ tibble    3.3.1
+✔ lubridate 1.9.5     ✔ tidyr     1.3.2
+✔ purrr     1.2.1     
+── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
+✖ dplyr::filter() masks stats::filter()
+✖ dplyr::lag()    masks stats::lag()
+ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
+
+
datadf <- readRDS("../data/chinawq/datadf.rds")
+
+
+
+

Data view

+
+
skimr::skim(datadf)
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Data summary
Namedatadf
Number of rows75747
Number of columns21
_______________________
Column type frequency:
Date1
numeric20
________________________
Group variablesNone
+

Variable type: Date

+ +++++++++ + + + + + + + + + + + + + + + + + + + + + + +
skim_variablen_missingcomplete_rateminmaxmediann_unique
date011980-01-072022-08-012014-06-161405
+

Variable type: numeric

+ +++++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
skim_variablen_missingcomplete_ratemeansdp0p25p50p75p100hist
lon01.00116.147.3279.98112.31117.19120.71134.67▁▁▃▇▁
lat01.0032.907.3716.5128.6532.6638.3253.48▃▆▇▅▁
NH4N212450.720.764.780.000.150.260.491000.00▇▁▁▁▁
CODMn213680.724.516.150.002.103.305.30241.00▇▁▁▁▁
DO27400.967.892.380.006.577.749.1693.20▇▁▁▁▁
pH12890.987.790.510.007.407.908.1210.30▁▁▁▇▁
BOD753150.011.792.810.100.601.022.0041.30▇▁▁▁▁
TSSs754180.00472.393192.680.5027.00100.00260.0051800.00▇▁▁▁▁
TEMP752270.0117.617.760.1011.2017.8024.3331.80▂▇▇▇▆
DOC757340.004.874.611.001.603.207.2014.50▇▁▂▁▂
NO3N753590.011.031.000.000.370.721.306.86▇▂▁▁▁
DIP573080.240.010.020.000.000.010.020.64▇▁▁▁▁
NO2N754130.000.040.060.000.000.010.040.60▇▁▁▁▁
TP755510.000.602.340.000.010.050.1825.61▇▁▁▁▁
DOSAT757160.0095.0714.2055.7088.8096.10105.50119.40▁▃▇▇▅
COD562390.261.060.980.030.600.861.2423.00▇▁▁▁▁
TDP757310.000.010.010.000.010.010.020.03▁▇▃▁▃
TOC757460.0076.70NA76.7076.7076.7076.7076.70▁▁▇▁▁
TPH588620.220.020.020.000.010.010.030.63▇▁▁▁▁
DIN563780.260.290.400.000.050.140.368.17▇▁▁▁▁
+
+
head(datadf)
+
+
# A tibble: 6 × 21
+    lon   lat date        NH4N CODMn    DO    pH   BOD  TSSs  TEMP   DOC  NO3N
+  <dbl> <dbl> <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
+1  80.7  43.8 2012-05-28  0.07   2.9  6.04   7.9    NA    NA    NA    NA    NA
+2  80.7  43.8 2012-06-04  0.04   2.7  6.62   7.8    NA    NA    NA    NA    NA
+3  80.7  43.8 2012-06-11  0.02   1    6.39   7.8    NA    NA    NA    NA    NA
+4  80.7  43.8 2012-06-18  1.33   1.3  5.6    7.7    NA    NA    NA    NA    NA
+5  80.7  43.8 2012-06-25  2.66   3    4.63   7.7    NA    NA    NA    NA    NA
+6  80.7  43.8 2012-07-02  0.05   6.9  5.04   7.9    NA    NA    NA    NA    NA
+# ℹ 9 more variables: DIP <dbl>, NO2N <dbl>, TP <dbl>, DOSAT <dbl>, COD <dbl>,
+#   TDP <dbl>, TOC <dbl>, TPH <dbl>, DIN <dbl>
+
+
tail(datadf)
+
+
# A tibble: 6 × 21
+    lon   lat date        NH4N CODMn    DO    pH   BOD  TSSs  TEMP   DOC  NO3N
+  <dbl> <dbl> <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
+1  122.  30.4 2022-07-01    NA    NA  6.71  8.01    NA    NA    NA    NA    NA
+2  122.  30.4 2020-04-01    NA    NA  8.68  8.07    NA    NA    NA    NA    NA
+3  122.  30.4 2020-07-01    NA    NA  6.7   8.03    NA    NA    NA    NA    NA
+4  122.  30.4 2021-04-01    NA    NA  7.4   7.95    NA    NA    NA    NA    NA
+5  122.  30.4 2020-10-01    NA    NA  7.67  8.07    NA    NA    NA    NA    NA
+6  122.  30.4 2021-10-01    NA    NA  7.19  8.26    NA    NA    NA    NA    NA
+# ℹ 9 more variables: DIP <dbl>, NO2N <dbl>, TP <dbl>, DOSAT <dbl>, COD <dbl>,
+#   TDP <dbl>, TOC <dbl>, TPH <dbl>, DIN <dbl>
+
+
names(datadf)
+
+
 [1] "lon"   "lat"   "date"  "NH4N"  "CODMn" "DO"    "pH"    "BOD"   "TSSs" 
+[10] "TEMP"  "DOC"   "NO3N"  "DIP"   "NO2N"  "TP"    "DOSAT" "COD"   "TDP"  
+[19] "TOC"   "TPH"   "DIN"  
+
+
summary(datadf)
+
+
      lon              lat             date                 NH4N         
+ Min.   : 79.98   Min.   :16.51   Min.   :1980-01-07   Min.   :   0.000  
+ 1st Qu.:112.31   1st Qu.:28.65   1st Qu.:2011-06-13   1st Qu.:   0.150  
+ Median :117.19   Median :32.66   Median :2014-06-16   Median :   0.260  
+ Mean   :116.14   Mean   :32.90   Mean   :2014-05-23   Mean   :   0.761  
+ 3rd Qu.:120.71   3rd Qu.:38.32   3rd Qu.:2018-04-23   3rd Qu.:   0.490  
+ Max.   :134.67   Max.   :53.48   Max.   :2022-08-01   Max.   :1000.000  
+                                                       NA's   :21245     
+     CODMn               DO               pH              BOD       
+ Min.   :  0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.10  
+ 1st Qu.:  2.100   1st Qu.: 6.570   1st Qu.: 7.400   1st Qu.: 0.60  
+ Median :  3.300   Median : 7.740   Median : 7.900   Median : 1.02  
+ Mean   :  4.511   Mean   : 7.891   Mean   : 7.788   Mean   : 1.79  
+ 3rd Qu.:  5.300   3rd Qu.: 9.160   3rd Qu.: 8.120   3rd Qu.: 2.00  
+ Max.   :241.000   Max.   :93.200   Max.   :10.300   Max.   :41.30  
+ NA's   :21368     NA's   :2740     NA's   :1289     NA's   :75315  
+      TSSs              TEMP            DOC             NO3N      
+ Min.   :    0.5   Min.   : 0.10   Min.   : 1.00   Min.   :0.00   
+ 1st Qu.:   27.0   1st Qu.:11.20   1st Qu.: 1.60   1st Qu.:0.37   
+ Median :  100.0   Median :17.80   Median : 3.20   Median :0.72   
+ Mean   :  472.4   Mean   :17.61   Mean   : 4.87   Mean   :1.03   
+ 3rd Qu.:  260.0   3rd Qu.:24.32   3rd Qu.: 7.20   3rd Qu.:1.30   
+ Max.   :51800.0   Max.   :31.80   Max.   :14.50   Max.   :6.86   
+ NA's   :75418     NA's   :75227   NA's   :75734   NA's   :75359  
+      DIP             NO2N             TP            DOSAT       
+ Min.   :0.00    Min.   :0.00    Min.   : 0.00   Min.   : 55.70  
+ 1st Qu.:0.00    1st Qu.:0.00    1st Qu.: 0.01   1st Qu.: 88.80  
+ Median :0.01    Median :0.01    Median : 0.05   Median : 96.10  
+ Mean   :0.01    Mean   :0.04    Mean   : 0.60   Mean   : 95.07  
+ 3rd Qu.:0.02    3rd Qu.:0.04    3rd Qu.: 0.18   3rd Qu.:105.50  
+ Max.   :0.64    Max.   :0.60    Max.   :25.61   Max.   :119.40  
+ NA's   :57308   NA's   :75413   NA's   :75551   NA's   :75716   
+      COD             TDP             TOC             TPH       
+ Min.   : 0.03   Min.   :0.00    Min.   :76.7    Min.   :0.00   
+ 1st Qu.: 0.60   1st Qu.:0.01    1st Qu.:76.7    1st Qu.:0.01   
+ Median : 0.86   Median :0.01    Median :76.7    Median :0.01   
+ Mean   : 1.06   Mean   :0.01    Mean   :76.7    Mean   :0.02   
+ 3rd Qu.: 1.24   3rd Qu.:0.02    3rd Qu.:76.7    3rd Qu.:0.03   
+ Max.   :23.00   Max.   :0.03    Max.   :76.7    Max.   :0.63   
+ NA's   :56239   NA's   :75731   NA's   :75746   NA's   :58862  
+      DIN       
+ Min.   :0.00   
+ 1st Qu.:0.05   
+ Median :0.14   
+ Mean   :0.29   
+ 3rd Qu.:0.36   
+ Max.   :8.17   
+ NA's   :56378  
+
+
str(datadf)
+
+
tibble [75,747 × 21] (S3: tbl_df/tbl/data.frame)
+ $ lon  : num [1:75747] 80.7 80.7 80.7 80.7 80.7 ...
+ $ lat  : num [1:75747] 43.8 43.8 43.8 43.8 43.8 ...
+ $ date : Date[1:75747], format: "2012-05-28" "2012-06-04" ...
+ $ NH4N : num [1:75747] 0.07 0.04 0.02 1.33 2.66 0.05 0.06 0.07 0.01 0.07 ...
+ $ CODMn: num [1:75747] 2.9 2.7 1 1.3 3 6.9 4.6 10.2 3.7 6.6 ...
+ $ DO   : num [1:75747] 6.04 6.62 6.39 5.6 4.63 5.04 5.07 5.24 NA 6.67 ...
+ $ pH   : num [1:75747] 7.9 7.8 7.8 7.7 7.7 7.9 8 7.9 8.1 8 ...
+ $ BOD  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+ $ TSSs : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+ $ TEMP : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+ $ DOC  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+ $ NO3N : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+ $ DIP  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+ $ NO2N : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+ $ TP   : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+ $ DOSAT: num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+ $ COD  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+ $ TDP  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+ $ TOC  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+ $ TPH  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+ $ DIN  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
+
+
+
+
+

Plot

+
+
# 每月NH4N与pH的相关性散点图
+p <- datadf |>
+  dplyr::filter(NH4N < 100) |>
+  dplyr::filter(between(year(date), 2016, 2019)) |>
+  mutate(month = month(date)) |>
+  ggplot(aes(CODMn, NH4N)) +
+  geom_point(shape = 21, size = 0.8, fill = "orange") +
+  geom_smooth(method = "lm") +
+  scale_x_log10() +
+  scale_y_log10() +
+  facet_wrap(~month, scale = "free", ncol = 4)
+
+plotly::ggplotly(p)
+
+
+ +
+
+
+
+

Plotly

+
+
p <- datadf |>
+  dplyr::filter(year(date) == 2018) |>
+  ggplot(aes(date, NH4N)) +
+  geom_point()
+
+ggsave("L9-1.pdf", width = 4, height = 3)
+
+
Warning: Removed 2667 rows containing missing values or values outside the scale range
+(`geom_point()`).
+
+
# install.packages("plotly")
+plotly::ggplotly(p)
+
+
+ +
+
+
+ +
+ + +
+ + + + + \ No newline at end of file diff --git a/coding/L9.qmd b/coding/L9.qmd new file mode 100644 index 0000000..f8ed383 --- /dev/null +++ b/coding/L9.qmd @@ -0,0 +1,202 @@ +--- +title: "L9 - Data Virtualization" +format: html +editor: visual +--- + +# Load Data + +```{r} +# getwd() +# setwd("coding") +require(tidyverse) +datadf <- readRDS("../data/chinawq/datadf.rds") +``` + +# Data view + +```{r} +skimr::skim(datadf) + +head(datadf) + +tail(datadf) + +names(datadf) + +summary(datadf) + +str(datadf) +``` + +# Plot + +```{r} +#| warning: false +#| message: false +# 每月NH4N与pH的相关性散点图 +p <- datadf |> + dplyr::filter(NH4N < 100) |> + dplyr::filter(between(year(date), 2016, 2019)) |> + mutate(month = month(date)) |> + ggplot(aes(CODMn, NH4N)) + + geom_point(shape = 21, size = 0.8, fill = "orange") + + geom_smooth(method = "gam", color = "red") + + scale_x_log10() + + scale_y_log10() + + labs( + x = "CODMn (mg L-1)", + y = "Ammonia (mg L-1)" + ) + + facet_wrap(~month, scale = "free", ncol = 4) + + theme( + axis.title.x = ggtext::element_markdown(), + axis.title.y = ggtext::element_markdown() + ) +print(p) + +plotly::ggplotly(p) +``` + +```{r} +#| warning: false +#| message: false +# 每月NH4N与pH的相关性散点图 +p <- datadf |> + dplyr::filter(NH4N < 100) |> + dplyr::filter(between(year(date), 2016, 2019)) |> + mutate(month = month(date)) |> + ggplot(aes(factor(month), NH4N)) + + geom_jitter(size = 0.1, colour = "gray95") + + geom_violin(fill = "orange", alpha = 0.3) + + scale_y_log10() + + labs(x = "Month", y = "Ammonia (mg L-1)") + + theme_classic() + + theme(axis.title.y = ggtext::element_markdown()) +print(p) + +plotly::ggplotly(p) +``` + +# Plotly + +```{r} + +p <- datadf |> + dplyr::filter(year(date) == 2018) |> + ggplot(aes(date, NH4N)) + + geom_point() + +ggsave("L9-1.pdf", width = 4, height = 3) + +# install.packages("plotly") +plotly::ggplotly(p) +``` + + +# Map - sf + +```{r} +# install.packages("sf") +# install.packages("sfext") + +require(sf) +require(sfext) + +chinawqsf <- sf::read_sf( + "../data/chinawq/Monitoring_sites/Monitoring_sites.shp" +) + +chinawqsf |> + ggplot() + + geom_sf(shape = 21, size = 1, fill = "orange") + +chinamapsf <- sf::read_sf("../data/中国省级地图GS(2019)1719号.geojson") +ninelinesf <- sf::read_sf("../data/九段线GS(2019)1719号.geojson") + +chinamapsf <- sf::read_sf( + "https://git.drwater.net/course/su2026rwep/raw/branch/PUB/data/中国省级地图GS(2019)1719号.geojson" +) +ninelinesf <- sf::read_sf( + "https://git.drwater.net/course/su2026rwep/raw/branch/PUB/data/九段线GS(2019)1719号.geojson" +) + +chinacrs <- "+proj=laea +lat_0=40 +lon_0=104" + +mapeR::get_chinacrs() + +chinawqsf |> + ggplot() + + geom_sf(data = chinamapsf) + + geom_sf(data = ninelinesf) + + geom_sf(shape = 21, size = 1, fill = "orange") + + coord_sf(datum = chinacrs) +``` + + +```{r} +require(mapeR) +map_chinese() +``` + +```{r} +chinamapsf <- sf::read_sf("../data/中国省级地图GS(2019)1719号.geojson") +ninelinesf <- sf::read_sf("../data/九段线GS(2019)1719号.geojson") + +chinamapsf <- sf::read_sf( + "https://git.drwater.net/course/su2026rwep/raw/branch/PUB/data/中国省级地图GS(2019)1719号.geojson" +) +ninelinesf <- sf::read_sf( + "https://git.drwater.net/course/su2026rwep/raw/branch/PUB/data/九段线GS(2019)1719号.geojson" +) +chinacrs <- "+proj=laea +lat_0=40 +lon_0=104" + +# 安装 +install.packages("ggspatial") + +datadf |> + sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |> + sf::st_transform(crs = chinacrs) |> + dplyr::filter(!is.na(CODMn)) |> + ggplot() + + geom_sf(data = chinamapsf, aes(fill = CNAME), alpha = 0.2) + + geom_sf(data = ninelinesf) + + geom_sf(aes(colour = log1p(CODMn))) + + labs(x = NULL, y = NULL, fill = NULL, colour = "CODMn") + + scale_colour_viridis_c() + + ggspatial::annotation_north_arrow(location = "tl") + + ggspatial::annotation_scale(location = "bl") + + theme(legend.position = "none") + + +mapeR::map_chinese +``` + +# 具体某个湖泊 + +```{r} +sf::read_sf("../data/taihu.shp") |> + ggplot() + + geom_sf() + + +sitesf <- sf::read_sf("~/Desktop/qingcaosha.kml") + +st_layers("~/Desktop/qingcaosha.kml") + +sitesf <- sf::read_sf("~/Desktop/qingcaosha.kml") +boundsf <- sf::read_sf("~/Desktop/qingcaosha.kml", layer = "qingcaosha") + +ggplot() + + geom_sf(data = boundsf) + + geom_sf(data = sitesf) + + geom_sf_text(data = sitesf, aes(label = Name), vjust = 1, size = 3) +``` + + + + + + + + diff --git a/coding/xgboost_tuned.RDS b/coding/xgboost_tuned.RDS new file mode 100644 index 0000000..5d0fe6a Binary files /dev/null and b/coding/xgboost_tuned.RDS differ