# 实践部分 ## Kriging ### Code ```{r} #| echo: true #| eval: false #| out-width: 50% require(sf) require(sp) require(gstat) require(raster) require(ggplot2) # 读取太湖边界 lake_boundary <- st_read("../../data/taihu.shp") |> sf::st_make_valid() main_water <- st_difference( lake_boundary[lake_boundary$Name == "boundary", ], st_union(lake_boundary[lake_boundary$Name != "boundary", ]) ) get_kriging <- function(indf, Yvarname, grid = NULL, boundsf = main_water) { insf <- indf |> sfext::df_to_sf(crs = 4326, coords = c("long", "lat")) |> dplyr::select(-c("long", "lat")) |> dplyr::rename(Y = tidyselect::all_of(Yvarname)) if (is.null(grid)) { # 1. 将sf边界转为terra格式 v <- terra::vect(boundsf) # 2. 创建基础网格 base_grid <- terra::rast(v, resolution = 0.002) # 基础低分辨率 # 3. 创建高分辨率区域(例如边界附近) buffer_zone <- buffer(v, width = 0.005) # 边界附近创建缓冲区 hi_res_grid <- terra::rast(buffer_zone, resolution = 0.001) # 4. 合并网格 final_grid <- merge(base_grid, hi_res_grid) # 5. 转为点并裁剪 grid <- terra::as.points(final_grid) |> sf::st_as_sf() |> sf::st_filter(main_water) } insp <- sf::as_Spatial(insf) fit <- automap::autofitVariogram(Y ~ 1, insp) # 克里金插值 m <- gstat::gstat( formula = Y ~ 1, data = insp, model = fit$var_model ) predsf <- predict(m, grid) |> sf::st_as_sf() outsf <- predsf |> st_coordinates() |> as.data.frame() |> cbind(pred = predsf$var1.pred) return(outsf) } wqdf <- readxl::read_xlsx("../data/wqdata.xlsx") if (FALSE) { wqsf <- wqdf |> nest(datedf = -date) |> dplyr::mutate( krigingsf = purrr::map(datedf, \(x) { get_kriging(x, "conc", boundsf = main_water) }) ) saveRDS(wqsf, "./wqsf.rds") } wqsf <- readRDS("./wqsf.rds") wqsf |> unnest(krigingsf) |> ggplot(aes(X, Y)) + geom_contour_filled(aes(z = pred), bins = 20) + geom_sf( data = main_water, aes(x = NULL, y = NULL), fill = NA, colour = "black", linewidth = 1 ) + # scale_fill_gradientn(colors = hcl.colors(100, "RdYlBu")) + # 设置颜色渐变 # ggsci::scale_fill_aaas() + coord_sf() + theme_void() ``` ### Output ```{r} #| echo: false #| out-width: 80% #| message: false #| fig-width: 6 #| fig-height: 3 require(sf) require(sp) require(gstat) require(raster) require(ggplot2) # 读取太湖边界 lake_boundary <- st_read("../../data/taihu.shp", quiet = TRUE) |> sf::st_make_valid() main_water <- st_difference( lake_boundary[lake_boundary$Name == "boundary", ], st_union(lake_boundary[lake_boundary$Name != "boundary", ]) ) get_kriging <- function(indf, Yvarname, grid = NULL, boundsf = main_water) { insf <- indf |> sfext::df_to_sf(crs = 4326, coords = c("long", "lat")) |> dplyr::select(-c("long", "lat")) |> dplyr::rename(Y = tidyselect::all_of(Yvarname)) if (is.null(grid)) { # 1. 将sf边界转为terra格式 v <- terra::vect(boundsf) # 2. 创建基础网格 base_grid <- terra::rast(v, resolution = 0.002) # 基础低分辨率 # 3. 创建高分辨率区域(例如边界附近) buffer_zone <- buffer(v, width = 0.005) # 边界附近创建缓冲区 hi_res_grid <- terra::rast(buffer_zone, resolution = 0.001) # 4. 合并网格 final_grid <- merge(base_grid, hi_res_grid) # 5. 转为点并裁剪 grid <- terra::as.points(final_grid) |> sf::st_as_sf() |> sf::st_filter(main_water) } insp <- sf::as_Spatial(insf) fit <- automap::autofitVariogram(Y ~ 1, insp) # 克里金插值 m <- gstat::gstat( formula = Y ~ 1, data = insp, model = fit$var_model ) predsf <- predict(m, grid) |> sf::st_as_sf() outsf <- predsf |> st_coordinates() |> as.data.frame() |> cbind(pred = predsf$var1.pred) return(outsf) } wqdf <- readxl::read_xlsx("../../data/wqdata.xlsx") if (FALSE) { wqsf <- wqdf |> nest(datedf = -date) |> dplyr::mutate( krigingsf = purrr::map(datedf, \(x) { get_kriging(x, "conc", boundsf = main_water) }) ) saveRDS(wqsf, "./wqsf.rds") } wqsf <- readRDS("./wqsf.rds") wqsf |> unnest(krigingsf) |> ggplot(aes(X, Y)) + geom_contour_filled(aes(z = pred), bins = 20) + geom_sf( data = main_water, aes(x = NULL, y = NULL), fill = NA, colour = "black", linewidth = 1 ) + # scale_fill_gradientn(colors = hcl.colors(100, "RdYlBu")) + # 设置颜色渐变 # ggsci::scale_fill_aaas() + coord_sf() + theme_void() ``` ::: ## 数据 ```{r} require(tidyverse) sitedf <- readr::read_csv( "https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv" ) |> select( SITE_ID, lon = LON_DD, lat = LAT_DD, name = LAKENAME, area = LAKEAREA, zmax = DEPTHMAX ) |> group_by(SITE_ID) |> summarize( lon = mean(lon, na.rm = TRUE), lat = mean(lat, na.rm = TRUE), name = unique(name), area = mean(area, na.rm = TRUE), zmax = mean(zmax, na.rm = TRUE) ) visitdf <- readr::read_csv( "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv" ) |> select(SITE_ID, date = DATE_PROFILE, year = YEAR, visit = VISIT_NO) |> distinct() waterchemdf <- readr::read_csv( "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv" ) |> select( SITE_ID, date = DATE_PROFILE, depth = DEPTH, temp = TEMP_FIELD, do = DO_FIELD, ph = PH_FIELD, cond = COND_FIELD, ) sddf <- readr::read_csv( "https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv" ) |> select( SITE_ID, date = DATE_SECCHI, sd = SECMEAN, clear_to_bottom = CLEAR_TO_BOTTOM ) trophicdf <- readr::read_csv( "https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv" ) |> select(SITE_ID, visit = VISIT_NO, tp = PTL, tn = NTL, chla = CHLA) |> left_join(visitdf, by = c("SITE_ID", "visit")) |> select(-year, -visit) |> group_by(SITE_ID, date) |> summarize( tp = mean(tp, na.rm = TRUE), tn = mean(tn, na.rm = TRUE), chla = mean(chla, na.rm = TRUE) ) phytodf <- readr::read_csv( "https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv" ) |> select( SITE_ID, date = DATEPHYT, depth = SAMPLE_DEPTH, phyta = DIVISION, genus = GENUS, species = SPECIES, tax = TAXANAME, abund = ABUND ) |> mutate(phyta = gsub(" .*$", "", phyta)) |> filter(!is.na(genus)) |> group_by(SITE_ID, date, depth, phyta, genus) |> summarize(abund = sum(abund, na.rm = TRUE)) |> nest(phytodf = -c(SITE_ID, date)) envdf <- waterchemdf |> filter(depth < 2) |> select(-depth) |> group_by(SITE_ID, date) |> summarise_all(~ mean(., na.rm = TRUE)) |> ungroup() |> left_join(sddf, by = c("SITE_ID", "date")) |> left_join(trophicdf, by = c("SITE_ID", "date")) nla <- envdf |> left_join(phytodf) |> left_join(sitedf, by = "SITE_ID") |> filter(!purrr::map_lgl(phytodf, is.null)) |> mutate( cyanophyta = purrr::map( phytodf, ~ .x |> dplyr::filter(phyta == "Cyanophyta") |> summarize(cyanophyta = sum(abund, na.rm = TRUE)) ) ) |> unnest(cyanophyta) |> select(-phyta) |> mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE)) # library(rmdify) # library(dwfun) # dwfun::init() ``` ## 数据 ```{r} skimr::skim(nla) ``` ## 简单模型 ```{r} nla |> filter(tp > 1) |> ggplot(aes(tn, tp)) + geom_point() + geom_smooth(method = "lm") + scale_x_log10( breaks = scales::trans_breaks("log10", function(x) 10^x), labels = scales::trans_format("log10", scales::math_format(10^.x)) ) + scale_y_log10( breaks = scales::trans_breaks("log10", function(x) 10^x), labels = scales::trans_format("log10", scales::math_format(10^.x)) ) m1 <- lm(log10(tp) ~ log10(tn), data = nla) summary(m1) ``` ## 复杂指标 ```{r} nla |> filter(tp > 1) |> ggplot(aes(tp, cyanophyta)) + geom_point() + geom_smooth(method = "lm") + scale_x_log10( breaks = scales::trans_breaks("log10", function(x) 10^x), labels = scales::trans_format("log10", scales::math_format(10^.x)) ) + scale_y_log10( breaks = scales::trans_breaks("log10", function(x) 10^x), labels = scales::trans_format("log10", scales::math_format(10^.x)) ) m2 <- lm(log10(cyanophyta) ~ log10(tp), data = nla) summary(m2) ``` ## tidymodels - Data split ```{r} require(tidymodels) (nla_split <- rsample::initial_split(nla, prop = 0.7, strata = zmax)) (nla_train <- training(nla_split)) (nla_test <- testing(nla_split)) ``` ## tidymodels - recipe ```{r} nla_formula <- as.formula( "cyanophyta ~ temp + do + ph + cond + sd + tp + tn + chla + clear_to_bottom" ) # nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn") nla_recipe <- recipes::recipe(nla_formula, data = nla_train) |> recipes::step_string2factor(all_nominal()) |> recipes::step_nzv(all_nominal()) |> recipes::step_log(chla, cyanophyta, base = 10) |> recipes::step_normalize(all_numeric_predictors()) |> prep() nla_recipe ``` ## tidymodels - cross validation ```{r} nla_cv <- recipes::bake( nla_recipe, new_data = training(nla_split) ) |> rsample::vfold_cv(v = 10) nla_cv ``` ## tidymodels - Model specification ```{r} xgboost_model <- parsnip::boost_tree( mode = "regression", trees = 1000, min_n = tune(), tree_depth = tune(), learn_rate = tune(), loss_reduction = tune() ) |> set_engine("xgboost", objective = "reg:squarederror") xgboost_model ``` ## tidymodels - Grid specification ```{r} # grid specification xgboost_params <- dials::parameters( min_n(), tree_depth(), learn_rate(), loss_reduction() ) xgboost_params ``` ## tidymodels - Grid specification ```{r} xgboost_grid <- dials::grid_max_entropy( xgboost_params, size = 60 ) knitr::kable(head(xgboost_grid)) ``` ## tidymodels - Workflow ```{r} xgboost_wf <- workflows::workflow() |> add_model(xgboost_model) |> add_formula(nla_formula) xgboost_wf ``` ## tidymodels - Tune ```{r} #| cache: true # hyperparameter tuning if (FALSE) { xgboost_tuned <- tune::tune_grid( object = xgboost_wf, resamples = nla_cv, grid = xgboost_grid, metrics = yardstick::metric_set(rmse, rsq, mae), control = tune::control_grid(verbose = TRUE) ) saveRDS(xgboost_tuned, "./xgboost_tuned.RDS") } xgboost_tuned <- readRDS("./xgboost_tuned.RDS") ``` ## tidymodels - Best model ```{r} xgboost_tuned |> tune::show_best(metric = "rmse") |> knitr::kable() ``` ## tidymodels - Best model ```{r} xgboost_tuned |> collect_metrics() ``` ## tidymodels - Best model ```{r} #| fig-width: 9 #| fig-height: 5 #| out-width: "100%" xgboost_tuned |> autoplot() ``` ## tidymodels - Best model ```{r} xgboost_best_params <- xgboost_tuned |> tune::select_best(metric = "rmse") knitr::kable(xgboost_best_params) ``` ## tidymodels - Final model ```{r} xgboost_model_final <- xgboost_model |> finalize_model(xgboost_best_params) xgboost_model_final ``` ## tidymodels - Train evaluation ```{r} (train_processed <- bake(nla_recipe, new_data = nla_train)) ``` ## tidymodels - Train data ```{r} train_prediction <- xgboost_model_final |> # fit the model on all the training data fit( formula = nla_formula, data = train_processed ) |> # predict the sale prices for the training data predict(new_data = train_processed) |> bind_cols( nla_train |> mutate(.obs = log10(cyanophyta)) ) xgboost_score_train <- train_prediction |> yardstick::metrics(.obs, .pred) |> mutate(.estimate = format(round(.estimate, 2), big.mark = ",")) knitr::kable(xgboost_score_train) ``` ## tidymodels - train evaluation ```{r} #| fig-width: 5 #| fig-height: 3 #| out-width: "80%" train_prediction |> ggplot(aes(.pred, .obs)) + geom_point() + geom_smooth(method = "lm") ``` ## tidymodels - test data ```{r} test_processed <- bake(nla_recipe, new_data = nla_test) test_prediction <- xgboost_model_final |> # fit the model on all the training data fit( formula = nla_formula, data = train_processed ) |> # use the training model fit to predict the test data predict(new_data = test_processed) |> bind_cols( nla_test |> mutate(.obs = log10(cyanophyta)) ) # measure the accuracy of our model using `yardstick` xgboost_score <- test_prediction |> yardstick::metrics(.obs, .pred) |> mutate(.estimate = format(round(.estimate, 2), big.mark = ",")) knitr::kable(xgboost_score) ``` ## tidymodels - evaluation ```{r} #| fig-width: 5 #| fig-height: 3 #| out-width: "80%" cyanophyta_prediction_residual <- test_prediction |> arrange(.pred) %>% mutate(residual_pct = (.obs - .pred) / .pred) |> select(.pred, residual_pct) cyanophyta_prediction_residual |> ggplot(aes(x = .pred, y = residual_pct)) + geom_point() + xlab("Predicted Cyanophyta") + ylab("Residual (%)") ``` ## tidymodels - test evaluation ```{r} #| fig-width: 5 #| fig-height: 3 #| out-width: "80%" test_prediction |> ggplot(aes(.pred, .obs)) + geom_point() + geom_smooth(method = "lm", colour = "black") ```