su2026rwep/coding/L10/main.qmd

# 实践部分


## Kriging

### Code

```{r}
#| echo: true
#| eval: false
#| out-width: 50%
require(sf)
require(sp)
require(gstat)
require(raster)
require(ggplot2)
# 读取太湖边界
lake_boundary <- st_read("../../data/taihu.shp") |>
  sf::st_make_valid()
main_water <- st_difference(
  lake_boundary[lake_boundary$Name == "boundary", ],
  st_union(lake_boundary[lake_boundary$Name != "boundary", ])
)

get_kriging <- function(indf, Yvarname, grid = NULL, boundsf = main_water) {
  insf <- indf |>
    sfext::df_to_sf(crs = 4326, coords = c("long", "lat")) |>
    dplyr::select(-c("long", "lat")) |>
    dplyr::rename(Y = tidyselect::all_of(Yvarname))
  if (is.null(grid)) {
    # 1. 将sf边界转为terra格式
    v <- terra::vect(boundsf)
    # 2. 创建基础网格
    base_grid <- terra::rast(v, resolution = 0.002) # 基础低分辨率
    # 3. 创建高分辨率区域（例如边界附近）
    buffer_zone <- buffer(v, width = 0.005) # 边界附近创建缓冲区
    hi_res_grid <- terra::rast(buffer_zone, resolution = 0.001)
    # 4. 合并网格
    final_grid <- merge(base_grid, hi_res_grid)
    # 5. 转为点并裁剪
    grid <- terra::as.points(final_grid) |>
      sf::st_as_sf() |>
      sf::st_filter(main_water)
  }
  insp <- sf::as_Spatial(insf)
  fit <- automap::autofitVariogram(Y ~ 1, insp)
  # 克里金插值
  m <- gstat::gstat(
    formula = Y ~ 1,
    data = insp,
    model = fit$var_model
  )
  predsf <- predict(m, grid) |>
    sf::st_as_sf()
  outsf <- predsf |>
    st_coordinates() |>
    as.data.frame() |>
    cbind(pred = predsf$var1.pred)
  return(outsf)
}


wqdf <- readxl::read_xlsx("../data/wqdata.xlsx")

if (FALSE) {
  wqsf <- wqdf |>
    nest(datedf = -date) |>
    dplyr::mutate(
      krigingsf = purrr::map(datedf, \(x) {
        get_kriging(x, "conc", boundsf = main_water)
      })
    )
  saveRDS(wqsf, "./wqsf.rds")
}

wqsf <- readRDS("./wqsf.rds")

wqsf |>
  unnest(krigingsf) |>
  ggplot(aes(X, Y)) +
  geom_contour_filled(aes(z = pred), bins = 20) +
  geom_sf(
    data = main_water,
    aes(x = NULL, y = NULL),
    fill = NA,
    colour = "black",
    linewidth = 1
  ) +
  # scale_fill_gradientn(colors = hcl.colors(100, "RdYlBu")) + # 设置颜色渐变
  # ggsci::scale_fill_aaas() +
  coord_sf() +
  theme_void()
```

### Output

```{r}
#| echo: false
#| out-width: 80%
#| message: false
#| fig-width: 6
#| fig-height: 3
require(sf)
require(sp)
require(gstat)
require(raster)
require(ggplot2)
# 读取太湖边界
lake_boundary <- st_read("../../data/taihu.shp", quiet = TRUE) |>
  sf::st_make_valid()
main_water <- st_difference(
  lake_boundary[lake_boundary$Name == "boundary", ],
  st_union(lake_boundary[lake_boundary$Name != "boundary", ])
)

get_kriging <- function(indf, Yvarname, grid = NULL, boundsf = main_water) {
  insf <- indf |>
    sfext::df_to_sf(crs = 4326, coords = c("long", "lat")) |>
    dplyr::select(-c("long", "lat")) |>
    dplyr::rename(Y = tidyselect::all_of(Yvarname))
  if (is.null(grid)) {
    # 1. 将sf边界转为terra格式
    v <- terra::vect(boundsf)
    # 2. 创建基础网格
    base_grid <- terra::rast(v, resolution = 0.002) # 基础低分辨率
    # 3. 创建高分辨率区域（例如边界附近）
    buffer_zone <- buffer(v, width = 0.005) # 边界附近创建缓冲区
    hi_res_grid <- terra::rast(buffer_zone, resolution = 0.001)
    # 4. 合并网格
    final_grid <- merge(base_grid, hi_res_grid)
    # 5. 转为点并裁剪
    grid <- terra::as.points(final_grid) |>
      sf::st_as_sf() |>
      sf::st_filter(main_water)
  }
  insp <- sf::as_Spatial(insf)
  fit <- automap::autofitVariogram(Y ~ 1, insp)
  # 克里金插值
  m <- gstat::gstat(
    formula = Y ~ 1,
    data = insp,
    model = fit$var_model
  )
  predsf <- predict(m, grid) |>
    sf::st_as_sf()
  outsf <- predsf |>
    st_coordinates() |>
    as.data.frame() |>
    cbind(pred = predsf$var1.pred)
  return(outsf)
}


wqdf <- readxl::read_xlsx("../../data/wqdata.xlsx")

if (FALSE) {
  wqsf <- wqdf |>
    nest(datedf = -date) |>
    dplyr::mutate(
      krigingsf = purrr::map(datedf, \(x) {
        get_kriging(x, "conc", boundsf = main_water)
      })
    )
  saveRDS(wqsf, "./wqsf.rds")
}
wqsf <- readRDS("./wqsf.rds")

wqsf |>
  unnest(krigingsf) |>
  ggplot(aes(X, Y)) +
  geom_contour_filled(aes(z = pred), bins = 20) +
  geom_sf(
    data = main_water,
    aes(x = NULL, y = NULL),
    fill = NA,
    colour = "black",
    linewidth = 1
  ) +
  # scale_fill_gradientn(colors = hcl.colors(100, "RdYlBu")) + # 设置颜色渐变
  # ggsci::scale_fill_aaas() +
  coord_sf() +
  theme_void()
```

:::


## 数据

```{r}
require(tidyverse)
sitedf <- readr::read_csv(
  "https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv"
) |>
  select(
    SITE_ID,
    lon = LON_DD,
    lat = LAT_DD,
    name = LAKENAME,
    area = LAKEAREA,
    zmax = DEPTHMAX
  ) |>
  group_by(SITE_ID) |>
  summarize(
    lon = mean(lon, na.rm = TRUE),
    lat = mean(lat, na.rm = TRUE),
    name = unique(name),
    area = mean(area, na.rm = TRUE),
    zmax = mean(zmax, na.rm = TRUE)
  )


visitdf <- readr::read_csv(
  "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
) |>
  select(SITE_ID, date = DATE_PROFILE, year = YEAR, visit = VISIT_NO) |>
  distinct()


waterchemdf <- readr::read_csv(
  "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
) |>
  select(
    SITE_ID,
    date = DATE_PROFILE,
    depth = DEPTH,
    temp = TEMP_FIELD,
    do = DO_FIELD,
    ph = PH_FIELD,
    cond = COND_FIELD,
  )

sddf <- readr::read_csv(
  "https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv"
) |>
  select(
    SITE_ID,
    date = DATE_SECCHI,
    sd = SECMEAN,
    clear_to_bottom = CLEAR_TO_BOTTOM
  )

trophicdf <- readr::read_csv(
  "https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv"
) |>
  select(SITE_ID, visit = VISIT_NO, tp = PTL, tn = NTL, chla = CHLA) |>
  left_join(visitdf, by = c("SITE_ID", "visit")) |>
  select(-year, -visit) |>
  group_by(SITE_ID, date) |>
  summarize(
    tp = mean(tp, na.rm = TRUE),
    tn = mean(tn, na.rm = TRUE),
    chla = mean(chla, na.rm = TRUE)
  )


phytodf <- readr::read_csv(
  "https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv"
) |>
  select(
    SITE_ID,
    date = DATEPHYT,
    depth = SAMPLE_DEPTH,
    phyta = DIVISION,
    genus = GENUS,
    species = SPECIES,
    tax = TAXANAME,
    abund = ABUND
  ) |>
  mutate(phyta = gsub(" .*$", "", phyta)) |>
  filter(!is.na(genus)) |>
  group_by(SITE_ID, date, depth, phyta, genus) |>
  summarize(abund = sum(abund, na.rm = TRUE)) |>
  nest(phytodf = -c(SITE_ID, date))

envdf <- waterchemdf |>
  filter(depth < 2) |>
  select(-depth) |>
  group_by(SITE_ID, date) |>
  summarise_all(~ mean(., na.rm = TRUE)) |>
  ungroup() |>
  left_join(sddf, by = c("SITE_ID", "date")) |>
  left_join(trophicdf, by = c("SITE_ID", "date"))

nla <- envdf |>
  left_join(phytodf) |>
  left_join(sitedf, by = "SITE_ID") |>
  filter(!purrr::map_lgl(phytodf, is.null)) |>
  mutate(
    cyanophyta = purrr::map(
      phytodf,
      ~ .x |>
        dplyr::filter(phyta == "Cyanophyta") |>
        summarize(cyanophyta = sum(abund, na.rm = TRUE))
    )
  ) |>
  unnest(cyanophyta) |>
  select(-phyta) |>
  mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE))

# library(rmdify)
# library(dwfun)
# dwfun::init()
```


## 数据

```{r}
skimr::skim(nla)
```


## 简单模型

```{r}
nla |>
  filter(tp > 1) |>
  ggplot(aes(tn, tp)) +
  geom_point() +
  geom_smooth(method = "lm") +
  scale_x_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  scale_y_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  )

m1 <- lm(log10(tp) ~ log10(tn), data = nla)

summary(m1)
```

## 复杂指标

```{r}
nla |>
  filter(tp > 1) |>
  ggplot(aes(tp, cyanophyta)) +
  geom_point() +
  geom_smooth(method = "lm") +
  scale_x_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  scale_y_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  )

m2 <- lm(log10(cyanophyta) ~ log10(tp), data = nla)

summary(m2)
```


## tidymodels - Data split

```{r}
require(tidymodels)
(nla_split <- rsample::initial_split(nla, prop = 0.7, strata = zmax))
(nla_train <- training(nla_split))
(nla_test <- testing(nla_split))
```

## tidymodels - recipe

```{r}
nla_formula <- as.formula(
  "cyanophyta ~ temp + do + ph + cond + sd + tp + tn + chla + clear_to_bottom"
)
# nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn")
nla_recipe <- recipes::recipe(nla_formula, data = nla_train) |>
  recipes::step_string2factor(all_nominal()) |>
  recipes::step_nzv(all_nominal()) |>
  recipes::step_log(chla, cyanophyta, base = 10) |>
  recipes::step_normalize(all_numeric_predictors()) |>
  prep()
nla_recipe
```

## tidymodels - cross validation

```{r}
nla_cv <- recipes::bake(
  nla_recipe,
  new_data = training(nla_split)
) |>
  rsample::vfold_cv(v = 10)
nla_cv
```

## tidymodels - Model specification

```{r}
xgboost_model <- parsnip::boost_tree(
  mode = "regression",
  trees = 1000,
  min_n = tune(),
  tree_depth = tune(),
  learn_rate = tune(),
  loss_reduction = tune()
) |>
  set_engine("xgboost", objective = "reg:squarederror")
xgboost_model
```


## tidymodels - Grid specification

```{r}
# grid specification
xgboost_params <- dials::parameters(
  min_n(),
  tree_depth(),
  learn_rate(),
  loss_reduction()
)
xgboost_params
```

## tidymodels - Grid specification

```{r}
xgboost_grid <- dials::grid_max_entropy(
  xgboost_params,
  size = 60
)
knitr::kable(head(xgboost_grid))
```

## tidymodels - Workflow

```{r}
xgboost_wf <- workflows::workflow() |>
  add_model(xgboost_model) |>
  add_formula(nla_formula)
xgboost_wf
```


## tidymodels - Tune

```{r}
#| cache: true
# hyperparameter tuning
if (FALSE) {
  xgboost_tuned <- tune::tune_grid(
    object = xgboost_wf,
    resamples = nla_cv,
    grid = xgboost_grid,
    metrics = yardstick::metric_set(rmse, rsq, mae),
    control = tune::control_grid(verbose = TRUE)
  )
  saveRDS(xgboost_tuned, "./xgboost_tuned.RDS")
}
xgboost_tuned <- readRDS("./xgboost_tuned.RDS")
```

## tidymodels - Best model

```{r}
xgboost_tuned |>
  tune::show_best(metric = "rmse") |>
  knitr::kable()
```


## tidymodels - Best model

```{r}
xgboost_tuned |>
  collect_metrics()
```


## tidymodels - Best model

```{r}
#| fig-width: 9
#| fig-height: 5
#| out-width: "100%"
xgboost_tuned |>
  autoplot()
```


## tidymodels - Best model


```{r}
xgboost_best_params <- xgboost_tuned |>
  tune::select_best(metric = "rmse")

knitr::kable(xgboost_best_params)
```


## tidymodels - Final model

```{r}
xgboost_model_final <- xgboost_model |>
  finalize_model(xgboost_best_params)
xgboost_model_final
```


## tidymodels - Train evaluation


```{r}
(train_processed <- bake(nla_recipe, new_data = nla_train))
```

## tidymodels - Train data

```{r}
train_prediction <- xgboost_model_final |>
  # fit the model on all the training data
  fit(
    formula = nla_formula,
    data = train_processed
  ) |>
  # predict the sale prices for the training data
  predict(new_data = train_processed) |>
  bind_cols(
    nla_train |>
      mutate(.obs = log10(cyanophyta))
  )
xgboost_score_train <-
  train_prediction |>
  yardstick::metrics(.obs, .pred) |>
  mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
knitr::kable(xgboost_score_train)
```

## tidymodels - train evaluation

```{r}
#| fig-width: 5
#| fig-height: 3
#| out-width: "80%"
train_prediction |>
  ggplot(aes(.pred, .obs)) +
  geom_point() +
  geom_smooth(method = "lm")
```


## tidymodels - test data


```{r}
test_processed <- bake(nla_recipe, new_data = nla_test)

test_prediction <- xgboost_model_final |>
  # fit the model on all the training data
  fit(
    formula = nla_formula,
    data = train_processed
  ) |>
  # use the training model fit to predict the test data
  predict(new_data = test_processed) |>
  bind_cols(
    nla_test |>
      mutate(.obs = log10(cyanophyta))
  )

# measure the accuracy of our model using `yardstick`
xgboost_score <- test_prediction |>
  yardstick::metrics(.obs, .pred) |>
  mutate(.estimate = format(round(.estimate, 2), big.mark = ","))

knitr::kable(xgboost_score)
```


## tidymodels - evaluation

```{r}
#| fig-width: 5
#| fig-height: 3
#| out-width: "80%"
cyanophyta_prediction_residual <- test_prediction |>
  arrange(.pred) %>%
  mutate(residual_pct = (.obs - .pred) / .pred) |>
  select(.pred, residual_pct)

cyanophyta_prediction_residual |>
  ggplot(aes(x = .pred, y = residual_pct)) +
  geom_point() +
  xlab("Predicted Cyanophyta") +
  ylab("Residual (%)")
```


## tidymodels - test evaluation

```{r}
#| fig-width: 5
#| fig-height: 3
#| out-width: "80%"
test_prediction |>
  ggplot(aes(.pred, .obs)) +
  geom_point() +
  geom_smooth(method = "lm", colour = "black")
```