update

2025-03-17 19:20:50 +08:00 · 2025-03-17 19:20:50 +08:00 · 2db87d5e89
parent 1532a8982d
commit 2db87d5e89
13 changed files with 803 additions and 1 deletions
--- a/SD/20240328_9_课后作业/.RData
+++ b/SD/20240328_9_课后作业/.RData
--- a/SD/20240328_9_课后作业/airqualitydf.RDS
+++ b/SD/20240328_9_课后作业/airqualitydf.RDS
--- a/SD/20240328_9_课后作业/metadf.RDS
+++ b/SD/20240328_9_课后作业/metadf.RDS
--- a/SD/20240328_9_课后作业/testdf.RDS
+++ b/SD/20240328_9_课后作业/testdf.RDS
--- a/SD/20240409_1_model/xgboost_tuned.RDS
+++ b/SD/20240409_1_model/xgboost_tuned.RDS
--- a/coding/Untitled.qmd
+++ b/coding/Untitled.qmd
@ -0,0 +1,398 @@
+---
+title: "第10次课"
+editor: visual
+---
+
+```{r}
+require(tidymodels)
+taxi
+set.seed(123)
+taxi_split <- initial_split(taxi, prop = 0.8)
+taxi_train <- training(taxi_split)
+taxi_test <- testing(taxi_split)
+
+nrow(taxi_train)
+```
+```{r}
+tree_spec <-
+  decision_tree(cost_complexity = 0.002) %>% 
+  set_mode("classification")
+
+tree_fit <-
+  workflow(tip ~ ., tree_spec) %>% 
+  fit(data = taxi_train) 
+taxi_test |> 
+  bind_cols(
+predict(tree_fit, new_data = taxi_test))
+  
+```
+
+```{r}
+require(magrittr)
+# %>%: magrittr
+# |>: base
+taxi_folds <- vfold_cv(taxi_train, v = 5)
+tree_spec <-
+  decision_tree(cost_complexity = 0.002) |> 
+  set_mode("classification")
+taxi_wflow <- workflow() |>
+  add_formula(tip ~ .) |>
+  add_model(tree_spec)
+taxi_res <- fit_resamples(taxi_wflow, taxi_folds)
+taxi_res
+taxi_metrics <- taxi_res |>
+  collect_metrics()
+
+
+# Save the assessment set results
+ctrl_taxi <- control_resamples(save_pred = TRUE)
+taxi_res <- fit_resamples(taxi_wflow, taxi_folds, control = ctrl_taxi)
+
+taxi_res |>
+  collect_predictions()
+
+taxi_metrics <- metric_set(accuracy, specificity, sensitivity)
+
+taxi_preds |>
+  group_by(id) |>
+  taxi_metrics(truth = tip, estimate = .pred_class)
+
+```
+
+
+## 数据
+
+```{r}
+require(tidyverse)
+sitedf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv") |>
+  select(SITE_ID,
+    lon = LON_DD, 
+    lat = LAT_DD, 
+    name = LAKENAME, 
+    area = LAKEAREA, 
+    zmax = DEPTHMAX
+    ) |>
+  group_by(SITE_ID) |>
+  summarize(lon = mean(lon, na.rm = TRUE),
+    lat = mean(lat, na.rm = TRUE),
+    name = unique(name),
+    area = mean(area, na.rm = TRUE),
+    zmax = mean(zmax, na.rm = TRUE))
+
+
+visitdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv") |>
+  select(SITE_ID,
+    date = DATE_PROFILE,
+    year = YEAR,
+    visit = VISIT_NO
+  ) |>
+  distinct()
+
+
+
+waterchemdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv") |>
+  select(SITE_ID,
+    date = DATE_PROFILE,
+    depth = DEPTH,
+    temp = TEMP_FIELD,
+    do = DO_FIELD,
+    ph = PH_FIELD,
+    cond = COND_FIELD,
+  )
+
+sddf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv") |>
+  select(SITE_ID, 
+    date = DATE_SECCHI, 
+    sd = SECMEAN,
+    clear_to_bottom = CLEAR_TO_BOTTOM
+  )
+
+trophicdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv") |>
+  select(SITE_ID,
+    visit = VISIT_NO,
+    tp = PTL,
+    tn = NTL,
+    chla = CHLA) |>
+  left_join(visitdf, by = c("SITE_ID", "visit")) |>
+  select(-year, -visit) |>
+  group_by(SITE_ID, date) |>
+  summarize(tp = mean(tp, na.rm = TRUE),
+    tn = mean(tn, na.rm = TRUE),
+    chla = mean(chla, na.rm = TRUE)
+  )
+
+
+
+phytodf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv") |>
+  select(SITE_ID,
+    date = DATEPHYT,
+    depth = SAMPLE_DEPTH,
+    phyta = DIVISION,
+    genus = GENUS,
+    species = SPECIES,
+    tax = TAXANAME,
+    abund = ABUND) |>
+  mutate(phyta = gsub(" .*$", "", phyta)) |>
+  filter(!is.na(genus)) |>
+  group_by(SITE_ID, date, depth, phyta, genus) |>
+  summarize(abund = sum(abund, na.rm = TRUE)) |>
+  nest(phytodf = -c(SITE_ID, date))
+
+envdf <- waterchemdf |>
+  filter(depth < 2) |>
+  select(-depth) |>
+  group_by(SITE_ID, date) |>
+  summarise_all(~mean(., na.rm = TRUE)) |>
+  ungroup() |>
+  left_join(sddf, by = c("SITE_ID", "date")) |>
+  left_join(trophicdf, by = c("SITE_ID", "date"))
+
+nla <- envdf |>
+  left_join(phytodf) |>
+  left_join(sitedf, by = "SITE_ID") |>
+  filter(!purrr::map_lgl(phytodf, is.null)) |>
+  mutate(cyanophyta = purrr::map(phytodf, ~ .x |>
+    dplyr::filter(phyta == "Cyanophyta") |>
+    summarize(cyanophyta = sum(abund, na.rm = TRUE))
+  )) |>
+  unnest(cyanophyta) |>
+  select(-phyta) |>
+  mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE))
+
+
+# library(rmdify)
+# library(dwfun)
+# dwfun::init()
+
+```
+
+
+```{r}
+skimr::skim(nla)
+```
+```{r}
+nla |>
+  filter(tp > 1) |>
+  ggplot(aes(tn, tp)) +
+geom_point() +
+geom_smooth(method = "lm") +
+scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
+               labels = scales::trans_format("log10", scales::math_format(10^.x))) +
+scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
+               labels = scales::trans_format("log10", scales::math_format(10^.x)))
+
+m1 <- lm(log10(tp) ~ log10(tn), data = nla)
+
+summary(m1)
+```
+```{r}
+nla |>
+  filter(tp > 1) |>
+  ggplot(aes(tp, cyanophyta)) +
+geom_point() +
+geom_smooth(method = "lm") +
+scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
+               labels = scales::trans_format("log10", scales::math_format(10^.x))) +
+scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
+               labels = scales::trans_format("log10", scales::math_format(10^.x)))
+
+
+m2 <- lm(log10(cyanophyta) ~ log10(tp), data = nla)
+
+summary(m2)
+
+m3 <- glm(log10(cyanophyta) ~ log10(tp), data = nla)
+summary(m3)
+
+require(mgcv)
+# 广义加性模型 General additive model
+m4 <- gam(log10(cyanophyta) ~ s(log10(tp)) + s(log10(tn)) + s(log10(chla)), data = nla)
+plot(m4)
+summary(m4)
+```
+
+
+```{r}
+nla_split <- initial_split(nla, prop = 0.7, strata = zmax)
+nla_train <- training(nla_split)
+nla_test <- testing(nla_split)
+```
+
+```{r}
+# xgboost, cyanophyta
+
+nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn + chla + clear_to_bottom")
+# nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn")
+nla_recipe <- recipes::recipe(nla_formula, data = nla_train) |>
+  recipes::step_string2factor(all_nominal()) |>
+  recipes::step_nzv(all_nominal()) |>
+  recipes::step_log(chla, cyanophyta, tn, tp, base = 10) |>
+  recipes::step_normalize(all_numeric_predictors()) |>
+  prep()
+nla_recipe
+```
+
+```{r}
+nla_cv <- recipes::bake(
+    nla_recipe, 
+    new_data = training(nla_split)
+  ) |>
+  rsample::vfold_cv(v = 10)
+nla_cv
+```
+
+```{r}
+xgboost_model <- parsnip::boost_tree(
+  mode = "regression",
+  trees = 1000,
+  min_n = tune(),
+  tree_depth = tune(),
+  learn_rate = tune(),
+  loss_reduction = tune()
+) |>
+  set_engine("xgboost", objective = "reg:squarederror")
+xgboost_model
+```
+
+```{r}
+# grid specification
+xgboost_params <- dials::parameters(
+  min_n(),
+  tree_depth(),
+  learn_rate(),
+  loss_reduction()
+)
+xgboost_params
+```
+
+```{r}
+xgboost_grid <- dials::grid_max_entropy(
+  xgboost_params, 
+  size = 60
+)
+knitr::kable(head(xgboost_grid))
+```
+
+```{r}
+xgboost_wf <- workflows::workflow() |>
+  add_model(xgboost_model) |>
+  add_formula(nla_formula)
+xgboost_wf
+```
+
+```{r}
+# hyperparameter tuning
+if (FALSE) {
+  xgboost_tuned <- tune::tune_grid(
+    object = xgboost_wf,
+    resamples = nla_cv,
+    grid = xgboost_grid,
+    metrics = yardstick::metric_set(rmse, rsq, mae),
+    control = tune::control_grid(verbose = TRUE)
+  )
+saveRDS(xgboost_tuned, "./xgboost_tuned.RDS")
+}
+xgboost_tuned <- readRDS("./xgboost_tuned.RDS")
+```
+
+```{r}
+xgboost_tuned |>
+  tune::show_best(metric = "rmse") |>
+  knitr::kable()
+```
+
+```{r}
+xgboost_tuned |>
+  collect_metrics()
+```
+
+```{r}
+xgboost_tuned |>
+  autoplot()
+```
+
+```{r}
+xgboost_best_params <- xgboost_tuned |>
+  tune::select_best("rmse")
+
+knitr::kable(xgboost_best_params)
+```
+
+```{r}
+xgboost_model_final <- xgboost_model |>
+  finalize_model(xgboost_best_params)
+xgboost_model_final
+```
+
+```{r}
+(train_processed <- bake(nla_recipe, new_data = nla_train))
+```
+
+```{r}
+train_prediction <- xgboost_model_final |>
+  # fit the model on all the training data
+  fit(
+    formula = nla_formula, 
+    data    = train_processed
+  ) |>
+  # predict the sale prices for the training data
+  predict(new_data = train_processed) |>
+  bind_cols(nla_train |>
+  mutate(.obs = log10(cyanophyta)))
+xgboost_score_train <- 
+  train_prediction |>
+  yardstick::metrics(.obs, .pred) |>
+  mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
+knitr::kable(xgboost_score_train)
+```
+
+
+```{r}
+train_prediction |>
+  ggplot(aes(.pred, .obs)) +
+geom_point() +
+geom_smooth(method = "lm")
+```
+
+```{r}
+test_processed  <- bake(nla_recipe, new_data = nla_test)
+
+test_prediction <- xgboost_model_final |>
+  # fit the model on all the training data
+  fit(
+    formula = nla_formula,
+    data    = train_processed
+  ) |>
+  # use the training model fit to predict the test data
+  predict(new_data = test_processed) |>
+  bind_cols(nla_test |>
+  mutate(.obs = log10(cyanophyta)))
+
+# measure the accuracy of our model using `yardstick`
+xgboost_score <- test_prediction |>
+  yardstick::metrics(.obs, .pred) |>
+  mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
+
+knitr::kable(xgboost_score)
+```
+
+```{r}
+cyanophyta_prediction_residual <- test_prediction |>
+  arrange(.pred) %>%
+  mutate(residual_pct = (.obs - .pred) / .pred) |>
+  select(.pred, residual_pct)
+
+cyanophyta_prediction_residual |>
+ggplot(aes(x = .pred, y = residual_pct)) +
+  geom_point() +
+  xlab("Predicted Cyanophyta") +
+  ylab("Residual (%)")
+```
+
+```{r}
+test_prediction |>
+  ggplot(aes(.pred, .obs)) +
+geom_point() +
+geom_smooth(method = "lm", colour = "black")
+```
+
--- a/coding/lesson10.qmd
+++ b/coding/lesson10.qmd
@ -0,0 +1,401 @@
+---
+title: "第10次课"
+editor: visual
+---
+
+```{r}
+require(tidymodels)
+taxi
+set.seed(123)
+taxi_split <- initial_split(taxi, prop = 0.8)
+taxi_train <- training(taxi_split)
+taxi_test <- testing(taxi_split)
+
+nrow(taxi_train)
+```
+```{r}
+tree_spec <-
+  decision_tree(cost_complexity = 0.002) %>% 
+  set_mode("classification")
+
+tree_fit <-
+  workflow(tip ~ ., tree_spec) %>% 
+  fit(data = taxi_train) 
+taxi_test |> 
+  bind_cols(
+predict(tree_fit, new_data = taxi_test))
+  
+```
+
+```{r}
+require(magrittr)
+# %>%: magrittr
+# |>: base
+taxi_folds <- vfold_cv(taxi_train, v = 5)
+tree_spec <-
+  decision_tree(cost_complexity = 0.002) |> 
+  set_mode("classification")
+taxi_wflow <- workflow() |>
+  add_formula(tip ~ .) |>
+  add_model(tree_spec)
+taxi_res <- fit_resamples(taxi_wflow, taxi_folds)
+taxi_res
+taxi_metrics <- taxi_res |>
+  collect_metrics()
+
+
+# Save the assessment set results
+ctrl_taxi <- control_resamples(save_pred = TRUE)
+taxi_res <- fit_resamples(taxi_wflow, taxi_folds, control = ctrl_taxi)
+
+taxi_res |>
+  collect_predictions()
+
+taxi_metrics <- metric_set(accuracy, specificity, sensitivity)
+
+taxi_preds |>
+  group_by(id) |>
+  taxi_metrics(truth = tip, estimate = .pred_class)
+
+```
+
+
+## 数据
+
+```{r}
+require(tidyverse)
+sitedf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv") |>
+  select(SITE_ID,
+    lon = LON_DD, 
+    lat = LAT_DD, 
+    name = LAKENAME, 
+    area = LAKEAREA, 
+    zmax = DEPTHMAX
+    ) |>
+  group_by(SITE_ID) |>
+  summarize(lon = mean(lon, na.rm = TRUE),
+    lat = mean(lat, na.rm = TRUE),
+    name = unique(name),
+    area = mean(area, na.rm = TRUE),
+    zmax = mean(zmax, na.rm = TRUE))
+
+
+visitdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv") |>
+  select(SITE_ID,
+    date = DATE_PROFILE,
+    year = YEAR,
+    visit = VISIT_NO
+  ) |>
+  distinct()
+
+
+
+waterchemdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv") |>
+  select(SITE_ID,
+    date = DATE_PROFILE,
+    depth = DEPTH,
+    temp = TEMP_FIELD,
+    do = DO_FIELD,
+    ph = PH_FIELD,
+    cond = COND_FIELD,
+  )
+
+sddf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv") |>
+  select(SITE_ID, 
+    date = DATE_SECCHI, 
+    sd = SECMEAN,
+    clear_to_bottom = CLEAR_TO_BOTTOM
+  )
+
+trophicdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv") |>
+  select(SITE_ID,
+    visit = VISIT_NO,
+    tp = PTL,
+    tn = NTL,
+    chla = CHLA) |>
+  left_join(visitdf, by = c("SITE_ID", "visit")) |>
+  select(-year, -visit) |>
+  group_by(SITE_ID, date) |>
+  summarize(tp = mean(tp, na.rm = TRUE),
+    tn = mean(tn, na.rm = TRUE),
+    chla = mean(chla, na.rm = TRUE)
+  )
+
+
+
+phytodf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv") |>
+  select(SITE_ID,
+    date = DATEPHYT,
+    depth = SAMPLE_DEPTH,
+    phyta = DIVISION,
+    genus = GENUS,
+    species = SPECIES,
+    tax = TAXANAME,
+    abund = ABUND) |>
+  mutate(phyta = gsub(" .*$", "", phyta)) |>
+  filter(!is.na(genus)) |>
+  group_by(SITE_ID, date, depth, phyta, genus) |>
+  summarize(abund = sum(abund, na.rm = TRUE)) |>
+  nest(phytodf = -c(SITE_ID, date))
+
+envdf <- waterchemdf |>
+  filter(depth < 2) |>
+  select(-depth) |>
+  group_by(SITE_ID, date) |>
+  summarise_all(~mean(., na.rm = TRUE)) |>
+  ungroup() |>
+  left_join(sddf, by = c("SITE_ID", "date")) |>
+  left_join(trophicdf, by = c("SITE_ID", "date"))
+
+nla <- envdf |>
+  left_join(phytodf) |>
+  left_join(sitedf, by = "SITE_ID") |>
+  filter(!purrr::map_lgl(phytodf, is.null)) |>
+  mutate(cyanophyta = purrr::map(phytodf, ~ .x |>
+    dplyr::filter(phyta == "Cyanophyta") |>
+    summarize(cyanophyta = sum(abund, na.rm = TRUE))
+  )) |>
+  unnest(cyanophyta) |>
+  select(-phyta) |>
+  mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE))
+
+
+# library(rmdify)
+# library(dwfun)
+# dwfun::init()
+
+```
+
+
+```{r}
+skimr::skim(nla)
+```
+```{r}
+nla |>
+  filter(tp > 1) |>
+  ggplot(aes(tn, tp)) +
+geom_point() +
+geom_smooth(method = "lm") +
+scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
+               labels = scales::trans_format("log10", scales::math_format(10^.x))) +
+scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
+               labels = scales::trans_format("log10", scales::math_format(10^.x)))
+
+m1 <- lm(log10(tp) ~ log10(tn), data = nla)
+
+summary(m1)
+```
+```{r}
+nla |>
+  filter(tp > 1) |>
+  ggplot(aes(tp, cyanophyta)) +
+geom_point() +
+geom_smooth(method = "lm") +
+scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
+               labels = scales::trans_format("log10", scales::math_format(10^.x))) +
+scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
+               labels = scales::trans_format("log10", scales::math_format(10^.x)))
+
+
+m2 <- lm(log10(cyanophyta) ~ log10(tp), data = nla)
+
+summary(m2)
+
+m3 <- glm(log10(cyanophyta) ~ log10(tp), data = nla)
+summary(m3)
+
+require(mgcv)
+# 广义加性模型 General additive model
+m4 <- gam(log10(cyanophyta) ~ s(log10(tp)) + s(log10(tn)) + s(log10(chla)), data = nla)
+plot(m4)
+summary(m4)
+```
+
+
+```{r}
+nla_split <- initial_split(nla, prop = 0.7, strata = zmax)
+nla_train <- training(nla_split)
+nla_test <- testing(nla_split)
+```
+
+```{r}
+# xgboost, cyanophyta
+
+nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn + chla + clear_to_bottom")
+# nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn")
+nla_recipe <- recipes::recipe(nla_formula, data = nla_train) |>
+  recipes::step_string2factor(all_nominal()) |>
+  recipes::step_nzv(all_nominal()) |>
+  recipes::step_log(chla, cyanophyta, tn, tp, base = 10) |>
+  recipes::step_normalize(all_numeric_predictors()) |>
+  prep()
+nla_recipe
+```
+
+```{r}
+nla_cv <- recipes::bake(
+    nla_recipe, 
+    new_data = nla_train
+  ) |>
+  rsample::vfold_cv(v = 10)
+nla_cv
+```
+
+```{r}
+xgboost_model <- parsnip::boost_tree(
+  mode = "regression",
+  trees = 1000,
+  min_n = tune(),
+  tree_depth = tune(),
+  learn_rate = tune(),
+  loss_reduction = tune()
+) |>
+  set_engine("xgboost", objective = "reg:squarederror")
+xgboost_model
+```
+
+```{r}
+# grid specification
+xgboost_params <- dials::parameters(
+  min_n(),
+  tree_depth(),
+  learn_rate(),
+  loss_reduction()
+)
+xgboost_params
+```
+
+```{r}
+xgboost_grid <- dials::grid_max_entropy(
+  xgboost_params, 
+  size = 60
+)
+knitr::kable(head(xgboost_grid))
+```
+
+```{r}
+xgboost_wf <- workflows::workflow() |>
+  add_model(xgboost_model) |>
+  add_formula(nla_formula)
+xgboost_wf
+```
+
+```{r}
+
+install.packages("xgboost")
+
+# hyperparameter tuning
+if (FALSE) {
+  xgboost_tuned <- tune::tune_grid(
+    object = xgboost_wf,
+    resamples = nla_cv,
+    grid = xgboost_grid,
+    metrics = yardstick::metric_set(rmse, rsq, mae),
+    control = tune::control_grid(verbose = TRUE)
+  )
+saveRDS(xgboost_tuned, "./xgboost_tuned.RDS")
+}
+xgboost_tuned <- readRDS("./xgboost_tuned.RDS")
+```
+
+```{r}
+xgboost_tuned |>
+  tune::show_best(metric = "rmse") |>
+  knitr::kable()
+```
+
+```{r}
+xgboost_tuned |>
+  collect_metrics()
+```
+
+```{r}
+xgboost_tuned |>
+  autoplot()
+```
+
+```{r}
+xgboost_best_params <- xgboost_tuned |>
+  tune::select_best("rmse")
+
+knitr::kable(xgboost_best_params)
+```
+
+```{r}
+xgboost_model_final <- xgboost_model |>
+  finalize_model(xgboost_best_params)
+xgboost_model_final
+```
+
+```{r}
+(train_processed <- bake(nla_recipe, new_data = nla_train))
+```
+
+```{r}
+train_prediction <- xgboost_model_final |>
+  # fit the model on all the training data
+  fit(
+    formula = nla_formula, 
+    data    = train_processed
+  ) |>
+  # predict the sale prices for the training data
+  predict(new_data = train_processed) |>
+  bind_cols(nla_train |>
+  mutate(.obs = log10(cyanophyta)))
+xgboost_score_train <- 
+  train_prediction |>
+  yardstick::metrics(.obs, .pred) |>
+  mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
+knitr::kable(xgboost_score_train)
+```
+
+
+```{r}
+train_prediction |>
+  ggplot(aes(.pred, .obs)) +
+geom_point() +
+geom_smooth(method = "lm")
+```
+
+```{r}
+test_processed  <- bake(nla_recipe, new_data = nla_test)
+
+test_prediction <- xgboost_model_final |>
+  # fit the model on all the training data
+  fit(
+    formula = nla_formula,
+    data    = train_processed
+  ) |>
+  # use the training model fit to predict the test data
+  predict(new_data = test_processed) |>
+  bind_cols(nla_test |>
+  mutate(.obs = log10(cyanophyta)))
+
+# measure the accuracy of our model using `yardstick`
+xgboost_score <- test_prediction |>
+  yardstick::metrics(.obs, .pred) |>
+  mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
+
+knitr::kable(xgboost_score)
+```
+
+```{r}
+cyanophyta_prediction_residual <- test_prediction |>
+  arrange(.pred) %>%
+  mutate(residual_pct = (.obs - .pred) / .pred) |>
+  select(.pred, residual_pct)
+
+cyanophyta_prediction_residual |>
+ggplot(aes(x = .pred, y = residual_pct)) +
+  geom_point() +
+  xlab("Predicted Cyanophyta") +
+  ylab("Residual (%)")
+```
+
+```{r}
+test_prediction |>
+  ggplot(aes(.pred, .obs)) +
+geom_point() +
+geom_smooth(method = "lm", colour = "black")
+```
+
--- a/coding/xgboost_tuned.RDS
+++ b/coding/xgboost_tuned.RDS
--- a/data/heights.RDS
+++ b/data/heights.RDS
--- a/data/licordata.RDS
+++ b/data/licordata.RDS
--- a/data/metadfdemo.RDS
+++ b/data/metadfdemo.RDS
--- a/data/writexldemo.xlsx
+++ b/data/writexldemo.xlsx
--- a/index.qmd
+++ b/index.qmd
@ -19,4 +19,7 @@ subtitle: "课程简介"

 - 网页公开：[https://drwater.rcees.ac.cn/course/public/RWEP/\@PUB/index.html](https://drwater.rcees.ac.cn/course/public/RWEP/@PUB/index.html)
 - 课件代码：[https://drwater.rcees.ac.cn/git/course/RWEP.git](https://drwater.rcees.ac.cn/git/course/RWEP.git)
- 代码web界面：[https://on.tty-share.com/s/2YElKFjzniilfg3vH-d1PUqqmz_MQzNki5dyxWWL7QONCX_77GzsdP9EEAgBQu3ONwM/](https://on.tty-share.com/s/2YElKFjzniilfg3vH-d1PUqqmz_MQzNki5dyxWWL7QONCX_77GzsdP9EEAgBQu3ONwM/)
+- 代码web界面：[https://on.tty-share.com/s/4hC-HxrUnzWmL4JnbopI4W-qrYU1oQgeYuvyJNaGLREcxl0_wbNAJ5UYRf8oEa0ayp4/](https://on.tty-share.com/s/4hC-HxrUnzWmL4JnbopI4W-qrYU1oQgeYuvyJNaGLREcxl0_wbNAJ5UYRf8oEa0ayp4/) 
+
+ <!-- `tty-share --public -readonly -command tmux` -->
+