update

2025-04-09 22:28:17 +08:00
parent 6412cc5560
commit 9ecb11b788
28 changed files with 5215 additions and 243 deletions
--- a/SD/5.1_model/_demo.qmd
+++ b/SD/5.1_model/_demo.qmd
@@ -0,0 +1,254 @@
+---
+title: "Lesson 9"
+format: html
+---
+
+
+```{r}
+# install.packages("tidymodels")
+
+require(tidymodels)
+taxi
+
+taxisplit <- initial_split(taxi, prop = 0.8)
+taxi_train <- training(taxisplit)
+taxi_test <- testing(taxisplit)
+
+tree_spec <-
+  decision_tree(cost_complexity = 0.002) %>%
+  set_mode("classification")
+
+taxi_fit <- workflow() %>%
+  add_formula(tip ~ .) %>%
+  add_model(tree_spec) %>%
+  fit(data = taxi_train)
+```
+
+
+```{r}
+augment(taxi_fit, new_data = taxi_train) %>%
+  relocate(tip, .pred_class, .pred_yes, .pred_no)
+
+augment(taxi_fit, new_data = taxi_train) %>%
+  conf_mat(truth = tip, estimate = .pred_class)
+
+augment(taxi_fit, new_data = taxi_train) %>%
+  accuracy(truth = tip, estimate = .pred_class)
+
+augment(taxi_fit, new_data = taxi_train) %>%
+  sensitivity(truth = tip, estimate = .pred_class)
+
+augment(taxi_fit, new_data = taxi_train) %>%
+  specificity(truth = tip, estimate = .pred_class)
+
+taxi_metrics <- metric_set(accuracy, specificity, sensitivity)
+
+augment(taxi_fit, new_data = taxi_train) %>%
+  taxi_metrics(truth = tip, estimate = .pred_class)
+
+taxi_metrics <- metric_set(accuracy, specificity, sensitivity)
+
+augment(taxi_fit, new_data = taxi_train) %>%
+  group_by(local) %>%
+  taxi_metrics(truth = tip, estimate = .pred_class)
+
+augment(taxi_fit, new_data = taxi_train) %>%
+  roc_curve(truth = tip, .pred_yes) %>%
+  autoplot()
+
+
+augment(taxi_fit, new_data = taxi_train)
+
+
+augment(taxi_fit, new_data = taxi_train) %>%
+  roc_curve(truth = tip, .pred_yes) |>
+  ggplot(aes(1 - sensitivity, specificity)) +
+  geom_point() +
+  geom_line() +
+  geom_abline(slope = 1)
+```
+
+
+# Cross Validation
+
+```{r}
+vfold_cv(taxi_train, v = 10) |>
+  pull(splits) |>
+  nth(1)
+taxi_folds <- vfold_cv(taxi_train)
+taxi_folds$splits[1:3]
+
+vfold_cv(taxi_train, strata = tip)
+
+set.seed(123)
+taxi_folds <- vfold_cv(taxi_train, v = 10, strata = tip)
+taxi_folds
+
+taxi_wflow <- workflow() %>%
+  add_formula(tip ~ .) %>%
+  add_model(tree_spec)
+
+
+taxi_res <- fit_resamples(taxi_wflow, taxi_folds)
+taxi_res
+
+
+taxi_res$.metrics[[1]]
+
+
+taxi_res$splits[[1]]
+
+analysis(taxi_res$splits[[1]])
+
+analysis(taxi_res$splits[[1]])
+assessment(taxi_res$splits[[1]])
+
+
+taxi_res %>%
+  collect_metrics()
+
+taxi_res %>%
+  collect_metrics() %>%
+  select(.metric, mean, n)
+
+# Save the assessment set results
+ctrl_taxi <- control_resamples(save_pred = TRUE)
+taxi_res <- fit_resamples(taxi_wflow, taxi_folds, control = ctrl_taxi)
+
+taxi_res
+```
+
+
+
+
+
+
+
+# NLA2007 cyanophyta model
+
+
+```{r}
+require(tidyverse)
+sitedf <- readr::read_csv(
+  "https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv"
+) |>
+  select(
+    SITE_ID,
+    lon = LON_DD,
+    lat = LAT_DD,
+    name = LAKENAME,
+    area = LAKEAREA,
+    zmax = DEPTHMAX
+  ) |>
+  group_by(SITE_ID) |>
+  summarize(
+    lon = mean(lon, na.rm = TRUE),
+    lat = mean(lat, na.rm = TRUE),
+    name = unique(name),
+    area = mean(area, na.rm = TRUE),
+    zmax = mean(zmax, na.rm = TRUE)
+  )
+
+
+visitdf <- readr::read_csv(
+  "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
+) |>
+  select(SITE_ID, date = DATE_PROFILE, year = YEAR, visit = VISIT_NO) |>
+  distinct()
+
+
+waterchemdf <- readr::read_csv(
+  "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
+) |>
+  select(
+    SITE_ID,
+    date = DATE_PROFILE,
+    depth = DEPTH,
+    temp = TEMP_FIELD,
+    do = DO_FIELD,
+    ph = PH_FIELD,
+    cond = COND_FIELD,
+  )
+
+sddf <- readr::read_csv(
+  "https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv"
+) |>
+  select(
+    SITE_ID,
+    date = DATE_SECCHI,
+    sd = SECMEAN,
+    clear_to_bottom = CLEAR_TO_BOTTOM
+  )
+
+trophicdf <- readr::read_csv(
+  "https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv"
+) |>
+  select(SITE_ID, visit = VISIT_NO, tp = PTL, tn = NTL, chla = CHLA) |>
+  left_join(visitdf, by = c("SITE_ID", "visit")) |>
+  select(-year, -visit) |>
+  group_by(SITE_ID, date) |>
+  summarize(
+    tp = mean(tp, na.rm = TRUE),
+    tn = mean(tn, na.rm = TRUE),
+    chla = mean(chla, na.rm = TRUE)
+  )
+
+
+phytodf <- readr::read_csv(
+  "https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv"
+) |>
+  select(
+    SITE_ID,
+    date = DATEPHYT,
+    depth = SAMPLE_DEPTH,
+    phyta = DIVISION,
+    genus = GENUS,
+    species = SPECIES,
+    tax = TAXANAME,
+    abund = ABUND
+  ) |>
+  mutate(phyta = gsub(" .*$", "", phyta)) |>
+  filter(!is.na(genus)) |>
+  group_by(SITE_ID, date, depth, phyta, genus) |>
+  summarize(abund = sum(abund, na.rm = TRUE)) |>
+  nest(phytodf = -c(SITE_ID, date))
+
+phytodf$phytodf[[1]]
+
+
+envdf <- waterchemdf |>
+  filter(depth < 2) |>
+  select(-depth) |>
+  group_by(SITE_ID, date) |>
+  summarise_all(~ mean(., na.rm = TRUE)) |>
+  ungroup() |>
+  left_join(sddf, by = c("SITE_ID", "date")) |>
+  left_join(trophicdf, by = c("SITE_ID", "date"))
+
+nla <- envdf |>
+  left_join(phytodf, by = c("SITE_ID", "date")) |>
+  left_join(sitedf, by = "SITE_ID") |>
+  filter(!purrr::map_lgl(phytodf, is.null)) |>
+  mutate(
+    cyanophyta = purrr::map(
+      phytodf,
+      \(x) {
+        x |>
+          dplyr::filter(phyta == "Cyanophyta") |>
+          summarize(cyanophyta = sum(abund, na.rm = TRUE))
+      }
+    )
+  ) |>
+  unnest(cyanophyta) |>
+  select(-phyta) |>
+  mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE))
+
+# library(rmdify)
+# library(dwfun)
+# dwfun::init()
+```
+
+
+
+
+