RWEP/SD/5.1_model/_demo.qmd

255 lines
5.3 KiB
Plaintext

---
title: "Lesson 9"
format: html
---
```{r}
# install.packages("tidymodels")
require(tidymodels)
taxi
taxisplit <- initial_split(taxi, prop = 0.8)
taxi_train <- training(taxisplit)
taxi_test <- testing(taxisplit)
tree_spec <-
decision_tree(cost_complexity = 0.002) %>%
set_mode("classification")
taxi_fit <- workflow() %>%
add_formula(tip ~ .) %>%
add_model(tree_spec) %>%
fit(data = taxi_train)
```
```{r}
augment(taxi_fit, new_data = taxi_train) %>%
relocate(tip, .pred_class, .pred_yes, .pred_no)
augment(taxi_fit, new_data = taxi_train) %>%
conf_mat(truth = tip, estimate = .pred_class)
augment(taxi_fit, new_data = taxi_train) %>%
accuracy(truth = tip, estimate = .pred_class)
augment(taxi_fit, new_data = taxi_train) %>%
sensitivity(truth = tip, estimate = .pred_class)
augment(taxi_fit, new_data = taxi_train) %>%
specificity(truth = tip, estimate = .pred_class)
taxi_metrics <- metric_set(accuracy, specificity, sensitivity)
augment(taxi_fit, new_data = taxi_train) %>%
taxi_metrics(truth = tip, estimate = .pred_class)
taxi_metrics <- metric_set(accuracy, specificity, sensitivity)
augment(taxi_fit, new_data = taxi_train) %>%
group_by(local) %>%
taxi_metrics(truth = tip, estimate = .pred_class)
augment(taxi_fit, new_data = taxi_train) %>%
roc_curve(truth = tip, .pred_yes) %>%
autoplot()
augment(taxi_fit, new_data = taxi_train)
augment(taxi_fit, new_data = taxi_train) %>%
roc_curve(truth = tip, .pred_yes) |>
ggplot(aes(1 - sensitivity, specificity)) +
geom_point() +
geom_line() +
geom_abline(slope = 1)
```
# Cross Validation
```{r}
vfold_cv(taxi_train, v = 10) |>
pull(splits) |>
nth(1)
taxi_folds <- vfold_cv(taxi_train)
taxi_folds$splits[1:3]
vfold_cv(taxi_train, strata = tip)
set.seed(123)
taxi_folds <- vfold_cv(taxi_train, v = 10, strata = tip)
taxi_folds
taxi_wflow <- workflow() %>%
add_formula(tip ~ .) %>%
add_model(tree_spec)
taxi_res <- fit_resamples(taxi_wflow, taxi_folds)
taxi_res
taxi_res$.metrics[[1]]
taxi_res$splits[[1]]
analysis(taxi_res$splits[[1]])
analysis(taxi_res$splits[[1]])
assessment(taxi_res$splits[[1]])
taxi_res %>%
collect_metrics()
taxi_res %>%
collect_metrics() %>%
select(.metric, mean, n)
# Save the assessment set results
ctrl_taxi <- control_resamples(save_pred = TRUE)
taxi_res <- fit_resamples(taxi_wflow, taxi_folds, control = ctrl_taxi)
taxi_res
```
# NLA2007 cyanophyta model
```{r}
require(tidyverse)
sitedf <- readr::read_csv(
"https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv"
) |>
select(
SITE_ID,
lon = LON_DD,
lat = LAT_DD,
name = LAKENAME,
area = LAKEAREA,
zmax = DEPTHMAX
) |>
group_by(SITE_ID) |>
summarize(
lon = mean(lon, na.rm = TRUE),
lat = mean(lat, na.rm = TRUE),
name = unique(name),
area = mean(area, na.rm = TRUE),
zmax = mean(zmax, na.rm = TRUE)
)
visitdf <- readr::read_csv(
"https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
) |>
select(SITE_ID, date = DATE_PROFILE, year = YEAR, visit = VISIT_NO) |>
distinct()
waterchemdf <- readr::read_csv(
"https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
) |>
select(
SITE_ID,
date = DATE_PROFILE,
depth = DEPTH,
temp = TEMP_FIELD,
do = DO_FIELD,
ph = PH_FIELD,
cond = COND_FIELD,
)
sddf <- readr::read_csv(
"https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv"
) |>
select(
SITE_ID,
date = DATE_SECCHI,
sd = SECMEAN,
clear_to_bottom = CLEAR_TO_BOTTOM
)
trophicdf <- readr::read_csv(
"https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv"
) |>
select(SITE_ID, visit = VISIT_NO, tp = PTL, tn = NTL, chla = CHLA) |>
left_join(visitdf, by = c("SITE_ID", "visit")) |>
select(-year, -visit) |>
group_by(SITE_ID, date) |>
summarize(
tp = mean(tp, na.rm = TRUE),
tn = mean(tn, na.rm = TRUE),
chla = mean(chla, na.rm = TRUE)
)
phytodf <- readr::read_csv(
"https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv"
) |>
select(
SITE_ID,
date = DATEPHYT,
depth = SAMPLE_DEPTH,
phyta = DIVISION,
genus = GENUS,
species = SPECIES,
tax = TAXANAME,
abund = ABUND
) |>
mutate(phyta = gsub(" .*$", "", phyta)) |>
filter(!is.na(genus)) |>
group_by(SITE_ID, date, depth, phyta, genus) |>
summarize(abund = sum(abund, na.rm = TRUE)) |>
nest(phytodf = -c(SITE_ID, date))
phytodf$phytodf[[1]]
envdf <- waterchemdf |>
filter(depth < 2) |>
select(-depth) |>
group_by(SITE_ID, date) |>
summarise_all(~ mean(., na.rm = TRUE)) |>
ungroup() |>
left_join(sddf, by = c("SITE_ID", "date")) |>
left_join(trophicdf, by = c("SITE_ID", "date"))
nla <- envdf |>
left_join(phytodf, by = c("SITE_ID", "date")) |>
left_join(sitedf, by = "SITE_ID") |>
filter(!purrr::map_lgl(phytodf, is.null)) |>
mutate(
cyanophyta = purrr::map(
phytodf,
\(x) {
x |>
dplyr::filter(phyta == "Cyanophyta") |>
summarize(cyanophyta = sum(abund, na.rm = TRUE))
}
)
) |>
unnest(cyanophyta) |>
select(-phyta) |>
mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE))
# library(rmdify)
# library(dwfun)
# dwfun::init()
```