update
This commit is contained in:
254
SD/5.1_model/_demo.qmd
Normal file
254
SD/5.1_model/_demo.qmd
Normal file
@@ -0,0 +1,254 @@
|
||||
---
|
||||
title: "Lesson 9"
|
||||
format: html
|
||||
---
|
||||
|
||||
|
||||
```{r}
|
||||
# install.packages("tidymodels")
|
||||
|
||||
require(tidymodels)
|
||||
taxi
|
||||
|
||||
taxisplit <- initial_split(taxi, prop = 0.8)
|
||||
taxi_train <- training(taxisplit)
|
||||
taxi_test <- testing(taxisplit)
|
||||
|
||||
tree_spec <-
|
||||
decision_tree(cost_complexity = 0.002) %>%
|
||||
set_mode("classification")
|
||||
|
||||
taxi_fit <- workflow() %>%
|
||||
add_formula(tip ~ .) %>%
|
||||
add_model(tree_spec) %>%
|
||||
fit(data = taxi_train)
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
augment(taxi_fit, new_data = taxi_train) %>%
|
||||
relocate(tip, .pred_class, .pred_yes, .pred_no)
|
||||
|
||||
augment(taxi_fit, new_data = taxi_train) %>%
|
||||
conf_mat(truth = tip, estimate = .pred_class)
|
||||
|
||||
augment(taxi_fit, new_data = taxi_train) %>%
|
||||
accuracy(truth = tip, estimate = .pred_class)
|
||||
|
||||
augment(taxi_fit, new_data = taxi_train) %>%
|
||||
sensitivity(truth = tip, estimate = .pred_class)
|
||||
|
||||
augment(taxi_fit, new_data = taxi_train) %>%
|
||||
specificity(truth = tip, estimate = .pred_class)
|
||||
|
||||
taxi_metrics <- metric_set(accuracy, specificity, sensitivity)
|
||||
|
||||
augment(taxi_fit, new_data = taxi_train) %>%
|
||||
taxi_metrics(truth = tip, estimate = .pred_class)
|
||||
|
||||
taxi_metrics <- metric_set(accuracy, specificity, sensitivity)
|
||||
|
||||
augment(taxi_fit, new_data = taxi_train) %>%
|
||||
group_by(local) %>%
|
||||
taxi_metrics(truth = tip, estimate = .pred_class)
|
||||
|
||||
augment(taxi_fit, new_data = taxi_train) %>%
|
||||
roc_curve(truth = tip, .pred_yes) %>%
|
||||
autoplot()
|
||||
|
||||
|
||||
augment(taxi_fit, new_data = taxi_train)
|
||||
|
||||
|
||||
augment(taxi_fit, new_data = taxi_train) %>%
|
||||
roc_curve(truth = tip, .pred_yes) |>
|
||||
ggplot(aes(1 - sensitivity, specificity)) +
|
||||
geom_point() +
|
||||
geom_line() +
|
||||
geom_abline(slope = 1)
|
||||
```
|
||||
|
||||
|
||||
# Cross Validation
|
||||
|
||||
```{r}
|
||||
vfold_cv(taxi_train, v = 10) |>
|
||||
pull(splits) |>
|
||||
nth(1)
|
||||
taxi_folds <- vfold_cv(taxi_train)
|
||||
taxi_folds$splits[1:3]
|
||||
|
||||
vfold_cv(taxi_train, strata = tip)
|
||||
|
||||
set.seed(123)
|
||||
taxi_folds <- vfold_cv(taxi_train, v = 10, strata = tip)
|
||||
taxi_folds
|
||||
|
||||
taxi_wflow <- workflow() %>%
|
||||
add_formula(tip ~ .) %>%
|
||||
add_model(tree_spec)
|
||||
|
||||
|
||||
taxi_res <- fit_resamples(taxi_wflow, taxi_folds)
|
||||
taxi_res
|
||||
|
||||
|
||||
taxi_res$.metrics[[1]]
|
||||
|
||||
|
||||
taxi_res$splits[[1]]
|
||||
|
||||
analysis(taxi_res$splits[[1]])
|
||||
|
||||
analysis(taxi_res$splits[[1]])
|
||||
assessment(taxi_res$splits[[1]])
|
||||
|
||||
|
||||
taxi_res %>%
|
||||
collect_metrics()
|
||||
|
||||
taxi_res %>%
|
||||
collect_metrics() %>%
|
||||
select(.metric, mean, n)
|
||||
|
||||
# Save the assessment set results
|
||||
ctrl_taxi <- control_resamples(save_pred = TRUE)
|
||||
taxi_res <- fit_resamples(taxi_wflow, taxi_folds, control = ctrl_taxi)
|
||||
|
||||
taxi_res
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# NLA2007 cyanophyta model
|
||||
|
||||
|
||||
```{r}
|
||||
require(tidyverse)
|
||||
sitedf <- readr::read_csv(
|
||||
"https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv"
|
||||
) |>
|
||||
select(
|
||||
SITE_ID,
|
||||
lon = LON_DD,
|
||||
lat = LAT_DD,
|
||||
name = LAKENAME,
|
||||
area = LAKEAREA,
|
||||
zmax = DEPTHMAX
|
||||
) |>
|
||||
group_by(SITE_ID) |>
|
||||
summarize(
|
||||
lon = mean(lon, na.rm = TRUE),
|
||||
lat = mean(lat, na.rm = TRUE),
|
||||
name = unique(name),
|
||||
area = mean(area, na.rm = TRUE),
|
||||
zmax = mean(zmax, na.rm = TRUE)
|
||||
)
|
||||
|
||||
|
||||
visitdf <- readr::read_csv(
|
||||
"https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
|
||||
) |>
|
||||
select(SITE_ID, date = DATE_PROFILE, year = YEAR, visit = VISIT_NO) |>
|
||||
distinct()
|
||||
|
||||
|
||||
waterchemdf <- readr::read_csv(
|
||||
"https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
|
||||
) |>
|
||||
select(
|
||||
SITE_ID,
|
||||
date = DATE_PROFILE,
|
||||
depth = DEPTH,
|
||||
temp = TEMP_FIELD,
|
||||
do = DO_FIELD,
|
||||
ph = PH_FIELD,
|
||||
cond = COND_FIELD,
|
||||
)
|
||||
|
||||
sddf <- readr::read_csv(
|
||||
"https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv"
|
||||
) |>
|
||||
select(
|
||||
SITE_ID,
|
||||
date = DATE_SECCHI,
|
||||
sd = SECMEAN,
|
||||
clear_to_bottom = CLEAR_TO_BOTTOM
|
||||
)
|
||||
|
||||
trophicdf <- readr::read_csv(
|
||||
"https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv"
|
||||
) |>
|
||||
select(SITE_ID, visit = VISIT_NO, tp = PTL, tn = NTL, chla = CHLA) |>
|
||||
left_join(visitdf, by = c("SITE_ID", "visit")) |>
|
||||
select(-year, -visit) |>
|
||||
group_by(SITE_ID, date) |>
|
||||
summarize(
|
||||
tp = mean(tp, na.rm = TRUE),
|
||||
tn = mean(tn, na.rm = TRUE),
|
||||
chla = mean(chla, na.rm = TRUE)
|
||||
)
|
||||
|
||||
|
||||
phytodf <- readr::read_csv(
|
||||
"https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv"
|
||||
) |>
|
||||
select(
|
||||
SITE_ID,
|
||||
date = DATEPHYT,
|
||||
depth = SAMPLE_DEPTH,
|
||||
phyta = DIVISION,
|
||||
genus = GENUS,
|
||||
species = SPECIES,
|
||||
tax = TAXANAME,
|
||||
abund = ABUND
|
||||
) |>
|
||||
mutate(phyta = gsub(" .*$", "", phyta)) |>
|
||||
filter(!is.na(genus)) |>
|
||||
group_by(SITE_ID, date, depth, phyta, genus) |>
|
||||
summarize(abund = sum(abund, na.rm = TRUE)) |>
|
||||
nest(phytodf = -c(SITE_ID, date))
|
||||
|
||||
phytodf$phytodf[[1]]
|
||||
|
||||
|
||||
envdf <- waterchemdf |>
|
||||
filter(depth < 2) |>
|
||||
select(-depth) |>
|
||||
group_by(SITE_ID, date) |>
|
||||
summarise_all(~ mean(., na.rm = TRUE)) |>
|
||||
ungroup() |>
|
||||
left_join(sddf, by = c("SITE_ID", "date")) |>
|
||||
left_join(trophicdf, by = c("SITE_ID", "date"))
|
||||
|
||||
nla <- envdf |>
|
||||
left_join(phytodf, by = c("SITE_ID", "date")) |>
|
||||
left_join(sitedf, by = "SITE_ID") |>
|
||||
filter(!purrr::map_lgl(phytodf, is.null)) |>
|
||||
mutate(
|
||||
cyanophyta = purrr::map(
|
||||
phytodf,
|
||||
\(x) {
|
||||
x |>
|
||||
dplyr::filter(phyta == "Cyanophyta") |>
|
||||
summarize(cyanophyta = sum(abund, na.rm = TRUE))
|
||||
}
|
||||
)
|
||||
) |>
|
||||
unnest(cyanophyta) |>
|
||||
select(-phyta) |>
|
||||
mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE))
|
||||
|
||||
# library(rmdify)
|
||||
# library(dwfun)
|
||||
# dwfun::init()
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user