---
title: "Lesson 9"
format: html
---


```{r}
# install.packages("tidymodels")

require(tidymodels)
taxi

taxisplit <- initial_split(taxi, prop = 0.8)
taxi_train <- training(taxisplit)
taxi_test <- testing(taxisplit)

tree_spec <-
  decision_tree(cost_complexity = 0.002) %>%
  set_mode("classification")

taxi_fit <- workflow() %>%
  add_formula(tip ~ .) %>%
  add_model(tree_spec) %>%
  fit(data = taxi_train)
```


```{r}
augment(taxi_fit, new_data = taxi_train) %>%
  relocate(tip, .pred_class, .pred_yes, .pred_no)

augment(taxi_fit, new_data = taxi_train) %>%
  conf_mat(truth = tip, estimate = .pred_class)

augment(taxi_fit, new_data = taxi_train) %>%
  accuracy(truth = tip, estimate = .pred_class)

augment(taxi_fit, new_data = taxi_train) %>%
  sensitivity(truth = tip, estimate = .pred_class)

augment(taxi_fit, new_data = taxi_train) %>%
  specificity(truth = tip, estimate = .pred_class)

taxi_metrics <- metric_set(accuracy, specificity, sensitivity)

augment(taxi_fit, new_data = taxi_train) %>%
  taxi_metrics(truth = tip, estimate = .pred_class)

taxi_metrics <- metric_set(accuracy, specificity, sensitivity)

augment(taxi_fit, new_data = taxi_train) %>%
  group_by(local) %>%
  taxi_metrics(truth = tip, estimate = .pred_class)

augment(taxi_fit, new_data = taxi_train) %>%
  roc_curve(truth = tip, .pred_yes) %>%
  autoplot()


augment(taxi_fit, new_data = taxi_train)


augment(taxi_fit, new_data = taxi_train) %>%
  roc_curve(truth = tip, .pred_yes) |>
  ggplot(aes(1 - sensitivity, specificity)) +
  geom_point() +
  geom_line() +
  geom_abline(slope = 1)
```


# Cross Validation

```{r}
vfold_cv(taxi_train, v = 10) |>
  pull(splits) |>
  nth(1)
taxi_folds <- vfold_cv(taxi_train)
taxi_folds$splits[1:3]

vfold_cv(taxi_train, strata = tip)

set.seed(123)
taxi_folds <- vfold_cv(taxi_train, v = 10, strata = tip)
taxi_folds

taxi_wflow <- workflow() %>%
  add_formula(tip ~ .) %>%
  add_model(tree_spec)


taxi_res <- fit_resamples(taxi_wflow, taxi_folds)
taxi_res


taxi_res$.metrics[[1]]


taxi_res$splits[[1]]

analysis(taxi_res$splits[[1]])

analysis(taxi_res$splits[[1]])
assessment(taxi_res$splits[[1]])


taxi_res %>%
  collect_metrics()

taxi_res %>%
  collect_metrics() %>%
  select(.metric, mean, n)

# Save the assessment set results
ctrl_taxi <- control_resamples(save_pred = TRUE)
taxi_res <- fit_resamples(taxi_wflow, taxi_folds, control = ctrl_taxi)

taxi_res
```


# NLA2007 cyanophyta model


```{r}
require(tidyverse)
sitedf <- readr::read_csv(
  "https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv"
) |>
  select(
    SITE_ID,
    lon = LON_DD,
    lat = LAT_DD,
    name = LAKENAME,
    area = LAKEAREA,
    zmax = DEPTHMAX
  ) |>
  group_by(SITE_ID) |>
  summarize(
    lon = mean(lon, na.rm = TRUE),
    lat = mean(lat, na.rm = TRUE),
    name = unique(name),
    area = mean(area, na.rm = TRUE),
    zmax = mean(zmax, na.rm = TRUE)
  )


visitdf <- readr::read_csv(
  "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
) |>
  select(SITE_ID, date = DATE_PROFILE, year = YEAR, visit = VISIT_NO) |>
  distinct()


waterchemdf <- readr::read_csv(
  "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
) |>
  select(
    SITE_ID,
    date = DATE_PROFILE,
    depth = DEPTH,
    temp = TEMP_FIELD,
    do = DO_FIELD,
    ph = PH_FIELD,
    cond = COND_FIELD,
  )

sddf <- readr::read_csv(
  "https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv"
) |>
  select(
    SITE_ID,
    date = DATE_SECCHI,
    sd = SECMEAN,
    clear_to_bottom = CLEAR_TO_BOTTOM
  )

trophicdf <- readr::read_csv(
  "https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv"
) |>
  select(SITE_ID, visit = VISIT_NO, tp = PTL, tn = NTL, chla = CHLA) |>
  left_join(visitdf, by = c("SITE_ID", "visit")) |>
  select(-year, -visit) |>
  group_by(SITE_ID, date) |>
  summarize(
    tp = mean(tp, na.rm = TRUE),
    tn = mean(tn, na.rm = TRUE),
    chla = mean(chla, na.rm = TRUE)
  )


phytodf <- readr::read_csv(
  "https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv"
) |>
  select(
    SITE_ID,
    date = DATEPHYT,
    depth = SAMPLE_DEPTH,
    phyta = DIVISION,
    genus = GENUS,
    species = SPECIES,
    tax = TAXANAME,
    abund = ABUND
  ) |>
  mutate(phyta = gsub(" .*$", "", phyta)) |>
  filter(!is.na(genus)) |>
  group_by(SITE_ID, date, depth, phyta, genus) |>
  summarize(abund = sum(abund, na.rm = TRUE)) |>
  nest(phytodf = -c(SITE_ID, date))

phytodf$phytodf[[1]]


envdf <- waterchemdf |>
  filter(depth < 2) |>
  select(-depth) |>
  group_by(SITE_ID, date) |>
  summarise_all(~ mean(., na.rm = TRUE)) |>
  ungroup() |>
  left_join(sddf, by = c("SITE_ID", "date")) |>
  left_join(trophicdf, by = c("SITE_ID", "date"))

nla <- envdf |>
  left_join(phytodf, by = c("SITE_ID", "date")) |>
  left_join(sitedf, by = "SITE_ID") |>
  filter(!purrr::map_lgl(phytodf, is.null)) |>
  mutate(
    cyanophyta = purrr::map(
      phytodf,
      \(x) {
        x |>
          dplyr::filter(phyta == "Cyanophyta") |>
          summarize(cyanophyta = sum(abund, na.rm = TRUE))
      }
    )
  ) |>
  unnest(cyanophyta) |>
  select(-phyta) |>
  mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE))

# library(rmdify)
# library(dwfun)
# dwfun::init()
```