--- title: "Lesson 9" format: html --- ```{r} # install.packages("tidymodels") require(tidymodels) taxi taxisplit <- initial_split(taxi, prop = 0.8) taxi_train <- training(taxisplit) taxi_test <- testing(taxisplit) tree_spec <- decision_tree(cost_complexity = 0.002) %>% set_mode("classification") taxi_fit <- workflow() %>% add_formula(tip ~ .) %>% add_model(tree_spec) %>% fit(data = taxi_train) ``` ```{r} augment(taxi_fit, new_data = taxi_train) %>% relocate(tip, .pred_class, .pred_yes, .pred_no) augment(taxi_fit, new_data = taxi_train) %>% conf_mat(truth = tip, estimate = .pred_class) augment(taxi_fit, new_data = taxi_train) %>% accuracy(truth = tip, estimate = .pred_class) augment(taxi_fit, new_data = taxi_train) %>% sensitivity(truth = tip, estimate = .pred_class) augment(taxi_fit, new_data = taxi_train) %>% specificity(truth = tip, estimate = .pred_class) taxi_metrics <- metric_set(accuracy, specificity, sensitivity) augment(taxi_fit, new_data = taxi_train) %>% taxi_metrics(truth = tip, estimate = .pred_class) taxi_metrics <- metric_set(accuracy, specificity, sensitivity) augment(taxi_fit, new_data = taxi_train) %>% group_by(local) %>% taxi_metrics(truth = tip, estimate = .pred_class) augment(taxi_fit, new_data = taxi_train) %>% roc_curve(truth = tip, .pred_yes) %>% autoplot() augment(taxi_fit, new_data = taxi_train) augment(taxi_fit, new_data = taxi_train) %>% roc_curve(truth = tip, .pred_yes) |> ggplot(aes(1 - sensitivity, specificity)) + geom_point() + geom_line() + geom_abline(slope = 1) ``` # Cross Validation ```{r} vfold_cv(taxi_train, v = 10) |> pull(splits) |> nth(1) taxi_folds <- vfold_cv(taxi_train) taxi_folds$splits[1:3] vfold_cv(taxi_train, strata = tip) set.seed(123) taxi_folds <- vfold_cv(taxi_train, v = 10, strata = tip) taxi_folds taxi_wflow <- workflow() %>% add_formula(tip ~ .) %>% add_model(tree_spec) taxi_res <- fit_resamples(taxi_wflow, taxi_folds) taxi_res taxi_res$.metrics[[1]] taxi_res$splits[[1]] analysis(taxi_res$splits[[1]]) analysis(taxi_res$splits[[1]]) assessment(taxi_res$splits[[1]]) taxi_res %>% collect_metrics() taxi_res %>% collect_metrics() %>% select(.metric, mean, n) # Save the assessment set results ctrl_taxi <- control_resamples(save_pred = TRUE) taxi_res <- fit_resamples(taxi_wflow, taxi_folds, control = ctrl_taxi) taxi_res ``` # NLA2007 cyanophyta model ```{r} require(tidyverse) sitedf <- readr::read_csv( "https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv" ) |> select( SITE_ID, lon = LON_DD, lat = LAT_DD, name = LAKENAME, area = LAKEAREA, zmax = DEPTHMAX ) |> group_by(SITE_ID) |> summarize( lon = mean(lon, na.rm = TRUE), lat = mean(lat, na.rm = TRUE), name = unique(name), area = mean(area, na.rm = TRUE), zmax = mean(zmax, na.rm = TRUE) ) visitdf <- readr::read_csv( "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv" ) |> select(SITE_ID, date = DATE_PROFILE, year = YEAR, visit = VISIT_NO) |> distinct() waterchemdf <- readr::read_csv( "https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv" ) |> select( SITE_ID, date = DATE_PROFILE, depth = DEPTH, temp = TEMP_FIELD, do = DO_FIELD, ph = PH_FIELD, cond = COND_FIELD, ) sddf <- readr::read_csv( "https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv" ) |> select( SITE_ID, date = DATE_SECCHI, sd = SECMEAN, clear_to_bottom = CLEAR_TO_BOTTOM ) trophicdf <- readr::read_csv( "https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv" ) |> select(SITE_ID, visit = VISIT_NO, tp = PTL, tn = NTL, chla = CHLA) |> left_join(visitdf, by = c("SITE_ID", "visit")) |> select(-year, -visit) |> group_by(SITE_ID, date) |> summarize( tp = mean(tp, na.rm = TRUE), tn = mean(tn, na.rm = TRUE), chla = mean(chla, na.rm = TRUE) ) phytodf <- readr::read_csv( "https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv" ) |> select( SITE_ID, date = DATEPHYT, depth = SAMPLE_DEPTH, phyta = DIVISION, genus = GENUS, species = SPECIES, tax = TAXANAME, abund = ABUND ) |> mutate(phyta = gsub(" .*$", "", phyta)) |> filter(!is.na(genus)) |> group_by(SITE_ID, date, depth, phyta, genus) |> summarize(abund = sum(abund, na.rm = TRUE)) |> nest(phytodf = -c(SITE_ID, date)) phytodf$phytodf[[1]] envdf <- waterchemdf |> filter(depth < 2) |> select(-depth) |> group_by(SITE_ID, date) |> summarise_all(~ mean(., na.rm = TRUE)) |> ungroup() |> left_join(sddf, by = c("SITE_ID", "date")) |> left_join(trophicdf, by = c("SITE_ID", "date")) nla <- envdf |> left_join(phytodf, by = c("SITE_ID", "date")) |> left_join(sitedf, by = "SITE_ID") |> filter(!purrr::map_lgl(phytodf, is.null)) |> mutate( cyanophyta = purrr::map( phytodf, \(x) { x |> dplyr::filter(phyta == "Cyanophyta") |> summarize(cyanophyta = sum(abund, na.rm = TRUE)) } ) ) |> unnest(cyanophyta) |> select(-phyta) |> mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE)) # library(rmdify) # library(dwfun) # dwfun::init() ```