--- title: "第10次课" editor: visual --- ```{r} require(tidymodels) taxi set.seed(123) taxi_split <- initial_split(taxi, prop = 0.8) taxi_train <- training(taxi_split) taxi_test <- testing(taxi_split) nrow(taxi_train) ``` ```{r} tree_spec <- decision_tree(cost_complexity = 0.002) %>% set_mode("classification") tree_fit <- workflow(tip ~ ., tree_spec) %>% fit(data = taxi_train) taxi_test |> bind_cols( predict(tree_fit, new_data = taxi_test)) ``` ```{r} require(magrittr) # %>%: magrittr # |>: base taxi_folds <- vfold_cv(taxi_train, v = 5) tree_spec <- decision_tree(cost_complexity = 0.002) |> set_mode("classification") taxi_wflow <- workflow() |> add_formula(tip ~ .) |> add_model(tree_spec) taxi_res <- fit_resamples(taxi_wflow, taxi_folds) taxi_res taxi_metrics <- taxi_res |> collect_metrics() # Save the assessment set results ctrl_taxi <- control_resamples(save_pred = TRUE) taxi_res <- fit_resamples(taxi_wflow, taxi_folds, control = ctrl_taxi) taxi_res |> collect_predictions() taxi_metrics <- metric_set(accuracy, specificity, sensitivity) taxi_preds |> group_by(id) |> taxi_metrics(truth = tip, estimate = .pred_class) ``` ## 数据 ```{r} require(tidyverse) sitedf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv") |> select(SITE_ID, lon = LON_DD, lat = LAT_DD, name = LAKENAME, area = LAKEAREA, zmax = DEPTHMAX ) |> group_by(SITE_ID) |> summarize(lon = mean(lon, na.rm = TRUE), lat = mean(lat, na.rm = TRUE), name = unique(name), area = mean(area, na.rm = TRUE), zmax = mean(zmax, na.rm = TRUE)) visitdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv") |> select(SITE_ID, date = DATE_PROFILE, year = YEAR, visit = VISIT_NO ) |> distinct() waterchemdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv") |> select(SITE_ID, date = DATE_PROFILE, depth = DEPTH, temp = TEMP_FIELD, do = DO_FIELD, ph = PH_FIELD, cond = COND_FIELD, ) sddf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv") |> select(SITE_ID, date = DATE_SECCHI, sd = SECMEAN, clear_to_bottom = CLEAR_TO_BOTTOM ) trophicdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv") |> select(SITE_ID, visit = VISIT_NO, tp = PTL, tn = NTL, chla = CHLA) |> left_join(visitdf, by = c("SITE_ID", "visit")) |> select(-year, -visit) |> group_by(SITE_ID, date) |> summarize(tp = mean(tp, na.rm = TRUE), tn = mean(tn, na.rm = TRUE), chla = mean(chla, na.rm = TRUE) ) phytodf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv") |> select(SITE_ID, date = DATEPHYT, depth = SAMPLE_DEPTH, phyta = DIVISION, genus = GENUS, species = SPECIES, tax = TAXANAME, abund = ABUND) |> mutate(phyta = gsub(" .*$", "", phyta)) |> filter(!is.na(genus)) |> group_by(SITE_ID, date, depth, phyta, genus) |> summarize(abund = sum(abund, na.rm = TRUE)) |> nest(phytodf = -c(SITE_ID, date)) envdf <- waterchemdf |> filter(depth < 2) |> select(-depth) |> group_by(SITE_ID, date) |> summarise_all(~mean(., na.rm = TRUE)) |> ungroup() |> left_join(sddf, by = c("SITE_ID", "date")) |> left_join(trophicdf, by = c("SITE_ID", "date")) nla <- envdf |> left_join(phytodf) |> left_join(sitedf, by = "SITE_ID") |> filter(!purrr::map_lgl(phytodf, is.null)) |> mutate(cyanophyta = purrr::map(phytodf, ~ .x |> dplyr::filter(phyta == "Cyanophyta") |> summarize(cyanophyta = sum(abund, na.rm = TRUE)) )) |> unnest(cyanophyta) |> select(-phyta) |> mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE)) # library(rmdify) # library(dwfun) # dwfun::init() ``` ```{r} skimr::skim(nla) ``` ```{r} nla |> filter(tp > 1) |> ggplot(aes(tn, tp)) + geom_point() + geom_smooth(method = "lm") + scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x), labels = scales::trans_format("log10", scales::math_format(10^.x))) + scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x), labels = scales::trans_format("log10", scales::math_format(10^.x))) m1 <- lm(log10(tp) ~ log10(tn), data = nla) summary(m1) ``` ```{r} nla |> filter(tp > 1) |> ggplot(aes(tp, cyanophyta)) + geom_point() + geom_smooth(method = "lm") + scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x), labels = scales::trans_format("log10", scales::math_format(10^.x))) + scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x), labels = scales::trans_format("log10", scales::math_format(10^.x))) m2 <- lm(log10(cyanophyta) ~ log10(tp), data = nla) summary(m2) m3 <- glm(log10(cyanophyta) ~ log10(tp), data = nla) summary(m3) require(mgcv) # 广义加性模型 General additive model m4 <- gam(log10(cyanophyta) ~ s(log10(tp)) + s(log10(tn)) + s(log10(chla)), data = nla) plot(m4) summary(m4) ``` ```{r} nla_split <- initial_split(nla, prop = 0.7, strata = zmax) nla_train <- training(nla_split) nla_test <- testing(nla_split) ``` ```{r} # xgboost, cyanophyta nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn + chla + clear_to_bottom") # nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn") nla_recipe <- recipes::recipe(nla_formula, data = nla_train) |> recipes::step_string2factor(all_nominal()) |> recipes::step_nzv(all_nominal()) |> recipes::step_log(chla, cyanophyta, tn, tp, base = 10) |> recipes::step_normalize(all_numeric_predictors()) |> prep() nla_recipe ``` ```{r} nla_cv <- recipes::bake( nla_recipe, new_data = nla_train ) |> rsample::vfold_cv(v = 10) nla_cv ``` ```{r} xgboost_model <- parsnip::boost_tree( mode = "regression", trees = 1000, min_n = tune(), tree_depth = tune(), learn_rate = tune(), loss_reduction = tune() ) |> set_engine("xgboost", objective = "reg:squarederror") xgboost_model ``` ```{r} # grid specification xgboost_params <- dials::parameters( min_n(), tree_depth(), learn_rate(), loss_reduction() ) xgboost_params ``` ```{r} xgboost_grid <- dials::grid_max_entropy( xgboost_params, size = 60 ) knitr::kable(head(xgboost_grid)) ``` ```{r} xgboost_wf <- workflows::workflow() |> add_model(xgboost_model) |> add_formula(nla_formula) xgboost_wf ``` ```{r} install.packages("xgboost") # hyperparameter tuning if (FALSE) { xgboost_tuned <- tune::tune_grid( object = xgboost_wf, resamples = nla_cv, grid = xgboost_grid, metrics = yardstick::metric_set(rmse, rsq, mae), control = tune::control_grid(verbose = TRUE) ) saveRDS(xgboost_tuned, "./xgboost_tuned.RDS") } xgboost_tuned <- readRDS("./xgboost_tuned.RDS") ``` ```{r} xgboost_tuned |> tune::show_best(metric = "rmse") |> knitr::kable() ``` ```{r} xgboost_tuned |> collect_metrics() ``` ```{r} xgboost_tuned |> autoplot() ``` ```{r} xgboost_best_params <- xgboost_tuned |> tune::select_best("rmse") knitr::kable(xgboost_best_params) ``` ```{r} xgboost_model_final <- xgboost_model |> finalize_model(xgboost_best_params) xgboost_model_final ``` ```{r} (train_processed <- bake(nla_recipe, new_data = nla_train)) ``` ```{r} train_prediction <- xgboost_model_final |> # fit the model on all the training data fit( formula = nla_formula, data = train_processed ) |> # predict the sale prices for the training data predict(new_data = train_processed) |> bind_cols(nla_train |> mutate(.obs = log10(cyanophyta))) xgboost_score_train <- train_prediction |> yardstick::metrics(.obs, .pred) |> mutate(.estimate = format(round(.estimate, 2), big.mark = ",")) knitr::kable(xgboost_score_train) ``` ```{r} train_prediction |> ggplot(aes(.pred, .obs)) + geom_point() + geom_smooth(method = "lm") ``` ```{r} test_processed <- bake(nla_recipe, new_data = nla_test) test_prediction <- xgboost_model_final |> # fit the model on all the training data fit( formula = nla_formula, data = train_processed ) |> # use the training model fit to predict the test data predict(new_data = test_processed) |> bind_cols(nla_test |> mutate(.obs = log10(cyanophyta))) # measure the accuracy of our model using `yardstick` xgboost_score <- test_prediction |> yardstick::metrics(.obs, .pred) |> mutate(.estimate = format(round(.estimate, 2), big.mark = ",")) knitr::kable(xgboost_score) ``` ```{r} cyanophyta_prediction_residual <- test_prediction |> arrange(.pred) %>% mutate(residual_pct = (.obs - .pred) / .pred) |> select(.pred, residual_pct) cyanophyta_prediction_residual |> ggplot(aes(x = .pred, y = residual_pct)) + geom_point() + xlab("Predicted Cyanophyta") + ylab("Residual (%)") ``` ```{r} test_prediction |> ggplot(aes(.pred, .obs)) + geom_point() + geom_smooth(method = "lm", colour = "black") ```