update
This commit is contained in:
parent
1532a8982d
commit
2db87d5e89
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,398 @@
|
|||
---
|
||||
title: "第10次课"
|
||||
editor: visual
|
||||
---
|
||||
|
||||
```{r}
|
||||
require(tidymodels)
|
||||
taxi
|
||||
set.seed(123)
|
||||
taxi_split <- initial_split(taxi, prop = 0.8)
|
||||
taxi_train <- training(taxi_split)
|
||||
taxi_test <- testing(taxi_split)
|
||||
|
||||
nrow(taxi_train)
|
||||
```
|
||||
```{r}
|
||||
tree_spec <-
|
||||
decision_tree(cost_complexity = 0.002) %>%
|
||||
set_mode("classification")
|
||||
|
||||
tree_fit <-
|
||||
workflow(tip ~ ., tree_spec) %>%
|
||||
fit(data = taxi_train)
|
||||
taxi_test |>
|
||||
bind_cols(
|
||||
predict(tree_fit, new_data = taxi_test))
|
||||
|
||||
```
|
||||
|
||||
```{r}
|
||||
require(magrittr)
|
||||
# %>%: magrittr
|
||||
# |>: base
|
||||
taxi_folds <- vfold_cv(taxi_train, v = 5)
|
||||
tree_spec <-
|
||||
decision_tree(cost_complexity = 0.002) |>
|
||||
set_mode("classification")
|
||||
taxi_wflow <- workflow() |>
|
||||
add_formula(tip ~ .) |>
|
||||
add_model(tree_spec)
|
||||
taxi_res <- fit_resamples(taxi_wflow, taxi_folds)
|
||||
taxi_res
|
||||
taxi_metrics <- taxi_res |>
|
||||
collect_metrics()
|
||||
|
||||
|
||||
# Save the assessment set results
|
||||
ctrl_taxi <- control_resamples(save_pred = TRUE)
|
||||
taxi_res <- fit_resamples(taxi_wflow, taxi_folds, control = ctrl_taxi)
|
||||
|
||||
taxi_res |>
|
||||
collect_predictions()
|
||||
|
||||
taxi_metrics <- metric_set(accuracy, specificity, sensitivity)
|
||||
|
||||
taxi_preds |>
|
||||
group_by(id) |>
|
||||
taxi_metrics(truth = tip, estimate = .pred_class)
|
||||
|
||||
```
|
||||
|
||||
|
||||
## 数据
|
||||
|
||||
```{r}
|
||||
require(tidyverse)
|
||||
sitedf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv") |>
|
||||
select(SITE_ID,
|
||||
lon = LON_DD,
|
||||
lat = LAT_DD,
|
||||
name = LAKENAME,
|
||||
area = LAKEAREA,
|
||||
zmax = DEPTHMAX
|
||||
) |>
|
||||
group_by(SITE_ID) |>
|
||||
summarize(lon = mean(lon, na.rm = TRUE),
|
||||
lat = mean(lat, na.rm = TRUE),
|
||||
name = unique(name),
|
||||
area = mean(area, na.rm = TRUE),
|
||||
zmax = mean(zmax, na.rm = TRUE))
|
||||
|
||||
|
||||
visitdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv") |>
|
||||
select(SITE_ID,
|
||||
date = DATE_PROFILE,
|
||||
year = YEAR,
|
||||
visit = VISIT_NO
|
||||
) |>
|
||||
distinct()
|
||||
|
||||
|
||||
|
||||
waterchemdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv") |>
|
||||
select(SITE_ID,
|
||||
date = DATE_PROFILE,
|
||||
depth = DEPTH,
|
||||
temp = TEMP_FIELD,
|
||||
do = DO_FIELD,
|
||||
ph = PH_FIELD,
|
||||
cond = COND_FIELD,
|
||||
)
|
||||
|
||||
sddf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv") |>
|
||||
select(SITE_ID,
|
||||
date = DATE_SECCHI,
|
||||
sd = SECMEAN,
|
||||
clear_to_bottom = CLEAR_TO_BOTTOM
|
||||
)
|
||||
|
||||
trophicdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv") |>
|
||||
select(SITE_ID,
|
||||
visit = VISIT_NO,
|
||||
tp = PTL,
|
||||
tn = NTL,
|
||||
chla = CHLA) |>
|
||||
left_join(visitdf, by = c("SITE_ID", "visit")) |>
|
||||
select(-year, -visit) |>
|
||||
group_by(SITE_ID, date) |>
|
||||
summarize(tp = mean(tp, na.rm = TRUE),
|
||||
tn = mean(tn, na.rm = TRUE),
|
||||
chla = mean(chla, na.rm = TRUE)
|
||||
)
|
||||
|
||||
|
||||
|
||||
phytodf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv") |>
|
||||
select(SITE_ID,
|
||||
date = DATEPHYT,
|
||||
depth = SAMPLE_DEPTH,
|
||||
phyta = DIVISION,
|
||||
genus = GENUS,
|
||||
species = SPECIES,
|
||||
tax = TAXANAME,
|
||||
abund = ABUND) |>
|
||||
mutate(phyta = gsub(" .*$", "", phyta)) |>
|
||||
filter(!is.na(genus)) |>
|
||||
group_by(SITE_ID, date, depth, phyta, genus) |>
|
||||
summarize(abund = sum(abund, na.rm = TRUE)) |>
|
||||
nest(phytodf = -c(SITE_ID, date))
|
||||
|
||||
envdf <- waterchemdf |>
|
||||
filter(depth < 2) |>
|
||||
select(-depth) |>
|
||||
group_by(SITE_ID, date) |>
|
||||
summarise_all(~mean(., na.rm = TRUE)) |>
|
||||
ungroup() |>
|
||||
left_join(sddf, by = c("SITE_ID", "date")) |>
|
||||
left_join(trophicdf, by = c("SITE_ID", "date"))
|
||||
|
||||
nla <- envdf |>
|
||||
left_join(phytodf) |>
|
||||
left_join(sitedf, by = "SITE_ID") |>
|
||||
filter(!purrr::map_lgl(phytodf, is.null)) |>
|
||||
mutate(cyanophyta = purrr::map(phytodf, ~ .x |>
|
||||
dplyr::filter(phyta == "Cyanophyta") |>
|
||||
summarize(cyanophyta = sum(abund, na.rm = TRUE))
|
||||
)) |>
|
||||
unnest(cyanophyta) |>
|
||||
select(-phyta) |>
|
||||
mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE))
|
||||
|
||||
|
||||
# library(rmdify)
|
||||
# library(dwfun)
|
||||
# dwfun::init()
|
||||
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
skimr::skim(nla)
|
||||
```
|
||||
```{r}
|
||||
nla |>
|
||||
filter(tp > 1) |>
|
||||
ggplot(aes(tn, tp)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm") +
|
||||
scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
|
||||
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
|
||||
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
|
||||
labels = scales::trans_format("log10", scales::math_format(10^.x)))
|
||||
|
||||
m1 <- lm(log10(tp) ~ log10(tn), data = nla)
|
||||
|
||||
summary(m1)
|
||||
```
|
||||
```{r}
|
||||
nla |>
|
||||
filter(tp > 1) |>
|
||||
ggplot(aes(tp, cyanophyta)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm") +
|
||||
scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
|
||||
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
|
||||
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
|
||||
labels = scales::trans_format("log10", scales::math_format(10^.x)))
|
||||
|
||||
|
||||
m2 <- lm(log10(cyanophyta) ~ log10(tp), data = nla)
|
||||
|
||||
summary(m2)
|
||||
|
||||
m3 <- glm(log10(cyanophyta) ~ log10(tp), data = nla)
|
||||
summary(m3)
|
||||
|
||||
require(mgcv)
|
||||
# 广义加性模型 General additive model
|
||||
m4 <- gam(log10(cyanophyta) ~ s(log10(tp)) + s(log10(tn)) + s(log10(chla)), data = nla)
|
||||
plot(m4)
|
||||
summary(m4)
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
nla_split <- initial_split(nla, prop = 0.7, strata = zmax)
|
||||
nla_train <- training(nla_split)
|
||||
nla_test <- testing(nla_split)
|
||||
```
|
||||
|
||||
```{r}
|
||||
# xgboost, cyanophyta
|
||||
|
||||
nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn + chla + clear_to_bottom")
|
||||
# nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn")
|
||||
nla_recipe <- recipes::recipe(nla_formula, data = nla_train) |>
|
||||
recipes::step_string2factor(all_nominal()) |>
|
||||
recipes::step_nzv(all_nominal()) |>
|
||||
recipes::step_log(chla, cyanophyta, tn, tp, base = 10) |>
|
||||
recipes::step_normalize(all_numeric_predictors()) |>
|
||||
prep()
|
||||
nla_recipe
|
||||
```
|
||||
|
||||
```{r}
|
||||
nla_cv <- recipes::bake(
|
||||
nla_recipe,
|
||||
new_data = training(nla_split)
|
||||
) |>
|
||||
rsample::vfold_cv(v = 10)
|
||||
nla_cv
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_model <- parsnip::boost_tree(
|
||||
mode = "regression",
|
||||
trees = 1000,
|
||||
min_n = tune(),
|
||||
tree_depth = tune(),
|
||||
learn_rate = tune(),
|
||||
loss_reduction = tune()
|
||||
) |>
|
||||
set_engine("xgboost", objective = "reg:squarederror")
|
||||
xgboost_model
|
||||
```
|
||||
|
||||
```{r}
|
||||
# grid specification
|
||||
xgboost_params <- dials::parameters(
|
||||
min_n(),
|
||||
tree_depth(),
|
||||
learn_rate(),
|
||||
loss_reduction()
|
||||
)
|
||||
xgboost_params
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_grid <- dials::grid_max_entropy(
|
||||
xgboost_params,
|
||||
size = 60
|
||||
)
|
||||
knitr::kable(head(xgboost_grid))
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_wf <- workflows::workflow() |>
|
||||
add_model(xgboost_model) |>
|
||||
add_formula(nla_formula)
|
||||
xgboost_wf
|
||||
```
|
||||
|
||||
```{r}
|
||||
# hyperparameter tuning
|
||||
if (FALSE) {
|
||||
xgboost_tuned <- tune::tune_grid(
|
||||
object = xgboost_wf,
|
||||
resamples = nla_cv,
|
||||
grid = xgboost_grid,
|
||||
metrics = yardstick::metric_set(rmse, rsq, mae),
|
||||
control = tune::control_grid(verbose = TRUE)
|
||||
)
|
||||
saveRDS(xgboost_tuned, "./xgboost_tuned.RDS")
|
||||
}
|
||||
xgboost_tuned <- readRDS("./xgboost_tuned.RDS")
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_tuned |>
|
||||
tune::show_best(metric = "rmse") |>
|
||||
knitr::kable()
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_tuned |>
|
||||
collect_metrics()
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_tuned |>
|
||||
autoplot()
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_best_params <- xgboost_tuned |>
|
||||
tune::select_best("rmse")
|
||||
|
||||
knitr::kable(xgboost_best_params)
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_model_final <- xgboost_model |>
|
||||
finalize_model(xgboost_best_params)
|
||||
xgboost_model_final
|
||||
```
|
||||
|
||||
```{r}
|
||||
(train_processed <- bake(nla_recipe, new_data = nla_train))
|
||||
```
|
||||
|
||||
```{r}
|
||||
train_prediction <- xgboost_model_final |>
|
||||
# fit the model on all the training data
|
||||
fit(
|
||||
formula = nla_formula,
|
||||
data = train_processed
|
||||
) |>
|
||||
# predict the sale prices for the training data
|
||||
predict(new_data = train_processed) |>
|
||||
bind_cols(nla_train |>
|
||||
mutate(.obs = log10(cyanophyta)))
|
||||
xgboost_score_train <-
|
||||
train_prediction |>
|
||||
yardstick::metrics(.obs, .pred) |>
|
||||
mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
|
||||
knitr::kable(xgboost_score_train)
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
train_prediction |>
|
||||
ggplot(aes(.pred, .obs)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm")
|
||||
```
|
||||
|
||||
```{r}
|
||||
test_processed <- bake(nla_recipe, new_data = nla_test)
|
||||
|
||||
test_prediction <- xgboost_model_final |>
|
||||
# fit the model on all the training data
|
||||
fit(
|
||||
formula = nla_formula,
|
||||
data = train_processed
|
||||
) |>
|
||||
# use the training model fit to predict the test data
|
||||
predict(new_data = test_processed) |>
|
||||
bind_cols(nla_test |>
|
||||
mutate(.obs = log10(cyanophyta)))
|
||||
|
||||
# measure the accuracy of our model using `yardstick`
|
||||
xgboost_score <- test_prediction |>
|
||||
yardstick::metrics(.obs, .pred) |>
|
||||
mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
|
||||
|
||||
knitr::kable(xgboost_score)
|
||||
```
|
||||
|
||||
```{r}
|
||||
cyanophyta_prediction_residual <- test_prediction |>
|
||||
arrange(.pred) %>%
|
||||
mutate(residual_pct = (.obs - .pred) / .pred) |>
|
||||
select(.pred, residual_pct)
|
||||
|
||||
cyanophyta_prediction_residual |>
|
||||
ggplot(aes(x = .pred, y = residual_pct)) +
|
||||
geom_point() +
|
||||
xlab("Predicted Cyanophyta") +
|
||||
ylab("Residual (%)")
|
||||
```
|
||||
|
||||
```{r}
|
||||
test_prediction |>
|
||||
ggplot(aes(.pred, .obs)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm", colour = "black")
|
||||
```
|
||||
|
|
@ -0,0 +1,401 @@
|
|||
---
|
||||
title: "第10次课"
|
||||
editor: visual
|
||||
---
|
||||
|
||||
```{r}
|
||||
require(tidymodels)
|
||||
taxi
|
||||
set.seed(123)
|
||||
taxi_split <- initial_split(taxi, prop = 0.8)
|
||||
taxi_train <- training(taxi_split)
|
||||
taxi_test <- testing(taxi_split)
|
||||
|
||||
nrow(taxi_train)
|
||||
```
|
||||
```{r}
|
||||
tree_spec <-
|
||||
decision_tree(cost_complexity = 0.002) %>%
|
||||
set_mode("classification")
|
||||
|
||||
tree_fit <-
|
||||
workflow(tip ~ ., tree_spec) %>%
|
||||
fit(data = taxi_train)
|
||||
taxi_test |>
|
||||
bind_cols(
|
||||
predict(tree_fit, new_data = taxi_test))
|
||||
|
||||
```
|
||||
|
||||
```{r}
|
||||
require(magrittr)
|
||||
# %>%: magrittr
|
||||
# |>: base
|
||||
taxi_folds <- vfold_cv(taxi_train, v = 5)
|
||||
tree_spec <-
|
||||
decision_tree(cost_complexity = 0.002) |>
|
||||
set_mode("classification")
|
||||
taxi_wflow <- workflow() |>
|
||||
add_formula(tip ~ .) |>
|
||||
add_model(tree_spec)
|
||||
taxi_res <- fit_resamples(taxi_wflow, taxi_folds)
|
||||
taxi_res
|
||||
taxi_metrics <- taxi_res |>
|
||||
collect_metrics()
|
||||
|
||||
|
||||
# Save the assessment set results
|
||||
ctrl_taxi <- control_resamples(save_pred = TRUE)
|
||||
taxi_res <- fit_resamples(taxi_wflow, taxi_folds, control = ctrl_taxi)
|
||||
|
||||
taxi_res |>
|
||||
collect_predictions()
|
||||
|
||||
taxi_metrics <- metric_set(accuracy, specificity, sensitivity)
|
||||
|
||||
taxi_preds |>
|
||||
group_by(id) |>
|
||||
taxi_metrics(truth = tip, estimate = .pred_class)
|
||||
|
||||
```
|
||||
|
||||
|
||||
## 数据
|
||||
|
||||
```{r}
|
||||
require(tidyverse)
|
||||
sitedf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv") |>
|
||||
select(SITE_ID,
|
||||
lon = LON_DD,
|
||||
lat = LAT_DD,
|
||||
name = LAKENAME,
|
||||
area = LAKEAREA,
|
||||
zmax = DEPTHMAX
|
||||
) |>
|
||||
group_by(SITE_ID) |>
|
||||
summarize(lon = mean(lon, na.rm = TRUE),
|
||||
lat = mean(lat, na.rm = TRUE),
|
||||
name = unique(name),
|
||||
area = mean(area, na.rm = TRUE),
|
||||
zmax = mean(zmax, na.rm = TRUE))
|
||||
|
||||
|
||||
visitdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv") |>
|
||||
select(SITE_ID,
|
||||
date = DATE_PROFILE,
|
||||
year = YEAR,
|
||||
visit = VISIT_NO
|
||||
) |>
|
||||
distinct()
|
||||
|
||||
|
||||
|
||||
waterchemdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv") |>
|
||||
select(SITE_ID,
|
||||
date = DATE_PROFILE,
|
||||
depth = DEPTH,
|
||||
temp = TEMP_FIELD,
|
||||
do = DO_FIELD,
|
||||
ph = PH_FIELD,
|
||||
cond = COND_FIELD,
|
||||
)
|
||||
|
||||
sddf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv") |>
|
||||
select(SITE_ID,
|
||||
date = DATE_SECCHI,
|
||||
sd = SECMEAN,
|
||||
clear_to_bottom = CLEAR_TO_BOTTOM
|
||||
)
|
||||
|
||||
trophicdf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv") |>
|
||||
select(SITE_ID,
|
||||
visit = VISIT_NO,
|
||||
tp = PTL,
|
||||
tn = NTL,
|
||||
chla = CHLA) |>
|
||||
left_join(visitdf, by = c("SITE_ID", "visit")) |>
|
||||
select(-year, -visit) |>
|
||||
group_by(SITE_ID, date) |>
|
||||
summarize(tp = mean(tp, na.rm = TRUE),
|
||||
tn = mean(tn, na.rm = TRUE),
|
||||
chla = mean(chla, na.rm = TRUE)
|
||||
)
|
||||
|
||||
|
||||
|
||||
phytodf <- readr::read_csv("https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv") |>
|
||||
select(SITE_ID,
|
||||
date = DATEPHYT,
|
||||
depth = SAMPLE_DEPTH,
|
||||
phyta = DIVISION,
|
||||
genus = GENUS,
|
||||
species = SPECIES,
|
||||
tax = TAXANAME,
|
||||
abund = ABUND) |>
|
||||
mutate(phyta = gsub(" .*$", "", phyta)) |>
|
||||
filter(!is.na(genus)) |>
|
||||
group_by(SITE_ID, date, depth, phyta, genus) |>
|
||||
summarize(abund = sum(abund, na.rm = TRUE)) |>
|
||||
nest(phytodf = -c(SITE_ID, date))
|
||||
|
||||
envdf <- waterchemdf |>
|
||||
filter(depth < 2) |>
|
||||
select(-depth) |>
|
||||
group_by(SITE_ID, date) |>
|
||||
summarise_all(~mean(., na.rm = TRUE)) |>
|
||||
ungroup() |>
|
||||
left_join(sddf, by = c("SITE_ID", "date")) |>
|
||||
left_join(trophicdf, by = c("SITE_ID", "date"))
|
||||
|
||||
nla <- envdf |>
|
||||
left_join(phytodf) |>
|
||||
left_join(sitedf, by = "SITE_ID") |>
|
||||
filter(!purrr::map_lgl(phytodf, is.null)) |>
|
||||
mutate(cyanophyta = purrr::map(phytodf, ~ .x |>
|
||||
dplyr::filter(phyta == "Cyanophyta") |>
|
||||
summarize(cyanophyta = sum(abund, na.rm = TRUE))
|
||||
)) |>
|
||||
unnest(cyanophyta) |>
|
||||
select(-phyta) |>
|
||||
mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE))
|
||||
|
||||
|
||||
# library(rmdify)
|
||||
# library(dwfun)
|
||||
# dwfun::init()
|
||||
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
skimr::skim(nla)
|
||||
```
|
||||
```{r}
|
||||
nla |>
|
||||
filter(tp > 1) |>
|
||||
ggplot(aes(tn, tp)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm") +
|
||||
scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
|
||||
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
|
||||
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
|
||||
labels = scales::trans_format("log10", scales::math_format(10^.x)))
|
||||
|
||||
m1 <- lm(log10(tp) ~ log10(tn), data = nla)
|
||||
|
||||
summary(m1)
|
||||
```
|
||||
```{r}
|
||||
nla |>
|
||||
filter(tp > 1) |>
|
||||
ggplot(aes(tp, cyanophyta)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm") +
|
||||
scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
|
||||
labels = scales::trans_format("log10", scales::math_format(10^.x))) +
|
||||
scale_y_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),
|
||||
labels = scales::trans_format("log10", scales::math_format(10^.x)))
|
||||
|
||||
|
||||
m2 <- lm(log10(cyanophyta) ~ log10(tp), data = nla)
|
||||
|
||||
summary(m2)
|
||||
|
||||
m3 <- glm(log10(cyanophyta) ~ log10(tp), data = nla)
|
||||
summary(m3)
|
||||
|
||||
require(mgcv)
|
||||
# 广义加性模型 General additive model
|
||||
m4 <- gam(log10(cyanophyta) ~ s(log10(tp)) + s(log10(tn)) + s(log10(chla)), data = nla)
|
||||
plot(m4)
|
||||
summary(m4)
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
nla_split <- initial_split(nla, prop = 0.7, strata = zmax)
|
||||
nla_train <- training(nla_split)
|
||||
nla_test <- testing(nla_split)
|
||||
```
|
||||
|
||||
```{r}
|
||||
# xgboost, cyanophyta
|
||||
|
||||
nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn + chla + clear_to_bottom")
|
||||
# nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn")
|
||||
nla_recipe <- recipes::recipe(nla_formula, data = nla_train) |>
|
||||
recipes::step_string2factor(all_nominal()) |>
|
||||
recipes::step_nzv(all_nominal()) |>
|
||||
recipes::step_log(chla, cyanophyta, tn, tp, base = 10) |>
|
||||
recipes::step_normalize(all_numeric_predictors()) |>
|
||||
prep()
|
||||
nla_recipe
|
||||
```
|
||||
|
||||
```{r}
|
||||
nla_cv <- recipes::bake(
|
||||
nla_recipe,
|
||||
new_data = nla_train
|
||||
) |>
|
||||
rsample::vfold_cv(v = 10)
|
||||
nla_cv
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_model <- parsnip::boost_tree(
|
||||
mode = "regression",
|
||||
trees = 1000,
|
||||
min_n = tune(),
|
||||
tree_depth = tune(),
|
||||
learn_rate = tune(),
|
||||
loss_reduction = tune()
|
||||
) |>
|
||||
set_engine("xgboost", objective = "reg:squarederror")
|
||||
xgboost_model
|
||||
```
|
||||
|
||||
```{r}
|
||||
# grid specification
|
||||
xgboost_params <- dials::parameters(
|
||||
min_n(),
|
||||
tree_depth(),
|
||||
learn_rate(),
|
||||
loss_reduction()
|
||||
)
|
||||
xgboost_params
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_grid <- dials::grid_max_entropy(
|
||||
xgboost_params,
|
||||
size = 60
|
||||
)
|
||||
knitr::kable(head(xgboost_grid))
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_wf <- workflows::workflow() |>
|
||||
add_model(xgboost_model) |>
|
||||
add_formula(nla_formula)
|
||||
xgboost_wf
|
||||
```
|
||||
|
||||
```{r}
|
||||
|
||||
install.packages("xgboost")
|
||||
|
||||
# hyperparameter tuning
|
||||
if (FALSE) {
|
||||
xgboost_tuned <- tune::tune_grid(
|
||||
object = xgboost_wf,
|
||||
resamples = nla_cv,
|
||||
grid = xgboost_grid,
|
||||
metrics = yardstick::metric_set(rmse, rsq, mae),
|
||||
control = tune::control_grid(verbose = TRUE)
|
||||
)
|
||||
saveRDS(xgboost_tuned, "./xgboost_tuned.RDS")
|
||||
}
|
||||
xgboost_tuned <- readRDS("./xgboost_tuned.RDS")
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_tuned |>
|
||||
tune::show_best(metric = "rmse") |>
|
||||
knitr::kable()
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_tuned |>
|
||||
collect_metrics()
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_tuned |>
|
||||
autoplot()
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_best_params <- xgboost_tuned |>
|
||||
tune::select_best("rmse")
|
||||
|
||||
knitr::kable(xgboost_best_params)
|
||||
```
|
||||
|
||||
```{r}
|
||||
xgboost_model_final <- xgboost_model |>
|
||||
finalize_model(xgboost_best_params)
|
||||
xgboost_model_final
|
||||
```
|
||||
|
||||
```{r}
|
||||
(train_processed <- bake(nla_recipe, new_data = nla_train))
|
||||
```
|
||||
|
||||
```{r}
|
||||
train_prediction <- xgboost_model_final |>
|
||||
# fit the model on all the training data
|
||||
fit(
|
||||
formula = nla_formula,
|
||||
data = train_processed
|
||||
) |>
|
||||
# predict the sale prices for the training data
|
||||
predict(new_data = train_processed) |>
|
||||
bind_cols(nla_train |>
|
||||
mutate(.obs = log10(cyanophyta)))
|
||||
xgboost_score_train <-
|
||||
train_prediction |>
|
||||
yardstick::metrics(.obs, .pred) |>
|
||||
mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
|
||||
knitr::kable(xgboost_score_train)
|
||||
```
|
||||
|
||||
|
||||
```{r}
|
||||
train_prediction |>
|
||||
ggplot(aes(.pred, .obs)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm")
|
||||
```
|
||||
|
||||
```{r}
|
||||
test_processed <- bake(nla_recipe, new_data = nla_test)
|
||||
|
||||
test_prediction <- xgboost_model_final |>
|
||||
# fit the model on all the training data
|
||||
fit(
|
||||
formula = nla_formula,
|
||||
data = train_processed
|
||||
) |>
|
||||
# use the training model fit to predict the test data
|
||||
predict(new_data = test_processed) |>
|
||||
bind_cols(nla_test |>
|
||||
mutate(.obs = log10(cyanophyta)))
|
||||
|
||||
# measure the accuracy of our model using `yardstick`
|
||||
xgboost_score <- test_prediction |>
|
||||
yardstick::metrics(.obs, .pred) |>
|
||||
mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
|
||||
|
||||
knitr::kable(xgboost_score)
|
||||
```
|
||||
|
||||
```{r}
|
||||
cyanophyta_prediction_residual <- test_prediction |>
|
||||
arrange(.pred) %>%
|
||||
mutate(residual_pct = (.obs - .pred) / .pred) |>
|
||||
select(.pred, residual_pct)
|
||||
|
||||
cyanophyta_prediction_residual |>
|
||||
ggplot(aes(x = .pred, y = residual_pct)) +
|
||||
geom_point() +
|
||||
xlab("Predicted Cyanophyta") +
|
||||
ylab("Residual (%)")
|
||||
```
|
||||
|
||||
```{r}
|
||||
test_prediction |>
|
||||
ggplot(aes(.pred, .obs)) +
|
||||
geom_point() +
|
||||
geom_smooth(method = "lm", colour = "black")
|
||||
```
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -19,4 +19,7 @@ subtitle: "课程简介"
|
|||
|
||||
- 网页公开:[https://drwater.rcees.ac.cn/course/public/RWEP/\@PUB/index.html](https://drwater.rcees.ac.cn/course/public/RWEP/@PUB/index.html)
|
||||
- 课件代码:[https://drwater.rcees.ac.cn/git/course/RWEP.git](https://drwater.rcees.ac.cn/git/course/RWEP.git)
|
||||
- 代码web界面:[https://on.tty-share.com/s/2YElKFjzniilfg3vH-d1PUqqmz_MQzNki5dyxWWL7QONCX_77GzsdP9EEAgBQu3ONwM/](https://on.tty-share.com/s/2YElKFjzniilfg3vH-d1PUqqmz_MQzNki5dyxWWL7QONCX_77GzsdP9EEAgBQu3ONwM/)
|
||||
- 代码web界面:[https://on.tty-share.com/s/4hC-HxrUnzWmL4JnbopI4W-qrYU1oQgeYuvyJNaGLREcxl0_wbNAJ5UYRf8oEa0ayp4/](https://on.tty-share.com/s/4hC-HxrUnzWmL4JnbopI4W-qrYU1oQgeYuvyJNaGLREcxl0_wbNAJ5UYRf8oEa0ayp4/)
|
||||
|
||||
<!-- `tty-share --public -readonly -command tmux` -->
|
||||
|
||||
|
|
Loading…
Reference in New Issue