618 lines
13 KiB
Plaintext
618 lines
13 KiB
Plaintext
# 实践部分
|
|
|
|
|
|
## Kriging
|
|
|
|
### Code
|
|
|
|
```{r}
|
|
#| echo: true
|
|
#| eval: false
|
|
#| out-width: 50%
|
|
require(sf)
|
|
require(sp)
|
|
require(gstat)
|
|
require(raster)
|
|
require(ggplot2)
|
|
# 读取太湖边界
|
|
lake_boundary <- st_read("../../data/taihu.shp") |>
|
|
sf::st_make_valid()
|
|
main_water <- st_difference(
|
|
lake_boundary[lake_boundary$Name == "boundary", ],
|
|
st_union(lake_boundary[lake_boundary$Name != "boundary", ])
|
|
)
|
|
|
|
get_kriging <- function(indf, Yvarname, grid = NULL, boundsf = main_water) {
|
|
insf <- indf |>
|
|
sfext::df_to_sf(crs = 4326, coords = c("long", "lat")) |>
|
|
dplyr::select(-c("long", "lat")) |>
|
|
dplyr::rename(Y = tidyselect::all_of(Yvarname))
|
|
if (is.null(grid)) {
|
|
# 1. 将sf边界转为terra格式
|
|
v <- terra::vect(boundsf)
|
|
# 2. 创建基础网格
|
|
base_grid <- terra::rast(v, resolution = 0.002) # 基础低分辨率
|
|
# 3. 创建高分辨率区域(例如边界附近)
|
|
buffer_zone <- buffer(v, width = 0.005) # 边界附近创建缓冲区
|
|
hi_res_grid <- terra::rast(buffer_zone, resolution = 0.001)
|
|
# 4. 合并网格
|
|
final_grid <- merge(base_grid, hi_res_grid)
|
|
# 5. 转为点并裁剪
|
|
grid <- terra::as.points(final_grid) |>
|
|
sf::st_as_sf() |>
|
|
sf::st_filter(main_water)
|
|
}
|
|
insp <- sf::as_Spatial(insf)
|
|
fit <- automap::autofitVariogram(Y ~ 1, insp)
|
|
# 克里金插值
|
|
m <- gstat::gstat(
|
|
formula = Y ~ 1,
|
|
data = insp,
|
|
model = fit$var_model
|
|
)
|
|
predsf <- predict(m, grid) |>
|
|
sf::st_as_sf()
|
|
outsf <- predsf |>
|
|
st_coordinates() |>
|
|
as.data.frame() |>
|
|
cbind(pred = predsf$var1.pred)
|
|
return(outsf)
|
|
}
|
|
|
|
|
|
wqdf <- readxl::read_xlsx("../data/wqdata.xlsx")
|
|
|
|
if (FALSE) {
|
|
wqsf <- wqdf |>
|
|
nest(datedf = -date) |>
|
|
dplyr::mutate(
|
|
krigingsf = purrr::map(datedf, \(x) {
|
|
get_kriging(x, "conc", boundsf = main_water)
|
|
})
|
|
)
|
|
saveRDS(wqsf, "./wqsf.rds")
|
|
}
|
|
|
|
wqsf <- readRDS("./wqsf.rds")
|
|
|
|
wqsf |>
|
|
unnest(krigingsf) |>
|
|
ggplot(aes(X, Y)) +
|
|
geom_contour_filled(aes(z = pred), bins = 20) +
|
|
geom_sf(
|
|
data = main_water,
|
|
aes(x = NULL, y = NULL),
|
|
fill = NA,
|
|
colour = "black",
|
|
linewidth = 1
|
|
) +
|
|
# scale_fill_gradientn(colors = hcl.colors(100, "RdYlBu")) + # 设置颜色渐变
|
|
# ggsci::scale_fill_aaas() +
|
|
coord_sf() +
|
|
theme_void()
|
|
```
|
|
|
|
### Output
|
|
|
|
```{r}
|
|
#| echo: false
|
|
#| out-width: 80%
|
|
#| message: false
|
|
#| fig-width: 6
|
|
#| fig-height: 3
|
|
require(sf)
|
|
require(sp)
|
|
require(gstat)
|
|
require(raster)
|
|
require(ggplot2)
|
|
# 读取太湖边界
|
|
lake_boundary <- st_read("../../data/taihu.shp", quiet = TRUE) |>
|
|
sf::st_make_valid()
|
|
main_water <- st_difference(
|
|
lake_boundary[lake_boundary$Name == "boundary", ],
|
|
st_union(lake_boundary[lake_boundary$Name != "boundary", ])
|
|
)
|
|
|
|
get_kriging <- function(indf, Yvarname, grid = NULL, boundsf = main_water) {
|
|
insf <- indf |>
|
|
sfext::df_to_sf(crs = 4326, coords = c("long", "lat")) |>
|
|
dplyr::select(-c("long", "lat")) |>
|
|
dplyr::rename(Y = tidyselect::all_of(Yvarname))
|
|
if (is.null(grid)) {
|
|
# 1. 将sf边界转为terra格式
|
|
v <- terra::vect(boundsf)
|
|
# 2. 创建基础网格
|
|
base_grid <- terra::rast(v, resolution = 0.002) # 基础低分辨率
|
|
# 3. 创建高分辨率区域(例如边界附近)
|
|
buffer_zone <- buffer(v, width = 0.005) # 边界附近创建缓冲区
|
|
hi_res_grid <- terra::rast(buffer_zone, resolution = 0.001)
|
|
# 4. 合并网格
|
|
final_grid <- merge(base_grid, hi_res_grid)
|
|
# 5. 转为点并裁剪
|
|
grid <- terra::as.points(final_grid) |>
|
|
sf::st_as_sf() |>
|
|
sf::st_filter(main_water)
|
|
}
|
|
insp <- sf::as_Spatial(insf)
|
|
fit <- automap::autofitVariogram(Y ~ 1, insp)
|
|
# 克里金插值
|
|
m <- gstat::gstat(
|
|
formula = Y ~ 1,
|
|
data = insp,
|
|
model = fit$var_model
|
|
)
|
|
predsf <- predict(m, grid) |>
|
|
sf::st_as_sf()
|
|
outsf <- predsf |>
|
|
st_coordinates() |>
|
|
as.data.frame() |>
|
|
cbind(pred = predsf$var1.pred)
|
|
return(outsf)
|
|
}
|
|
|
|
|
|
wqdf <- readxl::read_xlsx("../../data/wqdata.xlsx")
|
|
|
|
if (FALSE) {
|
|
wqsf <- wqdf |>
|
|
nest(datedf = -date) |>
|
|
dplyr::mutate(
|
|
krigingsf = purrr::map(datedf, \(x) {
|
|
get_kriging(x, "conc", boundsf = main_water)
|
|
})
|
|
)
|
|
saveRDS(wqsf, "./wqsf.rds")
|
|
}
|
|
wqsf <- readRDS("./wqsf.rds")
|
|
|
|
wqsf |>
|
|
unnest(krigingsf) |>
|
|
ggplot(aes(X, Y)) +
|
|
geom_contour_filled(aes(z = pred), bins = 20) +
|
|
geom_sf(
|
|
data = main_water,
|
|
aes(x = NULL, y = NULL),
|
|
fill = NA,
|
|
colour = "black",
|
|
linewidth = 1
|
|
) +
|
|
# scale_fill_gradientn(colors = hcl.colors(100, "RdYlBu")) + # 设置颜色渐变
|
|
# ggsci::scale_fill_aaas() +
|
|
coord_sf() +
|
|
theme_void()
|
|
```
|
|
|
|
:::
|
|
|
|
|
|
## 数据
|
|
|
|
```{r}
|
|
require(tidyverse)
|
|
sitedf <- readr::read_csv(
|
|
"https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv"
|
|
) |>
|
|
select(
|
|
SITE_ID,
|
|
lon = LON_DD,
|
|
lat = LAT_DD,
|
|
name = LAKENAME,
|
|
area = LAKEAREA,
|
|
zmax = DEPTHMAX
|
|
) |>
|
|
group_by(SITE_ID) |>
|
|
summarize(
|
|
lon = mean(lon, na.rm = TRUE),
|
|
lat = mean(lat, na.rm = TRUE),
|
|
name = unique(name),
|
|
area = mean(area, na.rm = TRUE),
|
|
zmax = mean(zmax, na.rm = TRUE)
|
|
)
|
|
|
|
|
|
visitdf <- readr::read_csv(
|
|
"https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
|
|
) |>
|
|
select(SITE_ID, date = DATE_PROFILE, year = YEAR, visit = VISIT_NO) |>
|
|
distinct()
|
|
|
|
|
|
waterchemdf <- readr::read_csv(
|
|
"https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
|
|
) |>
|
|
select(
|
|
SITE_ID,
|
|
date = DATE_PROFILE,
|
|
depth = DEPTH,
|
|
temp = TEMP_FIELD,
|
|
do = DO_FIELD,
|
|
ph = PH_FIELD,
|
|
cond = COND_FIELD,
|
|
)
|
|
|
|
sddf <- readr::read_csv(
|
|
"https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv"
|
|
) |>
|
|
select(
|
|
SITE_ID,
|
|
date = DATE_SECCHI,
|
|
sd = SECMEAN,
|
|
clear_to_bottom = CLEAR_TO_BOTTOM
|
|
)
|
|
|
|
trophicdf <- readr::read_csv(
|
|
"https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv"
|
|
) |>
|
|
select(SITE_ID, visit = VISIT_NO, tp = PTL, tn = NTL, chla = CHLA) |>
|
|
left_join(visitdf, by = c("SITE_ID", "visit")) |>
|
|
select(-year, -visit) |>
|
|
group_by(SITE_ID, date) |>
|
|
summarize(
|
|
tp = mean(tp, na.rm = TRUE),
|
|
tn = mean(tn, na.rm = TRUE),
|
|
chla = mean(chla, na.rm = TRUE)
|
|
)
|
|
|
|
|
|
phytodf <- readr::read_csv(
|
|
"https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv"
|
|
) |>
|
|
select(
|
|
SITE_ID,
|
|
date = DATEPHYT,
|
|
depth = SAMPLE_DEPTH,
|
|
phyta = DIVISION,
|
|
genus = GENUS,
|
|
species = SPECIES,
|
|
tax = TAXANAME,
|
|
abund = ABUND
|
|
) |>
|
|
mutate(phyta = gsub(" .*$", "", phyta)) |>
|
|
filter(!is.na(genus)) |>
|
|
group_by(SITE_ID, date, depth, phyta, genus) |>
|
|
summarize(abund = sum(abund, na.rm = TRUE)) |>
|
|
nest(phytodf = -c(SITE_ID, date))
|
|
|
|
envdf <- waterchemdf |>
|
|
filter(depth < 2) |>
|
|
select(-depth) |>
|
|
group_by(SITE_ID, date) |>
|
|
summarise_all(~ mean(., na.rm = TRUE)) |>
|
|
ungroup() |>
|
|
left_join(sddf, by = c("SITE_ID", "date")) |>
|
|
left_join(trophicdf, by = c("SITE_ID", "date"))
|
|
|
|
nla <- envdf |>
|
|
left_join(phytodf) |>
|
|
left_join(sitedf, by = "SITE_ID") |>
|
|
filter(!purrr::map_lgl(phytodf, is.null)) |>
|
|
mutate(
|
|
cyanophyta = purrr::map(
|
|
phytodf,
|
|
~ .x |>
|
|
dplyr::filter(phyta == "Cyanophyta") |>
|
|
summarize(cyanophyta = sum(abund, na.rm = TRUE))
|
|
)
|
|
) |>
|
|
unnest(cyanophyta) |>
|
|
select(-phyta) |>
|
|
mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE))
|
|
|
|
# library(rmdify)
|
|
# library(dwfun)
|
|
# dwfun::init()
|
|
```
|
|
|
|
|
|
## 数据
|
|
|
|
```{r}
|
|
skimr::skim(nla)
|
|
```
|
|
|
|
|
|
|
|
## 简单模型
|
|
|
|
```{r}
|
|
nla |>
|
|
filter(tp > 1) |>
|
|
ggplot(aes(tn, tp)) +
|
|
geom_point() +
|
|
geom_smooth(method = "lm") +
|
|
scale_x_log10(
|
|
breaks = scales::trans_breaks("log10", function(x) 10^x),
|
|
labels = scales::trans_format("log10", scales::math_format(10^.x))
|
|
) +
|
|
scale_y_log10(
|
|
breaks = scales::trans_breaks("log10", function(x) 10^x),
|
|
labels = scales::trans_format("log10", scales::math_format(10^.x))
|
|
)
|
|
|
|
m1 <- lm(log10(tp) ~ log10(tn), data = nla)
|
|
|
|
summary(m1)
|
|
```
|
|
|
|
## 复杂指标
|
|
|
|
```{r}
|
|
nla |>
|
|
filter(tp > 1) |>
|
|
ggplot(aes(tp, cyanophyta)) +
|
|
geom_point() +
|
|
geom_smooth(method = "lm") +
|
|
scale_x_log10(
|
|
breaks = scales::trans_breaks("log10", function(x) 10^x),
|
|
labels = scales::trans_format("log10", scales::math_format(10^.x))
|
|
) +
|
|
scale_y_log10(
|
|
breaks = scales::trans_breaks("log10", function(x) 10^x),
|
|
labels = scales::trans_format("log10", scales::math_format(10^.x))
|
|
)
|
|
|
|
m2 <- lm(log10(cyanophyta) ~ log10(tp), data = nla)
|
|
|
|
summary(m2)
|
|
```
|
|
|
|
|
|
|
|
|
|
## tidymodels - Data split
|
|
|
|
```{r}
|
|
require(tidymodels)
|
|
(nla_split <- rsample::initial_split(nla, prop = 0.7, strata = zmax))
|
|
(nla_train <- training(nla_split))
|
|
(nla_test <- testing(nla_split))
|
|
```
|
|
|
|
## tidymodels - recipe
|
|
|
|
```{r}
|
|
nla_formula <- as.formula(
|
|
"cyanophyta ~ temp + do + ph + cond + sd + tp + tn + chla + clear_to_bottom"
|
|
)
|
|
# nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn")
|
|
nla_recipe <- recipes::recipe(nla_formula, data = nla_train) |>
|
|
recipes::step_string2factor(all_nominal()) |>
|
|
recipes::step_nzv(all_nominal()) |>
|
|
recipes::step_log(chla, cyanophyta, base = 10) |>
|
|
recipes::step_normalize(all_numeric_predictors()) |>
|
|
prep()
|
|
nla_recipe
|
|
```
|
|
|
|
## tidymodels - cross validation
|
|
|
|
```{r}
|
|
nla_cv <- recipes::bake(
|
|
nla_recipe,
|
|
new_data = training(nla_split)
|
|
) |>
|
|
rsample::vfold_cv(v = 10)
|
|
nla_cv
|
|
```
|
|
|
|
## tidymodels - Model specification
|
|
|
|
```{r}
|
|
xgboost_model <- parsnip::boost_tree(
|
|
mode = "regression",
|
|
trees = 1000,
|
|
min_n = tune(),
|
|
tree_depth = tune(),
|
|
learn_rate = tune(),
|
|
loss_reduction = tune()
|
|
) |>
|
|
set_engine("xgboost", objective = "reg:squarederror")
|
|
xgboost_model
|
|
```
|
|
|
|
|
|
## tidymodels - Grid specification
|
|
|
|
```{r}
|
|
# grid specification
|
|
xgboost_params <- dials::parameters(
|
|
min_n(),
|
|
tree_depth(),
|
|
learn_rate(),
|
|
loss_reduction()
|
|
)
|
|
xgboost_params
|
|
```
|
|
|
|
## tidymodels - Grid specification
|
|
|
|
```{r}
|
|
xgboost_grid <- dials::grid_max_entropy(
|
|
xgboost_params,
|
|
size = 60
|
|
)
|
|
knitr::kable(head(xgboost_grid))
|
|
```
|
|
|
|
## tidymodels - Workflow
|
|
|
|
```{r}
|
|
xgboost_wf <- workflows::workflow() |>
|
|
add_model(xgboost_model) |>
|
|
add_formula(nla_formula)
|
|
xgboost_wf
|
|
```
|
|
|
|
|
|
## tidymodels - Tune
|
|
|
|
```{r}
|
|
#| cache: true
|
|
# hyperparameter tuning
|
|
if (FALSE) {
|
|
xgboost_tuned <- tune::tune_grid(
|
|
object = xgboost_wf,
|
|
resamples = nla_cv,
|
|
grid = xgboost_grid,
|
|
metrics = yardstick::metric_set(rmse, rsq, mae),
|
|
control = tune::control_grid(verbose = TRUE)
|
|
)
|
|
saveRDS(xgboost_tuned, "./xgboost_tuned.RDS")
|
|
}
|
|
xgboost_tuned <- readRDS("./xgboost_tuned.RDS")
|
|
```
|
|
|
|
## tidymodels - Best model
|
|
|
|
```{r}
|
|
xgboost_tuned |>
|
|
tune::show_best(metric = "rmse") |>
|
|
knitr::kable()
|
|
```
|
|
|
|
|
|
## tidymodels - Best model
|
|
|
|
```{r}
|
|
xgboost_tuned |>
|
|
collect_metrics()
|
|
```
|
|
|
|
|
|
## tidymodels - Best model
|
|
|
|
```{r}
|
|
#| fig-width: 9
|
|
#| fig-height: 5
|
|
#| out-width: "100%"
|
|
xgboost_tuned |>
|
|
autoplot()
|
|
```
|
|
|
|
|
|
## tidymodels - Best model
|
|
|
|
|
|
```{r}
|
|
xgboost_best_params <- xgboost_tuned |>
|
|
tune::select_best(metric = "rmse")
|
|
|
|
knitr::kable(xgboost_best_params)
|
|
```
|
|
|
|
|
|
## tidymodels - Final model
|
|
|
|
```{r}
|
|
xgboost_model_final <- xgboost_model |>
|
|
finalize_model(xgboost_best_params)
|
|
xgboost_model_final
|
|
```
|
|
|
|
|
|
## tidymodels - Train evaluation
|
|
|
|
|
|
```{r}
|
|
(train_processed <- bake(nla_recipe, new_data = nla_train))
|
|
```
|
|
|
|
## tidymodels - Train data
|
|
|
|
```{r}
|
|
train_prediction <- xgboost_model_final |>
|
|
# fit the model on all the training data
|
|
fit(
|
|
formula = nla_formula,
|
|
data = train_processed
|
|
) |>
|
|
# predict the sale prices for the training data
|
|
predict(new_data = train_processed) |>
|
|
bind_cols(
|
|
nla_train |>
|
|
mutate(.obs = log10(cyanophyta))
|
|
)
|
|
xgboost_score_train <-
|
|
train_prediction |>
|
|
yardstick::metrics(.obs, .pred) |>
|
|
mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
|
|
knitr::kable(xgboost_score_train)
|
|
```
|
|
|
|
## tidymodels - train evaluation
|
|
|
|
```{r}
|
|
#| fig-width: 5
|
|
#| fig-height: 3
|
|
#| out-width: "80%"
|
|
train_prediction |>
|
|
ggplot(aes(.pred, .obs)) +
|
|
geom_point() +
|
|
geom_smooth(method = "lm")
|
|
```
|
|
|
|
|
|
## tidymodels - test data
|
|
|
|
|
|
```{r}
|
|
test_processed <- bake(nla_recipe, new_data = nla_test)
|
|
|
|
test_prediction <- xgboost_model_final |>
|
|
# fit the model on all the training data
|
|
fit(
|
|
formula = nla_formula,
|
|
data = train_processed
|
|
) |>
|
|
# use the training model fit to predict the test data
|
|
predict(new_data = test_processed) |>
|
|
bind_cols(
|
|
nla_test |>
|
|
mutate(.obs = log10(cyanophyta))
|
|
)
|
|
|
|
# measure the accuracy of our model using `yardstick`
|
|
xgboost_score <- test_prediction |>
|
|
yardstick::metrics(.obs, .pred) |>
|
|
mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
|
|
|
|
knitr::kable(xgboost_score)
|
|
```
|
|
|
|
|
|
## tidymodels - evaluation
|
|
|
|
```{r}
|
|
#| fig-width: 5
|
|
#| fig-height: 3
|
|
#| out-width: "80%"
|
|
cyanophyta_prediction_residual <- test_prediction |>
|
|
arrange(.pred) %>%
|
|
mutate(residual_pct = (.obs - .pred) / .pred) |>
|
|
select(.pred, residual_pct)
|
|
|
|
cyanophyta_prediction_residual |>
|
|
ggplot(aes(x = .pred, y = residual_pct)) +
|
|
geom_point() +
|
|
xlab("Predicted Cyanophyta") +
|
|
ylab("Residual (%)")
|
|
```
|
|
|
|
|
|
|
|
|
|
## tidymodels - test evaluation
|
|
|
|
```{r}
|
|
#| fig-width: 5
|
|
#| fig-height: 3
|
|
#| out-width: "80%"
|
|
test_prediction |>
|
|
ggplot(aes(.pred, .obs)) +
|
|
geom_point() +
|
|
geom_smooth(method = "lm", colour = "black")
|
|
```
|
|
|
|
|
|
|