This commit is contained in:
2026-06-04 15:55:13 +08:00
parent 7aeb071fba
commit 028ef1c471
6 changed files with 1889 additions and 1 deletions
+617
View File
@@ -0,0 +1,617 @@
# 实践部分
## Kriging
### Code
```{r}
#| echo: true
#| eval: false
#| out-width: 50%
require(sf)
require(sp)
require(gstat)
require(raster)
require(ggplot2)
# 读取太湖边界
lake_boundary <- st_read("../../data/taihu.shp") |>
sf::st_make_valid()
main_water <- st_difference(
lake_boundary[lake_boundary$Name == "boundary", ],
st_union(lake_boundary[lake_boundary$Name != "boundary", ])
)
get_kriging <- function(indf, Yvarname, grid = NULL, boundsf = main_water) {
insf <- indf |>
sfext::df_to_sf(crs = 4326, coords = c("long", "lat")) |>
dplyr::select(-c("long", "lat")) |>
dplyr::rename(Y = tidyselect::all_of(Yvarname))
if (is.null(grid)) {
# 1. 将sf边界转为terra格式
v <- terra::vect(boundsf)
# 2. 创建基础网格
base_grid <- terra::rast(v, resolution = 0.002) # 基础低分辨率
# 3. 创建高分辨率区域(例如边界附近)
buffer_zone <- buffer(v, width = 0.005) # 边界附近创建缓冲区
hi_res_grid <- terra::rast(buffer_zone, resolution = 0.001)
# 4. 合并网格
final_grid <- merge(base_grid, hi_res_grid)
# 5. 转为点并裁剪
grid <- terra::as.points(final_grid) |>
sf::st_as_sf() |>
sf::st_filter(main_water)
}
insp <- sf::as_Spatial(insf)
fit <- automap::autofitVariogram(Y ~ 1, insp)
# 克里金插值
m <- gstat::gstat(
formula = Y ~ 1,
data = insp,
model = fit$var_model
)
predsf <- predict(m, grid) |>
sf::st_as_sf()
outsf <- predsf |>
st_coordinates() |>
as.data.frame() |>
cbind(pred = predsf$var1.pred)
return(outsf)
}
wqdf <- readxl::read_xlsx("../data/wqdata.xlsx")
if (FALSE) {
wqsf <- wqdf |>
nest(datedf = -date) |>
dplyr::mutate(
krigingsf = purrr::map(datedf, \(x) {
get_kriging(x, "conc", boundsf = main_water)
})
)
saveRDS(wqsf, "./wqsf.rds")
}
wqsf <- readRDS("./wqsf.rds")
wqsf |>
unnest(krigingsf) |>
ggplot(aes(X, Y)) +
geom_contour_filled(aes(z = pred), bins = 20) +
geom_sf(
data = main_water,
aes(x = NULL, y = NULL),
fill = NA,
colour = "black",
linewidth = 1
) +
# scale_fill_gradientn(colors = hcl.colors(100, "RdYlBu")) + # 设置颜色渐变
# ggsci::scale_fill_aaas() +
coord_sf() +
theme_void()
```
### Output
```{r}
#| echo: false
#| out-width: 80%
#| message: false
#| fig-width: 6
#| fig-height: 3
require(sf)
require(sp)
require(gstat)
require(raster)
require(ggplot2)
# 读取太湖边界
lake_boundary <- st_read("../../data/taihu.shp", quiet = TRUE) |>
sf::st_make_valid()
main_water <- st_difference(
lake_boundary[lake_boundary$Name == "boundary", ],
st_union(lake_boundary[lake_boundary$Name != "boundary", ])
)
get_kriging <- function(indf, Yvarname, grid = NULL, boundsf = main_water) {
insf <- indf |>
sfext::df_to_sf(crs = 4326, coords = c("long", "lat")) |>
dplyr::select(-c("long", "lat")) |>
dplyr::rename(Y = tidyselect::all_of(Yvarname))
if (is.null(grid)) {
# 1. 将sf边界转为terra格式
v <- terra::vect(boundsf)
# 2. 创建基础网格
base_grid <- terra::rast(v, resolution = 0.002) # 基础低分辨率
# 3. 创建高分辨率区域(例如边界附近)
buffer_zone <- buffer(v, width = 0.005) # 边界附近创建缓冲区
hi_res_grid <- terra::rast(buffer_zone, resolution = 0.001)
# 4. 合并网格
final_grid <- merge(base_grid, hi_res_grid)
# 5. 转为点并裁剪
grid <- terra::as.points(final_grid) |>
sf::st_as_sf() |>
sf::st_filter(main_water)
}
insp <- sf::as_Spatial(insf)
fit <- automap::autofitVariogram(Y ~ 1, insp)
# 克里金插值
m <- gstat::gstat(
formula = Y ~ 1,
data = insp,
model = fit$var_model
)
predsf <- predict(m, grid) |>
sf::st_as_sf()
outsf <- predsf |>
st_coordinates() |>
as.data.frame() |>
cbind(pred = predsf$var1.pred)
return(outsf)
}
wqdf <- readxl::read_xlsx("../../data/wqdata.xlsx")
if (FALSE) {
wqsf <- wqdf |>
nest(datedf = -date) |>
dplyr::mutate(
krigingsf = purrr::map(datedf, \(x) {
get_kriging(x, "conc", boundsf = main_water)
})
)
saveRDS(wqsf, "./wqsf.rds")
}
wqsf <- readRDS("./wqsf.rds")
wqsf |>
unnest(krigingsf) |>
ggplot(aes(X, Y)) +
geom_contour_filled(aes(z = pred), bins = 20) +
geom_sf(
data = main_water,
aes(x = NULL, y = NULL),
fill = NA,
colour = "black",
linewidth = 1
) +
# scale_fill_gradientn(colors = hcl.colors(100, "RdYlBu")) + # 设置颜色渐变
# ggsci::scale_fill_aaas() +
coord_sf() +
theme_void()
```
:::
## 数据
```{r}
require(tidyverse)
sitedf <- readr::read_csv(
"https://www.epa.gov/sites/default/files/2014-01/nla2007_sampledlakeinformation_20091113.csv"
) |>
select(
SITE_ID,
lon = LON_DD,
lat = LAT_DD,
name = LAKENAME,
area = LAKEAREA,
zmax = DEPTHMAX
) |>
group_by(SITE_ID) |>
summarize(
lon = mean(lon, na.rm = TRUE),
lat = mean(lat, na.rm = TRUE),
name = unique(name),
area = mean(area, na.rm = TRUE),
zmax = mean(zmax, na.rm = TRUE)
)
visitdf <- readr::read_csv(
"https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
) |>
select(SITE_ID, date = DATE_PROFILE, year = YEAR, visit = VISIT_NO) |>
distinct()
waterchemdf <- readr::read_csv(
"https://www.epa.gov/sites/default/files/2013-09/nla2007_profile_20091008.csv"
) |>
select(
SITE_ID,
date = DATE_PROFILE,
depth = DEPTH,
temp = TEMP_FIELD,
do = DO_FIELD,
ph = PH_FIELD,
cond = COND_FIELD,
)
sddf <- readr::read_csv(
"https://www.epa.gov/sites/default/files/2014-10/nla2007_secchi_20091008.csv"
) |>
select(
SITE_ID,
date = DATE_SECCHI,
sd = SECMEAN,
clear_to_bottom = CLEAR_TO_BOTTOM
)
trophicdf <- readr::read_csv(
"https://www.epa.gov/sites/default/files/2014-10/nla2007_trophic_conditionestimate_20091123.csv"
) |>
select(SITE_ID, visit = VISIT_NO, tp = PTL, tn = NTL, chla = CHLA) |>
left_join(visitdf, by = c("SITE_ID", "visit")) |>
select(-year, -visit) |>
group_by(SITE_ID, date) |>
summarize(
tp = mean(tp, na.rm = TRUE),
tn = mean(tn, na.rm = TRUE),
chla = mean(chla, na.rm = TRUE)
)
phytodf <- readr::read_csv(
"https://www.epa.gov/sites/default/files/2014-10/nla2007_phytoplankton_softalgaecount_20091023.csv"
) |>
select(
SITE_ID,
date = DATEPHYT,
depth = SAMPLE_DEPTH,
phyta = DIVISION,
genus = GENUS,
species = SPECIES,
tax = TAXANAME,
abund = ABUND
) |>
mutate(phyta = gsub(" .*$", "", phyta)) |>
filter(!is.na(genus)) |>
group_by(SITE_ID, date, depth, phyta, genus) |>
summarize(abund = sum(abund, na.rm = TRUE)) |>
nest(phytodf = -c(SITE_ID, date))
envdf <- waterchemdf |>
filter(depth < 2) |>
select(-depth) |>
group_by(SITE_ID, date) |>
summarise_all(~ mean(., na.rm = TRUE)) |>
ungroup() |>
left_join(sddf, by = c("SITE_ID", "date")) |>
left_join(trophicdf, by = c("SITE_ID", "date"))
nla <- envdf |>
left_join(phytodf) |>
left_join(sitedf, by = "SITE_ID") |>
filter(!purrr::map_lgl(phytodf, is.null)) |>
mutate(
cyanophyta = purrr::map(
phytodf,
~ .x |>
dplyr::filter(phyta == "Cyanophyta") |>
summarize(cyanophyta = sum(abund, na.rm = TRUE))
)
) |>
unnest(cyanophyta) |>
select(-phyta) |>
mutate(clear_to_bottom = ifelse(is.na(clear_to_bottom), TRUE, FALSE))
# library(rmdify)
# library(dwfun)
# dwfun::init()
```
## 数据
```{r}
skimr::skim(nla)
```
## 简单模型
```{r}
nla |>
filter(tp > 1) |>
ggplot(aes(tn, tp)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))
) +
scale_y_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))
)
m1 <- lm(log10(tp) ~ log10(tn), data = nla)
summary(m1)
```
## 复杂指标
```{r}
nla |>
filter(tp > 1) |>
ggplot(aes(tp, cyanophyta)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))
) +
scale_y_log10(
breaks = scales::trans_breaks("log10", function(x) 10^x),
labels = scales::trans_format("log10", scales::math_format(10^.x))
)
m2 <- lm(log10(cyanophyta) ~ log10(tp), data = nla)
summary(m2)
```
## tidymodels - Data split
```{r}
require(tidymodels)
(nla_split <- rsample::initial_split(nla, prop = 0.7, strata = zmax))
(nla_train <- training(nla_split))
(nla_test <- testing(nla_split))
```
## tidymodels - recipe
```{r}
nla_formula <- as.formula(
"cyanophyta ~ temp + do + ph + cond + sd + tp + tn + chla + clear_to_bottom"
)
# nla_formula <- as.formula("cyanophyta ~ temp + do + ph + cond + sd + tp + tn")
nla_recipe <- recipes::recipe(nla_formula, data = nla_train) |>
recipes::step_string2factor(all_nominal()) |>
recipes::step_nzv(all_nominal()) |>
recipes::step_log(chla, cyanophyta, base = 10) |>
recipes::step_normalize(all_numeric_predictors()) |>
prep()
nla_recipe
```
## tidymodels - cross validation
```{r}
nla_cv <- recipes::bake(
nla_recipe,
new_data = training(nla_split)
) |>
rsample::vfold_cv(v = 10)
nla_cv
```
## tidymodels - Model specification
```{r}
xgboost_model <- parsnip::boost_tree(
mode = "regression",
trees = 1000,
min_n = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune()
) |>
set_engine("xgboost", objective = "reg:squarederror")
xgboost_model
```
## tidymodels - Grid specification
```{r}
# grid specification
xgboost_params <- dials::parameters(
min_n(),
tree_depth(),
learn_rate(),
loss_reduction()
)
xgboost_params
```
## tidymodels - Grid specification
```{r}
xgboost_grid <- dials::grid_max_entropy(
xgboost_params,
size = 60
)
knitr::kable(head(xgboost_grid))
```
## tidymodels - Workflow
```{r}
xgboost_wf <- workflows::workflow() |>
add_model(xgboost_model) |>
add_formula(nla_formula)
xgboost_wf
```
## tidymodels - Tune
```{r}
#| cache: true
# hyperparameter tuning
if (FALSE) {
xgboost_tuned <- tune::tune_grid(
object = xgboost_wf,
resamples = nla_cv,
grid = xgboost_grid,
metrics = yardstick::metric_set(rmse, rsq, mae),
control = tune::control_grid(verbose = TRUE)
)
saveRDS(xgboost_tuned, "./xgboost_tuned.RDS")
}
xgboost_tuned <- readRDS("./xgboost_tuned.RDS")
```
## tidymodels - Best model
```{r}
xgboost_tuned |>
tune::show_best(metric = "rmse") |>
knitr::kable()
```
## tidymodels - Best model
```{r}
xgboost_tuned |>
collect_metrics()
```
## tidymodels - Best model
```{r}
#| fig-width: 9
#| fig-height: 5
#| out-width: "100%"
xgboost_tuned |>
autoplot()
```
## tidymodels - Best model
```{r}
xgboost_best_params <- xgboost_tuned |>
tune::select_best(metric = "rmse")
knitr::kable(xgboost_best_params)
```
## tidymodels - Final model
```{r}
xgboost_model_final <- xgboost_model |>
finalize_model(xgboost_best_params)
xgboost_model_final
```
## tidymodels - Train evaluation
```{r}
(train_processed <- bake(nla_recipe, new_data = nla_train))
```
## tidymodels - Train data
```{r}
train_prediction <- xgboost_model_final |>
# fit the model on all the training data
fit(
formula = nla_formula,
data = train_processed
) |>
# predict the sale prices for the training data
predict(new_data = train_processed) |>
bind_cols(
nla_train |>
mutate(.obs = log10(cyanophyta))
)
xgboost_score_train <-
train_prediction |>
yardstick::metrics(.obs, .pred) |>
mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
knitr::kable(xgboost_score_train)
```
## tidymodels - train evaluation
```{r}
#| fig-width: 5
#| fig-height: 3
#| out-width: "80%"
train_prediction |>
ggplot(aes(.pred, .obs)) +
geom_point() +
geom_smooth(method = "lm")
```
## tidymodels - test data
```{r}
test_processed <- bake(nla_recipe, new_data = nla_test)
test_prediction <- xgboost_model_final |>
# fit the model on all the training data
fit(
formula = nla_formula,
data = train_processed
) |>
# use the training model fit to predict the test data
predict(new_data = test_processed) |>
bind_cols(
nla_test |>
mutate(.obs = log10(cyanophyta))
)
# measure the accuracy of our model using `yardstick`
xgboost_score <- test_prediction |>
yardstick::metrics(.obs, .pred) |>
mutate(.estimate = format(round(.estimate, 2), big.mark = ","))
knitr::kable(xgboost_score)
```
## tidymodels - evaluation
```{r}
#| fig-width: 5
#| fig-height: 3
#| out-width: "80%"
cyanophyta_prediction_residual <- test_prediction |>
arrange(.pred) %>%
mutate(residual_pct = (.obs - .pred) / .pred) |>
select(.pred, residual_pct)
cyanophyta_prediction_residual |>
ggplot(aes(x = .pred, y = residual_pct)) +
geom_point() +
xlab("Predicted Cyanophyta") +
ylab("Residual (%)")
```
## tidymodels - test evaluation
```{r}
#| fig-width: 5
#| fig-height: 3
#| out-width: "80%"
test_prediction |>
ggplot(aes(.pred, .obs)) +
geom_point() +
geom_smooth(method = "lm", colour = "black")
```
-1
View File
@@ -274,7 +274,6 @@ p4 <- datadf |>
dplyr::filter(pH > 5) |>
ggplot(aes(x = month, y = pH)) +
geom_boxplot(aes(fill = as.factor(month)), colour = "black", size = 0.8) +
scale_x_continuous(breaks = 1:12, labels = month.abb) +
theme(legend.position = "none")
(p1 | p2) / (p3 | p4) + patchwork::plot_annotation(tag_levels = "A")
BIN
View File
Binary file not shown.
+1070
View File
File diff suppressed because one or more lines are too long
+202
View File
@@ -0,0 +1,202 @@
---
title: "L9 - Data Virtualization"
format: html
editor: visual
---
# Load Data
```{r}
# getwd()
# setwd("coding")
require(tidyverse)
datadf <- readRDS("../data/chinawq/datadf.rds")
```
# Data view
```{r}
skimr::skim(datadf)
head(datadf)
tail(datadf)
names(datadf)
summary(datadf)
str(datadf)
```
# Plot
```{r}
#| warning: false
#| message: false
# 每月NH4N与pH的相关性散点图
p <- datadf |>
dplyr::filter(NH4N < 100) |>
dplyr::filter(between(year(date), 2016, 2019)) |>
mutate(month = month(date)) |>
ggplot(aes(CODMn, NH4N)) +
geom_point(shape = 21, size = 0.8, fill = "orange") +
geom_smooth(method = "gam", color = "red") +
scale_x_log10() +
scale_y_log10() +
labs(
x = "COD<sub>Mn</sub> (mg L<sup>-1</sup>)",
y = "Ammonia (mg L<sup>-1</sup>)"
) +
facet_wrap(~month, scale = "free", ncol = 4) +
theme(
axis.title.x = ggtext::element_markdown(),
axis.title.y = ggtext::element_markdown()
)
print(p)
plotly::ggplotly(p)
```
```{r}
#| warning: false
#| message: false
# 每月NH4N与pH的相关性散点图
p <- datadf |>
dplyr::filter(NH4N < 100) |>
dplyr::filter(between(year(date), 2016, 2019)) |>
mutate(month = month(date)) |>
ggplot(aes(factor(month), NH4N)) +
geom_jitter(size = 0.1, colour = "gray95") +
geom_violin(fill = "orange", alpha = 0.3) +
scale_y_log10() +
labs(x = "Month", y = "Ammonia (mg L<sup>-1</sup>)") +
theme_classic() +
theme(axis.title.y = ggtext::element_markdown())
print(p)
plotly::ggplotly(p)
```
# Plotly
```{r}
p <- datadf |>
dplyr::filter(year(date) == 2018) |>
ggplot(aes(date, NH4N)) +
geom_point()
ggsave("L9-1.pdf", width = 4, height = 3)
# install.packages("plotly")
plotly::ggplotly(p)
```
# Map - sf
```{r}
# install.packages("sf")
# install.packages("sfext")
require(sf)
require(sfext)
chinawqsf <- sf::read_sf(
"../data/chinawq/Monitoring_sites/Monitoring_sites.shp"
)
chinawqsf |>
ggplot() +
geom_sf(shape = 21, size = 1, fill = "orange")
chinamapsf <- sf::read_sf("../data/中国省级地图GS20191719号.geojson")
ninelinesf <- sf::read_sf("../data/九段线GS20191719号.geojson")
chinamapsf <- sf::read_sf(
"https://git.drwater.net/course/su2026rwep/raw/branch/PUB/data/中国省级地图GS20191719号.geojson"
)
ninelinesf <- sf::read_sf(
"https://git.drwater.net/course/su2026rwep/raw/branch/PUB/data/九段线GS20191719号.geojson"
)
chinacrs <- "+proj=laea +lat_0=40 +lon_0=104"
mapeR::get_chinacrs()
chinawqsf |>
ggplot() +
geom_sf(data = chinamapsf) +
geom_sf(data = ninelinesf) +
geom_sf(shape = 21, size = 1, fill = "orange") +
coord_sf(datum = chinacrs)
```
```{r}
require(mapeR)
map_chinese()
```
```{r}
chinamapsf <- sf::read_sf("../data/中国省级地图GS20191719号.geojson")
ninelinesf <- sf::read_sf("../data/九段线GS20191719号.geojson")
chinamapsf <- sf::read_sf(
"https://git.drwater.net/course/su2026rwep/raw/branch/PUB/data/中国省级地图GS20191719号.geojson"
)
ninelinesf <- sf::read_sf(
"https://git.drwater.net/course/su2026rwep/raw/branch/PUB/data/九段线GS20191719号.geojson"
)
chinacrs <- "+proj=laea +lat_0=40 +lon_0=104"
# 安装
install.packages("ggspatial")
datadf |>
sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |>
sf::st_transform(crs = chinacrs) |>
dplyr::filter(!is.na(CODMn)) |>
ggplot() +
geom_sf(data = chinamapsf, aes(fill = CNAME), alpha = 0.2) +
geom_sf(data = ninelinesf) +
geom_sf(aes(colour = log1p(CODMn))) +
labs(x = NULL, y = NULL, fill = NULL, colour = "CODMn") +
scale_colour_viridis_c() +
ggspatial::annotation_north_arrow(location = "tl") +
ggspatial::annotation_scale(location = "bl") +
theme(legend.position = "none")
mapeR::map_chinese
```
# 具体某个湖泊
```{r}
sf::read_sf("../data/taihu.shp") |>
ggplot() +
geom_sf()
sitesf <- sf::read_sf("~/Desktop/qingcaosha.kml")
st_layers("~/Desktop/qingcaosha.kml")
sitesf <- sf::read_sf("~/Desktop/qingcaosha.kml")
boundsf <- sf::read_sf("~/Desktop/qingcaosha.kml", layer = "qingcaosha")
ggplot() +
geom_sf(data = boundsf) +
geom_sf(data = sitesf) +
geom_sf_text(data = sitesf, aes(label = Name), vjust = 1, size = 3)
```
Binary file not shown.