L9 - Data Virtualization

Load Data

# getwd()
# setwd("coding")
require(tidyverse)
Loading required package: tidyverse
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.1.6
✔ forcats   1.0.0     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
datadf <- readRDS("../data/chinawq/datadf.rds")

Data view

skimr::skim(datadf)
Data summary
Name datadf
Number of rows 75747
Number of columns 21
_______________________
Column type frequency:
Date 1
numeric 20
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 1980-01-07 2022-08-01 2014-06-16 1405

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
lon 0 1.00 116.14 7.32 79.98 112.31 117.19 120.71 134.67 ▁▁▃▇▁
lat 0 1.00 32.90 7.37 16.51 28.65 32.66 38.32 53.48 ▃▆▇▅▁
NH4N 21245 0.72 0.76 4.78 0.00 0.15 0.26 0.49 1000.00 ▇▁▁▁▁
CODMn 21368 0.72 4.51 6.15 0.00 2.10 3.30 5.30 241.00 ▇▁▁▁▁
DO 2740 0.96 7.89 2.38 0.00 6.57 7.74 9.16 93.20 ▇▁▁▁▁
pH 1289 0.98 7.79 0.51 0.00 7.40 7.90 8.12 10.30 ▁▁▁▇▁
BOD 75315 0.01 1.79 2.81 0.10 0.60 1.02 2.00 41.30 ▇▁▁▁▁
TSSs 75418 0.00 472.39 3192.68 0.50 27.00 100.00 260.00 51800.00 ▇▁▁▁▁
TEMP 75227 0.01 17.61 7.76 0.10 11.20 17.80 24.33 31.80 ▂▇▇▇▆
DOC 75734 0.00 4.87 4.61 1.00 1.60 3.20 7.20 14.50 ▇▁▂▁▂
NO3N 75359 0.01 1.03 1.00 0.00 0.37 0.72 1.30 6.86 ▇▂▁▁▁
DIP 57308 0.24 0.01 0.02 0.00 0.00 0.01 0.02 0.64 ▇▁▁▁▁
NO2N 75413 0.00 0.04 0.06 0.00 0.00 0.01 0.04 0.60 ▇▁▁▁▁
TP 75551 0.00 0.60 2.34 0.00 0.01 0.05 0.18 25.61 ▇▁▁▁▁
DOSAT 75716 0.00 95.07 14.20 55.70 88.80 96.10 105.50 119.40 ▁▃▇▇▅
COD 56239 0.26 1.06 0.98 0.03 0.60 0.86 1.24 23.00 ▇▁▁▁▁
TDP 75731 0.00 0.01 0.01 0.00 0.01 0.01 0.02 0.03 ▁▇▃▁▃
TOC 75746 0.00 76.70 NA 76.70 76.70 76.70 76.70 76.70 ▁▁▇▁▁
TPH 58862 0.22 0.02 0.02 0.00 0.01 0.01 0.03 0.63 ▇▁▁▁▁
DIN 56378 0.26 0.29 0.40 0.00 0.05 0.14 0.36 8.17 ▇▁▁▁▁
head(datadf)
# A tibble: 6 × 21
    lon   lat date        NH4N CODMn    DO    pH   BOD  TSSs  TEMP   DOC  NO3N
  <dbl> <dbl> <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1  80.7  43.8 2012-05-28  0.07   2.9  6.04   7.9    NA    NA    NA    NA    NA
2  80.7  43.8 2012-06-04  0.04   2.7  6.62   7.8    NA    NA    NA    NA    NA
3  80.7  43.8 2012-06-11  0.02   1    6.39   7.8    NA    NA    NA    NA    NA
4  80.7  43.8 2012-06-18  1.33   1.3  5.6    7.7    NA    NA    NA    NA    NA
5  80.7  43.8 2012-06-25  2.66   3    4.63   7.7    NA    NA    NA    NA    NA
6  80.7  43.8 2012-07-02  0.05   6.9  5.04   7.9    NA    NA    NA    NA    NA
# ℹ 9 more variables: DIP <dbl>, NO2N <dbl>, TP <dbl>, DOSAT <dbl>, COD <dbl>,
#   TDP <dbl>, TOC <dbl>, TPH <dbl>, DIN <dbl>
tail(datadf)
# A tibble: 6 × 21
    lon   lat date        NH4N CODMn    DO    pH   BOD  TSSs  TEMP   DOC  NO3N
  <dbl> <dbl> <date>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1  122.  30.4 2022-07-01    NA    NA  6.71  8.01    NA    NA    NA    NA    NA
2  122.  30.4 2020-04-01    NA    NA  8.68  8.07    NA    NA    NA    NA    NA
3  122.  30.4 2020-07-01    NA    NA  6.7   8.03    NA    NA    NA    NA    NA
4  122.  30.4 2021-04-01    NA    NA  7.4   7.95    NA    NA    NA    NA    NA
5  122.  30.4 2020-10-01    NA    NA  7.67  8.07    NA    NA    NA    NA    NA
6  122.  30.4 2021-10-01    NA    NA  7.19  8.26    NA    NA    NA    NA    NA
# ℹ 9 more variables: DIP <dbl>, NO2N <dbl>, TP <dbl>, DOSAT <dbl>, COD <dbl>,
#   TDP <dbl>, TOC <dbl>, TPH <dbl>, DIN <dbl>
names(datadf)
 [1] "lon"   "lat"   "date"  "NH4N"  "CODMn" "DO"    "pH"    "BOD"   "TSSs" 
[10] "TEMP"  "DOC"   "NO3N"  "DIP"   "NO2N"  "TP"    "DOSAT" "COD"   "TDP"  
[19] "TOC"   "TPH"   "DIN"  
summary(datadf)
      lon              lat             date                 NH4N         
 Min.   : 79.98   Min.   :16.51   Min.   :1980-01-07   Min.   :   0.000  
 1st Qu.:112.31   1st Qu.:28.65   1st Qu.:2011-06-13   1st Qu.:   0.150  
 Median :117.19   Median :32.66   Median :2014-06-16   Median :   0.260  
 Mean   :116.14   Mean   :32.90   Mean   :2014-05-23   Mean   :   0.761  
 3rd Qu.:120.71   3rd Qu.:38.32   3rd Qu.:2018-04-23   3rd Qu.:   0.490  
 Max.   :134.67   Max.   :53.48   Max.   :2022-08-01   Max.   :1000.000  
                                                       NA's   :21245     
     CODMn               DO               pH              BOD       
 Min.   :  0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.10  
 1st Qu.:  2.100   1st Qu.: 6.570   1st Qu.: 7.400   1st Qu.: 0.60  
 Median :  3.300   Median : 7.740   Median : 7.900   Median : 1.02  
 Mean   :  4.511   Mean   : 7.891   Mean   : 7.788   Mean   : 1.79  
 3rd Qu.:  5.300   3rd Qu.: 9.160   3rd Qu.: 8.120   3rd Qu.: 2.00  
 Max.   :241.000   Max.   :93.200   Max.   :10.300   Max.   :41.30  
 NA's   :21368     NA's   :2740     NA's   :1289     NA's   :75315  
      TSSs              TEMP            DOC             NO3N      
 Min.   :    0.5   Min.   : 0.10   Min.   : 1.00   Min.   :0.00   
 1st Qu.:   27.0   1st Qu.:11.20   1st Qu.: 1.60   1st Qu.:0.37   
 Median :  100.0   Median :17.80   Median : 3.20   Median :0.72   
 Mean   :  472.4   Mean   :17.61   Mean   : 4.87   Mean   :1.03   
 3rd Qu.:  260.0   3rd Qu.:24.32   3rd Qu.: 7.20   3rd Qu.:1.30   
 Max.   :51800.0   Max.   :31.80   Max.   :14.50   Max.   :6.86   
 NA's   :75418     NA's   :75227   NA's   :75734   NA's   :75359  
      DIP             NO2N             TP            DOSAT       
 Min.   :0.00    Min.   :0.00    Min.   : 0.00   Min.   : 55.70  
 1st Qu.:0.00    1st Qu.:0.00    1st Qu.: 0.01   1st Qu.: 88.80  
 Median :0.01    Median :0.01    Median : 0.05   Median : 96.10  
 Mean   :0.01    Mean   :0.04    Mean   : 0.60   Mean   : 95.07  
 3rd Qu.:0.02    3rd Qu.:0.04    3rd Qu.: 0.18   3rd Qu.:105.50  
 Max.   :0.64    Max.   :0.60    Max.   :25.61   Max.   :119.40  
 NA's   :57308   NA's   :75413   NA's   :75551   NA's   :75716   
      COD             TDP             TOC             TPH       
 Min.   : 0.03   Min.   :0.00    Min.   :76.7    Min.   :0.00   
 1st Qu.: 0.60   1st Qu.:0.01    1st Qu.:76.7    1st Qu.:0.01   
 Median : 0.86   Median :0.01    Median :76.7    Median :0.01   
 Mean   : 1.06   Mean   :0.01    Mean   :76.7    Mean   :0.02   
 3rd Qu.: 1.24   3rd Qu.:0.02    3rd Qu.:76.7    3rd Qu.:0.03   
 Max.   :23.00   Max.   :0.03    Max.   :76.7    Max.   :0.63   
 NA's   :56239   NA's   :75731   NA's   :75746   NA's   :58862  
      DIN       
 Min.   :0.00   
 1st Qu.:0.05   
 Median :0.14   
 Mean   :0.29   
 3rd Qu.:0.36   
 Max.   :8.17   
 NA's   :56378  
str(datadf)
tibble [75,747 × 21] (S3: tbl_df/tbl/data.frame)
 $ lon  : num [1:75747] 80.7 80.7 80.7 80.7 80.7 ...
 $ lat  : num [1:75747] 43.8 43.8 43.8 43.8 43.8 ...
 $ date : Date[1:75747], format: "2012-05-28" "2012-06-04" ...
 $ NH4N : num [1:75747] 0.07 0.04 0.02 1.33 2.66 0.05 0.06 0.07 0.01 0.07 ...
 $ CODMn: num [1:75747] 2.9 2.7 1 1.3 3 6.9 4.6 10.2 3.7 6.6 ...
 $ DO   : num [1:75747] 6.04 6.62 6.39 5.6 4.63 5.04 5.07 5.24 NA 6.67 ...
 $ pH   : num [1:75747] 7.9 7.8 7.8 7.7 7.7 7.9 8 7.9 8.1 8 ...
 $ BOD  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
 $ TSSs : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
 $ TEMP : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
 $ DOC  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
 $ NO3N : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
 $ DIP  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
 $ NO2N : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
 $ TP   : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
 $ DOSAT: num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
 $ COD  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
 $ TDP  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
 $ TOC  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
 $ TPH  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...
 $ DIN  : num [1:75747] NA NA NA NA NA NA NA NA NA NA ...

Plot

# 每月NH4N与pH的相关性散点图
p <- datadf |>
  dplyr::filter(NH4N < 100) |>
  dplyr::filter(between(year(date), 2016, 2019)) |>
  mutate(month = month(date)) |>
  ggplot(aes(CODMn, NH4N)) +
  geom_point(shape = 21, size = 0.8, fill = "orange") +
  geom_smooth(method = "lm") +
  scale_x_log10() +
  scale_y_log10() +
  facet_wrap(~month, scale = "free", ncol = 4)

plotly::ggplotly(p)

Plotly

p <- datadf |>
  dplyr::filter(year(date) == 2018) |>
  ggplot(aes(date, NH4N)) +
  geom_point()

ggsave("L9-1.pdf", width = 4, height = 3)
Warning: Removed 2667 rows containing missing values or values outside the scale range
(`geom_point()`).
# install.packages("plotly")
plotly::ggplotly(p)