From 78196d49edd9a27de488005c5e36ad59f592cf5a Mon Sep 17 00:00:00 2001 From: ming Date: Mon, 17 Mar 2025 20:06:33 +0800 Subject: [PATCH] render compile --- .source_state | 2 +- SD/1_model/index.qmd | 17 +- SD/2_R语言语法基础/index.html | 1799 --------------------------- SD/3_datatransform/index.html | 1413 --------------------- SD/9_课后作业6/index.html | 655 ---------- SD/9_课后作业8/index.html | 638 ---------- SD/9_课后作业8/index.qmd | 4 +- data/writexldemo.xlsx | Bin 5362 -> 5360 bytes 8 files changed, 13 insertions(+), 4515 deletions(-) delete mode 100644 SD/2_R语言语法基础/index.html delete mode 100644 SD/3_datatransform/index.html delete mode 100644 SD/9_课后作业6/index.html delete mode 100644 SD/9_课后作业8/index.html diff --git a/.source_state b/.source_state index f115b0b..d2a74c4 100644 --- a/.source_state +++ b/.source_state @@ -1 +1 @@ -75ab32db1cd9222c3a3f959c30cd2363 +8a3f81986b4932245c23eee5be50040b diff --git a/SD/1_model/index.qmd b/SD/1_model/index.qmd index 091b6d1..7f33daa 100644 --- a/SD/1_model/index.qmd +++ b/SD/1_model/index.qmd @@ -197,16 +197,19 @@ taxi #| set.seed(123) library(forcats) -one_split <- slice(taxi, 1:30) %>% - initial_split() %>% - tidy() %>% - add_row(Row = 1:30, Data = "Original") %>% - mutate(Data = case_when( +require(tidymodels) +require(tidyverse) +one_split <- taxi |> + dplyr::slice(1:30) |> + rsample::initial_split() |> + generics::tidy() |> + tibble::add_row(Row = 1:30, Data = "Original") |> + dplyr::mutate(Data = case_when( Data == "Analysis" ~ "Training", Data == "Assessment" ~ "Testing", TRUE ~ Data - )) %>% - mutate(Data = factor(Data, levels = c("Original", "Training", "Testing"))) + )) |> + dplyr::mutate(Data = factor(Data, levels = c("Original", "Training", "Testing"))) all_split <- ggplot(one_split, aes(x = Row, y = fct_rev(Data), fill = Data)) + geom_tile(color = "white", diff --git a/SD/2_R语言语法基础/index.html b/SD/2_R语言语法基础/index.html deleted file mode 100644 index d9ef8b8..0000000 --- a/SD/2_R语言语法基础/index.html +++ /dev/null @@ -1,1799 +0,0 @@ - - - - - - - - - - - - - - Version: {{< var branch >}} – 二、R语言语法基础 - - - - - - - - - - - - - - - - - - - - - -
-
- -
-

二、R语言语法基础

-

《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution

- -
-
-
-苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心 -
-
-
- -

2025-03-17

-
-
-

数据类型

-

数值型

-

R中的数值型数据可以是整数或浮点数。

-
-
(x <- 10)
-
-
[1] 10
-
-
(y <- 1.23e-2)
-
-
[1] 0.0123
-
-
(z <- pi)
-
-
[1] 3.141593
-
-
-
-
-

数据类型

-

字符串

-
    -
  • R 中的字符串用引号括起来,建议用双引号
  • -
  • 中文编码主要有GBK编码和UTF-8编码, 可能遇到编码错误造成乱码。RStudio软件默认采用UTF-8编码,在R程序运行时字符串一般用UTF-8编码保存。
  • -
-
-
(str <- "Hello, World!")
-
-
[1] "Hello, World!"
-
-
(str <- 'Hello, World!')
-
-
[1] "Hello, World!"
-
-
(str <- 'He was very angry, and shouted: "Stop!"')
-
-
[1] "He was very angry, and shouted: \"Stop!\""
-
-
-
-
-

数据类型

-

逻辑

-
-
c(TRUE, FALSE)
-
-
[1]  TRUE FALSE
-
-
-
-
-

特殊值

-
-
-
    -
  • NA: 这是最常见的NA类型,表示缺失值
  • -
  • NA_integer_: 这是NA的整数类型
  • -
  • NA_real_: 这是NA的实数类型
  • -
  • NA_character_: 这是NA的字符类型
  • -
  • NA_complex_: 这是NA的复数类型
  • -
-
- -
-
-
pi
-
-
[1] 3.141593
-
-
NA
-
-
[1] NA
-
-
NA_character_
-
-
[1] NA
-
-
Inf
-
-
[1] Inf
-
-
-
-
-
-

特殊值

-

在 R 中,Inf 代表正无穷大(positive infinity),而 -Inf 则代表负无穷大(negative infinity)。这些值通常出现在数学计算中,例如除以零或对负数取对数等操作可能会导致无穷大的结果。

-
-
# 正无穷大
-(x <- Inf)
-
-
[1] Inf
-
-
# 负无穷大
-(y <- -Inf)
-
-
[1] -Inf
-
-
# 无穷大的运算
-(a <- 5 / 0)
-
-
[1] Inf
-
-
(b <- log(0))
-
-
[1] -Inf
-
-
-
-
-

变量赋值

-

在 R 中,可以使用 <-= 运算符将值赋给变量,建议用<-

-
-
# 使用 `<-` 运算符
-(x <- 10)
-
-
[1] 10
-
-
(y <- "hello")
-
-
[1] "hello"
-
-
# 使用 `=` 运算符
-(z = c(1, 2, 3))
-
-
[1] 1 2 3
-
-
-
-
-

变量赋值

-
-
# 向量赋值
-(vec <- c(1, 2, 3, 4, 5))
-
-
[1] 1 2 3 4 5
-
-
# 矩阵赋值
-(mat <- matrix(1:9, nrow = 3))
-
-
     [,1] [,2] [,3]
-[1,]    1    4    7
-[2,]    2    5    8
-[3,]    3    6    9
-
-
-
-
-

变量赋值

-

数据框赋值

-
-
(df <- data.frame(
-  Name = c("Alice", "Bob", "Charlie"),
-  Age = c(25, 30, 35),
-  Married = c(TRUE, FALSE, TRUE)
-))
-
-
     Name Age Married
-1   Alice  25    TRUE
-2     Bob  30   FALSE
-3 Charlie  35    TRUE
-
-
-
-
-

变量赋值

-

列表赋值

-
-
(lst <- list(
-  numbers = c(1, 2, 3),
-  strings = c("a", "b", "c"),
-  matrix = matrix(1:9, nrow = 3)
-))
-
-
$numbers
-[1] 1 2 3
-
-$strings
-[1] "a" "b" "c"
-
-$matrix
-     [,1] [,2] [,3]
-[1,]    1    4    7
-[2,]    2    5    8
-[3,]    3    6    9
-
-
-
-
-

数学函数

-
-
round(pi, digits = 3)
-
-
[1] 3.142
-
-
log(10)
-
-
[1] 2.302585
-
-
-
    -
  1. abs(x): 返回 x 的绝对值
  2. -
  3. sqrt(x): 返回 x 的平方根
  4. -
  5. exp(x): 以e为底的指数函数值
  6. -
  7. log(x, base): 以指定底数的对数函数的值,默认底数为e
  8. -
  9. log10(x): 10为底的对数值
  10. -
  11. log2(x): 2为底的对数值
  12. -
  13. floor(x): 不大于x的最大整数
  14. -
  15. ceiling(x): 不小于x的最小整数
  16. -
-
-
-

数学函数

-
    -
  1. sin(x), cos(x), tan(x): 返回 x 的正弦、余弦和正切值,其中 x 为弧度
  2. -
  3. asin(x), acos(x), atan(x): x 的反正弦、反余弦和反正切值,返回弧度
  4. -
  5. sinh(x), cosh(x), tanh(x): 返回 x 的双曲正弦、双曲余弦和双曲正切值
  6. -
  7. asinh(x), acosh(x), atanh(x): 反双曲正弦、反双曲余弦和反双曲正切值
  8. -
  9. round(x, digits): x 四舍五入,digits指定小数点后位数
  10. -
  11. trunc(x): 返回x截断值,即去掉小数部分
  12. -
  13. sign(x): 返回符号
  14. -
-
-
-

统计函数

-
-
x <- c(5, 10, 15, 20, 25)
-# 计算向量的平均值
-mean(x)
-
-
[1] 15
-
-
# 计算向量的中位数
-median(x)
-
-
[1] 15
-
-
# 计算向量的最小值
-min(x)
-
-
[1] 5
-
-
# 计算向量的最大值
-max(x)
-
-
[1] 25
-
-
# 计算向量的总和
-sum(x)
-
-
[1] 75
-
-
-
-
-

统计函数

-
-
# 计算向量的标准差
-sd(x)
-
-
[1] 7.905694
-
-
# 计算向量的方差
-var(x)
-
-
[1] 62.5
-
-
# 计算向量的分位数
-quantile(x, probs = c(0.25, 0.5, 0.75))
-
-
25% 50% 75% 
- 10  15  20 
-
-
# 统计向量的频数
-(frequency <- table(x))
-
-
x
- 5 10 15 20 25 
- 1  1  1  1  1 
-
-
-
-
-

统计函数

-

执行两样本或单样本 t 检验

-
-
y <- c(3, 8, 13, 18, 23)
-t.test(x, y)
-
-

-    Welch Two Sample t-test
-
-data:  x and y
-t = 0.4, df = 8, p-value = 0.6996
-alternative hypothesis: true difference in means is not equal to 0
-95 percent confidence interval:
- -9.530021 13.530021
-sample estimates:
-mean of x mean of y 
-       15        13 
-
-
-
-
-

统计函数

-

Wilcoxon-Mann-Whitney检验

-
-
wilcox.test(x, y)
-
-

-    Wilcoxon rank sum exact test
-
-data:  x and y
-W = 15, p-value = 0.6905
-alternative hypothesis: true location shift is not equal to 0
-
-
-
-
-

统计函数

-

创建向量的直方图

-
-
hist(x)
- -
-
-
-

函数调用-练习

-

题目:设有一组数据集合 x 包含了一些整数,请编写R语言代码计算并输出以下指标:

-
    -
  • 平均值(mean)
  • -
  • 中位数(median)
  • -
  • 最大值(maximum)
  • -
  • 最小值(minimum)
  • -
  • 数据集合中所有元素的和(sum)
  • -
  • 数据集合的标准差(standard deviation)
  • -
  • 数据集合 x 为:x <- c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
  • -
-

要求:使用R语言编写函数,输入参数为数据集合 x,输出为以上指标的值。

-
-
-

控制流程

-

if-else 语句

-
-
x <- 10
-
-if (x > 10) {
-  print("x 大于 10")
-} else {
-  print("x 不大于 10")
-}
-
-
[1] "x 不大于 10"
-
-
-
-
-

控制流程

-

for 循环

-
-
for (i in 1:5) {
-  print(i)
-}
-
-
[1] 1
-[1] 2
-[1] 3
-[1] 4
-[1] 5
-
-
-
-
-

自定义函数

-

定义函数

-

使用 function 关键字定义函数,并使用 return 关键字返回结果。

-
-
my_function <- function(x, y) {
-  return(x + y)
-}
-
-

调用函数

-
-
result <- my_function(3, 4)
-print(result)
-
-
[1] 7
-
-
-
-
-

数据结构

-

向量

-

向量是一维数组,可以包含相同类型的元素。

-
-
(v <- c(1, 2, 3, 4, 5))
-
-
[1] 1 2 3 4 5
-
-
-

列表

-

列表可以包含不同类型的元素。

-
-
(l <- list(a = 1, b = "hello", c = TRUE))
-
-
$a
-[1] 1
-
-$b
-[1] "hello"
-
-$c
-[1] TRUE
-
-
-
-
-

数值型向量

-

什么是数值型向量?

-
    -
  • 在 R 中,向量是一种基本的数据结构。
  • -
  • 数值型向量包含相同类型的数值元素。
  • -
-

创建数值型向量

-
-
# 使用 c() 函数创建数值型向量
-(numeric_vector <- c(1, 2, 3, 4, 5))
-
-
[1] 1 2 3 4 5
-
-
-
-
-

数值型向量

-

向量运算

-
-
# 创建两个数值型向量
-(vector1 <- c(1, 2, 3))
-
-
[1] 1 2 3
-
-
(vector2 <- c(4, 5, 6))
-
-
[1] 4 5 6
-
-
# 执行向量加法
-(result <- vector1 + vector2)
-
-
[1] 5 7 9
-
-
# 执行向量乘法
-(result <- vector1 * vector2)
-
-
[1]  4 10 18
-
-
-
-
-

向量运算

-

向量求和

-
-
# 创建数值型向量
-vector <- c(1, 2, 3, 4, 5)
-
-# 求和
-(sum_result <- sum(vector))
-
-
[1] 15
-
-
-
-
-

向量运算

-

向量平均值

-
-
# 创建数值型向量
-vector <- c(1, 2, 3, 4, 5)
-
-# 平均值
-(mean_result <- mean(vector))
-
-
[1] 3
-
-
-
-
-

运算-数值运算

-
    -
  • a 的平方。
  • -
  • b 的立方。
  • -
  • a 除以 b 的商和余数。
  • -
-

要求:使用R语言编写函数,输入参数为 a 和 b,输出为上述结果。

-
-
-

运算-逻辑运算

-
-
all(c(FALSE, 2, 1:3, 3) > 1)
-
-
[1] FALSE
-
-
any(c(FALSE, 2, 1:3, 3) > 1)
-
-
[1] TRUE
-
-
(flag1 <- FALSE)
-
-
[1] FALSE
-
-
(flag2 <- (3 > 2))
-
-
[1] TRUE
-
-
(flag3 <- TRUE * TRUE)
-
-
[1] 1
-
-
(flag4 <- TRUE * FALSE)
-
-
[1] 0
-
-
(flag5 <- TRUE & FALSE)
-
-
[1] FALSE
-
-
(flag6 <- TRUE | FALSE)
-
-
[1] TRUE
-
-
-
-
-

运算-逻辑运算

-
    -
  • which
  • -
-
-
which(c(FALSE, TRUE, TRUE, FALSE, NA))
-
-
[1] 2 3
-
-
which((11:15) > 12)
-
-
[1] 3 4 5
-
-
-
    -
  • identical
  • -
-
-
identical(c(1,2,3), c(1,2,NA))
-
-
[1] FALSE
-
-
identical(c(1L,2L,3L), c(1,2,3))
-
-
[1] FALSE
-
-
-
-
-

运算-字符型

-
    -
  • 特殊字符
  • -
-
-
c("abc", "", 'a cat', NA, '李明', "\n")
-
-
[1] "abc"   ""      "a cat" NA      "李明"  "\n"   
-
-
-
    -
  • paste
  • -
-
-
(users <- paste("ruser", 1:9))
-
-
[1] "ruser 1" "ruser 2" "ruser 3" "ruser 4" "ruser 5" "ruser 6" "ruser 7"
-[8] "ruser 8" "ruser 9"
-
-
paste(users, collapse = ", ")
-
-
[1] "ruser 1, ruser 2, ruser 3, ruser 4, ruser 5, ruser 6, ruser 7, ruser 8, ruser 9"
-
-
-
-
-

运算-字符型

-
    -
  • 大小写
  • -
-
-
letters[1:5]
-
-
[1] "a" "b" "c" "d" "e"
-
-
toupper(letters[6:9])
-
-
[1] "F" "G" "H" "I"
-
-
tolower(month.abb)
-
-
 [1] "jan" "feb" "mar" "apr" "may" "jun" "jul" "aug" "sep" "oct" "nov" "dec"
-
-
stringr::str_to_title(c("monday", "tuesday"))
-
-
[1] "Monday"  "Tuesday"
-
-
-
-
-

运算-字符型

-
    -
  • 字符串截取
  • -
-
-
substr("Monday", 1, 3)
-
-
[1] "Mon"
-
-
stringr::str_sub("Monday", 1, 3)
-
-
[1] "Mon"
-
-
-
-
-

运算-字符型

-
    -
  • 类型转换
  • -
-
-
100
-
-
[1] 100
-
-
as.character(100)
-
-
[1] "100"
-
-
as.numeric(c("0100", "0101"))
-
-
[1] 100 101
-
-
sprintf('renamedfile%03d.png', c(3, 99, 100))
-
-
[1] "renamedfile003.png" "renamedfile099.png" "renamedfile100.png"
-
-
-
-
-

运算-字符型

-
    -
  • 字符串替换
  • -
-
-
(mystr <- "He was wrong!")
-
-
[1] "He was wrong!"
-
-
gsub("wrong", "right", mystr)
-
-
[1] "He was right!"
-
-
-
-
-

索引

-

向量

-
-
# 创建一个向量
-vector <- c("apple", "banana", "cherry", "date")
-# 访问第三个元素
-vector[3]
-
-
[1] "cherry"
-
-
# 访问多个元素
-vector[c(2, 4)]
-
-
[1] "banana" "date"  
-
-
vector[c(2:4)]
-
-
[1] "banana" "cherry" "date"  
-
-
-
-
-

索引

-

向量

-
-
# 除了第2个元素
-vector[-2]
-
-
[1] "apple"  "cherry" "date"  
-
-
# 超界
-vector[100]
-
-
[1] NA
-
-
# 更新数据
-vector[7] <- "New Data"
-vector
-
-
[1] "apple"    "banana"   "cherry"   "date"     NA         NA         "New Data"
-
-
-
-
-

索引

-
-
(x <- 1:10)
-
-
 [1]  1  2  3  4  5  6  7  8  9 10
-
-
x[x > 6]
-
-
[1]  7  8  9 10
-
-
x[x < 3] <- 99
-x
-
-
 [1] 99 99  3  4  5  6  7  8  9 10
-
-
# which
-which(x > 10)
-
-
[1] 1 2
-
-
which.max(x)
-
-
[1] 1
-
-
which.min(x)
-
-
[1] 3
-
-
-
-
-

索引

-

列表

-
-
# 创建一个列表
-my_list <- list(fruit = c("apple", "banana", "cherry"),
-                numbers = c(1, 2, 3, 4, 5))
-
-# 访问列表中的第二个元素
-my_list[[2]]
-
-
[1] 1 2 3 4 5
-
-
-
-
-

索引

-

数据框

-
-
# 创建一个数据框
-df <- data.frame(fruit = c("apple", "banana", "cherry"),
-                 quantity = c(5, 7, 3))
-
-# 访问数据框中的第一个元素
-df[1, 1]
-
-
[1] "apple"
-
-
# 第2-3行
-df[2:3, ]
-
-
   fruit quantity
-2 banana        7
-3 cherry        3
-
-
-
-
-

日期和时间

-

base package

-
-
as.Date("2024-01-01")
-
-
[1] "2024-01-01"
-
-
as.POSIXct(1)
-
-
[1] "1970-01-01 08:00:01 CST"
-
-
as.Date(c("12/6/2022", "1/1/2023"), format="%m/%d/%Y")
-
-
[1] "2022-12-06" "2023-01-01"
-
-
-
-
-

日期和时间

-

lubridate package

-
-
lubridate::today()
-
-
[1] "2025-03-17"
-
-
require(lubridate)
-now()
-
-
[1] "2025-03-17 19:40:31 CST"
-
-
ymd(c(20200321, 240404, "20181231"))
-
-
[1] "2020-03-21" "2024-04-04" "2018-12-31"
-
-
mdy(c("3-10-1998", "01-17-2018", "Feb 3, 2024"))
-
-
[1] "1998-03-10" "2018-01-17" "2024-02-03"
-
-
ymd_hms("1998-03-16 13:15:45", tz = "Asia/Shanghai")
-
-
[1] "1998-03-16 13:15:45 CST"
-
-
-
-
-

日期和时间

-

lubridate package

-
-
make_date(2028, 1, 30)
-
-
[1] "2028-01-30"
-
-
as_date("2000-01-01")
-
-
[1] "2000-01-01"
-
-
as_datetime("2000-01-01", tz = "Asia/Shanghai")
-
-
[1] "2000-01-01 CST"
-
-
as_datetime("2024-02-01 8:00:00", tz = "Asia/Shanghai")
-
-
[1] "2024-02-01 08:00:00 CST"
-
-
-
-
-

日期和时间

-

lubridate package

-
-
year(today())
-
-
[1] 2025
-
-
wday(today())
-
-
[1] 2
-
-
hour(now())
-
-
[1] 19
-
-
-
-
-

日期和时间

-

lubridate package

-
-
(x <- now())
-
-
[1] "2025-03-17 19:40:31 CST"
-
-
floor_date(x, unit = "day")
-
-
[1] "2025-03-17 CST"
-
-
floor_date(x, unit = "hour")
-
-
[1] "2025-03-17 19:00:00 CST"
-
-
floor_date(x, unit = "10 minutes")
-
-
[1] "2025-03-17 19:40:00 CST"
-
-
ceiling_date(x, unit = "10 minutes")
-
-
[1] "2025-03-17 19:50:00 CST"
-
-
-
-
-

因子(factor)

-

Factor是什么?

-
    -
  • 在R中,Factor是用来表示分类数据的特殊数据类型。
  • -
  • 它将数据分成不同的水平(levels),每个水平代表了一个类别。
  • -
-
-
-

因子(factor)

-

创建Factor

-
-
# 创建一个Factor
-gender <- factor(c("Male", "Female", "Female", "Male"))
-# 查看Factor的水平
-levels(gender)
-
-
[1] "Female" "Male"  
-
-
# 改变Factor的水平顺序
-gender <- factor(gender, levels = c("Female", "Male"))
-summary(gender) # 使用Factor进行分组
-
-
Female   Male 
-     2      2 
-
-
as.numeric(gender) # 因子转换为纯粹的整数值
-
-
[1] 2 1 1 2
-
-
as.character(gender) # 转为字符
-
-
[1] "Male"   "Female" "Female" "Male"  
-
-
-
-
-

因子(factor)

-

Label of Factor

-
-
(x <- factor(1:12, label = month.abb))
-
-
 [1] Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
-Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
-
-
factor(x, levels = month.abb[c(2:12, 1)])
-
-
 [1] Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
-Levels: Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec Jan
-
-
-

分组

-
-
cut(1:20, breaks=c(0, 5, 10, 15, 18, 20))
-
-
 [1] (0,5]   (0,5]   (0,5]   (0,5]   (0,5]   (5,10]  (5,10]  (5,10]  (5,10] 
-[10] (5,10]  (10,15] (10,15] (10,15] (10,15] (10,15] (15,18] (15,18] (15,18]
-[19] (18,20] (18,20]
-Levels: (0,5] (5,10] (10,15] (15,18] (18,20]
-
-
-
-
-

矩阵

-
-
1:20
-
-
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
-
-
(A <- matrix(1:20, nrow = 4, byrow = TRUE))
-
-
     [,1] [,2] [,3] [,4] [,5]
-[1,]    1    2    3    4    5
-[2,]    6    7    8    9   10
-[3,]   11   12   13   14   15
-[4,]   16   17   18   19   20
-
-
(B <- matrix(1:20, nrow = 4, byrow = FALSE))
-
-
     [,1] [,2] [,3] [,4] [,5]
-[1,]    1    5    9   13   17
-[2,]    2    6   10   14   18
-[3,]    3    7   11   15   19
-[4,]    4    8   12   16   20
-
-
nrow(A)
-
-
[1] 4
-
-
ncol(B)
-
-
[1] 5
-
-
-
-
-

矩阵

-

高维矩阵

-
-
X <- array(1:12, dim = c(3, 2, 2))
-dim(C)
-
-
NULL
-
-
X[1, , ]
-
-
     [,1] [,2]
-[1,]    1    7
-[2,]    4   10
-
-
X[1, , 1]
-
-
[1] 1 4
-
-
-
-
-

矩阵

-

cbindrbind

-
-
cbind(X[1, , ], X[2, , ], X[3, , ])
-
-
     [,1] [,2] [,3] [,4] [,5] [,6]
-[1,]    1    7    2    8    3    9
-[2,]    4   10    5   11    6   12
-
-
rbind(X[1, , ], X[2, , ], X[3, , ])
-
-
     [,1] [,2]
-[1,]    1    7
-[2,]    4   10
-[3,]    2    8
-[4,]    5   11
-[5,]    3    9
-[6,]    6   12
-
-
cbind(c(1,2), c(3,4), c(5,6))
-
-
     [,1] [,2] [,3]
-[1,]    1    3    5
-[2,]    2    4    6
-
-
-
-
-

数据框(data frame)

-

最主要的数据形式。

-
-
# 创建数据框
-(df <- data.frame(
-  Name = c("Alice", "Bob", "Charlie"),
-  Age = c(25, 30, 35),
-  Married = c(TRUE, FALSE, TRUE)
-))
-
-
     Name Age Married
-1   Alice  25    TRUE
-2     Bob  30   FALSE
-3 Charlie  35    TRUE
-
-
names(df)
-
-
[1] "Name"    "Age"     "Married"
-
-
colnames(df)
-
-
[1] "Name"    "Age"     "Married"
-
-
ncol(df); nrow(df)
-
-
[1] 3
-
-
-
[1] 3
-
-
-
-
-

数据框(data frame)

-
-
df[1, 1]
-
-
[1] "Alice"
-
-
df[2, ]
-
-
  Name Age Married
-2  Bob  30   FALSE
-
-
df[, 1]
-
-
[1] "Alice"   "Bob"     "Charlie"
-
-
df$Age
-
-
[1] 25 30 35
-
-
df[["Age"]]
-
-
[1] 25 30 35
-
-
df[, "Age"]
-
-
[1] 25 30 35
-
-
-
-
-

数据框(data frame)

-
-
X <- matrix(1:9, nrow = 3)
-class(X)
-
-
[1] "matrix" "array" 
-
-
(Y <- as.data.frame(X))
-
-
  V1 V2 V3
-1  1  4  7
-2  2  5  8
-3  3  6  9
-
-
names(Y)
-
-
[1] "V1" "V2" "V3"
-
-
names(Y) <- c("colA", "colB", "colC")
-
-
-
-

欢迎讨论!

- 苏命|https://drwater.rcees.ac.cn; https://drwater.rcees.ac.cn/bcard; Slides

- - - -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/SD/3_datatransform/index.html b/SD/3_datatransform/index.html deleted file mode 100644 index 7309321..0000000 --- a/SD/3_datatransform/index.html +++ /dev/null @@ -1,1413 +0,0 @@ - - - - - - - - - - - - - - Version: {{< var branch >}} – Data Transform - - - - - - - - - - - - - - - - - - - - - -
-
- -
-

Data Transform

-

《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution

- -
-
-
-苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心 -
-
-
- -

2025-03-17

-
-
-

tidyverse风格数据分析总体流程

- -
-
-

dplyr cheatsheet

- -
-
-

查看数据

-
-
flights
-
-
# A tibble: 336,776 × 19
-    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
- 1  2013     1     1      517            515         2      830            819
- 2  2013     1     1      533            529         4      850            830
- 3  2013     1     1      542            540         2      923            850
- 4  2013     1     1      544            545        -1     1004           1022
- 5  2013     1     1      554            600        -6      812            837
- 6  2013     1     1      554            558        -4      740            728
- 7  2013     1     1      555            600        -5      913            854
- 8  2013     1     1      557            600        -3      709            723
- 9  2013     1     1      557            600        -3      838            846
-10  2013     1     1      558            600        -2      753            745
-# ℹ 336,766 more rows
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

选择列

-
-
flights |> 
-  select(year, month, day)
-
-
-
-

选择列

-
-
flights |> 
-  select(year:day)
-
-
-
-

选择列

-
-
flights |> 
-select(3:5)
-
-
# A tibble: 336,776 × 3
-     day dep_time sched_dep_time
-   <int>    <int>          <int>
- 1     1      517            515
- 2     1      533            529
- 3     1      542            540
- 4     1      544            545
- 5     1      554            600
- 6     1      554            558
- 7     1      555            600
- 8     1      557            600
- 9     1      557            600
-10     1      558            600
-# ℹ 336,766 more rows
-
-
-
-
-

选择列

-
-
flights |> 
-  select(!year:day)
-
-
# A tibble: 336,776 × 16
-   dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
-      <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>  
- 1      517            515         2      830            819        11 UA     
- 2      533            529         4      850            830        20 UA     
- 3      542            540         2      923            850        33 AA     
- 4      544            545        -1     1004           1022       -18 B6     
- 5      554            600        -6      812            837       -25 DL     
- 6      554            558        -4      740            728        12 UA     
- 7      555            600        -5      913            854        19 B6     
- 8      557            600        -3      709            723       -14 EV     
- 9      557            600        -3      838            846        -8 B6     
-10      558            600        -2      753            745         8 AA     
-# ℹ 336,766 more rows
-# ℹ 9 more variables: flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
-#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

选择列

-
-
flights |> 
-  select(-(year:day))
-
-
# A tibble: 336,776 × 16
-   dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier
-      <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>  
- 1      517            515         2      830            819        11 UA     
- 2      533            529         4      850            830        20 UA     
- 3      542            540         2      923            850        33 AA     
- 4      544            545        -1     1004           1022       -18 B6     
- 5      554            600        -6      812            837       -25 DL     
- 6      554            558        -4      740            728        12 UA     
- 7      555            600        -5      913            854        19 B6     
- 8      557            600        -3      709            723       -14 EV     
- 9      557            600        -3      838            846        -8 B6     
-10      558            600        -2      753            745         8 AA     
-# ℹ 336,766 more rows
-# ℹ 9 more variables: flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
-#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

选择列

-
-
flights |> 
-  select(where(is.character))
-
-
# A tibble: 336,776 × 4
-   carrier tailnum origin dest 
-   <chr>   <chr>   <chr>  <chr>
- 1 UA      N14228  EWR    IAH  
- 2 UA      N24211  LGA    IAH  
- 3 AA      N619AA  JFK    MIA  
- 4 B6      N804JB  JFK    BQN  
- 5 DL      N668DN  LGA    ATL  
- 6 UA      N39463  EWR    ORD  
- 7 B6      N516JB  EWR    FLL  
- 8 EV      N829AS  LGA    IAD  
- 9 B6      N593JB  JFK    MCO  
-10 AA      N3ALAA  LGA    ORD  
-# ℹ 336,766 more rows
-
-
-
-
-

选择列

-
-
flights |> 
-  select(!where(is.character)) |>
-  select(contains("_"))
-
-
# A tibble: 336,776 × 8
-   dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay air_time
-      <int>          <int>     <dbl>    <int>          <int>     <dbl>    <dbl>
- 1      517            515         2      830            819        11      227
- 2      533            529         4      850            830        20      227
- 3      542            540         2      923            850        33      160
- 4      544            545        -1     1004           1022       -18      183
- 5      554            600        -6      812            837       -25      116
- 6      554            558        -4      740            728        12      150
- 7      555            600        -5      913            854        19      158
- 8      557            600        -3      709            723       -14       53
- 9      557            600        -3      838            846        -8      140
-10      558            600        -2      753            745         8      138
-# ℹ 336,766 more rows
-# ℹ 1 more variable: time_hour <dttm>
-
-
-
-
-

选择列

-
-
flights |> 
-  select(tail_num = tailnum)
-
-
# A tibble: 336,776 × 1
-   tail_num
-   <chr>   
- 1 N14228  
- 2 N24211  
- 3 N619AA  
- 4 N804JB  
- 5 N668DN  
- 6 N39463  
- 7 N516JB  
- 8 N829AS  
- 9 N593JB  
-10 N3ALAA  
-# ℹ 336,766 more rows
-
-
-
-
-

选择列

-
-
flights |> 
-  select(air_time, everything())
-
-
# A tibble: 336,776 × 19
-   air_time  year month   day dep_time sched_dep_time dep_delay arr_time
-      <dbl> <int> <int> <int>    <int>          <int>     <dbl>    <int>
- 1      227  2013     1     1      517            515         2      830
- 2      227  2013     1     1      533            529         4      850
- 3      160  2013     1     1      542            540         2      923
- 4      183  2013     1     1      544            545        -1     1004
- 5      116  2013     1     1      554            600        -6      812
- 6      150  2013     1     1      554            558        -4      740
- 7      158  2013     1     1      555            600        -5      913
- 8       53  2013     1     1      557            600        -3      709
- 9      140  2013     1     1      557            600        -3      838
-10      138  2013     1     1      558            600        -2      753
-# ℹ 336,766 more rows
-# ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
-#   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

重命名

-
-
flights |> 
-  rename(tail_num = tailnum)
-
-
# A tibble: 336,776 × 19
-    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
- 1  2013     1     1      517            515         2      830            819
- 2  2013     1     1      533            529         4      850            830
- 3  2013     1     1      542            540         2      923            850
- 4  2013     1     1      544            545        -1     1004           1022
- 5  2013     1     1      554            600        -6      812            837
- 6  2013     1     1      554            558        -4      740            728
- 7  2013     1     1      555            600        -5      913            854
- 8  2013     1     1      557            600        -3      709            723
- 9  2013     1     1      557            600        -3      838            846
-10  2013     1     1      558            600        -2      753            745
-# ℹ 336,766 more rows
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tail_num <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

重命名

-
-
flights |> 
-  rename(年份 = 1) |>
-  rename(月份 = 2)
-
-
# A tibble: 336,776 × 19
-    年份  月份   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
- 1  2013     1     1      517            515         2      830            819
- 2  2013     1     1      533            529         4      850            830
- 3  2013     1     1      542            540         2      923            850
- 4  2013     1     1      544            545        -1     1004           1022
- 5  2013     1     1      554            600        -6      812            837
- 6  2013     1     1      554            558        -4      740            728
- 7  2013     1     1      555            600        -5      913            854
- 8  2013     1     1      557            600        -3      709            723
- 9  2013     1     1      557            600        -3      838            846
-10  2013     1     1      558            600        -2      753            745
-# ℹ 336,766 more rows
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

重命名

-
-
flights |> select(1:4) |> head(n = 3)
-
-
# A tibble: 3 × 4
-   year month   day dep_time
-  <int> <int> <int>    <int>
-1  2013     1     1      517
-2  2013     1     1      533
-3  2013     1     1      542
-
-
# 重命名
-flights |> select(1:4) |> head(n = 3) |>
-  rename_all(~c("c1", "c2", "c3", "c4"))
-
-
# A tibble: 3 × 4
-     c1    c2    c3    c4
-  <int> <int> <int> <int>
-1  2013     1     1   517
-2  2013     1     1   533
-3  2013     1     1   542
-
-
-
-
-

重命名

-
-
flights |> select(1:4) |> head(n = 3)
-
-
# A tibble: 3 × 4
-   year month   day dep_time
-  <int> <int> <int>    <int>
-1  2013     1     1      517
-2  2013     1     1      533
-3  2013     1     1      542
-
-
# 重命名
-flights |> select(1:4) |> head(n = 3) |>
-  rename_all(toupper)
-
-
# A tibble: 3 × 4
-   YEAR MONTH   DAY DEP_TIME
-  <int> <int> <int>    <int>
-1  2013     1     1      517
-2  2013     1     1      533
-3  2013     1     1      542
-
-
-
-
-

重命名

-
-
flights |> select(1:4) |> head(n = 3)
-
-
# A tibble: 3 × 4
-   year month   day dep_time
-  <int> <int> <int>    <int>
-1  2013     1     1      517
-2  2013     1     1      533
-3  2013     1     1      542
-
-
# 重命名
-flights |> select(1:4) |> head(n = 3) |>
-  rename_all(~paste0(toupper(.), "_NEW"))
-
-
# A tibble: 3 × 4
-  YEAR_NEW MONTH_NEW DAY_NEW DEP_TIME_NEW
-     <int>     <int>   <int>        <int>
-1     2013         1       1          517
-2     2013         1       1          533
-3     2013         1       1          542
-
-
-
-
-

练习

-

将含有下划线的列名中的下划线去掉。

-
-
flights |> select(1:4) |> head(n = 3)
-
-
# A tibble: 3 × 4
-   year month   day dep_time
-  <int> <int> <int>    <int>
-1  2013     1     1      517
-2  2013     1     1      533
-3  2013     1     1      542
-
-
-
-
-

练习

-

airqualitydf中列名的单位信息去除(前5列)。

-
-
airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx", sheet = 2)
-airqualitydf |> select(1:5)
-
-
# A tibble: 20,088 × 5
-   datetime            site  `CO_mg/m3` `CO_24h_mg/m3` `NO2_μg/m3`
-   <dttm>              <chr>      <dbl>          <dbl>       <dbl>
- 1 2024-03-19 01:00:00 1001A        0.1            0.4           5
- 2 2024-03-19 01:00:00 1003A        0.2            0.4           9
- 3 2024-03-19 01:00:00 1004A        0.2            0.4           4
- 4 2024-03-19 01:00:00 1005A        0.1            0.3           6
- 5 2024-03-19 01:00:00 1006A        0.1            0.4           5
- 6 2024-03-19 01:00:00 1007A        0.3            0.5           6
- 7 2024-03-19 01:00:00 1008A        0.2            0.4           2
- 8 2024-03-19 01:00:00 1009A        0.2            0.4           2
- 9 2024-03-19 01:00:00 1010A        0.1            0.3           2
-10 2024-03-19 01:00:00 1011A        0.2            0.4          12
-# ℹ 20,078 more rows
-
-
-
-
-

filter

-
-
flights |> 
-  filter(dep_delay > 120)
-
-
# A tibble: 9,723 × 19
-    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
- 1  2013     1     1      848           1835       853     1001           1950
- 2  2013     1     1      957            733       144     1056            853
- 3  2013     1     1     1114            900       134     1447           1222
- 4  2013     1     1     1540           1338       122     2020           1825
- 5  2013     1     1     1815           1325       290     2120           1542
- 6  2013     1     1     1842           1422       260     1958           1535
- 7  2013     1     1     1856           1645       131     2212           2005
- 8  2013     1     1     1934           1725       129     2126           1855
- 9  2013     1     1     1938           1703       155     2109           1823
-10  2013     1     1     1942           1705       157     2124           1830
-# ℹ 9,713 more rows
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

filter 练习

-

Flights that departed on January 1.

-
-
-
# A tibble: 842 × 19
-    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
- 1  2013     1     1      517            515         2      830            819
- 2  2013     1     1      533            529         4      850            830
- 3  2013     1     1      542            540         2      923            850
- 4  2013     1     1      544            545        -1     1004           1022
- 5  2013     1     1      554            600        -6      812            837
- 6  2013     1     1      554            558        -4      740            728
- 7  2013     1     1      555            600        -5      913            854
- 8  2013     1     1      557            600        -3      709            723
- 9  2013     1     1      557            600        -3      838            846
-10  2013     1     1      558            600        -2      753            745
-# ℹ 832 more rows
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

filter 练习

-

Select flights that departed in January or February

-
-
-
# A tibble: 51,955 × 19
-    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
- 1  2013     1     1      517            515         2      830            819
- 2  2013     1     1      533            529         4      850            830
- 3  2013     1     1      542            540         2      923            850
- 4  2013     1     1      544            545        -1     1004           1022
- 5  2013     1     1      554            600        -6      812            837
- 6  2013     1     1      554            558        -4      740            728
- 7  2013     1     1      555            600        -5      913            854
- 8  2013     1     1      557            600        -3      709            723
- 9  2013     1     1      557            600        -3      838            846
-10  2013     1     1      558            600        -2      753            745
-# ℹ 51,945 more rows
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

filter 练习

-
-
jan1 <- flights |> 
-  filter(month == 1 & day == 1)
-
-
-
-

filter

-
-
flights |> 
-  filter(month = 1)
-
-
-
-

filter

-
-
flights |> 
-  filter(month == 1 | 2)
-
-
# A tibble: 336,776 × 19
-    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
- 1  2013     1     1      517            515         2      830            819
- 2  2013     1     1      533            529         4      850            830
- 3  2013     1     1      542            540         2      923            850
- 4  2013     1     1      544            545        -1     1004           1022
- 5  2013     1     1      554            600        -6      812            837
- 6  2013     1     1      554            558        -4      740            728
- 7  2013     1     1      555            600        -5      913            854
- 8  2013     1     1      557            600        -3      709            723
- 9  2013     1     1      557            600        -3      838            846
-10  2013     1     1      558            600        -2      753            745
-# ℹ 336,766 more rows
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

排序

-
-
flights |> 
-  arrange(year, month, day, dep_time)
-
-
# A tibble: 336,776 × 19
-    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
- 1  2013     1     1      517            515         2      830            819
- 2  2013     1     1      533            529         4      850            830
- 3  2013     1     1      542            540         2      923            850
- 4  2013     1     1      544            545        -1     1004           1022
- 5  2013     1     1      554            600        -6      812            837
- 6  2013     1     1      554            558        -4      740            728
- 7  2013     1     1      555            600        -5      913            854
- 8  2013     1     1      557            600        -3      709            723
- 9  2013     1     1      557            600        -3      838            846
-10  2013     1     1      558            600        -2      753            745
-# ℹ 336,766 more rows
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

排序

-
-
flights |> 
-  arrange(desc(dep_delay))
-
-
# A tibble: 336,776 × 19
-    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
- 1  2013     1     9      641            900      1301     1242           1530
- 2  2013     6    15     1432           1935      1137     1607           2120
- 3  2013     1    10     1121           1635      1126     1239           1810
- 4  2013     9    20     1139           1845      1014     1457           2210
- 5  2013     7    22      845           1600      1005     1044           1815
- 6  2013     4    10     1100           1900       960     1342           2211
- 7  2013     3    17     2321            810       911      135           1020
- 8  2013     6    27      959           1900       899     1236           2226
- 9  2013     7    22     2257            759       898      121           1026
-10  2013    12     5      756           1700       896     1058           2020
-# ℹ 336,766 more rows
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

slice

-
-
flights |> head(n = 5)
-
-
# A tibble: 5 × 19
-   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
-1  2013     1     1      517            515         2      830            819
-2  2013     1     1      533            529         4      850            830
-3  2013     1     1      542            540         2      923            850
-4  2013     1     1      544            545        -1     1004           1022
-5  2013     1     1      554            600        -6      812            837
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
flights |> slice(1:5)
-
-
# A tibble: 5 × 19
-   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
-1  2013     1     1      517            515         2      830            819
-2  2013     1     1      533            529         4      850            830
-3  2013     1     1      542            540         2      923            850
-4  2013     1     1      544            545        -1     1004           1022
-5  2013     1     1      554            600        -6      812            837
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

slice

-
-
flights |> 
-  slice_max(dep_delay, n = 5)
-
-
# A tibble: 5 × 19
-   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
-1  2013     1     9      641            900      1301     1242           1530
-2  2013     6    15     1432           1935      1137     1607           2120
-3  2013     1    10     1121           1635      1126     1239           1810
-4  2013     9    20     1139           1845      1014     1457           2210
-5  2013     7    22      845           1600      1005     1044           1815
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

slice

-
-
flights |> 
-  slice_min(dep_delay, prop = 0.005)
-
-
# A tibble: 2,257 × 19
-    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
- 1  2013    12     7     2040           2123       -43       40           2352
- 2  2013     2     3     2022           2055       -33     2240           2338
- 3  2013    11    10     1408           1440       -32     1549           1559
- 4  2013     1    11     1900           1930       -30     2233           2243
- 5  2013     1    29     1703           1730       -27     1947           1957
- 6  2013     8     9      729            755       -26     1002            955
- 7  2013    10    23     1907           1932       -25     2143           2143
- 8  2013     3    30     2030           2055       -25     2213           2250
- 9  2013     3     2     1431           1455       -24     1601           1631
-10  2013     5     5      934            958       -24     1225           1309
-# ℹ 2,247 more rows
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

排序练习

-

根据origindestair_time倒序排序。

-
-
-
# A tibble: 336,776 × 19
-   origin dest  air_time  year month   day dep_time sched_dep_time dep_delay
-   <chr>  <chr>    <dbl> <int> <int> <int>    <int>          <int>     <dbl>
- 1 EWR    ALB         50  2013     5     5     1950           2000       -10
- 2 EWR    ALB         45  2013     1    13     1721           1619        62
- 3 EWR    ALB         43  2013     1    20     1623           1619         4
- 4 EWR    ALB         42  2013     4     1     1439           1340        59
- 5 EWR    ALB         41  2013    12     4     1316           1310         6
- 6 EWR    ALB         41  2013     2     1     2034           2000        34
- 7 EWR    ALB         41  2013     5     7     1956           2000        -4
- 8 EWR    ALB         38  2013     1    18     1824           1619       125
- 9 EWR    ALB         38  2013     1    28     1636           1619        17
-10 EWR    ALB         38  2013    11    10     2149           2159       -10
-# ℹ 336,766 more rows
-# ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
-#   carrier <chr>, flight <int>, tailnum <chr>, distance <dbl>, hour <dbl>,
-#   minute <dbl>, time_hour <dttm>
-
-
-
-
-

去重

-
-
# Remove duplicate rows, if any
-flights |> 
-  distinct()
-
-
# A tibble: 336,776 × 19
-    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
- 1  2013     1     1      517            515         2      830            819
- 2  2013     1     1      533            529         4      850            830
- 3  2013     1     1      542            540         2      923            850
- 4  2013     1     1      544            545        -1     1004           1022
- 5  2013     1     1      554            600        -6      812            837
- 6  2013     1     1      554            558        -4      740            728
- 7  2013     1     1      555            600        -5      913            854
- 8  2013     1     1      557            600        -3      709            723
- 9  2013     1     1      557            600        -3      838            846
-10  2013     1     1      558            600        -2      753            745
-# ℹ 336,766 more rows
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

去重

-
-
# Find all unique origin and destination pairs
-flights |> 
-  distinct(origin, dest)
-
-
# A tibble: 224 × 2
-   origin dest 
-   <chr>  <chr>
- 1 EWR    IAH  
- 2 LGA    IAH  
- 3 JFK    MIA  
- 4 JFK    BQN  
- 5 LGA    ATL  
- 6 EWR    ORD  
- 7 EWR    FLL  
- 8 LGA    IAD  
- 9 JFK    MCO  
-10 LGA    ORD  
-# ℹ 214 more rows
-
-
-
-
-

去重

-
-
flights |> 
-  distinct(origin, dest, .keep_all = TRUE)
-
-
# A tibble: 224 × 19
-    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
-   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
- 1  2013     1     1      517            515         2      830            819
- 2  2013     1     1      533            529         4      850            830
- 3  2013     1     1      542            540         2      923            850
- 4  2013     1     1      544            545        -1     1004           1022
- 5  2013     1     1      554            600        -6      812            837
- 6  2013     1     1      554            558        -4      740            728
- 7  2013     1     1      555            600        -5      913            854
- 8  2013     1     1      557            600        -3      709            723
- 9  2013     1     1      557            600        -3      838            846
-10  2013     1     1      558            600        -2      753            745
-# ℹ 214 more rows
-# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
-#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
-#   hour <dbl>, minute <dbl>, time_hour <dttm>
-
-
-
-
-

欢迎讨论!

- 苏命|https://drwater.rcees.ac.cn; https://drwater.rcees.ac.cn/bcard; Slides

- - - -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/SD/9_课后作业6/index.html b/SD/9_课后作业6/index.html deleted file mode 100644 index dc96458..0000000 --- a/SD/9_课后作业6/index.html +++ /dev/null @@ -1,655 +0,0 @@ - - - - - - - - - - - - - - Version: {{< var branch >}} – 课后作业6 - - - - - - - - - - - - - - - - - - - - - -
-
- -
-

课后作业6

-

《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution

- -
-
-
-苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心 -
-
-
- -

2025-03-17

-
-
-

第6次课后作业

-
    -
  1. 如何在 R 中将数字 10 赋值给变量 x?
  2. -
  3. 在 R 中,如何创建一个包含数字 1 到 5 的向量?
  4. -
  5. 编写一个条件语句,如果变量 age 大于等于 18,则打印 “成年人”,否则打印 “未成年人”。
  6. -
  7. 使用 for 循环打印从 1 到 10 的整数。
  8. -
  9. 编写一个名为 addition 的函数,接受两个参数 a 和 b,返回它们的和。
  10. -
  11. 创建一个列表,包含三个元素:一个数字向量、一个字符向量和一个逻辑向量。
  12. -
  13. 使用 read.csv() 函数读取名为 data.csv 的 CSV 文件,并将数据存储在一个名为 data 的数据框中。
  14. -
  15. 从数据框中选择前五行,并将结果存储在一个新的数据框中。
  16. -
  17. 将字符串 “hello world” 转换为大写。
  18. -
  19. 从数据框中选择 score 列大于等于 90 的行。
  20. -
-
-
-

data.csv内容

-
name,age,score
-Alice,25,85
-Bob,30,92
-Charlie,28,89
-David,22,95
-Eva,35,87
-Frank,27,91
-Grace,29,88
-Helen,26,93
-Ivan,31,86
-Jack,24,94
-Kelly,32,89
-Lily,28,90
-Mike,33,85
-Nancy,27,92
-Olivia,34,88
-Peter,29,93
-Queen,25,89
-Ryan,30,94
-Samantha,26,91
-Tom,31,87
-
-
-
-

欢迎讨论!

- 苏命|https://drwater.rcees.ac.cn; https://drwater.rcees.ac.cn/bcard; Slides

- - - -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/SD/9_课后作业8/index.html b/SD/9_课后作业8/index.html deleted file mode 100644 index 9f57e70..0000000 --- a/SD/9_课后作业8/index.html +++ /dev/null @@ -1,638 +0,0 @@ - - - - - - - - - - - - - - Version: {{< var branch >}} – 课后作业8 - - - - - - - - - - - - - - - - - - - - - -
-
- -
-

课后作业8

-

《区域水环境污染数据分析实践》
Data analysis practice of regional water environment pollution

- -
-
-
-苏命、王为东
中国科学院大学资源与环境学院
中国科学院生态环境研究中心 -
-
-
- -

2025-03-17

-
-
-

第8次课后作业

-
    -
  1. 根据airqualitydf.xlsx,按采样点统计白天(8:00-20:00)与夜晚(20:00-8:00)中空气质量指数(AQI)中位数,按城市统计低于所有采样点AQI30%分位值的采样点占比,列出上述占比最高的10个城市(不考虑采样点数低于5个的城市)。
  2. -
  3. 按照不同城市分组,统计白天与夜晚AQI中位数是否具有显著差异。
  4. -
-

作业模板:第8次课后作业_模板.qmd

-
-
-

示例代码

-

基于R的示例结果

- -

基于SAS的示例结果

- -
-
-

欢迎讨论!

- 苏命|https://drwater.rcees.ac.cn; https://drwater.rcees.ac.cn/bcard; Slides

- - - -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/SD/9_课后作业8/index.qmd b/SD/9_课后作业8/index.qmd index 637df89..ffd30e0 100644 --- a/SD/9_课后作业8/index.qmd +++ b/SD/9_课后作业8/index.qmd @@ -43,7 +43,7 @@ require(learnr) 1. 根据`airqualitydf.xlsx`,按采样点统计白天(8:00-20:00)与夜晚(20:00-8:00)中空气质量指数(AQI)中位数,按城市统计低于所有采样点AQI30%分位值的采样点占比,列出上述占比最高的10个城市(不考虑采样点数低于5个的城市)。 2. 按照不同城市分组,统计白天与夜晚AQI中位数是否具有显著差异。 -作业模板:[第8次课后作业_模板.qmd](https://drwater.rcees.ac.cn/git/course/RWEP/raw/branch/main/SD/20240328_9_课后作业/第8次课后作业_模板.qmd) +作业模板:[第8次课后作业_模板.qmd](https://git.drwater.net/course/RWEP/raw/branch/main/SD/20240328_9_课后作业/第8次课后作业_模板.qmd) ## 示例代码 @@ -62,5 +62,5 @@ require(learnr) ## 欢迎讨论!{.center} -`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drwater.rcees.ac.cn/course/public/RWEP/@PUB/SD/")` +`r rmdify::slideend(wechat = FALSE, type = "public", tel = FALSE, thislink = "https://drc.drwater.net/course/public/RWEP/PUB/SD/")` diff --git a/data/writexldemo.xlsx b/data/writexldemo.xlsx index 8e99520b86f05b5440d19f509230e946bfb5d29b..b3d1110c4d84d2df8ebd162b0af86dbe2af54523 100644 GIT binary patch delta 338 zcmV-Y0j>V>Dex(2oCLh;a&+&UxihF*>{OThg7ebo8j&nV321JF)NPI4*UR()B|b1M zSZ6fUXb&DOnwyGS%8i3pXDqlN!6y-|AW+Yl_p*z+wY=CkO~QQR1(Scs)-f3ao! zo3((8Jio&V0uwASJOEQW5z!GT_=L1St{VY@W9UGE4jz+?;IlV?Q+~E#R4$xKhP{PZ z>q(s4J1-|=-}hNxj@Fo;;Lp{|+qh~f^>8WNw=z#1r6< k`}z|!f3x`zb^`^59~lQElXMhR0r!*16e9+l5&!@I08k~MO8@`> delta 340 zcmV-a0jvJ-De@_>vJVAM9;>w`leZ5Zf6Z>fFc5_AeTvBU#4(|?vXUUR6{jLqDpgT$ zmc0b4@sDh`fw!*{Ab`ZF$Mwv9JG*i@->EkF1?Q#F6(U)V642ZTshbLYtQP4bN_=2i zu-0g(&>lRR*LNkilp6i1h1E`@58F8)cvVApt=;CatACJSC{y*h=*&L56gJsjRf(u zkNS2&thkG$$X1J!ZJzlkTn mA@}tssQ&=7_Yigi1y3HUwI-8t6jTBElgAVz2A&cC0001lAfs9U