render compile

2026-05-21 01:33:30 +08:00
parent 2c2eaa0d0d
commit 204a12a58f
2379 changed files with 142545 additions and 814 deletions
@@ -0,0 +1,154 @@
+---
+title: "Lesson 7"
+format: html
+---
+
+
+```{r}
+require(tidyverse)
+
+
+files <- c(
+  "../../data/01-sales.csv",
+  "../../data/02-sales.csv",
+  "../../data/03-sales.csv"
+)
+
+files <- dir("../../data", pattern = "sales.csv", full.names = TRUE)
+
+
+readr::read_csv(files, id = "file") |>
+  mutate(file = basename(file))
+```
+
+
+```{r}
+require(tidyverse)
+library(tidyverse)
+
+files <- dir(
+  "../../data/gapminder",
+  pattern = "^[12][09][0-9][0-9].xlsx$",
+  full.names = TRUE
+)
+
+
+alldf <- tibble::tibble()
+for (file in files) {
+  alldf <- alldf |>
+    bind_rows(
+      readxl::read_xlsx(file) |>
+        mutate(year = parse_number(basename(file)))
+    )
+}
+
+alldf |>
+  ggplot(aes(x = lifeExp, y = gdpPercap)) +
+  geom_point(aes(color = factor(year))) +
+  geom_smooth(method = "lm", se = FALSE) +
+  scale_y_log10(
+    breaks = scales::trans_breaks("log10", function(x) 10^x),
+    labels = scales::trans_format("log10", scales::math_format(10^.x))
+  ) +
+  facet_wrap(~year, ncol = 4, scale = "fixed")
+```
+
+# slope
+
+```{r}
+file <- files[1]
+
+lm(y ~ x, data)
+
+m <- lm(log10(gdpPercap) ~ lifeExp, readxl::read_xlsx(file))
+
+summary(m)
+
+coef(m)[2]
+
+slopes <- c()
+years <- c()
+for (file in files) {
+  m <- lm(log10(gdpPercap) ~ lifeExp, readxl::read_xlsx(file))
+  years <- c(years, parse_number(basename(file)))
+  slopes <- c(slopes, coef(m)[2])
+}
+years
+slopes
+
+
+plot(years, as.numeric(slopes), type = "b")
+```
+
+# purrr
+
+
+
+```{r}
+require(tidyverse)
+df <- tibble(
+  filename = dir(
+    "../../data/gapminder",
+    pattern = "^[12][09][0-9][0-9].xlsx$",
+    full.names = TRUE
+  )
+) |>
+  dplyr::mutate(
+    data = purrr::map(
+      filename,
+      \(x) readxl::read_xlsx(x)
+    )
+  ) |>
+  mutate(year = parse_number(basename(filename))) |>
+  mutate(
+    m = purrr::map(data, \(xxxx) lm(log10(gdpPercap) ~ lifeExp, data = xxxx))
+  ) |>
+  mutate(slope = purrr::map_dbl(m, \(yyyy) coef(yyyy)[2])) |>
+  unnest(data)
+
+pvalue <- summary(df$m[[1]])$coefficients[2, 4]
+rsq <- summary(df$m[[1]])$r.squared
+
+require(tidymodels)
+generics::tidy(df$m[[1]])
+
+df |>
+  ggplot(aes(x = lifeExp, y = gdpPercap)) +
+  geom_point(aes(color = factor(year))) +
+  geom_smooth(method = "lm", se = FALSE) +
+  scale_y_log10(
+    breaks = scales::trans_breaks("log10", function(x) 10^x),
+    labels = scales::trans_format("log10", scales::math_format(10^.x))
+  ) +
+  facet_wrap(~year, ncol = 4, scale = "fixed")
+
+
+df |>
+  ggplot(aes(x = year, y = slope)) +
+  geom_line() +
+  geom_point()
+
+
+df
+df$slope[[1]]
+
+coef(df$m[[1]])[2]
+
+lm(y ~ x, data)
+
+
+df |>
+  unnest(data) |>
+  ggplot(aes(x = lifeExp, y = gdpPercap)) +
+  geom_point(aes(color = factor(year))) +
+  geom_smooth(method = "lm", se = FALSE) +
+  scale_y_log10(
+    breaks = scales::trans_breaks("log10", function(x) 10^x),
+    labels = scales::trans_format("log10", scales::math_format(10^.x))
+  ) +
+  facet_wrap(~year, ncol = 4, scale = "fixed")
+```
+
+
+
+
@@ -0,0 +1 @@
+../../_extensions
@@ -0,0 +1,326 @@
+---
+title: "Data import"
+format:
+  dwsd-revealjs:
+    logo: _extensions/drwater/dwsd/inst/ucaslogo.png
+---
+
+
+```{r}
+#| echo: false
+source("../../coding/_common.R")
+library(tidyverse)
+```
+
+
+## tidyverse风格数据分析总体流程
+
+
+![](../../image/data-science/import.png) 
+
+
+## 导入csv数据
+
+```{r}
+read_lines("../../data/students.csv") |> cat(sep = "\n")
+```
+
+## 导入csv数据
+
+
+```{r}
+read_csv("../../data/students.csv") |>
+  knitr::kable()
+```
+
+
+## 读取数据
+
+```{r}
+(students <- read_csv("../../data/students.csv"))
+```
+
+
+## 读取数据
+
+```{r}
+#| message: false
+(students <- read_csv("../../data/students.csv", na = c("N/A", "")))
+```
+
+## 列名不要有空格
+
+```{r}
+students |>
+  rename(
+    student_id = `Student ID`,
+    full_name = `Full Name`
+  )
+```
+
+## `janitor`处理空格
+
+```{r}
+#| message: false
+
+students |> janitor::clean_names()
+```
+
+## `janitor`处理空格
+
+```{r}
+students |>
+  janitor::clean_names() |>
+  mutate(meal_plan = factor(meal_plan))
+```
+
+## `janitor`处理空格
+
+```{r}
+students <- students |>
+  janitor::clean_names() |>
+  mutate(
+    meal_plan = factor(meal_plan),
+    age = parse_number(if_else(age == "five", "5", age))
+  )
+students
+```
+
+
+## 直接录入
+
+```{r}
+#| message: false
+
+read_csv(
+  "The first line of metadata
+  The second line of metadata
+  x,y,z
+  1,2,3",
+  skip = 2
+)
+```
+
+
+## 直接录入
+
+```{r}
+#| message: false
+read_csv(
+  "# A comment I want to skip
+  x,y,z
+  1,2,3",
+  comment = "#"
+)
+```
+
+
+## 指定列名
+
+```{r}
+#| message: false
+
+read_csv(
+  "1,2,3
+  4,5,6",
+  col_names = c("x", "y", "z")
+)
+```
+
+## 指定列的类型
+
+```{r}
+another_csv <- "
+x,y,z
+1,2,3"
+
+read_csv(
+  another_csv,
+  col_types = cols(.default = col_character())
+)
+read_csv(
+  another_csv,
+  col_types = cols_only(x = col_character())
+)
+```
+
+
+## 练习
+
+```{r}
+#| eval: false
+
+read_csv("a,b\n1,2,3\n4,5,6")
+read_csv("a,b,c\n1,2\n1,2,3,4")
+read_csv("a,b\n\"1")
+read_csv("a,b\n1,2\na,b")
+read_csv("a;b\n1;3")
+```
+
+## 练习
+
+```{r}
+#| eval: false
+annoying <- tibble(
+  `1` = 1:10,
+  `2` = `1` * 2 + rnorm(length(`1`))
+)
+```
+
+
+
+## 批量读取
+
+```{r}
+#| message: false
+
+sales_files <- c(
+  "../../data/01-sales.csv",
+  "../../data/02-sales.csv",
+  "../../data/03-sales.csv"
+)
+read_csv(sales_files, id = "file")
+```
+
+
+
+
+## 读取Excel，建议用`readxl`包
+
+```{r}
+(surveydf <- readxl::read_xlsx("../../data/survey.xlsx"))
+```
+
+## 读取Excel
+
+```{r}
+(airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx", sheet = 2))
+```
+
+
+
+## 批量读取
+
+```{r}
+sales_files <- list.files(
+  "../../data",
+  pattern = "sales\\.csv$",
+  full.names = TRUE
+)
+sales_files
+```
+
+## 写入csv
+
+```{r}
+#| warning: false
+#| message: false
+students
+write_csv(students, "students-2.csv")
+read_csv("students-2.csv")
+```
+
+## 写入Excel
+
+```{r}
+writexl::write_xlsx(students, "../../data/writexldemo.xlsx")
+```
+
+## 读取数据库，以MySQL为例
+
+```{r}
+if (FALSE) {
+  conn <- cctdb::get_dbconn("nationalairquality")
+  DBI::dbListTables(conn)
+}
+```
+
+
+
+## 读取数据库，以MySQL为例
+
+```{r}
+if (FALSE) {
+  conn <- cctdb::get_dbconn("nationalairquality")
+  metadf <- tbl(conn, "metadf") |>
+    head(100) |>
+    collect()
+  DBI::dbDisconnect(conn)
+  saveRDS(metadf, file = "../../data/metadfdemo.RDS")
+}
+metadf <- readRDS(file = "../../data/metadfdemo.RDS")
+lang <- "cn"
+metadf |>
+  ggplot(aes(lon, lat)) +
+  geom_point(aes(fill = Area)) +
+  dwfun::theme_sci()
+```
+
+
+## 练习
+
+```{r}
+#| eval: false
+metadf <- readxl::read_xlsx("../../data/airquality.xlsx")
+dir.create("../../data/metacity2/")
+metadf |>
+  nest(sitedf = -site) |>
+  mutate(
+    flag = purrr::map2(
+      site,
+      sitedf,
+      ~ writexl::write_xlsx(.y, paste0("../../data/metacity2/", .x, ".xlsx"))
+    )
+  )
+```
+
+
+## 练习
+
+```{r}
+#| include: false
+#| eval: false
+if (FALSE) {
+  require(tidyverse)
+  conn <- cctdb::get_dbconn("nationalairquality")
+  metadf <- tbl(conn, "metadf") |>
+    collect()
+  DBI::dbDisconnect(conn)
+  metanestdf <- metadf |>
+    nest(citydf = -Area)
+  names(metanestdf$citydf) <- metanestdf$Area
+  writexl::write_xlsx(metanestdf$citydf, path = "../../data/meta_city.xlsx")
+  dir.create("../../data/metacity/")
+  metanestdf |>
+    mutate(
+      flag = purrr::map2(
+        Area,
+        citydf,
+        ~ writexl::write_xlsx(
+          .y,
+          path = paste0("../../data/metacity/", .x, ".xlsx")
+        )
+      )
+    )
+}
+```
+
+1. 从“../../data/sales.xlsx”读取第9到13行的数据
+2. 从“../../data/meta_city.xlsx”读取所有的数据，并保存至“../../data/meta_city_onetable1.xlsx”
+3. 从“../../data/metacity/”读取所有的数据，并保存至“../../data/meta_city_onetable2.xlsx”
+
+
+
+## 欢迎讨论！{.center}
+
+
+```{r}
+#| results: 'asis'
+rmdify::slideend(
+  wechat = FALSE,
+  type = "public",
+  tel = FALSE,
+  thislink = "../"
+)
+```
+
@@ -0,0 +1,7 @@
+student_id,full_name,favourite_food,meal_plan,age
+1,Sunil Huffmann,Strawberry yoghurt,Lunch only,4
+2,Barclay Lynn,French fries,Lunch only,5
+3,Jayendra Lyne,NA,Breakfast and lunch,7
+4,Leon Rossini,Anchovies,Lunch only,NA
+5,Chidiegwu Dunkel,Pizza,Breakfast and lunch,5
+6,Güvenç Attila,Ice cream,Lunch only,6