336 lines
4.8 KiB
Plaintext
336 lines
4.8 KiB
Plaintext
---
|
|
title: "Data import"
|
|
format:
|
|
dwsd-revealjs:
|
|
logo: _extensions/drwater/dwsd/inst/ucaslogo.png
|
|
---
|
|
|
|
|
|
```{r}
|
|
#| include: false
|
|
#| cache: false
|
|
lang <- "cn"
|
|
require(tidyverse)
|
|
require(learnr)
|
|
```
|
|
|
|
```{r}
|
|
#| echo: false
|
|
source("../../coding/_common.R")
|
|
library(tidyverse)
|
|
```
|
|
|
|
|
|
## tidyverse风格数据分析总体流程
|
|
|
|
|
|

|
|
|
|
|
|
## 导入csv数据
|
|
|
|
```{r}
|
|
read_lines("../../data/students.csv") |> cat(sep = "\n")
|
|
```
|
|
|
|
## 导入csv数据
|
|
|
|
|
|
```{r}
|
|
read_csv("../../data/students.csv") |>
|
|
knitr::kable()
|
|
```
|
|
|
|
|
|
## 读取数据
|
|
|
|
```{r}
|
|
(students <- read_csv("../../data/students.csv"))
|
|
```
|
|
|
|
|
|
## 读取数据
|
|
|
|
```{r}
|
|
#| message: false
|
|
(students <- read_csv("../../data/students.csv", na = c("N/A", "")))
|
|
```
|
|
|
|
## 列名不要有空格
|
|
|
|
```{r}
|
|
students |>
|
|
rename(
|
|
student_id = `Student ID`,
|
|
full_name = `Full Name`
|
|
)
|
|
```
|
|
|
|
## `janitor`处理空格
|
|
|
|
```{r}
|
|
#| message: false
|
|
|
|
students |> janitor::clean_names()
|
|
```
|
|
|
|
## `janitor`处理空格
|
|
|
|
```{r}
|
|
students |>
|
|
janitor::clean_names() |>
|
|
mutate(meal_plan = factor(meal_plan))
|
|
```
|
|
|
|
## `janitor`处理空格
|
|
|
|
```{r}
|
|
students <- students |>
|
|
janitor::clean_names() |>
|
|
mutate(
|
|
meal_plan = factor(meal_plan),
|
|
age = parse_number(if_else(age == "five", "5", age))
|
|
)
|
|
students
|
|
```
|
|
|
|
|
|
## 直接录入
|
|
|
|
```{r}
|
|
#| message: false
|
|
|
|
read_csv(
|
|
"The first line of metadata
|
|
The second line of metadata
|
|
x,y,z
|
|
1,2,3",
|
|
skip = 2
|
|
)
|
|
```
|
|
|
|
|
|
## 直接录入
|
|
|
|
```{r}
|
|
#| message: false
|
|
read_csv(
|
|
"# A comment I want to skip
|
|
x,y,z
|
|
1,2,3",
|
|
comment = "#"
|
|
)
|
|
```
|
|
|
|
|
|
## 指定列名
|
|
|
|
```{r}
|
|
#| message: false
|
|
|
|
read_csv(
|
|
"1,2,3
|
|
4,5,6",
|
|
col_names = c("x", "y", "z")
|
|
)
|
|
```
|
|
|
|
## 指定列的类型
|
|
|
|
```{r}
|
|
another_csv <- "
|
|
x,y,z
|
|
1,2,3"
|
|
|
|
read_csv(
|
|
another_csv,
|
|
col_types = cols(.default = col_character())
|
|
)
|
|
read_csv(
|
|
another_csv,
|
|
col_types = cols_only(x = col_character())
|
|
)
|
|
```
|
|
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
#| eval: false
|
|
|
|
read_csv("a,b\n1,2,3\n4,5,6")
|
|
read_csv("a,b,c\n1,2\n1,2,3,4")
|
|
read_csv("a,b\n\"1")
|
|
read_csv("a,b\n1,2\na,b")
|
|
read_csv("a;b\n1;3")
|
|
```
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
#| eval: false
|
|
annoying <- tibble(
|
|
`1` = 1:10,
|
|
`2` = `1` * 2 + rnorm(length(`1`))
|
|
)
|
|
```
|
|
|
|
|
|
|
|
## 批量读取
|
|
|
|
```{r}
|
|
#| message: false
|
|
|
|
sales_files <- c(
|
|
"../../data/01-sales.csv",
|
|
"../../data/02-sales.csv",
|
|
"../../data/03-sales.csv"
|
|
)
|
|
read_csv(sales_files, id = "file")
|
|
```
|
|
|
|
|
|
|
|
|
|
## 读取Excel,建议用`readxl`包
|
|
|
|
```{r}
|
|
(surveydf <- readxl::read_xlsx("../../data/survey.xlsx"))
|
|
```
|
|
|
|
## 读取Excel
|
|
|
|
```{r}
|
|
(airqualitydf <- readxl::read_xlsx("../../data/airquality.xlsx", sheet = 2))
|
|
```
|
|
|
|
|
|
|
|
## 批量读取
|
|
|
|
```{r}
|
|
sales_files <- list.files(
|
|
"../../data",
|
|
pattern = "sales\\.csv$",
|
|
full.names = TRUE
|
|
)
|
|
sales_files
|
|
```
|
|
|
|
## 写入csv
|
|
|
|
```{r}
|
|
#| warning: false
|
|
#| message: false
|
|
students
|
|
write_csv(students, "students-2.csv")
|
|
read_csv("students-2.csv")
|
|
```
|
|
|
|
## 写入Excel
|
|
|
|
```{r}
|
|
writexl::write_xlsx(students, "../../data/writexldemo.xlsx")
|
|
```
|
|
|
|
## 读取数据库,以MySQL为例
|
|
|
|
```{r}
|
|
if (FALSE) {
|
|
conn <- cctdb::get_dbconn("nationalairquality")
|
|
DBI::dbListTables(conn)
|
|
}
|
|
```
|
|
|
|
|
|
|
|
## 读取数据库,以MySQL为例
|
|
|
|
```{r}
|
|
if (FALSE) {
|
|
conn <- cctdb::get_dbconn("nationalairquality")
|
|
metadf <- tbl(conn, "metadf") |>
|
|
head(100) |>
|
|
collect()
|
|
DBI::dbDisconnect(conn)
|
|
saveRDS(metadf, file = "../../data/metadfdemo.RDS")
|
|
}
|
|
metadf <- readRDS(file = "../../data/metadfdemo.RDS")
|
|
lang <- "cn"
|
|
metadf |>
|
|
ggplot(aes(lon, lat)) +
|
|
geom_point(aes(fill = Area)) +
|
|
dwfun::theme_sci()
|
|
```
|
|
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
#| eval: false
|
|
metadf <- readxl::read_xlsx("../../data/airquality.xlsx")
|
|
dir.create("../../data/metacity2/")
|
|
metadf |>
|
|
nest(sitedf = -site) |>
|
|
mutate(
|
|
flag = purrr::map2(
|
|
site,
|
|
sitedf,
|
|
~ writexl::write_xlsx(.y, paste0("../../data/metacity2/", .x, ".xlsx"))
|
|
)
|
|
)
|
|
```
|
|
|
|
|
|
## 练习
|
|
|
|
```{r}
|
|
#| include: false
|
|
#| eval: false
|
|
if (FALSE) {
|
|
require(tidyverse)
|
|
conn <- cctdb::get_dbconn("nationalairquality")
|
|
metadf <- tbl(conn, "metadf") |>
|
|
collect()
|
|
DBI::dbDisconnect(conn)
|
|
metanestdf <- metadf |>
|
|
nest(citydf = -Area)
|
|
names(metanestdf$citydf) <- metanestdf$Area
|
|
writexl::write_xlsx(metanestdf$citydf, path = "../../data/meta_city.xlsx")
|
|
dir.create("../../data/metacity/")
|
|
metanestdf |>
|
|
mutate(
|
|
flag = purrr::map2(
|
|
Area,
|
|
citydf,
|
|
~ writexl::write_xlsx(
|
|
.y,
|
|
path = paste0("../../data/metacity/", .x, ".xlsx")
|
|
)
|
|
)
|
|
)
|
|
}
|
|
```
|
|
|
|
1. 从“../../data/sales.xlsx”读取第9到13行的数据
|
|
2. 从“../../data/meta_city.xlsx”读取所有的数据,并保存至“../../data/meta_city_onetable1.xlsx”
|
|
3. 从“../../data/metacity/”读取所有的数据,并保存至“../../data/meta_city_onetable2.xlsx”
|
|
|
|
|
|
|
|
## 欢迎讨论!{.center}
|
|
|
|
|
|
```{r}
|
|
#| results: 'asis'
|
|
#| echo: false
|
|
rmdify::slideend(
|
|
wechat = FALSE,
|
|
type = "public",
|
|
tel = FALSE,
|
|
thislink = "../"
|
|
)
|
|
```
|
|
|