379 lines
5.1 KiB
Plaintext
379 lines
5.1 KiB
Plaintext
|
---
|
|||
|
title: "Data import"
|
|||
|
subtitle: 《区域水环境污染数据分析实践》<br>Data analysis practice of regional water environment pollution
|
|||
|
author: 苏命、王为东<br>中国科学院大学资源与环境学院<br>中国科学院生态环境研究中心
|
|||
|
date: today
|
|||
|
lang: zh
|
|||
|
format:
|
|||
|
revealjs:
|
|||
|
theme: dark
|
|||
|
slide-number: true
|
|||
|
chalkboard:
|
|||
|
buttons: true
|
|||
|
preview-links: auto
|
|||
|
lang: zh
|
|||
|
toc: true
|
|||
|
toc-depth: 1
|
|||
|
toc-title: 大纲
|
|||
|
logo: ./_extensions/inst/img/ucaslogo.png
|
|||
|
css: ./_extensions/inst/css/revealjs.css
|
|||
|
pointer:
|
|||
|
key: "p"
|
|||
|
color: "#32cd32"
|
|||
|
pointerSize: 18
|
|||
|
revealjs-plugins:
|
|||
|
- pointer
|
|||
|
filters:
|
|||
|
- d2
|
|||
|
---
|
|||
|
|
|||
|
|
|||
|
```{r}
|
|||
|
#| echo: false
|
|||
|
knitr::opts_chunk$set(echo = TRUE)
|
|||
|
source("../../coding/_common.R")
|
|||
|
library(tidyverse)
|
|||
|
```
|
|||
|
|
|||
|
|
|||
|
## 导入csv数据
|
|||
|
|
|||
|
```{r}
|
|||
|
#| echo: false
|
|||
|
#| message: false
|
|||
|
#| comment: ""
|
|||
|
|
|||
|
read_lines("../../data/students.csv") |> cat(sep = "\n")
|
|||
|
```
|
|||
|
|
|||
|
## 导入csv数据
|
|||
|
|
|||
|
```{r}
|
|||
|
#| label: tbl-students-table
|
|||
|
#| echo: false
|
|||
|
#| message: false
|
|||
|
#| tbl-cap: Data from the students.csv file as a table.
|
|||
|
|
|||
|
read_csv("../../data/students.csv") |>
|
|||
|
knitr::kable()
|
|||
|
```
|
|||
|
|
|||
|
## 读取数据
|
|||
|
|
|||
|
```{r}
|
|||
|
#| message: true
|
|||
|
|
|||
|
(students <- read_csv("../../data/students.csv"))
|
|||
|
(students <- read_csv("https://pos.it/r4ds-students-csv"))
|
|||
|
```
|
|||
|
|
|||
|
|
|||
|
## 读取数据
|
|||
|
|
|||
|
```{r}
|
|||
|
#| message: false
|
|||
|
(students <- read_csv("../../data/students.csv", na = c("N/A", "")))
|
|||
|
```
|
|||
|
|
|||
|
## 列名不要有空格
|
|||
|
|
|||
|
```{r}
|
|||
|
students |>
|
|||
|
rename(
|
|||
|
student_id = `Student ID`,
|
|||
|
full_name = `Full Name`
|
|||
|
)
|
|||
|
```
|
|||
|
|
|||
|
## `janitor`处理空格
|
|||
|
|
|||
|
```{r}
|
|||
|
#| message: false
|
|||
|
|
|||
|
students |> janitor::clean_names()
|
|||
|
```
|
|||
|
|
|||
|
## `janitor`处理空格
|
|||
|
|
|||
|
```{r}
|
|||
|
students |>
|
|||
|
janitor::clean_names() |>
|
|||
|
mutate(meal_plan = factor(meal_plan))
|
|||
|
```
|
|||
|
|
|||
|
## `janitor`处理空格
|
|||
|
|
|||
|
```{r}
|
|||
|
students <- students |>
|
|||
|
janitor::clean_names() |>
|
|||
|
mutate(
|
|||
|
meal_plan = factor(meal_plan),
|
|||
|
age = parse_number(if_else(age == "five", "5", age))
|
|||
|
)
|
|||
|
students
|
|||
|
```
|
|||
|
|
|||
|
## 直接录入
|
|||
|
|
|||
|
```{r}
|
|||
|
read_csv(
|
|||
|
"a,b,c
|
|||
|
1,2,3
|
|||
|
4,5,6"
|
|||
|
)
|
|||
|
```
|
|||
|
|
|||
|
## 直接录入
|
|||
|
|
|||
|
```{r}
|
|||
|
#| message: false
|
|||
|
|
|||
|
read_csv(
|
|||
|
"The first line of metadata
|
|||
|
The second line of metadata
|
|||
|
x,y,z
|
|||
|
1,2,3",
|
|||
|
skip = 2
|
|||
|
)
|
|||
|
|
|||
|
read_csv(
|
|||
|
"# A comment I want to skip
|
|||
|
x,y,z
|
|||
|
1,2,3",
|
|||
|
comment = "#"
|
|||
|
)
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
#| message: false
|
|||
|
|
|||
|
read_csv(
|
|||
|
"1,2,3
|
|||
|
4,5,6",
|
|||
|
col_names = FALSE
|
|||
|
)
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
#| message: false
|
|||
|
|
|||
|
read_csv(
|
|||
|
"1,2,3
|
|||
|
4,5,6",
|
|||
|
col_names = c("x", "y", "z")
|
|||
|
)
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
#| eval: false
|
|||
|
|
|||
|
"x,y\n1,'a,b'"
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
#| eval: false
|
|||
|
|
|||
|
read_csv("a,b\n1,2,3\n4,5,6")
|
|||
|
read_csv("a,b,c\n1,2\n1,2,3,4")
|
|||
|
read_csv("a,b\n\"1")
|
|||
|
read_csv("a,b\n1,2\na,b")
|
|||
|
read_csv("a;b\n1;3")
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
annoying <- tibble(
|
|||
|
`1` = 1:10,
|
|||
|
`2` = `1` * 2 + rnorm(length(`1`))
|
|||
|
)
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
#| message: false
|
|||
|
|
|||
|
read_csv("
|
|||
|
logical,numeric,date,string
|
|||
|
TRUE,1,2021-01-15,abc
|
|||
|
false,4.5,2021-02-15,def
|
|||
|
T,Inf,2021-02-16,ghi
|
|||
|
")
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
simple_csv <- "
|
|||
|
x
|
|||
|
10
|
|||
|
.
|
|||
|
20
|
|||
|
30"
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
#| message: false
|
|||
|
|
|||
|
read_csv(simple_csv)
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
df <- read_csv(
|
|||
|
simple_csv,
|
|||
|
col_types = list(x = col_double())
|
|||
|
)
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
problems(df)
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
#| message: false
|
|||
|
|
|||
|
read_csv(simple_csv, na = ".")
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
another_csv <- "
|
|||
|
x,y,z
|
|||
|
1,2,3"
|
|||
|
|
|||
|
read_csv(
|
|||
|
another_csv,
|
|||
|
col_types = cols(.default = col_character())
|
|||
|
)
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
read_csv(
|
|||
|
another_csv,
|
|||
|
col_types = cols_only(x = col_character())
|
|||
|
)
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
#| message: false
|
|||
|
|
|||
|
sales_files <- c("../../data/01-sales.csv", "../../data/02-sales.csv", "../../data/03-sales.csv")
|
|||
|
read_csv(sales_files, id = "file")
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
#| eval: false
|
|||
|
|
|||
|
sales_files <- c(
|
|||
|
"https://pos.it/r4ds-01-sales",
|
|||
|
"https://pos.it/r4ds-02-sales",
|
|||
|
"https://pos.it/r4ds-03-sales"
|
|||
|
)
|
|||
|
read_csv(sales_files, id = "file")
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
sales_files <- list.files("../../data", pattern = "sales\\.csv$", full.names = TRUE)
|
|||
|
sales_files
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
#| eval: false
|
|||
|
|
|||
|
write_csv(students, "students.csv")
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
#| warning: false
|
|||
|
#| message: false
|
|||
|
|
|||
|
students
|
|||
|
write_csv(students, "students-2.csv")
|
|||
|
read_csv("students-2.csv")
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
write_rds(students, "students.rds")
|
|||
|
read_rds("students.rds")
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
#| eval: false
|
|||
|
|
|||
|
library(arrow)
|
|||
|
write_parquet(students, "students.parquet")
|
|||
|
read_parquet("students.parquet")
|
|||
|
#> # A tibble: 6 × 5
|
|||
|
#> student_id full_name favourite_food meal_plan age
|
|||
|
#> <dbl> <chr> <chr> <fct> <dbl>
|
|||
|
#> 1 1 Sunil Huffmann Strawberry yoghurt Lunch only 4
|
|||
|
#> 2 2 Barclay Lynn French fries Lunch only 5
|
|||
|
#> 3 3 Jayendra Lyne NA Breakfast and lunch 7
|
|||
|
#> 4 4 Leon Rossini Anchovies Lunch only NA
|
|||
|
#> 5 5 Chidiegwu Dunkel Pizza Breakfast and lunch 5
|
|||
|
#> 6 6 Güvenç Attila Ice cream Lunch only 6
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
#| include: false
|
|||
|
file.remove("students-2.csv")
|
|||
|
file.remove("students.rds")
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
tibble(
|
|||
|
x = c(1, 2, 5),
|
|||
|
y = c("h", "m", "g"),
|
|||
|
z = c(0.08, 0.83, 0.60)
|
|||
|
)
|
|||
|
```
|
|||
|
|
|||
|
##
|
|||
|
|
|||
|
```{r}
|
|||
|
tribble(
|
|||
|
~x, ~y, ~z,
|
|||
|
1, "h", 0.08,
|
|||
|
2, "m", 0.83,
|
|||
|
5, "g", 0.60
|
|||
|
)
|
|||
|
```
|