R数据科学（R for Data Science）

Part 1：探索

by: PJX for 查漏补缺

exercise: https://jrnold.github.io/r4ds-exercise-solutions

------------前言-------------------------------

library(tidyverse)

#核心包：ggplot2/tibble/readr/purrr/dplyr/tidyr/forcats/stringr

#更新

tidyverse_update()

#代码错误信息用英文

Sys.setenv(LANGUAGE="en")

#本书生成环境

library(devtools)

devtools::session_info(c("tidyverse")) #session_info查看R解释器版本及运行平台等信息

session_info()

.libPaths() #包路径

dim(available.packages()) #可安装的包及其属性

-----------第1章使用ggplot2进行数据可视化-------------------

#1.第一步

mpg

ggplot(mpg)+geom_point(aes(displ,hwy))

#练习题

ggplot(mpg)

dim(mpg)

?mpg

ggplot(mpg,aes(hwy,cyl))+geom_point()

ggplot(mpg,aes(class,drv))+geom_point()

#2.图形属性映射

ggplot(mpg)+geom_point(aes(displ,hwy,color=class))

ggplot(mpg)+geom_point(aes(displ,hwy,size=class))

ggplot(mpg)+geom_point(aes(displ,hwy,alpha=class))

ggplot(mpg)+geom_point(aes(displ,hwy,shape=class)) #ggplot2只能同时使用6种形状

ggplot(mpg)+geom_point(aes(displ,hwy,stroke=cty))

ggplot(mpg)+geom_point(aes(displ,hwy,color=displ<5)) #条件映射

#在aes外部手动设置几何对象的图形属性

ggplot(mpg)+geom_point(aes(displ,hwy),color="blue")

#3.分面(离散变量)

ggplot(mpg)+geom_point(aes(displ,hwy))+facet_wrap(~class,nrow = 2) #单个变量

ggplot(mpg)+geom_point(aes(displ,hwy))+facet_grid(drv~cyl) #两个变量

ggplot(mpg)+geom_point(aes(displ,hwy))+facet_grid(.~cyl) #只按cyl列分面

ggplot(mpg)+geom_point(aes(displ,hwy))+facet_grid(drv~.) #只按drv行分面

#4.几何对象

ggplot(mpg)+geom_point(aes(displ,hwy))

ggplot(mpg)+geom_smooth(aes(displ,hwy))

ggplot(mpg)+geom_smooth(aes(displ,hwy,linetype=drv))

ggplot(mpg)+geom_smooth(aes(displ,hwy,group=drv)) #group将图像属性设为一个分类变量

ggplot(mpg)+geom_smooth(aes(displ,hwy,color=drv),show.legend = F)

ggplot(mpg)+

  geom_point(aes(displ,hwy))+

  geom_smooth(aes(displ,hwy))

#ggplot中全局映射，减少代码重复

ggplot(mpg,aes(displ,hwy))+

  geom_point()+

  geom_smooth()

ggplot(mpg,aes(displ,hwy))+

  geom_point(aes(color=class))+ #局部映射只对该图层有效

  geom_smooth()

#不同图层可用不同的数据

ggplot(mpg,aes(displ,hwy))+

  geom_point(aes(color=class))+

  geom_smooth(data=filter(mpg,class=="subcompact"),se=F) #se标准误standard error

#5.统计变换

ggplot(diamonds)+geom_bar(aes(cut)) #产生count

#绘图时用来计算新数据的算法称为stat（即统计变换），共20多个

?geom_bar #默认数量统计变换stat = "count"

?geom_histogram

#通常几何对象函数和统计变换可交换使用：每个几何对象都有一个默认的统计变换，每个统计变换函数都有一个默认的几何对象

ggplot(diamonds)+stat_count(aes(cut))

#修改默认的统计变换

demo <- tribble(

  ~a,~b,

  "bar_1",20,

  "bar_2",30,

  "bar_3",40

)

ggplot(demo)+geom_bar(aes(a,b),stat = "identity")

#修改默认统计变换：按比例显示

ggplot(diamonds)+geom_bar(aes(x=cut,y=..prop..,group=1)) #group将所有cut视为一个分类变量

ggplot(diamonds)+geom_bar(aes(x=cut,y=..prop..)) #error

ggplot(diamonds)+geom_bar(aes(x=cut,y=..prop..,fill=color)) #error

#强调统计变换

ggplot(diamonds)+stat_summary(aes(x=cut,y=depth),

                              fun.ymin = min,

                              fun.ymax = max,

                              fun.y = median)

#6.位置调整

ggplot(diamonds)+geom_bar(aes(cut,color=cut))

ggplot(diamonds)+geom_bar(aes(cut,fill=cut))

ggplot(diamonds)+geom_bar(aes(cut,fill=clarity)) #每个矩形是cut和clarity的组合

#position的参数：identity, fill, dodge

ggplot(diamonds,aes(x=cut,fill=clarity))+

  geom_bar(alpha=1/5,position = "identity")

ggplot(diamonds,aes(x=cut,color=clarity))+

  geom_bar(fill=NA,position = "identity")

ggplot(diamonds,aes(x=cut,fill=clarity))+

  geom_bar(position = "fill")

ggplot(diamonds,aes(x=cut,fill=clarity))+

  geom_bar(position = "dodge")

ggplot(mpg,aes(displ,hwy))+geom_point()

ggplot(mpg,aes(displ,hwy))+geom_point(position = "jitter")

ggplot(mpg,aes(displ,hwy))+geom_jitter() #同上

#7.坐标系

#默认笛卡尔直角坐标系

ggplot(mpg,aes(class,hwy))+geom_boxplot()

ggplot(mpg,aes(class,hwy))+geom_boxplot()+coord_flip()

bar <- ggplot(diamonds)+geom_bar(aes(cut,fill=cut),show.legend = F,width = 1)

bar+coord_flip()

bar+coord_polar() #极坐标系

#8.图形分层语法

#组合：数据、几何对象、映射、统计变换、位置调整、坐标系、分面

  # ggplot(data = <data>)+

  #   <geom_function>(

  #     mapping=aes(<mapping>),

  #     stat=<stat>,

  #     position=<position>

  #   )+

  #   <coordinate_function>+

  #   <facet_function>

--------------------------第2章工作流：基础----------------------------

y <- seq(1,10,length.out = 5)

(y <- seq(1,10,length.out = 5)) #同时赋值和输出屏幕

#alt+ctrl+K 打开/关闭快捷键面板

-------------------------第3章使用dplyr进行数据转换---------------------

# 五个函数：

  # 筛选: filter()

  # 排列: arrange()

  # 选择: select()

  # 变形: mutate()

  # 汇总: summarise()

  # 分组: group_by()

 ---------------

library(nycflights13)

library(tidyverse)

flights

#变量类型：int/dbl双精度浮点/chr/dttm日期时间/lgl逻辑型/fctr因子/date

#dplyr函数：filter arrange select mutate summarize

#与group_by连用

#1.filter筛选行

filter(flights,month==1,day==1)

(dec25 <- filter(flights,month==12,day==25))

#比较运算符> < >= <= 1= ==

sqrt(2)^2 == 2 #计算机只能存储有限位数

near(sqrt(2)^2,2)

1/49*49 == 1

near(1/49*49,1)

#逻辑运算符& | !

filter(flights,month==11 | month==12)

filter(flights,month==11 | 12) #11|12此处为TRUE（即为1）

# x %in% y ：x是y中的一个值时的行

(nov_dec <- filter(flights,month %in% c(11,12)))

#缺失值

NA == NA

NA*0

NA^0

#filter只能筛选条件为TRUE的行，删除条件为FALSE和NA的行

df <- tibble(x=c(1,NA,3))

filter(df,x>1)

#保留NA的行

filter(df,is.na(x)|x>1)

?between

x <- rnorm(1e2)

x[between(x, -1, 1)] #筛选x的(-1,1)之间的数

filter(flights,is.na(dep_time))

#2.arrange排序

arrange(flights,year,month,day) #默认升序

arrange(flights,desc(arr_delay)) #降序

df <- tibble(x=c(5,2,NA))

arrange(df,x) #NA总是排在最后

arrange(df,desc(x))

#如何将NA排在前面

arrange(df,x) %>% tail() #??

arrange(flights, dep_time) %>% tail()

arrange(flights, desc(is.na(dep_time)), dep_time) 

#3.select选择列

select(flights,year,month,day)

select(flights,year:day)

select(flights,-(year:day))

select(flights,1:3)

variables <- c("dep_time", "dep_delay", "arr_time", "arr_delay")

select(flights, one_of(variables))

#辅助函数：

starts_with("abc")

ends_with("xyz")

contains("ijk") #默认不区分大小写，即匹配IJK

matches("(.)\\1") #正则匹配的变量

num_range("x",1:3) #匹配x1,x2,x3

rename(flights,tail_num=tailnum) #重命名变量

flights

#与everything连用，移动几个变量到前面

select(flights,time_hour,air_time,everything()) #变量名移位

select(flights, contains("TIME",ignore.case = T)) #默认

select(flights, contains("TIME",ignore.case = F))

#4.mutate添加新变量

#总是添加到最后

flights_sml <- select(flights,year:day,ends_with("delay"),distance,air_time)

flights_sml

mutate(flights_sml,gain=arr_delay-dep_delay,

       speed=distance/air_time*60,

       hours=air_time/60,

       gain_per_hour=gain/hours) #新列一旦创建可立即使用

#只保留新列

transmute(flights,gain=arr_delay-dep_delay,

          speed=distance/air_time*60,

          hours=air_time/60,

          gain_per_hour=gain/hour)

#mutate常用创建函数：

  # 算术运算符：+ - * / ^

  # 模运算符：%/%(整数除法) %%(取余)

transmute(flights,dep_time,hour=dep_time %/% 100,minute=dep_time %% 100)

  # 对数函数：log log2 log10 #更推荐log2，因更易解释（2倍变化）

  # 偏移函数：lead(领先值) lag(滞后值)

(x <- 1:10); lag(x); lead(x)

  # 累加和滚动聚合：cumsum(累加和) cumprod(累加积) commin(累加最小值) cummax(累加最大值)

x; cumsum(x); cummean(x)

  # 逻辑比较：< <= > >= !=

  # 排秩：min_rank()

(y <- c(1,2,2,NA,3,4)); min_rank(y); min_rank(desc(y))

#如何找出10个延误时间最长的航班（如何处理名词相同的情况？）

mutate(flights, dep_delay_rank = min_rank(desc(dep_delay))) %>%

  filter(dep_delay_rank <= 10) %>%

  arrange(dep_delay_rank) %>%

  select(month, day, carrier, flight, dep_delay, dep_delay_rank)

#5.summarize分组摘要

summarize(flights,delay=mean(dep_delay,na.rm = T)) #同summarise

#与group_by连用

group_by(flights,year,month,day) %>%

  summarise(delay=mean(dep_delay,na.rm = T))

#管道符应用

delays <- flights %>% group_by(dest) %>%

  summarise(count=n(), #计数，另一个计数函数sum(!is.na())

            dist=mean(distance,na.rm=T),

            delay=mean(arr_delay,na.rm = T)) %>%

  filter(count>20,dest!="HNL")

#所有聚合函数都有一个na.rm参数

group_by(flights,year,month,day) %>% summarise(delay=mean(dep_delay))

group_by(flights,year,month,day) %>% summarise(delay=mean(dep_delay,na.rm = T))

#可以先保存去缺失值后的数据集，以便后续继续使用

not_cancelled <- flights %>%

  filter(!is.na(dep_delay),!is.na(arr_delay))

not_cancelled %>% group_by(year,month,day) %>% summarise(mean=mean(dep_delay))

#求最长平均延误时间的飞机

not_cancelled %>% group_by(tailnum) %>% summarise(delay=mean(arr_delay)) %>%

  ggplot(aes(delay))+geom_freqpoly(binwidth=10)

#航班数量与平均延误时间的散点图

not_cancelled %>% group_by(tailnum) %>% summarise(delay=mean(arr_delay),n=n()) %>%

  ggplot(aes(n,delay))+geom_point(alpha=1/5)

#去掉观测数很少的分组，避免小分组中的极端变动影响

not_cancelled %>% group_by(tailnum) %>% summarise(delay=mean(arr_delay),n=n()) %>%

  filter(n>25) %>%

  ggplot(aes(n,delay))+geom_point(alpha=1/5)

#常用摘要函数：

#位置度量：mean/ median/ 聚合函数和逻辑筛选结合

not_cancelled %>% group_by(year,month,day) %>%

  summarise(delay1=mean(arr_delay), #平均延误

            delay2=mean(arr_delay[arr_delay>0])) #平均正延误

#分散程度度量：sd(标准误)/ IQR(四分位距)/ mad(绝对中位差)

#为什么某些目的地距离更多变？

not_cancelled %>% group_by(dest) %>%

  summarise(distance_sd=sd(distance)) %>%

  arrange(desc(distance_sd))

#秩的度量：min/ quantile/ max

#每天最早和最晚的航班何时出发？

not_cancelled %>% group_by(year,month,day) %>%

  summarise(first=min(dep_time),

            last=max(dep_time)) 

#定位度量：first(x)/ nth(x,2)/ last(x) 分别等于x[1]/x[2]/x[length(x)]

not_cancelled %>% group_by(year,month,day) %>%

  summarise(first=first(dep_time),

            last=last(dep_time)) 

#计数：n()/ sum(!is.na(x))/ n_distinct(x)计算唯一值的数量/ count(x)

#哪个目的地有最多的航空公司？

not_cancelled %>% group_by(dest) %>%

  summarise(carriers=n_distinct(carrier)) %>%

  arrange(desc(carriers))

not_cancelled %>% count(dest)

#逻辑值计数和比例：sum(x>10)/ mean(y==0)

#多少架航班在早上5点前出发？

not_cancelled %>% group_by(year,month,day) %>%

  summarise(n_early=sum(dep_time<500))  #sum计数

#延误超过1小时的航班比例是多少？

not_cancelled %>% group_by(year,month,day) %>%

  summarise(hour_perc=mean(arr_delay>60)) #mean计算比例

#多个变量分组,每一次用掉一个分组变量

group_by(flights,year,month,day) %>%

  summarise(flights=n()) %>%

  summarise(flights=sum(flights)) %>%

  summarise(flights=sum(flights))

#取消分组

group_by(flights,year,month,day) %>%

  ungroup() %>%

  summarise(flights=n())

-----------补充:tidyr包数据处理----------------

# tidyr包四个函数：

  # gather—宽数据转为长数据；

  # spread—长数据转为宽数据；

  # unit—多列合并为一列；

  # separate—将一列分离为多列；

-----------------------

library(tidyr)

mtcars$car <- rownames(mtcars)

mtcars <- mtcars[, c(12, 1:11)]   #将添加的一列从最后一列移到最前列

head(mtcars)

#1.gather()函数: 宽数据转为长数据

#gather(data, key, value, …, na.rm = FALSE, convert = FALSE)

#除了car列外，其余列聚合成两列(一列对应列名，一列对应值)，分别命名为attribute和value。

mtcarsNew <- mtcars %>% gather(attribute, value, -car)

# 指定连续列聚合成两列

mtcarsNew <- mtcars %>% gather(attribute, value, mpg:gear)

# 指定不连续列聚合成两列

mtcarsNew <- mtcars %>% gather(`gear`,`carb`,key = "attribute", value = "value", -car)

#2.spread函数：长数据转为宽数据 

#spread(data, key, value, fill = NA, convert = FALSE, drop = TRUE)

mtcarsSpread <- mtcarsNew %>% spread(attribute, value)

#和原来的mtcars数据相比，只是各列的相互位置稍有调整

#3.unite函数：多列合并为一列

#unite(data, col, …, sep = “_”, remove = TRUE)

  # data：为数据框

  # col：被组合的新列名称

  # …：指定哪些列需要被组合

  # sep：组合列之间的连接符，默认为下划线

  # remove：是否删除被组合的列

set.seed(1)

data <- data.frame(date=as.Date('2019-05-01') + 0:14,

                   hour=sample(1:24, 15),

                   min=sample(1:60, 15),

                   second=sample(1:60, 15),

                   event=sample(letters, 15))

head(data)

dataNew <- data %>%unite(datehour, date, hour, sep = ' ') %>%unite(datetime, datehour, min, second, sep = ':')

dataNew

#4.separate函数：一列分离为多列

# separate(data, col, into, sep = “[^[:alnum:]]+”, remove = TRUE,

#          convert = FALSE, extra = “warn”, fill = “warn”, …)

  # data：为数据框

  # col：需要被拆分的列

  # into：新建的列名，为字符串向量

  # sep：被拆分列的分隔符

  # remove：是否删除被分割的列 

data1 <- dataNew %>%separate(datetime, c('date', 'time'), sep = ' ') %>%

  separate(time, c('hour', 'min', 'second'), sep = ':')

data1

------------第4章工作流：脚本-----------------------------------------

#快键键：ctrl+enter  ctrl+shift+s

-----------第5章探索性数据分析----------------------------------------

#提出问题——>数据可视化、转换和建模——>找出答案——>精炼问题，提出新问题

diamonds

#对分布进行可视化

#分类变量

ggplot(diamonds)+geom_bar(aes(cut))

diamonds %>% count(cut)

#连续变量

ggplot(diamonds)+geom_histogram(aes(carat),binwidth = 0.5)

diamonds %>% count(cut_width(carat,0.5))

diamonds %>% filter(carat<3) %>%

  ggplot(aes(carat))+geom_histogram(binwidth = 0.1)

#在一张图上叠加多个直方图，用geom_freqploy

ggplot(diamonds,aes(carat,color=cut))+

  geom_freqpoly(binwidth=0.5) #数目折线图

ggplot(diamonds,aes(carat,color=cut))+

  geom_density() #密度曲线图

diamonds %>% filter(carat<3) %>%

  ggplot(aes(carat))+geom_histogram(binwidth = 0.01)

#图形解读

head(faithful)#火山喷发时长

ggplot(faithful,aes(eruptions))+geom_histogram(binwidth = 0.25)

#短喷发（2min）和长喷发（4-5min)是否有关呢？

#异常值

ggplot(diamonds)+geom_histogram(aes(y),binwidth = 0.5)

#异常值分箱高度太低几乎看不到，例如要更容易查看y的异常值，可用coord_cartesian部分放大

ggplot(diamonds)+geom_histogram(aes(y),binwidth = 0.5)+

  coord_cartesian(ylim = c(0,50)) #限定小范围，相当于放大

#以上查看到了0,30,60附近的3个异常值，我们在数据中将它们找出，并分析是否合理

diamonds %>% filter(y<3|y>20) %>% arrange(y)

#缺失值

diamonds2 <- diamonds %>% filter(between(y,3,20)) #去掉异常值的行

diamonds2 <- diamonds %>% mutate(y=ifelse(y<3|y>20,NA,y)) #用NA来代替异常值

ggplot(diamonds2,aes(x,y))+geom_point() #默认去除含NA的行

ggplot(diamonds2,aes(x,y))+geom_point(na.rm = T) #去掉警告

flights %>% mutate(cancelled=is.na(dep_time)) %>%  #已取消航班和未取消航班分布比较

  ggplot(aes(sched_dep_time))+geom_freqpoly(aes(color=cancelled),binwidth=2)

ggplot(diamonds,aes(price,y=..density..))+

  geom_freqpoly(aes(color=cut),binwidth=500) #密度是对计数的标准化，每个频率多边形面积都是1

ggplot(mpg,aes(class,hwy))+geom_boxplot()

ggplot(mpg,aes(reorder(class,hwy,FUN=median),hwy))+ #按hwy中位值对class排序

  geom_boxplot()

ggplot(mpg)+geom_boxplot(aes(reorder(class,hwy,FUN=median),hwy))+

  coord_flip()

#两个分类变量

ggplot(diamonds)+geom_count(aes(cut,color)) #气泡图

diamonds %>% count(color,cut)

diamonds %>% count(color,cut) %>%

  ggplot(aes(color,cut))+geom_tile(aes(fill=n)) #热图

#两个连续变量

ggplot(diamonds)+geom_point(aes(carat,price)) #不适合大数据集

ggplot(diamonds)+geom_point(aes(carat,price),alpha=1/100) #不适合大数据集

smaller <- diamonds %>% filter(carat<3)

ggplot(smaller)+geom_bin2d(aes(carat,price))

library(hexbin)

ggplot(smaller)+geom_hex(aes(carat,price))

#对一个连续变量进行分箱

ggplot(smaller)+geom_boxplot(aes(carat,price,group=cut_width(carat,0.1)))

ggplot(smaller)+geom_boxplot(aes(carat,price,group=cut_number(carat,20)))

#模式与模型

ggplot(faithful)+geom_point(aes(eruptions,waiting))

library(modelr)

mod <- lm(log(price)~log(carat),diamonds) #根据carat预测价格

diamonds2<- diamonds %>% add_residuals(mod) %>% mutate(resid=exp(resid))

head(diamonds2)

ggplot(diamonds2)+geom_point(aes(carat,resid)) #残差：预测值与实际值间的差别

#以上利用模型去除价格和克拉数之间的强关系，用残差反映钻石的价格，继续研究切割质量和价格的关系

ggplot(diamonds2)+geom_boxplot(aes(cut,resid))

----------------第6章工作流：项目----------------------------------

 # 为每个任务创建rstudio项目 R project

 # 在项目中保存数据

 # 在项目中保存脚本

 # 在项目中保存输出

 # 只使用相对路径，不使用绝对路径

巴特西

R数据科学-1

R数据科学（R for Data Science）

Part 1：探索

exercise: https://jrnold.github.io/r4ds-exercise-solutions

------------前言-------------------------------

-----------第1章使用ggplot2进行数据可视化-------------------

--------------------------第2章工作流：基础----------------------------

-------------------------第3章使用dplyr进行数据转换---------------------

-----------补充:tidyr包数据处理----------------

------------第4章工作流：脚本-----------------------------------------

-----------第5章探索性数据分析----------------------------------------

----------------第6章工作流：项目----------------------------------

最新文章

热门文章

巴特西

R数据科学-1

R数据科学（R for Data Science）

Part 1：探索

exercise: https://jrnold.github.io/r4ds-exercise-solutions

------------前言-------------------------------

-----------第1章 使用ggplot2进行数据可视化-------------------

--------------------------第2章 工作流：基础----------------------------

-------------------------第3章 使用dplyr进行数据转换---------------------

-----------补充:tidyr包数据处理----------------

------------第4章 工作流：脚本-----------------------------------------

-----------第5章 探索性数据分析----------------------------------------

----------------第6章 工作流：项目----------------------------------

最新文章

热门文章

-----------第1章使用ggplot2进行数据可视化-------------------

--------------------------第2章工作流：基础----------------------------

-------------------------第3章使用dplyr进行数据转换---------------------

------------第4章工作流：脚本-----------------------------------------

-----------第5章探索性数据分析----------------------------------------

----------------第6章工作流：项目----------------------------------