09-30 08:12 阅读 60

数据挖掘20210112学习笔记

tidyr 核心函数

一、数据清理

##原始数据
> test <- data.frame(geneid = paste0("gene",1:4),
+                  sample1 = c(1,4,7,10),
+                  sample2 = c(2,5,0.8,11),
+                  sample3 = c(0.3,6,9,12))

扁变长(gather-spread)

> test_gather <- gather(data = test,
+                     key = sample_nm,
+                     value = exp,
+                     - geneid)

长变扁

> test_re <- spread(data = test_gather,
+                 key = sample_nm,
+                 value = exp)

二、分割和合并(separate-unite)

#原始数据
> test <- data.frame(x = c( "a,b", "a,d", "b,c"))

分割

> test_seprate <- separate(test,x, c("X", "Y"),sep = ",")

合并

> test_re <- unite(test_seprate,"x",X,Y,sep = ",")

三、处理NA

### 原始数据
> X<-data.frame(X1 = LETTERS[1:5],X2 = 1:5)
> X[2,2] <- NA
> X[4,1] <- NA
> X
    X1 X2
1    A  1
2    B NA
3    C  3
4 <NA>  4
5    E  5

去掉含有NA的行,可以选择只根据某一列来去除

> drop_na(X)    #把所有带有NA的行都删掉
  X1 X2
1  A  1
2  C  3
3  E  5
> drop_na(X,X1)   #只对X1这一列分析，X1中有NA的这一行去掉，注意不是赋值，原本X没改变
  X1 X2
1  A  1
2  B NA
3  C  3
4  E  5

替换NA

> replace_na(X$X2,0)   #将X2这一列的NA值改为0
[1] 1 0 3 4 5

用上一行的值填充NA

> X
    X1 X2
1    A  1
2    B NA
3    C  3
4 <NA>  4
5    E  5
> fill(X,X2)   #X2的这一列的空值按上一行填充
    X1 X2
1    A  1
2    B  1
3    C  3
4 <NA>  4
5    E  5

完整版见小抄 https://rstudio.com/resources/cheatsheets/

dplyr核心函数

数据准备

> library(dplyr)
> test <- iris[c(1:2,51:52,101:102),]
> rownames(test) =NULL

五个基础函数

1.mutate(),新增列

2.select(),按列筛选

3.filter()筛选行

4.arrange(),按某一列对整个表格进行排序

5.summarise()：汇总

###1.mutate(),新增列
> mutate(test, new = Sepal.Length * Sepal.Width)

###2.select(),按列筛选
####(1)按列号筛选
> select(test,1)    #筛选第一列
> select(test,c(1,5))   #筛选第一列和第五列
 
####(2)按列名筛选
> select(test,Sepal.Length)
> select(test, Petal.Length, Petal.Width)
> vars <- c("Petal.Length", "Petal.Width")
> select(test, one_of(vars))

> select(test, starts_with("Petal"))
> select(test, ends_with("Width"))
> select(test, contains("etal"))
> select(test, matches(".t."))
> select(test, everything())
> select(test, last_col())
> select(test, last_col(offset = 1))

####(3)利用everything()，列名可以重排序
> select(test,Species,everything())

###3.filter()筛选行
> filter(test, Species == "setosa")
> filter(test, Species == "setosa"&Sepal.Length > 5 )
> filter(test, Species %in% c("setosa","versicolor"))

###4.arrange(),按某一列对整个表格进行排序
> arrange(test, Sepal.Length)#默认从小到大排序
> arrange(test, desc(Sepal.Length))#用desc从大到小

###5.summarise()：汇总
> summarise(test, mean(Sepal.Length), sd(Sepal.Length))计算Sepal.Length的平均值和标准差：
  mean(Sepal.Length) sd(Sepal.Length)
1           5.916667        0.8084965

> # 先按照Species分组，计算每组Sepal.Length的平均值和标准差
> group_by(test, Species)
> tmp = summarise(group_by(test, Species),mean(Sepal.Length), sd(Sepal.Length))
> tmp
# A tibble: 3 x 3
  Species    `mean(Sepal.Length)` `sd(Sepal.Length)`
* <fct>                     <dbl>              <dbl>
1 setosa                     5                 0.141
2 versicolor                 6.7               0.424
3 virginica                  6.05              0.354

补充arrange()函数使用方法

> library(dplyr)
> test = iris[c(1,2,51,52,101,102),]    #选取iris的第1,2,51,52,101,102行
> rownames(test) = NULL   #去掉iris的行名
> arrange(test,Sepal.Length)   #根据Sepal.Length从小到大排序
> arrange(test,desc(Sepal.Length))   #根据Sepal.Length从大到小排序
> arrange(test,Sepal.Length,Sepal.Width)   #按照两列排序，如果有一列有相同值，按照第二列的顺序排序
> o = order(test$Sepal.Length)    #返回值是位置下标
> test$Sepal.Length[o]
[1] 4.9 5.1 5.8 6.3 6.4 7.0
> x[order(x)]等同于  sort(x)，但是用order不仅仅可以对列排序，还可以对数据框排序
> test[o,]

两个实用技能

1：管道操作 %>% (cmd/ctrl + shift + M)

> library(dplyr)
> x1 = filter(iris,Sepal.Width>3)
> x2 = select(x1,c("Sepal.Length","Sepal.Width" ))
> x3 = arrange(x2,Sepal.Length)

> iris %>% 
+   filter(Sepal.Width>3) %>% 
+   select(c("Sepal.Length","Sepal.Width" ))%>%
+   arrange(Sepal.Length)

2：count统计某列的unique值

> count(test,Species)
     Species n
1     setosa 2
2 versicolor 2
3  virginica 2

处理关系数据:即将2个表进行连接，注意：不要引入factor

原始数据

> options(stringsAsFactors = F)
> test1 <- data.frame(name = c('jimmy','nicker','doodle'), 
+                     blood_type = c("A","B","O"))
> test1
    name blood_type
1  jimmy          A
2 nicker          B
3 doodle          O
> test2 <- data.frame(name = c('doodle','jimmy','nicker','tony'),
+                     group = c("group1","group1","group2","group2"),
+                     vision = c(4.2,4.3,4.9,4.5))
> test2 
    name  group vision
1 doodle group1    4.2
2  jimmy group1    4.3
3 nicker group2    4.9
4   tony group2    4.5
> test3 <- data.frame(NAME = c('doodle','jimmy','lucy','nicker'),
+                     weight = c(140,145,110,138))
> test3
    NAME weight
1 doodle    140
2  jimmy    145
3   lucy    110
4 nicker    138

> merge(test1,test2,by="name")
    name blood_type  group vision
1 doodle          O group1    4.2
2  jimmy          A group1    4.3
3 nicker          B group2    4.9
> merge(test1,test3,by.x = "name",by.y = "NAME")
    name blood_type weight
1 doodle          O    140
2  jimmy          A    145
3 nicker          B    138

1.內连inner_join,取交集

> inner_join(test1, test2, by = "name")
    name blood_type  group vision
1  jimmy          A group1    4.3
2 nicker          B group2    4.9
3 doodle          O group1    4.2
> inner_join(test1,test3,by = c("name"="NAME"))
    name blood_type weight
1  jimmy          A    145
2 nicker          B    138
3 doodle          O    140

2.左连left_join

> left_join(test1, test2, by = 'name')
    name blood_type  group vision
1  jimmy          A group1    4.3
2 nicker          B group2    4.9
3 doodle          O group1    4.2
> left_join(test2, test1, by = 'name')
    name  group vision blood_type
1 doodle group1    4.2          O
2  jimmy group1    4.3          A
3 nicker group2    4.9          B
4   tony group2    4.5       <NA>

3.全连full_join

> full_join(test1, test2, by = 'name')
    name blood_type  group vision
1  jimmy          A group1    4.3
2 nicker          B group2    4.9
3 doodle          O group1    4.2
4   tony       <NA> group2    4.5

4.半连接：返回能够与y表匹配的x表所有记录semi_join

> semi_join(x = test1, y = test2, by = 'name')
    name blood_type
1  jimmy          A
2 nicker          B
3 doodle          O

5.反连接：返回无法与y表匹配的x表的所记录anti_join

> anti_join(x = test2, y = test1, by = 'name')
  name  group vision
1 tony group2    4.5

6.数据的简单合并

在相当于base包里的cbind()函数和rbind()函数;注意，bind_rows()函数需要两个表格列数相同，而bind_cols()函数则需要两个数据框有相同的行数

> test1 <- data.frame(x = c(1,2,3,4), y = c(10,20,30,40))
> test1
  x  y
1 1 10
2 2 20
3 3 30
4 4 40
> test2 <- data.frame(x = c(5,6), y = c(50,60))
> test2
  x  y
1 5 50
2 6 60
> test3 <- data.frame(z = c(100,200,300,400))
> test3
    z
1 100
2 200
3 300
4 400
> bind_rows(test1, test2)
  x  y
1 1 10
2 2 20
3 3 30
4 4 40
5 5 50
6 6 60
> bind_cols(test1, test3)
  x  y   z
1 1 10 100
2 2 20 200
3 3 30 300
4 4 40 400

stringr函数

> library(stringr)
> x <- "The birch canoe slid on the smooth planks."
> x
[1] "The birch canoe slid on the smooth planks."

1.检测字符串长度

> length(x)
[1] 1
> str_length(x)    #含有多少个字符
[1] 42

2.字符串拆分与组合

> str_split(x," ")   #按空格将字符串拆分
[[1]]
[1] "The"     "birch"   "canoe"   "slid"    "on"     
[6] "the"     "smooth"  "planks."
> x2 = str_split(x," ")[[1]]
> str_c(x2,collapse = " ") #按空格组合
[1] "The birch canoe slid on the smooth planks."
> str_c(x2,1234,sep = "+")
[1] "The+1234"     "birch+1234"   "canoe+1234"  
[4] "slid+1234"    "on+1234"      "the+1234"    
[7] "smooth+1234"  "planks.+1234"

3.提取字符串的一部分

> str_sub(x,5,9)   #从第5位到第9位
[1] "birch"

4.大小写转换

> str_to_upper(x2)    #将字符串改为大写
[1] "THE"     "BIRCH"   "CANOE"   "SLID"    "ON"     
[6] "THE"     "SMOOTH"  "PLANKS."
> str_to_lower(x2)     #将字符串改为小写
[1] "the"     "birch"   "canoe"   "slid"    "on"     
[6] "the"     "smooth"  "planks."
> str_to_title(x2)     #将首字母改为大写
[1] "The"     "Birch"   "Canoe"   "Slid"    "On"     
[6] "The"     "Smooth"  "Planks."

5.字符串排序

> str_sort(x2)
[1] "birch"   "canoe"   "on"      "planks." "slid"   
[6] "smooth"  "the"     "The"

6.字符检测 --返回值为逻辑值

> str_detect(x2,"h")    #字符串含有"h"
[1]  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE
> str_starts(x2,"T")    #字符串含有"T"
[1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
> str_ends(x2,"e")     #字符串以e结尾的
[1]  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE

###与sum和mean连用，可以统计匹配的个数和比例
> sum(str_detect(x2,"h"))
[1] 4
> mean(str_detect(x2,"h"))
[1] 0.5

7.提取匹配到的字符串

> str_subset(x2,"h")
[1] "The"    "birch"  "the"    "smooth"

8.字符计数

[1] 7
> str_count(x2,"o")
[1] 0 0 1 0 1 0 2 0

9.字符串替换

[1] "The"     "birch"   "canAe"   "slid"    "An"     
[6] "the"     "smAoth"  "planks."
> str_replace_all(x2,"o","A")   #将所有的o替换为A
[1] "The"     "birch"   "canAe"   "slid"    "An"     
[6] "the"     "smAAth"  "planks."

练习

#Bioinformatics is a new subject of genetic data collection,analysis and dissemination to the research community.
#1.将上面这句话作为一个长字符串，赋值给tmp
#2.拆分为一个由单词组成的向量，赋值给tmp2(注意标点符号)
#3.用函数返回这句话中有多少个单词。
#4.用函数返回这句话中每个单词由多少个字母组成。
#5.统计tmp2有多少个单词中含有字母"e"

> tmp <- "Bioinformatics is a new subject of genetic data collection,analysis and dissemination to the research community."

> tmp2 <- tmp %>%
+   str_replace(","," ")%>%    #将逗号变为空格
+   str_remove("[.]")%>%      #将.变为本身意义
+   str_split(" ")
> tmp2 <- tmp2[[1]]

> length(tmp2)
[1] 16

> str_length(tmp2)
 [1] 14  2  1  3  7  2  7  4 10  8  3 13  2  3  8  9

> table(str_detect(tmp2,"e"))

FALSE  TRUE 
    9     7 
> sum(str_detect(tmp2,"e"))
[1] 7
#str_count(tmp2,"e")   #是指每个字符串里含有多少个e

条件语句和循环语句

一.条件语句

1.if(){ }

(1)只有if没有else，那么条件是FALSE时就什么都不做

> i = -1
> if (i<0) print('up')
[1] "up"
> if (i>0) print('up')    #条件是FALSE

(2)有else

> i =1
> if (i>0){
+   cat('+')
+ } else {
+   print("-")
+ }
+      #返回值为+

ifelse函数

ifelse有三个参数
ifelse(x,yes,no)
x:是逻辑值
yes：逻辑值为TRUE时的返回值
no：逻辑值为FALSE时的返回值

> i=c(0.11548,-5.123,2.654)
> ifelse(i>0,"+","-")
[1] "+" "-" "+"


> x=rnorm(10)
> x
 [1]  0.6425792 -0.6829069  0.1632753 -0.2406404
 [5] -0.3182894 -0.7686996 -0.1892211 -0.1442053
 [9]  1.0053013 -1.4639149
> y=ifelse(x>0,"+","-")
> y
 [1] "+" "-" "+" "-" "-" "-" "-" "-" "+" "-"

(3)多个条件

> i = 0
> if (i>0){
+   print('+')
+ } else if (i==0) {
+   print('0')
+ } else if (i< 0){
+   print('-')
+ }
[1] "0"


> ifelse(i>0,"+",ifelse((i<0),"-","0"))
[1] "0"

2.switch()

> cd = 3
> foo <- switch(EXPR = cd, 
+               #EXPR = "aa", 
+               aa=c(3.4,1),
+               bb=matrix(1:4,2,2),
+               cc=matrix(c(T,T,F,T,F,F),3,2),
+               dd="string here",
+               ee=matrix(c("red","green","blue","yellow")))
> foo
      [,1]  [,2]
[1,]  TRUE  TRUE
[2,]  TRUE FALSE
[3,] FALSE FALSE

练习

#1.使用循环，查看"a",TRUE和3的数据类型
> a <- list("a",TRUE,3)
> for (i in 1:length(a)) {
+   print(class(a[[i]]))
+   
+ }
[1] "character"
[1] "logical"
[1] "numeric"

#2.生成10个随机数，根据这10个随机数生成一个新向量，>中位数的值对应"A",<中位数的值对应"B"。
> b <- rnorm(10)
> ifelse(b>median(b),"A","B")
 [1] "A" "B" "A" "B" "B" "A" "B" "B" "A" "A"

#3.根据上一练习题中的tmp2生成一个新向量，含有e的值对应"A",不含有e的值对应"B"
> tmp2 <-  tmp %>% 
+   str_replace(","," ") %>%
+   str_remove("[.]") %>% 
+   str_split(" ")
> tmp2
[[1]]
 [1] "Bioinformatics" "is"            
 [3] "a"              "new"           
 [5] "subject"        "of"            
 [7] "genetic"        "data"          
 [9] "collection"     "analysis"      
[11] "and"            "dissemination" 
[13] "to"             "the"           
[15] "research"       "community"     
> tmp2 <- tmp2[[1]]
> ifelse(str_detect(tmp2,"e"),"A","B")
 [1] "B" "B" "B" "A" "A" "B" "A" "B" "A" "B" "B" "A"
[13] "B" "A" "A" "B"

#4.加载deg.Rdata,根据a、b两列的值，按照以下条件生成向量x：
#a<1 且b<0.05,则x对应的值为down；
#a>1 且b<0.05,则x对应的值为up；
#其他情况，x对应的值为no
> load("deg.Rdata")
> k1 = deg$a<1 & deg$b<0.05
> k2 = deg$a>1 & deg$b<0.05
> x = ifelse(k1,"down",ifelse(k2,"up","no"))

# 5.统计x的重复值个数
> table(x)
x
 down    no    up 
 3828 26094   853

# 6.将x添加到deg数据框中，成为新的一列
> deg$x <- x

二、循环语句

1.for循环

> x <- c(5,6,0,3)
> s=0
> for (i in x){
+   s=s+i
+   #if(i == 0) next
+   #if (i == 0) break
+   print(c(which(x==i),i,1/i,s))
+ }
[1] 1.0 5.0 0.2 5.0
[1]  2.0000000  6.0000000  0.1666667 11.0000000
[1]   3   0 Inf  11
[1]  4.0000000  3.0000000  0.3333333 14.0000000


> x <- c(5,6,0,3)
> s=0
> for (i in x){
+   s=s+i
+   if(i == 0) next
+   #if (i == 0) break
+   print(c(which(x==i),i,1/i,s))
+ }
[1] 1.0 5.0 0.2 5.0
[1]  2.0000000  6.0000000  0.1666667 11.0000000
[1]  4.0000000  3.0000000  0.3333333 14.0000000


> x <- c(5,6,0,3)
> s=0
> for (i in x){
+   s=s+i
+   #if(i == 0) next
+   if (i == 0) break
+   print(c(which(x==i),i,1/i,s))
+ }
[1] 1.0 5.0 0.2 5.0
[1]  2.0000000  6.0000000  0.1666667 11.0000000

#如何将结果存下来?
> s = 0
> result = list()
> for(i in 1:length(x)){
+   s=s+x[[i]]
+   result[[i]] = c(i,x[[i]],1/i,s)
+ }
> do.call(cbind,result)
     [,1] [,2]       [,3]  [,4]
[1,]    1  2.0  3.0000000  4.00
[2,]    5  6.0  0.0000000  3.00
[3,]    1  0.5  0.3333333  0.25
[4,]    5 11.0 11.0000000 14.00

#练习4----
#1.使用循环，对iris的1到4列分别画点图（plot）
> par(mfrow = c(2,2))     #par()函数可以将绘图区分割成规则的几个部分，而且是先按行绘制，mfcol是先按列绘制
> for(i in 1:4){
+   plot(iris[,i],col = iris[,5])
+ }



#2.生成一个随机数（rnorm）组成的10行6列的矩阵，列名为sample1，sample2….sample6，行名为gene1，gene2…gene10，分组为sample1、2、3属于A组，sample4、5、6属于B组。用循环对每个基因画ggplot2箱线图,并尝试把10张图拼到一起。
> exp = matrix(rnorm(60),nrow = 10)
> colnames(exp) <- paste0("sample",1:6)
> rownames(exp) <- paste0("gene",1:10)
> exp[1:4,1:4]
         sample1     sample2     sample3      sample4
gene1  0.3756800 -0.35824521  0.04884076  0.004333555
gene2  1.3406486  1.29023800 -0.18444678 -0.379581765
gene3 -0.2858732 -0.03525992  0.46980022  0.582935510
gene4 -1.2478246 -0.47409951 -0.72981205  1.374565803
> #dat = cbind(t(exp),group = rep(c("A","B"),each = 3))
> dat = data.frame(t(exp))
> dat = mutate(dat,group = rep(c("A","B"),each = 3))
> p = list()
> library(ggplot2)
> for(i in 1:(ncol(dat)-1)){
+   p[[i]] = ggplot(data = dat,aes_string(x = "group",y=colnames(dat)[i]))+     #批量出图时，需用到aes_string(),字符向量的循环
+     geom_boxplot(aes(color = group))+
+     geom_jitter(aes(color = group))+
+     theme_bw()
+ }
> library(patchwork)
> wrap_plots(p,nrow = 2,guides = "collect")

2.while 循环

> i = 0
> while (i < 5){
+   print(c(i,i^2))
+   i = i+1
+ }
[1] 0 0
[1] 1 1
[1] 2 4
[1] 3 9
[1]  4 16

3.repeat 语句

注意：必须有break

> i=0L
> s=0L
> repeat{
+  i = i + 1
+  s = s + i
+  print(c(i,s))
+  if(i==10) break
+ }
[1] 1 1
[1] 2 3
[1] 3 6
[1]  4 10
[1]  5 15
[1]  6 21
[1]  7 28
[1]  8 36
[1]  9 45
[1] 10 55

apply()族函数

1.apply 处理矩阵或数据框

apply(X, MARGIN, FUN, …)

其中X是数据框/矩阵名；

MARGIN为1表示取行，为2表示取列，FUN是函数

> test<- iris[,1:4]
> apply(test, 2, mean)
Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
    5.843333     3.057333     3.758000     1.199333 
> apply(test, 1, sum)
  [1] 10.2  9.5  9.4  9.4 10.2 11.4  9.7 10.1  8.9
 [10]  9.6 10.8 10.0  9.3  8.5 11.2 12.0 11.0 10.3
 [19] 11.5 10.7 10.7 10.7  9.4 10.6 10.3  9.8 10.4
 [28] 10.4 10.2  9.7  9.7 10.7 10.9 11.3  9.7  9.6
 [37] 10.5 10.0  8.9 10.2 10.1  8.4  9.1 10.7 11.2
 [46]  9.5 10.7  9.4 10.7  9.9 16.3 15.6 16.4 13.1
 [55] 15.4 14.3 15.9 11.6 15.4 13.2 11.5 14.6 13.2
 [64] 15.1 13.4 15.6 14.6 13.6 14.4 13.1 15.7 14.2
 [73] 15.2 14.8 14.9 15.4 15.8 16.4 14.9 12.8 12.8
 [82] 12.6 13.6 15.4 14.4 15.5 16.0 14.3 14.0 13.3
 [91] 13.7 15.1 13.6 11.6 13.8 14.1 14.1 14.7 11.7
[100] 13.9 18.1 15.5 18.1 16.6 17.5 19.3 13.6 18.3
[109] 16.8 19.4 16.8 16.3 17.4 15.2 16.1 17.2 16.8
[118] 20.4 19.5 14.7 18.1 15.3 19.2 15.7 17.8 18.2
[127] 15.6 15.8 16.9 17.6 18.2 20.1 17.0 15.7 15.7
[136] 19.1 17.7 16.8 15.6 17.5 17.8 17.4 15.5 18.2
[145] 18.2 17.2 15.7 16.7 17.3 15.8
> res <- c()
> for(i in 1:nrow(test)){
+   res[[i]] <- sum(test[i,])
+ }

2.lapply(list, FUN, …)

对列表/向量中的每个元素（向量）实施相同的操作

> test <- list(x = 36:33,
+              y = 32:35,
+              z = 30:27)

返回值是列表，对列表中的每个元素（向量）求均值(试试方差var,分位数quantile)

> lapply(test,mean)
$x
[1] 34.5

$y
[1] 33.5

$z
[1] 28.5

> class(lapply(test,mean))
[1] "list"
> x <- unlist(lapply(test,mean));x
   x    y    z 
34.5 33.5 28.5 
> class(x)
[1] "numeric"

3.sapply 处理列表，简化结果，直接返回矩阵和向量

sapply(X, FUN, …) 注意和lapply的区别,返回值不一样

> lapply(test,min)
$x
[1] 33

$y
[1] 32

$z
[1] 27

> sapply(test,min)
 x  y  z 
33 32 27 
> lapply(test,range)
$x
[1] 33 36

$y
[1] 32 35

$z
[1] 27 30

> sapply(test,range)
      x  y  z
[1,] 33 32 27
[2,] 36 35 30
> class(sapply(test,range))
[1] "matrix" "array"

作者:爱吃甜品的鱼

原文链接:https://www.jianshu.com/p/94731c033410