基于R对TCGA患者临床数据整理和Cox分析
一、数据整理
1 导入数据
利用data.table
包导入数据,以LUAD为例
library(data.table)
cl = fread('/Users/xena/TCGA-LUAD.GDC_phenotype.tsv.gz')
colnames(cl)
dim(cl)
tmp=as.data.frame(colnames(cl))
2 选择感兴趣的临床信息
一般常规项包括submitter_id,vital_status,tumor_stage,pathologic_M,pathologic_N,pathologic_T,age_at_initial_pathologic_diagnosis,days_to_death,days_to_last_follow_up,race,gender
先看所选参数在cl
的位置,table()
查看有几个level,有无缺失值
which(colnames(cl)=='submitter_id.samples')
which(colnames(cl)=='vital_status.demographic')
table(cl[,'age_at_initial_pathologic_diagnosis'])
colnames(cl[,c(1,80,98,42:44,6,76,86,78,61,103,104,102,36,53)])
meta=as.data.frame(cl[,c(1,80,98,42:44,6,76,86,78,61,103,104,102,36,53)])
colnames(meta)
meta[1:4,1:4]
save(cl, meta, file = 'GDC_TCGA_LUAD_clinical_df.Rdata')
load(file = 'GDC_TCGA_LUAD_clinical_df.Rdata')
3 初步整理参数
去掉lost follow up的样本
phe <- meta
table(phe[,15])
phe <- phe[!phe$lost_follow_up == 'YES',]
计算生存时间time
phe[,8][is.na(phe[,8])]=0
phe[,9][is.na(phe[,9])]=0
phe$days <- as.numeric(phe[,9])+as.numeric(phe[,8])
phe$time <- phe$days/30
划分age_group
phe$age_at_initial_pathologic_diagnosis=as.numeric(phe$age_at_initial_pathologic_diagnosis)
phe$age_group=ifelse(phe$age_at_initial_pathologic_diagnosis>median(phe$age_at_initial_pathologic_diagnosis,na.rm=T),'older','younger')
table(phe$age_at_initial_pathologic_diagnosis)
table(phe$age_group)
stage
table(phe$stage)
phe <- phe[phe$stage != 'not reported',]
table(phe$stage)
phe$stage <- gsub('[stage]','',phe$stage)
phe$stage <- gsub('[a-c]','',phe$stage)
phe$stage <- toupper(phe$stage)
T
levels(factor(phe$T))
phe <- phe[!phe$T == '',]
phe$T <- gsub('[a-b]','',phe$T)
levels(factor(phe$T))
N
table(phe$N)
levels(factor(phe$N))
phe <- phe[!phe$N == 'NX',]
phe$N=as.factor(phe$N)
M
table(phe$M)
levels(factor(phe$M))
phe <- phe[!phe$M == '',]
phe <- phe[!phe$M == 'MX',]
phe$M <- gsub('[a-b]','',phe$M)
levels(factor(phe$M))
save
save(phe, file = 'GDC_TCGA_LUAD_clinical_ok.Rdata')
二、整理患者信息表
load(file = 'GDC_TCGA_LUAD_clinical_ok.Rdata')
clinical_info <- clin
head(clinical_info)
library(tableone)
1 初步整理
对需要观测的临床特质值进行重新编码
clinical_info$age<-as.numeric(clinical_info$age)
clinical_info$AGE<-factor(ifelse(clinical_info$age>60,'>60','<=60'),ordered = T)
clinical_info$Gender<-factor(toupper(clinical_info$gender),ordered = T)
table(clinical_info$stage)
clinical_info$stage<-factor(clinical_info$stage,ordered = T)
clinical_info$T<-factor(clinical_info$T,ordered = T)
clinical_info$N<-factor(clinical_info$N,ordered = T)
clinical_info$M<-factor(clinical_info$M,ordered = T)
clinical_info$event<-factor(toupper(clinical_info$event),ordered = T)
去除不需要的临床信息
colnames(clinical_info)
clinical_info <- clinical_info[,-c(7:8,10:16)]
clinical_info
group_list=ifelse(as.numeric(substr(clinical_info$ID,14,15)) < 10,'Tumor','Normal')
table(group_list)
clinical_info <- clinical_info[group_list== 'Tumor',]
dput(names(clinical_info))
Vector of variables to summarize
myVars <- dput(names(clinical_info))
Vector of categorical variables that need transformation
catVars <- myVars[c(2:8)]
2 三线表类型之一 训练集和数据集
切割数据
library(caret)
set.seed(123456789)
sam<- createDataPartition(clinical_info$event, p = .5,list = FALSE)
train <- clinical_info[sam,]
test <- clinical_info[-sam,]
#查看两组一些临床参数切割比例
prop.table(table(train$stage))
prop.table(table(test$stage))
#添加分组
train$group<-'training datasets'
test$group<-'testing datasets'
clinical_info<-rbind(train,test)
clinical_info$group<-factor(clinical_info$group)
save(clinical_info,test,train, file = 'GDC_TCGA_LUAD-clin-group-test-train.Rdata')
最重要的三线表通常是以训练集和数据集来区分:group
##生成三线表
vars <-colnames(clinical_info)[c(2:8)]
library(tableone)
tb_group<-CreateTableOne(vars = myVars, strata = c("group"),
data = clinical_info,factorVars = catVars)
#tab1<-print(tb_group, nonnormal = c('age','time'),exact = c(myVars,'AGE'), smd = TRUE)
tab1<-print(tb_group, smd = TRUE)
summary(tab1)
tab_out<-print(tb_group, catDigits = 1, contDigits = 2, pDigits = 3,
quote = FALSE, missing = T, explain = TRUE, printToggle = TRUE,
test = TRUE, smd = T, noSpaces = FALSE, padColnames = FALSE,
varLabels = FALSE, format = c("fp", "f", "p", "pf")[1],
showAllLevels = T, cramVars = NULL, dropEqual = FALSE,
exact = NULL, nonnormal = NULL, minMax = FALSE)
Save to a CSV file
write.csv(tab_out, file = "TCGA-LUAD-phe_clinical_tables_group.csv")
3 需要一个全部病人信息的临床三线表
tb_all<-CreateTableOne(vars = myVars,
data = clinical_info,
factorVars = catVars)
tab2<-print(tb_all)
tab_out2<-print(tb_all, catDigits = 1, contDigits = 2, pDigits = 3,
quote = FALSE, missing = T, explain = TRUE,
printToggle = TRUE,
test = TRUE, smd = T, noSpaces = FALSE,
padColnames = FALSE,
varLabels = FALSE, format = c("fp", "f", "p", "pf")[1],
showAllLevels = T, cramVars = NULL, dropEqual = FALSE,
exact = NULL, nonnormal = NULL, minMax = FALSE)
Save to a CSV file
write.csv(tab_out2, file = "TCGA-LUAD-phe_clinical_tables_all.csv")
三、COX回归分析
1 整理分期 data as numeric
load(file = 'GDC_TCGA_LUAD_clinical_ok.Rdata')
clin <- phe
event
clin$event=ifelse(clin$event=='Alive',0,1)
stage
levels(factor(clin$stage))
clin$stage=as.factor(clin$stage)
clin$stage=as.numeric(clin$stage)
levels(factor(clin$stage))
age
levels(factor(clin$age_group))
clin$age_group=as.factor(clin$age_group)
clin$age_group=as.numeric(clin$age_group)
gender
levels(factor(clin$gender))
clin$gender=as.factor(clin$gender)
clin$gender=as.numeric(clin$gender)
T
levels(factor(clin$T))
clin$T=as.factor(clin$T)
clin$T=as.numeric(clin$T)
levels(factor(clin$T))
N
table(clin$N)
levels(factor(clin$N))
clin$N=as.factor(clin$N)
clin$N=as.numeric(clin$N)
levels(factor(clin$N))
M
table(clin$M)
levels(factor(clin$M))
clin$M=as.factor(clin$M)
clin$M=as.numeric(clin$M)
levels(factor(clin$M))
save
save(clin, file = 'GDC_TCGA_LUAD_clinical_cox.Rdata')
2 cox
载入数据
library(plyr)
library(survival)
library(survminer)
load(file = 'GDC_TCGA_LUAD_clinical_cox.Rdata')
load(file = 'GDC_TCGA_LUAD-clin-group-test-train.Rdata')
选择载入不同组数据分别进行cox分析
### 1 total
group_list=ifelse(as.numeric(substr(clin$ID,14,15)) < 10,'Tumor','Normal')
table(group_list)
clin <- clin[group_list== 'Tumor',]
### 2 test
clin <- clin[clin$ID %in% test$ID, ]
clin$time <- as.numeric(clin$time)
clin$event <- as.numeric(clin$event)
### 3 training
clin <- clin[clin$ID %in% train$ID, ]
clin$time <- as.numeric(clin$time)
clin$event <- as.numeric(clin$event)
unicox
BaSurv <- Surv(time = clin$time, event = clin$event)
clin$BaSurv <- with(clin,BaSurv)
GCox <- coxph(BaSurv~gender, data = clin)
GSum <- summary(GCox)
GSum$coefficients
HR <- round(GSum$coefficients[,2],2)
PValue <- round(GSum$coefficients[,5],3)
CI <- paste0(round(GSum$conf.int[,3:4],2), collapse = '-')
Unicox <- data.frame('Characteristics'= 'gender',
'Hazard Ratio' = HR,
'CI95' = CI,
'P Value' = PValue)
UniCox <- function(x){
FML <- as.formula(paste0('BaSurv~',x))
GCox <- coxph(FML, data = clin)
GSum <- summary(GCox)
HR <- round(GSum$coefficients[,2],2)
PValue <- round(GSum$coefficients[,5],3)
CI <- paste0(round(GSum$conf.int[,3:4],2), collapse = '-')
Unicox <- data.frame('Characteristics'= x,
'Hazard Ratio' = HR,
'CI95' = CI,
'P Value' = PValue)
return(Unicox)
}
UniCox('gender')
UniCox('T')
colnames(clin)
VarNames <- colnames(clin)[c(3:6,8:9,16:17)]
UniVar <- lapply(VarNames, UniCox)
UniVar <- ldply(UniVar, data.frame)
UniVar$Characteristics[UniVar$P.Value < 0.05]
write.csv(UniVar, file = 'GDC_TCGA_LUAD-clin-UniCox.csv')
multicox
fml <- as.formula(paste0('BaSurv~',
paste0(UniVar$Characteristics[UniVar$P.Value<0.05],
collapse = '+')))
Multicox <- coxph(fml, data = clin)
Multisum <- summary(Multicox)
MultiName <- as.character(UniVar$Characteristics[UniVar$P.Value < 0.05])
MHR <- round(Multisum$coefficients[,2],2)
MPV <- round(Multisum$coefficients[,5],3)
MCIL <- round(Multisum$conf.int[,3],2)
MCIU <- round(Multisum$conf.int[,4],2)
MCI <- paste0(MCIL,'-',MCIU)
MulCox <- data.frame('Characteristics'= MultiName,
'Hazard Ratio' = MHR,
'CI95' = MCI,
'P Value' = MPV)
paste0(UniVar$Characteristics[UniVar$P.Value < 0.05], collapse = '+')
Multicox <- coxph(BaSurv~stage+N+T+gender+smoking_history+EvsA,data = clin)
整合表格
Final <- merge.data.frame(UniVar,MulCox,by='Characteristics',
all = T, sort = T)
#输出
write.csv(Final,file = 'GDC_TCGA_LUAD_clin_cox-result-all.csv')
3 KM生存曲线
library(RColorBrewer)
以stage为例,其他参数一样做
surv <- Surv(time = clin$time, event = clin$event)
surv
fit <- survfit(surv~clin$stage, data=clin)
survdiff(surv~clin$stage)
summary(fit)
fit
ggsurvplot(fit, conf.int=F, pval=TRUE)
ggsurvplot(fit,
pval = T, # 在图上添加log rank检验的p值
legend.labs=c("I","II",'III','IV'), #表头标签注释男女
legend = c(0.8, 0.85),
ylab="Cumulative survival (percentage)",xlab = "Months",
linetype = "strata", # 生存曲线的线型
surv.median.line = "hv", # 标注出中位生存时间
ggtheme = theme_bw(), #背景布局
palette = brewer.pal(7, "Set1")[c(3,2,4,1)],# 图形颜色风格
font.main = c(20, "plain"),
font.x = c(20, "plain"),
font.y = c(20, "plain"),
font.tickslab = c(18, "plain"),
font.legend = c(18, "plain"),
font.pval = c(20, "plain")
)
ggsave(filename = 'survival_stage-train.pdf',width = 8,height = 6)
致谢:生信技能树
作者:DobbyBunny
原文链接:https://www.jianshu.com/p/551a62600f46