R
#移动下载文件夹中的文件到一个文件夹
#author tanzicai
#time 2021.6.8 R4.0.1
#program移动文件到同一文件夹
#输入:所下载的文件夹的上级文件夹
#输出:mergeAllFileTofolder文件夹
munzip <-function(path){
if(path == ''){
print("请输入文件夹路径")
}
setwd(path)
dir.create("../中间文件输出")
dir.create("../中间文件输出/源文件")
filePath = dir(path = path ,full.names = T)
filePath = filePath[1:length(filePath)-1]
for(wd in filePath){
files = dir(path = wd,pattern = "txt$")
fromFilePath = paste0(wd,"/",files)
toFilePath = paste0("../中间文件输出/源文件/",files)
file.copy(fromFilePath,toFilePath)
}
unlink("../中间文件输出/源文件/annotations.txt")
TRUE
}
#合并下载的所有文件
#author tanzicai
#time 2021.6.8 R4.0.1
#program读取每一个文件到一个表格
#输入:文件所在的文件夹目录
#输出:合并后的表格
combin <- function(path){
if(path == "null"){
print("请输入目标文件地址")
}
setwd(path)
countFilePath = dir(path = "../中间文件输出/源文件",pattern = "*.txt")
counts_merge = NULL
for (i in countFilePath) {
x=read.delim(paste0("../中间文件输出/源文件/",i),col.names = c("miRNA_ID",substr(i,1,9),1,2))
x=x[,1:length(x)-1]
x=x[,1:length(x)-1]
if(is.null(counts_merge)){
counts_merge = x
}
else{
counts_merge =merge(counts_merge,x,by = "miRNA_ID")
}
}
write.csv(counts_merge,file = "../中间文件输出/源文件合并.csv")
rownames(counts_merge)=counts_merge$miRNA_ID
conuts_reduce = counts_merge[,-1]
TRUE
}
#注释文件提取
#author tanzicai
#time 2021.6.8 R4.0.1
#program根据meta文件提取样本数据名称
#输入:注释文件
#输出:转换后的表格
convertName <- function(path){
if(any(grepl("rjson",installed.packages())) == FALSE){
install.packages("rjson")
}
library(rjson)
setwd(path)
metaJSON = fromJSON(file = paste0("../",dir(path = "../",pattern = "*.json",full.names = F)))
jsonDataInfo = data.frame(fileName = c(),TCGA_Barcode = c())
for (i in 1:length(metaJSON)) {
TCGA_barcode = metaJSON[[i]][["associated_entities"]][[1]][["entity_submitter_id"]]
fileName = metaJSON[[i]][["file_name"]]
jsonDataInfo = rbind(jsonDataInfo,data.frame(fileName = substr(fileName,1,9),TCGA_barcode = substr(TCGA_barcode,1,15)))
}
rownames(jsonDataInfo) = jsonDataInfo[,1]
write.csv(jsonDataInfo,file = "../中间文件输出/文件名与TCGA编号对照表.csv")
#获取counts矩阵
DATA = read.csv("../中间文件输出/源文件合并.csv")
DATA = DATA[-1]
fileNameToTCGA_Barcode = jsonDataInfo[-1]
jsonDataInfo = jsonDataInfo[order(jsonDataInfo$fileName),]
data = DATA
DATA = DATA[-1]
colnames(DATA) = jsonDataInfo$TCGA_barcode
DATA = cbind(data[1],DATA)
write.csv(DATA,file = "../中间文件输出/重复名列名为TCGA编号源文件.csv")
TRUE
}
#edgR差异分析
#author tanzicai
#time 2021.6.8 R4.0.1
#使用edgR包对数据进行差异分析
#输入:表达矩阵、分组信息
#输出:差异表达结果
edgRAnalyze <- function(path){
setwd(path)
if(any(grepl("BiocManager",installed.packages())) == FALSE){
install.packages("BiocManager")
}
library(BiocManager)
if(any(grepl("edgeR",installed.packages())) == FALSE){
BiocManager::install("edgeR")
BiocManager::install("limma")
}
library(edgeR)
library(limma)
#将第一列换成行名
DATA = read.csv("../中间文件输出/重复名列名为TCGA编号源文件.csv")
DATA = DATA[-1]
row.names(DATA) <- DATA[, 1]
DATA <- DATA[, -1]
#分组信息
group_list <- ifelse(as.numeric(substr(colnames(DATA),14,15))<10,"tumor","normal")
group_list <- factor(group_list,levels = c("normal","tumor"))
table(group_list)
dge <- DGEList(counts=DATA,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge)
design <- model.matrix(~0+group_list)
rownames(design)<-colnames(dge)
colnames(design)<-levels(group_list)
dge <- estimateGLMCommonDisp(dge,design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
fit2 <- glmLRT(fit, contrast=c(-1,1))
DEG2=topTags(fit2, n=nrow(DATA))
DEG2=as.data.frame(DEG2)
logFC_cutoff2 <- with(DEG2,mean(abs(logFC)) + 2*sd(abs(logFC)))
DEG2$change = as.factor(ifelse(DEG2$PValue < 0.05 & abs(DEG2$logFC) > logFC_cutoff2,ifelse(DEG2$logFC > logFC_cutoff2 ,'UP','DOWN'),'NOT'))
print("本次差异基因分析结果为")
print(table(DEG2$change))
edgeR_DEG <- DEG2
dir.create("../最终分析文件输出")
write.csv(edgeR_DEG,"../最终分析文件输出/差异分析结果.csv")
TRUE
}
#edgR2差异分析(未启用)
#author tanzicai
#time 2021.6.8 R4.0.1
#使用edgR包对数据进行差异分析
#输入:表达矩阵、分组信息
#输出:差异表达结果
edgRAnalyze2 <- function(){
etwd(path)
if(any(grepl("BiocManager",installed.packages())) == FALSE){
install.packages("BiocManager")
}
library(BiocManager)
if(any(grepl("edgeR",installed.packages())) == FALSE){
BiocManager::install("edgeR")
BiocManager::install("limma")
}
library(edgeR)
library(limma)
#将第一列换成行名
DATA = read.csv("../中间文件输出/重复名列名为TCGA编号源文件.csv")
DATA = DATA[-1]
row.names(DATA) <- DATA[, 1]
DATA <- DATA[, -1]
#分组信息
group_list <- ifelse(as.numeric(substr(colnames(DATA),14,15))<10,"tumor","normal")
group_list <- factor(group_list,levels = c("normal","tumor"))
table(group_list)
#构建 DGEList 对象
dge <- DGEList(counts=DATA,group=group_list)
#过滤 low count 数据,例如 CPM 标准化
keep <- rowSums(cpm(dgelist) > 1 ) >= 2
dgelist <- dgelist[keep, , keep.lib.sizes = FALSE]
#标准化,以 TMM 标准化为例
dgelist_norm <- calcNormFactors(dgelist, method = 'TMM')
#首先根据分组信息构建试验设计矩阵,分组信息中一定要是对照组在前,处理组在后
design <- model.matrix(~group)
#估算基因表达值的离散度
dge <- estimateDisp(dgelist_norm, design, robust = TRUE)
print("请选择模型拟合算法")
print("1:负二项广义对数线性模型")
print("2:拟似然负二项广义对数线性模型")
choice = readline()
# 负二项广义对数线性模型
if(choice == 1){
fit <- glmFit(dge, design, robust = TRUE)
lrt <- topTags(glmLRT(fit), n = nrow(dgelist$counts))
write.table(lrt, 'control_treat.glmLRT.txt', sep = '\t', col.names = NA, quote = FALSE)
}else if(choice == 2){
fit <- glmQLFit(dge, design, robust = TRUE)
lrt <- topTags(glmQLFTest(fit), n = nrow(dgelist$counts))
write.table(lrt, 'control_treat.glmQLFit.txt', sep = '\t', col.names = NA, quote = FALSE)
}
gene_diff <- read.delim('control_treat.glmLRT.txt', row.names = 1, sep = '\t', check.names = FALSE)
#首先对表格排个序,按 FDR 值升序排序,相同 FDR 值下继续按 log2FC 降序排序
gene_diff <- gene_diff[order(gene_diff$FDR, gene_diff$logFC, decreasing = c(FALSE, TRUE)), ]
#log2FC≥1 & FDR<0.01 标识 up,代表显著上调的基因
#log2FC≤-1 & FDR<0.01 标识 down,代表显著下调的基因
#其余标识 none,代表非差异的基因
gene_diff[which(gene_diff$logFC >= 1 & gene_diff$FDR < 0.01),'sig'] <- 'up'
gene_diff[which(gene_diff$logFC <= -1 & gene_diff$FDR < 0.01),'sig'] <- 'down'
gene_diff[which(abs(gene_diff$logFC) <= 1 | gene_diff$FDR >= 0.01),'sig'] <- 'none'
#输出选择的差异基因总表
gene_diff_select <- subset(gene_diff, sig %in% c('up', 'down'))
write.table(gene_diff_select, file = 'control_treat.glmQLFit.select.txt', sep = '\t', col.names = NA, quote = FALSE)
#根据 up 和 down 分开输出
gene_diff_up <- subset(gene_diff, sig == 'up')
gene_diff_down <- subset(gene_diff, sig == 'down')
write.table(gene_diff_up, file = 'control_treat.glmQLFit.up.txt', sep = '\t', col.names = NA, quote = FALSE)
write.table(gene_diff_down, file = 'control_treat.glmQLFit.down.txt', sep = '\t', col.names = NA, quote = FALSE)
}
mainAnalysis <- function(path){
print("#########step1:移动文件中###############")
if(munzip(path) == TRUE )print("文件移动完成")
print("#########step2:合并文件中###############")
if(combin(path) == TRUE )print("文件合并完成")
print("#########step3:文件列名转换中###############")
if(convertName(path) == TRUE )print("文件列名转换完成")
print("#########step4:使用egdR包进行基差异分析中###############")
if(edgRAnalyze(path) == TRUE )print("本次差异基因分析结束,请查看结果")
}
#e火山图
#author tanzicai
#time 2021.6.8 R4.0.1
#使用edgR包对数据进行差异分析
#输入:表达矩阵、分组信息
#输出:差异表达结果
paintVolcanoPlot <- function(path){
if(any(grepl("ggplot2",installed.packages())) == FALSE){
BiocManager::install("ggplot2")
}
library(ggplot2)
data = read.csv("../最终分析文件输出/差异分析结果.csv")
logFC <-data$logFC
padj <- data$FDR
data <- data.frame(logFC=logFC,padj=padj)
data$sig[(data$padj > 0.05|data$padj=="NA")|(data$logFC < 0.5)& data$logFC > -0.5] <- "NO"
data$sig[data$padj <= 0.05 & data$logFC >= 0.5] <- "up"
data$sig[data$padj <= 0.05 & data$logFC <= -0.5] <- "DOWN"
# 选最大值作为xlim的上下边界
x_lim <- max(logFC,-logFC)
# 绘制火山图
p<-ggplot(temp,aes(x=temp$log_foldchange,y=-log10(temp$p_value)))+xlab("log2 Fold Change")+ylab("-log10P-Value")+
geom_point(size=4,alpha=0.6)
}