2 Star 1 Fork 0

jmzeng / pan-cancer-expression

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
step1-get-expression-data.R 14.40 KB
一键复制 编辑 原始数据 按行查看 历史
jmzeng 提交于 2021-09-21 18:05 . first commit
rm(list=ls())
options(stringsAsFactors = F)
library(stringr)
library(data.table)
dir.create('Rdata')
###### step1: Counts文件批量转为Rdata ######
# dataset: gene expression RNAseq - HTSeq - Counts
# unitlog2(count+1)
fs=list.files('ucsc_xena',pattern = 'htseq_counts.tsv.gz')
fs
lapply(fs, function(x){
# x=fs[1]
pro=gsub('tsv.gz','',x)
print(pro)
a=fread(file.path('ucsc_xena',x),data.table = F)
head(a[ ,1:4])
tail(a[ ,1:4])
mat=a[1:60483,]
rownames(mat)=mat$Ensembl_ID
mat[1:4,1:4]
mat=mat[,-1]
exprSet=floor(2^mat - 1 )
exprSet[1:4,1:4]
n=floor(ncol(exprSet)/20)
n
keep_feature <- rowSums (exprSet > 2) > n
table(keep_feature)
mat <- exprSet[keep_feature, ]
mat=mat[, colSums(mat) > 1000000]
mat[1:4,1:4]
dim(mat)
colnames(mat)
b=read.table('ucsc_xena/gencode.v22.annotation.gene.probeMap',header = T)
head(b)
d=read.table('ucsc_xena/ensembID2type.txt')
head(d)
b=merge(b,d,by.x='id',by.y='V1')
head(b)
length(unique(b[match(rownames(mat),b$id),2]))
# 可以看到,多个ensembl的ID对应同一个基因的symbol,这个是需要处理掉的。
# 下面的代码略微有一点点复杂
dat=mat
ids <- b[match(rownames(dat),
b$id),1:2] #取需要的列
head(ids)
colnames(ids)=c('probe_id','symbol')
ids=ids[ids$symbol != '',]
ids=ids[ids$probe_id %in% rownames(dat),]
dat[1:4,1:4]
dat=dat[ids$probe_id,]
ids$median=apply(dat,1,median) #ids新建median这一列,列名为median,同时对dat这个矩阵按行操作,取每一行的中位数,将结果给到median这一列的每一行
ids=ids[order(ids$symbol,ids$median,decreasing = T),]#对ids$symbol按照ids$median中位数从大到小排列的顺序排序,将对应的行赋值为一个新的ids
ids=ids[!duplicated(ids$symbol),]#将symbol这一列取取出重复项,'!'为否,即取出不重复的项,去除重复的gene ,保留每个基因最大表达量结果s
dat=dat[ids$probe_id,] #新的ids取出probe_id这一列,将dat按照取出的这一列中的每一行组成一个新的dat
rownames(dat)=ids$symbol#把ids的symbol这一列中的每一行给dat作为dat的行名
dat[1:4,1:4] #保留每个基因ID第一次出现的信息
mat=dat
mat[1:4,1:4]
print(fivenum(mat['GAPDH',]))
print(fivenum(mat['ACTB',]))
tp=b[match(rownames(mat),b$gene),7]
print( tail(sort(table(tp))) )
pd_mat=mat[tp=='protein_coding',]
non_mat=mat[tp !='protein_coding',]
save(pd_mat,non_mat,
file = file.path('Rdata', paste0(pro,'Rdata')))
})
###### step2: 合并全部癌症表达量矩阵,自己走PCA,tSNE,DBSCAN ######
fs=list.files('Rdata/',pattern = 'htseq_counts')
fs
# 首先加载每个癌症的蛋白编码基因表达量矩阵
# 成为一个 list
dat_list = lapply(fs, function(x){
# x=fs[1]
pro=gsub('.htseq_counts..Rdata','',x)
print(pro)
load(file = file.path('Rdata/',x))
dat=log2(edgeR::cpm(pd_mat)+1)
return(dat)
})
tmp=table(unlist(lapply(dat_list, rownames)))
cg=names(tmp)[tmp==33]
length(cg)
# 在33个癌症都存在的蛋白编码基因是一万六千个
all_tcga_matrix=do.call(cbind,
lapply(dat_list, function(x){
x[cg,]
}))
dim(all_tcga_matrix)
group=rep(gsub('TCGA-','',gsub('.htseq_counts..Rdata','',fs)),
unlist(lapply(dat_list, ncol)))
table(group)
# 为了加快运算速度,我们取top1000基因即可,按照sd派系
cg=names(tail(sort(apply(all_tcga_matrix,1,sd)),1000))
# 首先查看pca情况
library("FactoMineR")
library("factoextra")
dat.pca <- PCA(t(all_tcga_matrix[cg,]), graph = FALSE)
fviz_pca_ind(dat.pca,
repel =T,
geom.ind = "point",
col.ind = group,
# addEllipses = TRUE,
ggtheme = theme_minimal(),
legend.title = "Groups"
)
plot(dat.pca$ind$coord[,1:2],
col=rainbow(33)[as.numeric(as.factor(group))] )
library(Rtsne)
tsne_out <-Rtsne(t(all_tcga_matrix[cg,]),
initial_dims=30,verbose=FALSE,
check_duplicates=FALSE,
perplexity=27, dims=2,max_iter=5000)
plot(tsne_out$Y,
pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2",
col=rainbow(33)[as.numeric(as.factor(group))])
# 比较kmeans和dbscan的差异
library(gplots)
kmeans_cluster_tsne=kmeans(tsne_out$Y,centers = 33)$clust
library(dbscan)
dbscan_cluster_tsne=dbscan(tsne_out$Y,eps=3.1)$cluster
balloonplot(table(dbscan_cluster_tsne,
kmeans_cluster_tsne))
balloonplot(table(dbscan_cluster_tsne,
group))
plot(tsne_out$Y,col=kmeans_cluster_tsne,
pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")
plot(tsne_out$Y,col=dbscan_cluster_tsne,
pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")
library(RColorBrewer)
plot(tsne_out$Y,col=kmeans_cluster_tsne,
pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")
plot(tsne_out$Y,col=dbscan_cluster_tsne,
pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")
###### step3: 合并全部癌症表达量矩阵,走seurat ######
fs=list.files('Rdata/',pattern = 'htseq_counts')
fs
# 首先加载每个癌症的蛋白编码基因表达量矩阵
# 成为一个 list
dat_list = lapply(fs, function(x){
# x=fs[1]
pro=gsub('.htseq_counts..Rdata','',x)
print(pro)
load(file = file.path('Rdata/',x))
# 因为要走单细胞 seurat流程,所以无需处理表达量矩阵
# 直接最原始的counts即可
# dat=log2(edgeR::cpm(pd_mat)+1)
dat=pd_mat
return(dat)
})
tmp=table(unlist(lapply(dat_list, rownames)))
cg=names(tmp)[tmp==33]
length(cg)
# 在33个癌症都存在的蛋白编码基因是一万六千个
all_tcga_matrix=do.call(cbind,
lapply(dat_list, function(x){
x[cg,]
}))
dim(all_tcga_matrix)
all_tcga_matrix[1:4,1:2]
group=rep(gsub('TCGA-','',gsub('.htseq_counts..Rdata','',fs)),
unlist(lapply(dat_list, ncol)))
table(group)
library(Seurat)
sce=CreateSeuratObject(counts = all_tcga_matrix)
sce
group=as.data.frame(group)
rownames(group)=colnames(sce)
sce=AddMetaData(sce,group)
table(sce$group)
# 这里并不是真正的单细胞表达量矩阵,所以无需过滤
sce.filt=sce
dim(sce.filt)
sce.filt <- NormalizeData(sce.filt, normalization.method = "LogNormalize",
scale.factor = 10000)
GetAssay(sce.filt,assay = "RNA")
sce.filt = FindVariableFeatures(sce.filt)
sce.filt = ScaleData(sce.filt, vars.to.regress = c("nFeature_RNA" ))
sce.filt = RunPCA(sce.filt, npcs = 20)
sce.filt = RunTSNE(sce.filt, npcs = 20)
sce.filt = RunUMAP(sce.filt, dims = 1:10)
sce=sce.filt
sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.2)
table(sce@meta.data$RNA_snn_res.0.2)
library(patchwork)
p1=DimPlot(sce,reduction = "tsne",label=T,repel = T,
group.by ='group')
p2=DimPlot(sce,reduction = "umap",label=T,repel = T,
group.by ='group')
p1+p2
library(patchwork)
p1=DimPlot(sce,reduction = "tsne",label=T,repel = T )
p2=DimPlot(sce,reduction = "umap",label=T,repel = T )
p1+p2
library(gplots)
balloonplot(table(sce$seurat_clusters,
sce$group))
# 接下来对seurat默认的分群进行差异分析
pro='basic_seurat'
table(Idents(sce))
sce.markers <- FindAllMarkers(object = sce, only.pos = TRUE, min.pct = 0.25,
thresh.use = 0.25)
DT::datatable(sce.markers)
write.csv(sce.markers,file=paste0(pro,'_sce.markers.csv'))
library(dplyr)
top10 <- sce.markers %>% group_by(cluster) %>% top_n(10, avg_log2FC)
DoHeatmap(sce,top10$gene,size=3)
ggsave(filename=paste0(pro,'_sce.markers_heatmap.pdf'),width = 18,height = 21)
p <- DotPlot(sce, features = unique(top10$gene),
assay='RNA' ) + coord_flip()
p
ggsave(paste0(pro,'-DotPlot_check_top10_markers_by_clusters.pdf'),width = 18,height = 21)
library(dplyr)
top3 <- sce.markers %>% group_by(cluster) %>% top_n(3, avg_log2FC)
DoHeatmap(sce,top3$gene,size=3)
ggsave(paste0(pro,'-DoHeatmap_check_top3_markers_by_clusters.pdf'),width = 18,height = 11)
p <- DotPlot(sce, features = unique(top3$gene),
assay='RNA' ) + coord_flip()
p
ggsave(paste0(pro,'-DotPlot_check_top3_markers_by_clusters.pdf'),width = 18,height = 11)
save(sce.markers,file=paste0(pro,file = '-sce.markers.Rdata'))
save(sce,file = 'sce.Rdata')
genes_to_check = c('PTPRC', 'CD3D', 'CD3E', 'CD4','CD8A','CD19', 'CD79A', 'MS4A1' ,
'IGHG1', 'MZB1', 'SDC1',
'CD68', 'CD163', 'CD14',
'TPSAB1' , 'TPSB2', # mast cells,
'RCVRN','FPR1' , 'ITGAM' ,
'C1QA', 'C1QB', # mac
'S100A9', 'S100A8', 'MMP19',# monocyte
'LAMP3', 'IDO1','IDO2',## DC3
'CD1E','CD1C', # DC2
'KLRB1','NCR1', # NK
'FGF7','MME', 'ACTA2',
'PECAM1', 'VWF',
'MKI67','TOP2A',
'DCN', 'LUM', 'GSN' , ## mouse PDAC fibo
'EPCAM' , 'KRT19', 'PROM1', 'ALDH1A1' )
p <- DotPlot(sce, features = genes_to_check,
assay='RNA' ) + coord_flip()
p
#
# ggsave(plot=p, filename="check_marker_by_celltype.pdf")
# table(sce@meta.data$celltype,sce@meta.data$seurat_clusters)
#
#
# DimPlot(sce, reduction = "umap", group.by = "celltype",label = T)
# ggsave('umap_by_celltype.pdf')
#
# library(patchwork)
# th=theme(axis.text.x = element_text(angle = 45,
# vjust = 0.5, hjust=0.5))
# p_all_markers=DotPlot(sce, features = genes_to_check,
# assay='RNA' ,group.by = 'celltype' ) + coord_flip()+ th
# p_umap=DimPlot(sce, reduction = "umap", group.by = "celltype",label = T)
# p_all_markers+p_umap
# ggsave('markers_umap_by_celltype.pdf',width = 13)
# phe=sce@meta.data
# save(phe,file = 'phe_by_markers.Rdata')
#
#
#
###### step4: 不同癌症的差异难道大于其与正常对照差异吗 ? ######
library(Seurat)
load(file = 'sce.Rdata')
gp=substring(colnames(sce),14,15)
table(gp)
sce@meta.data$gp=gp
library(gplots)
balloonplot(table(sce@meta.data$gp,sce@meta.data$group))
library(patchwork)
p1=DimPlot(sce,reduction = "tsne",label=T,repel = T,
group.by ='gp')
p2=DimPlot(sce,reduction = "umap",label=T,repel = T,
group.by ='gp')
p1+p2
cl=c('#7fc97f','#beaed4','#fdc086','#ffff99','#386cb0','#f0027f','#bf5b17')
p1=DimPlot(sce,reduction = "tsne",label=T,repel = T,
cols = cl,
group.by ='gp')
p2=DimPlot(sce,reduction = "umap",label=T,repel = T,
cols = cl,
group.by ='gp')
p1+p2
library(gplots)
balloonplot(table(sce$seurat_clusters,
sce$gp))
balloonplot(table(sce$seurat_clusters,
sce$group))
# 尝试使用harmony 抹去癌症差异
# 参考;https://mp.weixin.qq.com/s/5ivvtVFwCutUpJkqRgXYow
table( sce$group)
library(harmony)
seuratObj <- RunHarmony(sce, "group")
names(seuratObj@reductions)
seuratObj <- RunUMAP(seuratObj, dims = 1:15,
reduction = "harmony")
DimPlot(seuratObj,reduction = "umap",label=T ) +NoLegend()
library(cowplot)
p1.compare=plot_grid(ncol = 3,
DimPlot(sce, reduction = "pca", group.by = "group")+NoAxes() +NoLegend()+ggtitle("PCA"),
DimPlot(sce, reduction = "tsne", group.by = "group")+NoAxes() +NoLegend()+ggtitle("tSNE"),
DimPlot(sce, reduction = "umap", group.by = "group")+NoAxes() +NoLegend()+ggtitle("UMAP"),
DimPlot(seuratObj, reduction = "pca", group.by = "group")+NoAxes() +NoLegend()+ggtitle("PCA"),
DimPlot(seuratObj, reduction = "tsne", group.by = "group")+NoAxes() +NoLegend()+ggtitle("tSNE"),
DimPlot(seuratObj, reduction = "umap", group.by = "group")+NoAxes() +NoLegend()+ggtitle("UMAP")
)
p1.compare
ggsave(plot=p1.compare,filename="before_and_after_inter_cancerType.pdf")
sce=seuratObj
sce <- FindNeighbors(sce, reduction = "harmony",dims = 1:15)
sce <- FindClusters(sce, resolution = 0.5)
table(sce@meta.data$seurat_clusters)
DimPlot(sce,reduction = "umap",label=T)
ggsave(filename = 'harmony_umap_recluster_by_0.5.pdf')
p1=DimPlot(sce,reduction = "umap",label=T,
group.by = 'group')
p1
ggsave(filename = 'harmony_umap_sce_recluster_by_cancerType.pdf')
cl=c('#7fc97f','#beaed4','#fdc086','#ffff99','#386cb0','#f0027f','#bf5b17')
p2=DimPlot(sce,reduction = "umap",label=T,
cols = cl,
group.by = 'gp')
p2
ggsave(filename = 'harmony_umap_sce_recluster_by_position.pdf')
library(patchwork)
p1+p2
# 接下来对seurat默认的分群进行差异分析
pro='harmony_seurat-0.5'
table(Idents(sce))
sce.markers <- FindAllMarkers(object = sce, only.pos = TRUE, min.pct = 0.25,
thresh.use = 0.25)
DT::datatable(sce.markers)
write.csv(sce.markers,file=paste0(pro,'_sce.markers.csv'))
library(dplyr)
top10 <- sce.markers %>% group_by(cluster) %>% top_n(10, avg_log2FC)
DoHeatmap(sce,top10$gene,size=3)
ggsave(filename=paste0(pro,'_sce.markers_heatmap.pdf'),width = 18,height = 21)
p <- DotPlot(sce, features = unique(top10$gene),
assay='RNA' ) + coord_flip()
p
ggsave(paste0(pro,'-DotPlot_check_top10_markers_by_clusters.pdf'),width = 18,height = 21)
library(dplyr)
top3 <- sce.markers %>% group_by(cluster) %>% top_n(3, avg_log2FC)
DoHeatmap(sce,top3$gene,size=3)
ggsave(paste0(pro,'-DoHeatmap_check_top3_markers_by_clusters.pdf'),width = 18,height = 11)
p <- DotPlot(sce, features = unique(top3$gene),
assay='RNA' ) + coord_flip()
p
ggsave(paste0(pro,'-DotPlot_check_top3_markers_by_clusters.pdf'),width = 18,height = 11)
save(sce.markers,file=paste0(pro,file = '-sce.markers.Rdata'))
load(file='seurat_results/harmony_seurat-0.5-sce.markers.Rdata')
table(sce.markers$cluster)
library(org.Hs.eg.db)
library(clusterProfiler)
ids=bitr(sce.markers$gene,'SYMBOL','ENTREZID','org.Hs.eg.db')
sce.markers=merge(sce.markers,ids,by.x='gene',by.y='SYMBOL')
gcSample=split(sce.markers$ENTREZID, sce.markers$cluster)
gcSample # entrez id , compareCluster
xx <- compareCluster(gcSample, fun="enrichKEGG",
organism="hsa", pvalueCutoff=0.05)
p=dotplot(xx)
p+ theme(axis.text.x = element_text(angle = 45,
vjust = 0.5, hjust=0.5))
p
ggsave('seurat_results/harmony_seurat-0.5-compareCluster-enrichKEGG.pdf',width = 15)
library(gplots)
balloonplot(table(sce$seurat_clusters,
sce$gp))
balloonplot(table(sce$seurat_clusters,
sce$group))
# 可以看到 0,3,6,10是核糖体富集
#
###### step5: ######
1
https://gitee.com/jmzeng/pan-cancer-expression.git
git@gitee.com:jmzeng/pan-cancer-expression.git
jmzeng
pan-cancer-expression
pan-cancer-expression
master

搜索帮助