Codes in paper

A bar graph comparing the number of hypomethylated regions of the sample with the number of intersections of CpG islands, mutation load, and number of SVs

Legend

  A summary of the HMR (hypomethylation region) frequency of 100 independent mCRPC samples and the sample level of somatic mutations. The bar graph shows the HMR count within the genome feature, the HMR count that overlaps with CpG islands, shores and shelves (CpG overlap), the percentage of genomes with DNA copy number changes (CNA), somatic cell mutations per megabase (mutations per Mb) Number) and structural variance number (SV count). CMP samples are marked in blue. Samples with fewer HMR tumors on CpG islands, shores and shelves and CpG openseas (regions other than islands, shores and shelves) were designated as CpG methylator phenotype (CMP).

Code explanation
First get the required HMR count, CpG ovderlap, mutation load, SV count values
#source('~/mnt/data1/projects/WCDT_WGBS_2019/WCDT_WGBS/scripts/2019_05_15_prepare_environment.r')  #Load enviroment

# Plots Figure 1A

colstf <- c('CRPC_ARE','LOC_ARE','chip_FOXA1','chip_HOXB13','chip_ERG')
pcttotal=FALSE

output <- data.frame()
for( i in 1 : length(hmr) ) {
  print(i)
  sample <- toupper(names(hmr)[i])
  hmri <- hmr[[i]]

  totalhmrs <- length(hmri)
  output[sample,'Sample'] <- sample
  output[sample,'hmr'] <- totalhmrs

  divideby <- 1
  if(pcttotal) {
    divideby <- totalhmrs
  }

   #Calculate the TFBS, hmr count of each sample, including genebody, promoter, H3K27ac, H3K27me3, and CpG_island, CpG_shelf, CpG_shore
  in_tf <- rep(F,totalhmrs)
  for(j in 1:length(colstf)) {
    coltf <- colstf[j]
    in_tf <- in_tf | countOverlaps(hmri,tracks[[coltf]],ignore.strand=T)
  }

  in_genebody = countOverlaps(hmri,tracks[['genebody']],ignore.strand=T)>0
  in_promoter = countOverlaps(hmri,tracks[['promoter']],ignore.strand=T)>0
  in_h3k27ac = countOverlaps(hmri,tracks[['pca100_H3K27ac']],ignore.strand=T)>0
  in_h3k27me3 = countOverlaps(hmri,tracks[['pca100_H3K27me3']],ignore.strand=T)>0

  output[sample,'Promoter'] <- sum(in_promoter) / divideby
  output[sample,'Genebody'] <- sum(in_genebody & !in_promoter) / divideby
  output[sample,'TFBS_not_promoter'] <- sum(in_tf & !in_genebody & !in_promoter) / divideby
  output[sample,'H3K27ac_not_promoter_tfbs'] <- sum(in_h3k27ac & !in_tf & !in_genebody & !in_promoter) / divideby
  output[sample,'H3K27me3_not_others'] <- sum(in_h3k27me3 & !in_h3k27ac & !in_tf & !in_genebody & !in_promoter) / divideby

  promoter_oncogenes = countOverlaps(hmri,gr_oncogenes, ignore.strand=T)
  output[sample,'oncogene_promoter'] = sum(promoter_oncogenes>0)/divideby
  promoter_tsg = countOverlaps(hmri, gr_tsg, ignore.strand=T)
  output[sample,'TS_promoter'] = sum(promoter_tsg>0)/divideby

  in_cpg_island = countOverlaps(hmri,tracks[['cpg_island']],ignore.strand=T)>0
  in_cpg_shore = countOverlaps(hmri,tracks[['cpg_shore']],ignore.strand=T)>0
  in_cpg_shelf = countOverlaps(hmri,tracks[['cpg_shelf']],ignore.strand=T)>0

  output[sample,'CpG_island'] = sum(in_cpg_island)/divideby
  output[sample,'CpG_shore'] = sum(in_cpg_shore & !in_cpg_island)/divideby
  output[sample,'CpG_shelf'] = sum(in_cpg_shelf & !in_cpg_shore & !in_cpg_island)/divideby
}

# Screen samples based on tumor purity
#output <- output[rownames(tumorpurity)[tumorpurity[samples_wgbs,'Tumor.Purity.Comp']>80],]
output <- output[rownames(output) %in% sample_ids_wgbs,]

lvl <- output[order(output$hmr),'Sample']
output$Sample <- factor(output$Sample,levels=lvl)

if(pcttotal) {
  countsother <- 1
} else {
  countsother <- output$hmr
}
countsother <- output$hmr-(output$Promoter+output$Genebody+output$TFBS_not_promoter+
                             output$H3K27me3_not_others+output$H3K27ac_not_promoter_tfbs)

# Integrate the sub-categories into one category, Other, Promoter, Genebody, TFBS, H3K27ac, H3K27me3 belong to df1, CpG_island, CpG_shelf, CpG_shore belong to df3, df5 is cn and mutation load, df6 is sv
df1 <- rbind.data.frame(
  cbind.data.frame(sample=output$Sample,count=countsother,group='Other'),
  cbind.data.frame(sample=output$Sample,count=output$Promoter,group='Promoter'),
  cbind.data.frame(sample=output$Sample,count=output$Genebody,group='Genebody'),
  cbind.data.frame(sample=output$Sample,count=output$TFBS_not_promoter,group='TFBS'),
  cbind.data.frame(sample=output$Sample,count=output$H3K27ac_not_promoter_tfbs,group='H3K27ac'),
  cbind.data.frame(sample=output$Sample,count=output$H3K27me3_not_others,group='H3K27me3'))
df1$group <- factor(df1$group,levels=c('Other','H3K27me3','H3K27ac','TFBS','Genebody','Promoter'))

df3 <- rbind.data.frame(
  cbind.data.frame(sample=output$Sample,count=output$CpG_island,group='CpG_island'),
  cbind.data.frame(sample=output$Sample,count=output$CpG_shelf,group='CpG_shelf'),
  cbind.data.frame(sample=output$Sample,count=output$CpG_shore,group='CpG_shore'))
df3$group <- factor(df3$group,levels=c('CpG_shelf','CpG_shore','CpG_island'))

df5 <- cellsummary[lvl,]
df5$mutation_count <- df5$mutation_count / (HG38_SIZE / 1000000)
df5$sample <- factor(rownames(df5),levels=lvl)
df5$namecn <- '%Altered'
df5$namemut <- 'Mut/Mb'

df6slice <- manta_counts[lvl,]
df6slice$sample <- factor(rownames(df6slice),levels=lvl)
df6 <- rbind.data.frame(
  cbind.data.frame(sample=df6slice$sample,count=df6slice$BND,group='BND'),
  cbind.data.frame(sample=df6slice$sample,count=df6slice$DEL,group='DEL'),
  cbind.data.frame(sample=df6slice$sample,count=df6slice$DUP,group='DUP'),
  cbind.data.frame(sample=df6slice$sample,count=df6slice$INS,group='INS'),
  cbind.data.frame(sample=df6slice$sample,count=df6slice$INV,group='INV'))

df6$color = rep("black", 100)
df6$color[ match.idx( df6$sample, samples_cmp )$idx.A ] = "cornflowerblue"
Then we use the obtained data to draw
cols <- brewer.pal(n=5, name='Set1')  #配置色板
# 用plist存放ggplot对象,再用ggarrange进行排版并画图
plist <- list()
plist[[1]] <- ggplot(df1,aes(x=sample,y=count,fill=group))+theme_classic()+geom_bar(stat='identity')+
  theme(legend.position = "none", 
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + 
  ylab('HMR#')+
  scale_fill_manual(values=c('darkgray',cols))
plist[[2]] <- ggplot(df3,aes(x=sample,y=count,fill=group))+theme_classic()+geom_bar(stat='identity')+
  theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(),legend.position = "none")+
  scale_fill_brewer(palette='Dark2')+ylab('CpG#')
plist[[3]] <- ggplot(df5,aes(x=sample,y=percent.copy.altered,fill=namecn))+theme_classic()+geom_bar(stat='identity')+
  theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(),legend.position = "none")+
  scale_fill_manual(values='darkgrey')+ylab('% CN Altered')
plist[[4]] <- ggplot(df5,aes(x=sample,y=mutation_count,fill=namemut))+theme_classic()+geom_bar(stat='identity')+
  theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank(),legend.position = "none")+
  scale_fill_manual(values='darkgrey')+ylab('Mut/Mb')+scale_y_log10() 

plist[[5]] <- ggplot(df6,aes(x=sample,y=count,fill=group))+theme_classic()+geom_bar(stat='identity')+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_text(angle=90,hjust=1,vjust=0.5,colour=df6$color),
        axis.ticks.x=element_blank(),
        legend.position = "none")+ 
  scale_fill_brewer(palette='Set2')+ylab('# SV')

ggarrange(plotlist=plist,
          ncol=1,
          nrow=length(plist),
          heights=c(3,1,1,1,2) ,
          align="v")
ggsave(fn_fig1a, height=8, width=14)
Reference

Zhao S G , Chen W S , Li H , et al. The DNA methylation landscape of advanced prostate cancer[J]. Nature Genetics, 2020.

Original code

https://github.com/DavidQuigley/WCDT_WGBS/blob/master/scripts/2019_05_15_WGBS_figure_1A.R

Leave a Reply

Your email address will not be published. Required fields are marked *