Codes in paper

10X single-cell RNA sequencing QC (known sample SNP)

January 28, 2021February 21, 2021 by Yuwei Liao

Introduction

In this paper, single-cell 10X RNA sequencing was performed on a mixture of a variety of known cell lines to study the interaction mode between polyclones. Here we introduce the operation of filtering doublets and low quality cells in single cell sequencing.

Code explanation

Load the required tool function

library(mclust)
library(ggplot2)

# load counts matrix, genes, and cell identities for a given experiment
load_sc_exp <- function(data_folder) {
  rMat <- Matrix::readMM(file.path(data_folder, 'matrix.mtx')) %>% as.matrix()
  genes <- readr::read_tsv(file.path(data_folder, 'genes.tsv'), col_names = F) %>% 
    magrittr::set_colnames(c('Ensembl_ID', 'Gene_Symbol'))
  barcodes <- readr::read_tsv(file.path(data_folder, 'barcodes.tsv'), col_names = F) 
  colnames(rMat) <- barcodes$X1
  rownames(rMat) <- genes$Ensembl_ID
  num_CLs <- ncol(rMat)
  num_genes <- nrow(rMat)
  classification <- barcodes
  if (file.exists(file.path(data_folder, 'classifications.csv'))) {
    classification <- readr::read_csv(file.path(data_folder, 'classifications.csv')) %>% as.data.frame()
  }
  cat(sprintf('Loaded matrix with %d CLs and %d genes\n', num_CLs, num_genes))

  return(list(counts = rMat, gene_info = genes, cell_info = classification))
}

# convert gene identifiers from ensembl IDs to hugo symbols
convert_to_hugo_symbols <- function (dat) {
  ensemble_to_hugo <- dat$gene_info$Gene_Symbol %>% magrittr::set_names(dat$gene_info$Ensembl_ID)
  rownames(dat$counts) %<>% ensemble_to_hugo[.] %>% magrittr::set_names(NULL)
  dat$counts <- dat$counts[!is.na(rownames(dat$counts)), ]
  dat$counts <- dat$counts[!duplicated(rownames(dat$counts)), ]
  return(dat)
}

# input data object containing the counts, cell info (cell line classifications), and gene info
# for the experiment
create_seurat_obj <- function(dat, use_symbols = TRUE) {
  if(use_symbols) {
    dat <- dat %>%
      convert_to_hugo_symbols() #convert genes to hugo symbols (and only keep unique hugo symbol genes)
  }
  rownames(dat$cell_info) <- dat$cell_info$barcode

  #make into Seurat object
  seuObj <- Seurat::CreateSeuratObject(dat$counts, 
                                       min.cells = 0,
                                       min.features = 0,
                                       meta.data = dat$cell_info)

  #make singlet classifications the cell identifiers
  seuObj <- Seurat::SetIdent(seuObj, value = [email protected]$singlet_ID)

  #store gene info here
  seuObj@misc <- dat$gene_info

  #add mitochondrial gene fraction
  seuObj<- Seurat::PercentageFeatureSet(seuObj, pattern = "^MT-", col.name = 'percent.mito')

  #add cellular detection rate (fraction of genes detected in a given cell)
  [email protected]$cell_det_rate <- seuObj$nFeature_RNA/nrow(seuObj@assays$RNA@counts)

  return(seuObj)
}

# begin adding cell quality classifications
# classify cells with too few SNPs or abnormal mitochondrial read percentages as low quality
filter_low_quality_cells <- function(seuObj, min_SNPs = 50, max_mito_perc = 25, min_mito_perc=1) {

  #too few SNPs
  [email protected] <- [email protected] %>%
    dplyr::mutate(cell_quality = ifelse(num_SNPs < min_SNPs, 'low_quality', 'normal'))

  #abnormal mitochondrial read %
  [email protected] <- [email protected] %>%
    dplyr::mutate(cell_quality = ifelse(percent.mito > max_mito_perc | percent.mito < min_mito_perc, 'low_quality', cell_quality))

  return(seuObj)
}

# run Seurat methods, excluding low quality cells
process_Seurat_obj <- function(seuObj, n_pcs = 50, cluster_k =10, clust_res = 1, plot = TRUE) {

  all_meta <- [email protected]
  rownames(all_meta) <- all_meta$barcode

  rownames([email protected]) <- [email protected]$barcode

  seuObj <- Seurat::SubsetData(seuObj, cells = dplyr::filter([email protected], cell_quality == 'normal')$barcode)
  # run Seurat methods to get gene expression clusters
  seuObj <- Seurat::NormalizeData(object = seuObj, 
                                  normalization.method = "LogNormalize", 
                                  scale.factor = 1e5, verbose = FALSE)

  seuObj <- Seurat::ScaleData(object = seuObj, vars.to.regress = c(), verbose = FALSE)

  seuObj <- Seurat::FindVariableFeatures(object = seuObj,
                                         nfeatures = 5000,
                                         do.plot = FALSE,
                                         selection.method = 'vst', verbose = FALSE)

  seuObj <- Seurat::RunPCA(object = seuObj,
                           features = Seurat::VariableFeatures(seuObj),
                           seed.use = 1,
                           npcs = n_pcs,
                           verbose = FALSE)

  seuObj <- Seurat::RunTSNE(object = seuObj,
                            dims = 1:n_pcs,
                            check_duplicates = FALSE,
                            seed.use = 1,
                            perplexity = 30,
                            verbose = FALSE)  

  seuObj <- Seurat::FindNeighbors(seuObj, reduction = 'pca',
                                  dims = 1:n_pcs,
                                  k.param = cluster_k, 
                                  force.recalc = TRUE,
                                  verbose = FALSE)
  seuObj <- Seurat::FindClusters(seuObj, resolution = clust_res, random.seed = 1, verbose = FALSE)

  if(plot) {
    gg <- Seurat::DimPlot(seuObj, group.by = 'seurat_clusters') + Seurat::NoLegend()
    print(gg)
  }

  all_meta <- dplyr::left_join(all_meta, [email protected][,c('barcode', 'RNA_snn_res.1','seurat_clusters')])
  [email protected] <- all_meta
  return(seuObj)

}

# identify low quality 'cells', which may be from empty droplets
empty_droplet_clusters <- function(seuObj, dev_thresh = 0.3, plot = TRUE) {
  df <- Seurat::Embeddings(seuObj, reduction = 'tsne') %>% as.data.frame() %>%
    tibble::rownames_to_column(var = 'barcode') %>%
    dplyr::left_join([email protected] %>% 
                       dplyr::mutate(max_dev = singlet_dev + doublet_dev_imp), by='barcode')

  #compute stats for each cluster
  clust_stats <- plyr::ddply(df, .(seurat_clusters), function(ss) {
    data_frame(n_cells = nrow(ss),
               med_doub = median(ss$doublet_dev_imp, na.rm=T),
               med_dev = median(ss$max_dev, na.rm=T))
  }) %>% dplyr::mutate(
    clust_type = ifelse(med_dev <= dev_thresh, 'empty_droplet', 'normal')
  )

  empty_droplets <- df$barcode[which((clust_stats$clust_type %>% rlang::set_names(clust_stats$seurat_clusters) %>%
                                        .[df$seurat_clusters]) == 'empty_droplet')]
  print(sprintf('Identified %d empty droplets', length(empty_droplets)))

  rownames([email protected]) <- [email protected]$barcode
  [email protected][empty_droplets,'cell_quality'] <-  'empty_droplet'

  if(plot) {
    gg <- Seurat::DimPlot(seuObj, group.by = 'cell_quality')
    print(gg)
  }
  return(seuObj)
}

# Identify doublets
classify_doublets <- function(seuObj, doub_prob_thresh=0.5, doublet_dev_offset = 0.001, plot = TRUE) {
  # use singlet deviance, doublet deviance improvement, and fraction of genes detected in a given cell
  # to identify cells that are doublets
  # doublet identification

  doublet_df <- Seurat::FetchData(seuObj, c('singlet_dev', 'doublet_dev_imp', 'cell_quality')) %>% 
    tibble::rownames_to_column(var = 'barcode') %>% 
    dplyr::filter(cell_quality %in% c('normal', 'doublet')) %>% 
    dplyr::mutate(log_doublet_imp = log10(doublet_dev_imp + doublet_dev_offset)) %>% 
    dplyr::select(-doublet_dev_imp) 

  # handle cells where the doublet model failed by treating them as singlets.
  fail_doublet_model <- doublet_df %>% 
    dplyr::filter(is.na(log_doublet_imp)) %>% 
    .[['barcode']]
  print(sprintf('%d cells with failed doublet models treated as singlets', length(fail_doublet_model)))

  doublet_df %<>% dplyr::filter(!(barcode %in% fail_doublet_model))

  # sfit GMM
  gmm_output <- mclust::Mclust(as.matrix(doublet_df[, c('log_doublet_imp', 'singlet_dev')]), G = 2, verbose = FALSE, prior = mclust::priorControl(shrinkage = 0))

  doublet_cluster <- which.max(gmm_output$parameters$mean['log_doublet_imp', ])
  singlet_cluster <- which.min(gmm_output$parameters$mean['log_doublet_imp', ])
  doublet_df$doublet_prob <- gmm_output$z[,doublet_cluster]
  doublet_cells <- doublet_df %>% 
    dplyr::filter(doublet_prob >= doub_prob_thresh) %>% 
    .[['barcode']]

  [email protected][doublet_cells, 'cell_quality'] <- 'doublet'

  if(plot) {
    g1 <- plot(gmm_output, what = 'classification')

    g2 <- ggplot2::ggplot(doublet_df, ggplot2::aes(singlet_dev, log_doublet_imp, color=doublet_prob)) + 
      ggplot2::geom_point() + 
      ggplot2::theme(legend.position = "right", legend.direction = "vertical", legend.text = ggplot2::element_text(size=10)) +
      ggplot2::xlab("singlet deviance") + ggplot2::ylab("log doublet deviance improvement")

    g3 <- ggplot2::ggplot(doublet_df %>% dplyr::mutate(is_doublet = barcode %in% doublet_cells), ggplot2::aes(singlet_dev, log_doublet_imp, color= is_doublet)) +
      ggplot2::geom_point() +
      ggplot2::theme(legend.position = "right", legend.direction = "vertical", legend.text =ggplot2::element_text(size=10)) +
      ggplot2::xlab("singlet deviance") + ggplot2::ylab("log doublet deviance improvement") +
      ggplot2::guides(color = ggplot2::guide_legend(title = 'doublet'))

    print(g1)
    print(g2)
    print(g3)
  }
  return(seuObj)
}

identify_low_confidence_cells <- function(seuObj, min_z_margin = 2, plot = TRUE) {
  df <- [email protected]

  [email protected] %<>% 
    dplyr::mutate(cell_quality = ifelse([email protected]$singlet_z_margin < min_z_margin & [email protected]$cell_quality == 'normal', 'low_confidence', cell_quality))

  if(plot) {
    cell_class_confidence <- ggplot2::ggplot(df %>% filter(cell_quality == 'normal'),
                                             ggplot2::aes(singlet_z_margin)) + 
      ggplot2::geom_density() + 
      ggplot2::geom_vline(xintercept = min_z_margin, linetype = 'dashed')

    print(cell_class_confidence)
    gg <- ggplot2::ggplot([email protected], ggplot2::aes(singlet_dev, doublet_dev_imp, fill=cell_quality)) + 
      ggplot2::geom_point(pch = 21, size = 1.5, color = 'white', stroke = 0.1) + ggplot2::ggtitle('cell quality classifications')  
    print(gg)
  }

  return(seuObj)
}

To read in the data, the data is placed under a folder named data, not a single_cell_classification folder, which includes the output matrix.mtx, genes.tsv, barcodes.tsv of Cellranger, and a cell classification file identified by SNP. classifications.csv

  # run QC methods
  dat <- load_sc_exp(here::here('data'))
  seuObj <- create_seurat_obj(dat)
  num_CLs <- length(unique([email protected]$singlet_ID))

Set the parameters according to the number of sequenced cell lines, n_pcs is the number of principal components in principal component analysis, and clust_res is the resolution parameter of running FindCluster function

  # set parameters based on the number of cell lines in the experiment
  n_pcs <- num_CLs*2
  param_range <- 10
  if(num_CLs < 25+param_range) {
    clust_res <- 1
  } else if(num_CLs > 50-param_range & num_CLs < 50+param_range) {
    clust_res <- 2
  } else if(num_CLs > 100-param_range & num_CLs < 100+param_range) {
    clust_res <- 4
  } else {
    stop('Not implemented')
  }

Filter operation


  seuObj <- filter_low_quality_cells(seuObj)
  seuObj <- process_Seurat_obj(seuObj, n_pcs = n_pcs,  clust_res = clust_res)
  seuObj <- empty_droplet_clusters(seuObj)
  seuObj <- classify_doublets(seuObj)
  seuObj <- identify_low_confidence_cells(seuObj)

References

Kinker G S , Greenwald A C , Tal R , et al. Pan-cancer single-cell RNA-seq identifies recurring programs of cellular heterogeneity[J]. Nature Genetics, 2020, 52(11):1208-1218.

Original

https://github.com/broadinstitute/single_cell_classification

Recent Posts

Most Used Categories

The Code Search

10X single-cell RNA sequencing QC (known sample SNP)

Introduction

Code explanation

References

Original

Leave a Reply Cancel reply

Recent Posts

Most Used Categories

10X single-cell RNA sequencing QC (known sample SNP)

Introduction

Code explanation

References

Original

Leave a Reply Cancel reply

You may like

All codes for scRNA analysis in R

Significance test of PPI(protein–protein interaction) interface mutations.