library(Seurat)
library(Matrix)
library(DoubletFinder)
library(cowplot)
library(dplyr)
library(reshape2)
library(ggplot2)
library(harmony)
library(reticulate)
library(rafalib)
library(clustree)
library(edgeR)
library(MAST)
library(ggpubr)
library(enrichR)
library(msigdbr)
library(fgsea)
library(DropletUtils)
library(org.Mm.eg.db)
library(data.table)
setwd("~/Projects/scrna_ex/")

Part 2 - QC

knitr::opts_knit$set(root.dir = '~/Projects/scrna_ex/')
alldata = readRDS("./results/seurat_part1.rds")
# Get mitocarta3.0 data
#https://www.broadinstitute.org/mitocarta/mitocarta30-inventory-mammalian-mitochondrial-proteins-and-pathways
mouse_mitocarta3_file = "./data/Mouse.MitoCarta3.0.tsv"
mmc3 = data.frame(fread(mouse_mitocarta3_file, header=T), check.names=F)
mmc3 = mmc3[, c("MouseGeneID", "Symbol", "EnsemblGeneID")]
mito_genes = mmc3$Symbol
features = alldata@assays$RNA@counts@Dimnames[[1]]
mito_genes = mito_genes[mito_genes %in% features]

# Get annotations for ribo and hemoglobin. Skip, this info can be retrieve in the gene symbols.
#my_keys = keys(org.Mm.eg.db)
#my_annotations = select(org.Mm.eg.db,
#       keys = my_keys,
#       columns=c("ENSEMBL","SYMBOL","GENENAME"),
#       keytype="ENTREZID")
#ribo_genes = my_annotations[grepl("^Rn5|^Rn28|^Rn18|^Rn7", my_annotations$SYMBOL),]$ENSEMBL
#ribo_genes = ribo_genes[!is.na(ribo_genes)]

# Obviously, the mm10 10X genomics release does not contain rRNA, and Hbb genes. If you want those you would
# need to run the SingleCell pipeline using the unaltered ENCODE reference for instance.
alldata <- PercentageFeatureSet(alldata, pattern=NULL, mito_genes, col.name = "percent_mito")
alldata <- PercentageFeatureSet(alldata, "^Hbb-", col.name = "percent_hb")
alldata <- PercentageFeatureSet(alldata, "^Rn5|^Rn28|^Rn18|^Rn7", col.name = "percent_ribo")

feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb")
VlnPlot(alldata, group.by = "orig.ident", features = feats, pt.size = 0.0, ncol = 3) +
  NoLegend()

Here the depletion of rRNA reads or polyA selection looks to have work quite well. Let’s then look at the correlation between read counts and % mitochondrial reads and % ribosomal reads.

plot_grid(
  FeatureScatter(alldata, "nCount_RNA", "nFeature_RNA", group.by = "orig.ident", pt.size = 0.5),
  FeatureScatter(alldata, "nCount_RNA", "percent_mito", group.by = "orig.ident", pt.size = 0.5),
  FeatureScatter(alldata, "nCount_RNA", "percent_ribo", group.by = "orig.ident", pt.size = 0.5),
  ncol=2)

# filter cells with low amount of reads as well as 
# genes that are present in at least a certain amount of cells. 
selected_c <- WhichCells(alldata, expression = nFeature_RNA > 200)
selected_f <- rownames(alldata)[Matrix::rowSums(alldata) > 3]
data.filt <- subset(alldata, features = selected_f, cells = selected_c)
dim(data.filt)
## [1] 17861  6000

See which genes captured the most reads

## Warning in melt(as.matrix(t(C[most_expressed, ]))): The melt generic in
## data.table has been passed a matrix and will attempt to redirect to the
## relevant reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend the
## namespace like reshape2::melt(as.matrix(t(C[most_expressed, ]))). In the next
## version, this warning will become an error.

##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   9511286  508.0   14601642  779.9  14601642  779.9
## Vcells 136075337 1038.2  405273616 3092.0 506590951 3865.0

Filter out the mitochondrial genes

# So here, basically keep all cells having less than 15 % of mito genes content.
selected_mito <- WhichCells(data.filt, expression = percent_mito < 15)
# If rRNA genes would have been present in the gene model used for alignment
# we would have removed cells with high content of rRNA as well. In the current dataset, this is not a problem.
selected_ribo <- WhichCells(data.filt, expression = percent_ribo > 0.0001)

# and subset the object to only keep those cells
data.filt <- subset(data.filt, cells = selected_mito)
data.filt <- subset(data.filt, cells = selected_ribo)

dim(data.filt)
## [1] 17861  5369
table(data.filt$orig.ident)
## 
## cd69_1 cd69_2   wt_1   wt_2 
##   1193   1379   1436   1361
# We filtered out approx 50-350 cells per sample
# Plot filtered QC
# Lets plot the same QC-stats another time.
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo", "percent_hb")
VlnPlot(data.filt, group.by = "orig.ident", features = feats, pt.size = 0.1, ncol = 3) +
  NoLegend()

Assign sex values.

knitr::opts_knit$set(root.dir = '~/Projects/scrna_ex/')
genes_file = "./data/chromoGeneInfo.tab"
genes_table = read.table(genes_file, sep="\t", header=T)
genes_table <- genes_table[genes_table$V2 %in% rownames(alldata),]

chrY.gene = genes_table[genes_table$V4 == "chrY",]$V2
alldata$pct_chrY = colSums(alldata@assays$RNA@counts[chrY.gene, ])/colSums(alldata@assays$RNA@counts)
chrX.gene = genes_table[genes_table$V4 == "chrX",]$V2
alldata$pct_chrX = colSums(alldata@assays$RNA@counts[chrX.gene, ])/colSums(alldata@assays$RNA@counts)
FeatureScatter(alldata, feature1 = "pct_chrX", feature2 = "pct_chrY")

VlnPlot(alldata, features = c("pct_chrX", "pct_chrY"))

The X chromosomes gene counts look okay. However, it is unclear why Y chromosomes seem to be less present. It is not clear if this is statistically significant. Cd69 cells seem to have more variability in chrY gene counts. Would have to validate if this is statistically significant.

#Predict doublets
data.filt = NormalizeData(data.filt)
data.filt = FindVariableFeatures(data.filt, verbose = F)
data.filt = ScaleData(data.filt, vars.to.regress = c("nFeature_RNA", "percent_mito"),
                      verbose = F)
data.filt = RunPCA(data.filt, verbose = F, npcs = 20)
data.filt = RunUMAP(data.filt, dims = 1:10, verbose = F)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
# define the expected number of doublet cellscells.
nExp <- round(ncol(data.filt) * 0.023)  # expect 4% doublets
data.filt <- doubletFinder_v3(data.filt, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)
## Le chargement a nécessité le package : fields
## Le chargement a nécessité le package : spam
## Spam version 2.9-1 (2022-08-07) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
## 
## Attachement du package : 'spam'
## L'objet suivant est masqué depuis 'package:stats4':
## 
##     mle
## L'objet suivant est masqué depuis 'package:Matrix':
## 
##     det
## Les objets suivants sont masqués depuis 'package:base':
## 
##     backsolve, forwardsolve
## Le chargement a nécessité le package : viridis
## Le chargement a nécessité le package : viridisLite
## 
## Try help(fields) to get started.
## Le chargement a nécessité le package : KernSmooth
## KernSmooth 2.23 chargé
##  Copyright M. P. Wand 1997-2009
## [1] "Creating 1790 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## Centering and scaling data matrix
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
# name of the DF prediction can change, so extract the correct column name.
DF.name = colnames(data.filt@meta.data)[grepl("DF.classification", colnames(data.filt@meta.data))]
cowplot::plot_grid(ncol = 2, DimPlot(data.filt, group.by = "orig.ident") + NoAxes(),
                   DimPlot(data.filt, group.by = DF.name) + NoAxes())

#We should expect that two cells have more detected genes than a single cell, lets check if our predicted doublets also have more detected genes in general.
VlnPlot(data.filt, features = "nFeature_RNA", group.by = DF.name, pt.size = 0.1)

# remove doublets
data.filt = data.filt[, data.filt@meta.data[, DF.name] == "Singlet"]
dim(data.filt)
## [1] 17861  5246
# Save data
# Finally, lets save the QC-filtered data for further analysis. Create output directory results and save data to that folder.
dir.create("data/results", showWarnings = F)
saveRDS(data.filt, "./results/seurat_part2.rds")