Cell-type annotation of single cell transcriptomics data using marker based methods

Introduction

In the data processing protocols of scRNA-seq experiments, cell type identification is a vital step for subsequent analysis. Mainly, two types of strategies have been reported, e.g., cell-based and cluster-based annotation. The major challenge of cell-based strategy lies in the determination of cell types on each cluster as multiple cells with different types are present in one cluster.

Cluster-based methods use the pre-computed clusters to identify markers within each cluster. The markers are then use for cell type identification from reference databases which contain a plethora of manually curated marker to cell-type mappings.

Import functions

Please follow the instructions below

1. Check the appendix section below to install necessary packages
2. Refresh the kernel after installation is complete

suppressMessages(library(Seurat))
suppressMessages(library(anndata))
suppressMessages(library(dplyr))
suppressMessages(library(clustermole))

Defining custom functions to use

get_seurat_obj = function(adata) {

    # Get the expression matrix
    exprs <- t(as.matrix(adata$X))
    colnames(exprs) <- adata$obs_names
    rownames(exprs) <- adata$var_names
    # Create the Seurat object
    seurat <- CreateSeuratObject(exprs)
    # Set the expression assay
    seurat <- SetAssayData(seurat, "data", exprs)
    # Add observation metadata
    seurat <- AddMetaData(seurat, adata$obs)
    # Add embedding
    embedding_2 <- as.list(adata$obsm["X_tsne"])
    rownames(embedding_2$X_tsne) <- adata$obs_names
    colnames(embedding_2$X_tsne) <- c("tsne_1", "tsne_2")
    seurat[["tsne"]] <- CreateDimReducObject(embedding_2$X_tsne, key = "tsne_")

    return(seurat)

}

1. Querying Biomedical Molecular data

1.1 Install Polly Python

!sudo pip3 install polly-python --quiet

from polly.omixatlas import OmixAtlas
import pandas as pd
import os

LiverOmixAltasID = "1615965444377"

1.2 Connect to Polly OmixAtlas

Get Authentication Tokens

#REFRESH_TOKEN = "ENTER REFRESH TOKEN WITHIN THE DOUBLE QUOTES"
repo_client = OmixAtlas(REFRESH_TOKEN)

See a list of available Omix Atlases

pd.DataFrame.from_dict(repo_client.get_all_omixatlas()['data'])

	repo_name	repo_id	indexes	dataset_count	disease_count	diseases	organism_count	organisms	sources	datatypes	sample_count
0	liveromix_atlas	1615965444377	{'gct_metadata': 'liveromix_atlas_gct_metadata...	6760	761	[normal, carcinoma, hepatocellular, obesity, n...	22	[homo sapiens, mus musculus, rattus norvegicus...	[geo, lincs, tcga, metabolomics workbench, met...	[transcriptomics, mutation, metabolomics, sing...	1738079

1.3 Querying Single Cell data from Liver OmixAtlas

The publication provides the GEO dataset id that was used for the analysis. Using the GSE and GPL id the GEO datasets are queried on the Liver OmixAtlas.
The data is available in a structured anndata object in h5ad format

sc_normal_query = """select * FROM liveromix_atlas_files 
        WHERE dataset_id = 'GSE115469_GPL16791'
        AND kw_data_type = 'Single cell'"""

repo_client.query_metadata(sc_normal_query)

	dataset_id	dataset_source	disease	description	organism	tissue	file_type	authors	year	publication	kw_data_type	kw_cell_type	platform	kw_repo	kw_package	kw_key	kw_bucket	kw_filetype	kw_region	kw_location	kw_timestamp
0	GSE115469_GPL16791	GEO	Normal	Single cell RNA sequencing of human liver reve...	Homo sapiens	[liver]	h5ad	Sonya A. MacParland, Ian D McGilvray	2018	https://www.nature.com/articles/s41467-018-063...	Single cell	[]	10x	liveromix_atlas	liver_atlas/data	liver_atlas/data/SingleCell/GSE115469_GPL16791...	discover-prod-datalake-v1	h5ad	us-west-2	https://discover-prod-datalake-v1.s3-us-west-2...	1626622850659

dataset_id = list(repo_client.query_metadata(sc_normal_query)['dataset_id'])
dataset_id

['GSE115469_GPL16791']

Download data from liverOmix atlas

file_link = repo_client.download_data(LiverOmixAltasID, dataset_id[0])['data']
file_link

'https://discover-prod-datalake-v1.s3.amazonaws.com/liver_atlas/data/SingleCell/GSE115469_GPL16791.h5ad?AWSAccessKeyId=ASIAVRYB5UBIA5FW2NFB&Signature=wQeC6muM1IPeU2RQGOWf7z2gaMU%3D&x-amz-security-token=IQoJb3JpZ2luX2VjENf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQDQlEn4h5AZ3rKsV3RrcKVNwYuMCpB8CW4v0rYUyyCUUwIhALNparKajiikPLlUbV7tVAXgdOqRsCMy%2BEpBKeg8QtflKqoCCND%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEQAhoMMzgxNzE5MjU3MTY4Igz90u%2BAalZnFYSf59wq%2FgErXZhA1mS24Pzmxiujn5CDBTadtH%2BpJ%2BC4gLUAjzk3eAFnQuKfZfMN6MNdTN81pvl3g3p9Cr1LLeiPF%2FxHNEdltiie79cRDd3dZF0JMUqbWUY7y6%2Fy9It2YluhF2JGAGf8PqPT9zadygDz%2FYRLzRryb6Xcka0t%2FD6ST6P%2BZ6UlOxplcslONUukwhWgaIdPhl4cwUSP7LCL41fJAxUCiEb8%2FugQMMPvLu62qFGfdQpG7bUFddfXqSuE3mST0EAlY5b64c46YjgbadhJd52e9uIowubqpMxVsCOt0VCpLFE54ITnrMnv9t0Qp4KsNJHBu%2F4%2Bq9%2BvfxX%2BFkitnHj7oTCvtOSHBjqZAVP5nfMsDhkbIXsOcKWxas6yjg5q9eZDOecxXzT%2BR0iRAcMLh6fVhYciG327iZOaq0NQ87LDxkgT3zY9f4g0EN4DSj3jE5O28lrSPyOzbbuB9rhzJuu1A3uaap00bTErXKquKUFM4cPgCpbADzIoQQLuzoX6JSOXWySZ%2BN%2FX0MMFa4eWiZAUZyDkoet9oY4%2B%2BcdkndKPQIrceg%3D%3D&Expires=1626941537'

wget -O GSE115469_GPL16791.h5ad 'https://discover-prod-datalake-v1.s3.amazonaws.com/liver_atlas/data/SingleCell/GSE115469_GPL16791.h5ad?AWSAccessKeyId=ASIAVRYB5UBIA5FW2NFB&Signature=wQeC6muM1IPeU2RQGOWf7z2gaMU%3D&x-amz-security-token=IQoJb3JpZ2luX2VjENf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQDQlEn4h5AZ3rKsV3RrcKVNwYuMCpB8CW4v0rYUyyCUUwIhALNparKajiikPLlUbV7tVAXgdOqRsCMy%2BEpBKeg8QtflKqoCCND%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEQAhoMMzgxNzE5MjU3MTY4Igz90u%2BAalZnFYSf59wq%2FgErXZhA1mS24Pzmxiujn5CDBTadtH%2BpJ%2BC4gLUAjzk3eAFnQuKfZfMN6MNdTN81pvl3g3p9Cr1LLeiPF%2FxHNEdltiie79cRDd3dZF0JMUqbWUY7y6%2Fy9It2YluhF2JGAGf8PqPT9zadygDz%2FYRLzRryb6Xcka0t%2FD6ST6P%2BZ6UlOxplcslONUukwhWgaIdPhl4cwUSP7LCL41fJAxUCiEb8%2FugQMMPvLu62qFGfdQpG7bUFddfXqSuE3mST0EAlY5b64c46YjgbadhJd52e9uIowubqpMxVsCOt0VCpLFE54ITnrMnv9t0Qp4KsNJHBu%2F4%2Bq9%2BvfxX%2BFkitnHj7oTCvtOSHBjqZAVP5nfMsDhkbIXsOcKWxas6yjg5q9eZDOecxXzT%2BR0iRAcMLh6fVhYciG327iZOaq0NQ87LDxkgT3zY9f4g0EN4DSj3jE5O28lrSPyOzbbuB9rhzJuu1A3uaap00bTErXKquKUFM4cPgCpbADzIoQQLuzoX6JSOXWySZ%2BN%2FX0MMFa4eWiZAUZyDkoet9oY4%2B%2BcdkndKPQIrceg%3D%3D&Expires=1626941537'

--2021-07-22 07:12:54--  https://discover-prod-datalake-v1.s3.amazonaws.com/liver_atlas/data/SingleCell/GSE115469_GPL16791.h5ad?AWSAccessKeyId=ASIAVRYB5UBIA5FW2NFB&Signature=wQeC6muM1IPeU2RQGOWf7z2gaMU%3D&x-amz-security-token=IQoJb3JpZ2luX2VjENf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQDQlEn4h5AZ3rKsV3RrcKVNwYuMCpB8CW4v0rYUyyCUUwIhALNparKajiikPLlUbV7tVAXgdOqRsCMy%2BEpBKeg8QtflKqoCCND%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEQAhoMMzgxNzE5MjU3MTY4Igz90u%2BAalZnFYSf59wq%2FgErXZhA1mS24Pzmxiujn5CDBTadtH%2BpJ%2BC4gLUAjzk3eAFnQuKfZfMN6MNdTN81pvl3g3p9Cr1LLeiPF%2FxHNEdltiie79cRDd3dZF0JMUqbWUY7y6%2Fy9It2YluhF2JGAGf8PqPT9zadygDz%2FYRLzRryb6Xcka0t%2FD6ST6P%2BZ6UlOxplcslONUukwhWgaIdPhl4cwUSP7LCL41fJAxUCiEb8%2FugQMMPvLu62qFGfdQpG7bUFddfXqSuE3mST0EAlY5b64c46YjgbadhJd52e9uIowubqpMxVsCOt0VCpLFE54ITnrMnv9t0Qp4KsNJHBu%2F4%2Bq9%2BvfxX%2BFkitnHj7oTCvtOSHBjqZAVP5nfMsDhkbIXsOcKWxas6yjg5q9eZDOecxXzT%2BR0iRAcMLh6fVhYciG327iZOaq0NQ87LDxkgT3zY9f4g0EN4DSj3jE5O28lrSPyOzbbuB9rhzJuu1A3uaap00bTErXKquKUFM4cPgCpbADzIoQQLuzoX6JSOXWySZ%2BN%2FX0MMFa4eWiZAUZyDkoet9oY4%2B%2BcdkndKPQIrceg%3D%3D&Expires=1626941537
Resolving discover-prod-datalake-v1.s3.amazonaws.com (discover-prod-datalake-v1.s3.amazonaws.com)... 52.218.216.170
Connecting to discover-prod-datalake-v1.s3.amazonaws.com (discover-prod-datalake-v1.s3.amazonaws.com)|52.218.216.170|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 186223226 (178M) [binary/octet-stream]
Saving to: ‘GSE115469_GPL16791.h5ad’

GSE115469_GPL16791. 100%[===================>] 177.60M  30.7MB/s    in 6.3s

2021-07-22 07:13:01 (28.0 MB/s) - ‘GSE115469_GPL16791.h5ad’ saved [186223226/186223226]

2. Data

The single-cell dataset used is a map of the cellular landscape of the human liver using single-cell RNA sequencing. The dataset consists of transcriptional profiles of 8444 parenchymal and non-parenchymal cells obtained from the fractionation of fresh hepatic tissue from five human livers.

The original publication shows 12 distinct cell type populations identified using gene expression patterns, flow cytometry, and immunohistochemical examinations. However, the dataset deposited on Gene Expression Omnibus (GEO), did not contain the experimentally identified cell type annotations.

options(warn=-1)

adata.pred = read_h5ad('GSE115469_GPL16791.h5ad')

seurat = get_seurat_obj(adata.pred)

DimPlot(object = seurat, reduction = "tsne", group.by = "clusters", cols = DiscretePalette(20), label = TRUE)

3. Finding cluster biomarkers

A one vs all comparison is performed between clusters to identify marker genes.

# find markers for every cluster compared to all remaining cells, report only the positive ones
Idents(object = seurat) <- seurat@meta.data$clusters
liver.markers <- FindAllMarkers(seurat, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

head(liver.markers)

A data.frame: 6 × 7
	p_val	avg_log2FC	pct.1	pct.2	p_val_adj	cluster	gene
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<int>	<chr>
HSD11B1	0	0.7473840	0.896	0.207	0	1	HSD11B1
APOM	0	0.6909939	0.913	0.204	0	1	APOM
PON3	0	0.5102462	0.858	0.177	0	1	PON3
TTC36	0	0.4761022	0.827	0.193	0	1	TTC36
F10	0	0.4080488	0.583	0.104	0	1	F10
BCHE	0	0.3824872	0.749	0.156	0	1	BCHE

Plotting a heatmap showing the top 5 clusters containing largest number of cells

#Scaling data
seurat <- ScaleData(seurat)

Idents(object = seurat) <- seurat@meta.data$clusters

liver.markers.top <- liver.markers %>% group_by(cluster) %>%  top_n(n = 2 ,wt = avg_log2FC)
liver.markers.subset <- liver.markers %>% group_by(cluster) %>% filter(cluster == c(1,2,3,4,5)) %>% top_n(n = 2 ,wt = avg_log2FC)

seurat.subset <- subset(seurat, idents = c('1','2','3','4','5'))
DoHeatmap(seurat.subset, features = liver.markers.subset$gene)

Centering and scaling data matrix

4 . Automated cell type curation using biomarker genes

Marker gene database-based annotation takes advantage of cell type atlases. Literature- and scRNA-seq analysis-derived markers have been assembled into reference cell type hierarchies and marker lists. In this approach, basic scoring systems are used to ascribe cell types at the cluster level in the query dataset.

Type <- c()
clusters <- c()
for (i in seq(1,20, by=1)) {
    liver.markers.filt <- as.data.frame(liver.markers %>% filter(cluster == as.character(i)) %>% top_n(n = 25, wt = avg_log2FC))
    my_overlaps <- clustermole_overlaps(genes = liver.markers.filt$gene, species = "hs")
    type <- as.character(my_overlaps[1,5])
    Type <- c(Type, type)
    clusters <- c(clusters, i)
}


clust_map <- as.data.frame(cbind(clusters,Type))
head(clust_map)

A data.frame: 6 × 2
	clusters	Type
	<fct>	<fct>
1	1	Hepatocytes
2	2	DURANTE_ADULT_OLFACTORY_NEUROEPITHELIUM_CD8_T_CELLS
3	3	Hepatocytes
4	4	AIZARANI_LIVER_C25_KUPFFER_CELLS_4
5	5	Hepatocytes
6	6	AIZARANI_LIVER_C11_HEPATOCYTES_1

Curating cell types from cluster based identifications

The cell type annotations predicted lacked harmonisation in cell names, since the algorithm used a marker-gene database search which queries markers from multiple databases each with database specific vocabulary. Therefore, in order for the cell types to cluster, markers corresponding to same cell type but different annotation must be curated to follow a common vocabulary

#The cell type names must be cleaned in order for them to be clustered together
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('NK_NKT_CELLS','NK Cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('B cell\\D+','B cell',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_CD71\\D+','Erythroid cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_NK_CELLS','NK Cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('KUPFFER_CELLS','Kupffer Cells',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('MVECS','Endothelial cells',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_CD8_T_CELLS','CD8 T cell',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_CD4_T_CELLS','CD4 T cell',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('AIZARANI_LIVER_C\\d+_','',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('_\\d+$','',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('HEPATOCYTES','Hepatocytes',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\bPlasma cell\\b','Plasma cells',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_PLASMA_CELLS','Plasma cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_PLASMA_CELL','Plasma cells',x))                               
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('EPCAM_POS_\\D+','EPCAM+',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_B_CELL','B cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('CUI_DEVELOPING_HEART_C3_FIBROBLAST_LIKE_CELL','Others',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('Erythrocytes_\\D+','Erythrocytes',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('FAN_EMBRYONIC_CTX_NSC','Others',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('Lymph_endothel_cell','Other endothelial cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_Epithelial_cells','Epithelial cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('HAY_BONE_MARROW_PRO_B','Others',x))


cell.type_clust <- aggregate(clusters ~ Type, clust_map, paste, collapse=",")
cell.type_clust

A data.frame: 12 × 2
Type	clusters
<chr>	<chr>
B cell	16
CD8 T cell	2
Cholangiocytes	17
Endothelial cells	11,13
Erythroid cells	19
Gamma delta T cells	18
Hepatic stellate cells	20
Hepatocytes	1,3,5,6,14,15
Kupffer Cells	4,10
LSECS	12
NK Cells	8,9
Plasma cells	7

rows <- rownames(seurat@meta.data)
seurat@meta.data = merge(seurat@meta.data, clust_map, by = 'clusters', all.x = TRUE)
rownames(seurat@meta.data) <- rows

DimPlot(object = seurat, reduction = "tsne", group.by = "Type", cols = DiscretePalette(12))

Appendix

Package installation

NOTE : For R packages, the cells must be run in the following order as shown below

!sudo R -e 'BiocManager::install(c("GSEABase", "GSVA", "singscore"))'

!sudo R -e 'install.packages(c("anndata", "clustermole"), repos = "https://cloud.r-project.org/")'