Skip to content

Cell-type annotation of single cell transcriptomics data using marker based methods

Introduction

In the data processing protocols of scRNA-seq experiments, cell type identification is a vital step for subsequent analysis. Mainly, two types of strategies have been reported, e.g., cell-based and cluster-based annotation. The major challenge of cell-based strategy lies in the determination of cell types on each cluster as multiple cells with different types are present in one cluster.

Cluster-based methods use the pre-computed clusters to identify markers within each cluster. The markers are then use for cell type identification from reference databases which contain a plethora of manually curated marker to cell-type mappings.

Import functions

Please follow the instructions below

1. Check the appendix section below to install necessary packages
2. Refresh the kernel after installation is complete

suppressMessages(library(Seurat))
suppressMessages(library(anndata))
suppressMessages(library(dplyr))
suppressMessages(library(clustermole))

Defining custom functions to use

get_seurat_obj = function(adata) {

    # Get the expression matrix
    exprs <- t(as.matrix(adata$X))
    colnames(exprs) <- adata$obs_names
    rownames(exprs) <- adata$var_names
    # Create the Seurat object
    seurat <- CreateSeuratObject(exprs)
    # Set the expression assay
    seurat <- SetAssayData(seurat, "data", exprs)
    # Add observation metadata
    seurat <- AddMetaData(seurat, adata$obs)
    # Add embedding
    embedding_2 <- as.list(adata$obsm["X_tsne"])
    rownames(embedding_2$X_tsne) <- adata$obs_names
    colnames(embedding_2$X_tsne) <- c("tsne_1", "tsne_2")
    seurat[["tsne"]] <- CreateDimReducObject(embedding_2$X_tsne, key = "tsne_")

    return(seurat)

}

1. Querying Biomedical Molecular data

1.1 Install Polly Python

!sudo pip3 install polly-python --quiet
from polly.omixatlas import OmixAtlas
import pandas as pd
import os

LiverOmixAltasID = "1615965444377"

1.2 Connect to Polly OmixAtlas

Get Authentication Tokens

#REFRESH_TOKEN = "ENTER REFRESH TOKEN WITHIN THE DOUBLE QUOTES"
repo_client = OmixAtlas(REFRESH_TOKEN)

See a list of available Omix Atlases

pd.DataFrame.from_dict(repo_client.get_all_omixatlas()['data'])
repo_name repo_id indexes dataset_count disease_count diseases organism_count organisms sources datatypes sample_count
0 liveromix_atlas 1615965444377 {'gct_metadata': 'liveromix_atlas_gct_metadata... 6760 761 [normal, carcinoma, hepatocellular, obesity, n... 22 [homo sapiens, mus musculus, rattus norvegicus... [geo, lincs, tcga, metabolomics workbench, met... [transcriptomics, mutation, metabolomics, sing... 1738079

1.3 Querying Single Cell data from Liver OmixAtlas

The publication provides the GEO dataset id that was used for the analysis. Using the GSE and GPL id the GEO datasets are queried on the Liver OmixAtlas.
The data is available in a structured anndata object in h5ad format

sc_normal_query = """select * FROM liveromix_atlas_files 
        WHERE dataset_id = 'GSE115469_GPL16791'
        AND kw_data_type = 'Single cell'"""

repo_client.query_metadata(sc_normal_query)
dataset_id dataset_source disease description organism tissue file_type authors year publication kw_data_type kw_cell_type platform kw_repo kw_package kw_key kw_bucket kw_filetype kw_region kw_location kw_timestamp
0 GSE115469_GPL16791 GEO Normal Single cell RNA sequencing of human liver reve... Homo sapiens [liver] h5ad Sonya A. MacParland, Ian D McGilvray 2018 https://www.nature.com/articles/s41467-018-063... Single cell [] 10x liveromix_atlas liver_atlas/data liver_atlas/data/SingleCell/GSE115469_GPL16791... discover-prod-datalake-v1 h5ad us-west-2 https://discover-prod-datalake-v1.s3-us-west-2... 1626622850659
dataset_id = list(repo_client.query_metadata(sc_normal_query)['dataset_id'])
dataset_id
['GSE115469_GPL16791']

Download data from liverOmix atlas

file_link = repo_client.download_data(LiverOmixAltasID, dataset_id[0])['data']
file_link
'https://discover-prod-datalake-v1.s3.amazonaws.com/liver_atlas/data/SingleCell/GSE115469_GPL16791.h5ad?AWSAccessKeyId=ASIAVRYB5UBIA5FW2NFB&Signature=wQeC6muM1IPeU2RQGOWf7z2gaMU%3D&x-amz-security-token=IQoJb3JpZ2luX2VjENf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQDQlEn4h5AZ3rKsV3RrcKVNwYuMCpB8CW4v0rYUyyCUUwIhALNparKajiikPLlUbV7tVAXgdOqRsCMy%2BEpBKeg8QtflKqoCCND%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEQAhoMMzgxNzE5MjU3MTY4Igz90u%2BAalZnFYSf59wq%2FgErXZhA1mS24Pzmxiujn5CDBTadtH%2BpJ%2BC4gLUAjzk3eAFnQuKfZfMN6MNdTN81pvl3g3p9Cr1LLeiPF%2FxHNEdltiie79cRDd3dZF0JMUqbWUY7y6%2Fy9It2YluhF2JGAGf8PqPT9zadygDz%2FYRLzRryb6Xcka0t%2FD6ST6P%2BZ6UlOxplcslONUukwhWgaIdPhl4cwUSP7LCL41fJAxUCiEb8%2FugQMMPvLu62qFGfdQpG7bUFddfXqSuE3mST0EAlY5b64c46YjgbadhJd52e9uIowubqpMxVsCOt0VCpLFE54ITnrMnv9t0Qp4KsNJHBu%2F4%2Bq9%2BvfxX%2BFkitnHj7oTCvtOSHBjqZAVP5nfMsDhkbIXsOcKWxas6yjg5q9eZDOecxXzT%2BR0iRAcMLh6fVhYciG327iZOaq0NQ87LDxkgT3zY9f4g0EN4DSj3jE5O28lrSPyOzbbuB9rhzJuu1A3uaap00bTErXKquKUFM4cPgCpbADzIoQQLuzoX6JSOXWySZ%2BN%2FX0MMFa4eWiZAUZyDkoet9oY4%2B%2BcdkndKPQIrceg%3D%3D&Expires=1626941537'
wget -O GSE115469_GPL16791.h5ad 'https://discover-prod-datalake-v1.s3.amazonaws.com/liver_atlas/data/SingleCell/GSE115469_GPL16791.h5ad?AWSAccessKeyId=ASIAVRYB5UBIA5FW2NFB&Signature=wQeC6muM1IPeU2RQGOWf7z2gaMU%3D&x-amz-security-token=IQoJb3JpZ2luX2VjENf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQDQlEn4h5AZ3rKsV3RrcKVNwYuMCpB8CW4v0rYUyyCUUwIhALNparKajiikPLlUbV7tVAXgdOqRsCMy%2BEpBKeg8QtflKqoCCND%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEQAhoMMzgxNzE5MjU3MTY4Igz90u%2BAalZnFYSf59wq%2FgErXZhA1mS24Pzmxiujn5CDBTadtH%2BpJ%2BC4gLUAjzk3eAFnQuKfZfMN6MNdTN81pvl3g3p9Cr1LLeiPF%2FxHNEdltiie79cRDd3dZF0JMUqbWUY7y6%2Fy9It2YluhF2JGAGf8PqPT9zadygDz%2FYRLzRryb6Xcka0t%2FD6ST6P%2BZ6UlOxplcslONUukwhWgaIdPhl4cwUSP7LCL41fJAxUCiEb8%2FugQMMPvLu62qFGfdQpG7bUFddfXqSuE3mST0EAlY5b64c46YjgbadhJd52e9uIowubqpMxVsCOt0VCpLFE54ITnrMnv9t0Qp4KsNJHBu%2F4%2Bq9%2BvfxX%2BFkitnHj7oTCvtOSHBjqZAVP5nfMsDhkbIXsOcKWxas6yjg5q9eZDOecxXzT%2BR0iRAcMLh6fVhYciG327iZOaq0NQ87LDxkgT3zY9f4g0EN4DSj3jE5O28lrSPyOzbbuB9rhzJuu1A3uaap00bTErXKquKUFM4cPgCpbADzIoQQLuzoX6JSOXWySZ%2BN%2FX0MMFa4eWiZAUZyDkoet9oY4%2B%2BcdkndKPQIrceg%3D%3D&Expires=1626941537'
--2021-07-22 07:12:54--  https://discover-prod-datalake-v1.s3.amazonaws.com/liver_atlas/data/SingleCell/GSE115469_GPL16791.h5ad?AWSAccessKeyId=ASIAVRYB5UBIA5FW2NFB&Signature=wQeC6muM1IPeU2RQGOWf7z2gaMU%3D&x-amz-security-token=IQoJb3JpZ2luX2VjENf%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQDQlEn4h5AZ3rKsV3RrcKVNwYuMCpB8CW4v0rYUyyCUUwIhALNparKajiikPLlUbV7tVAXgdOqRsCMy%2BEpBKeg8QtflKqoCCND%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEQAhoMMzgxNzE5MjU3MTY4Igz90u%2BAalZnFYSf59wq%2FgErXZhA1mS24Pzmxiujn5CDBTadtH%2BpJ%2BC4gLUAjzk3eAFnQuKfZfMN6MNdTN81pvl3g3p9Cr1LLeiPF%2FxHNEdltiie79cRDd3dZF0JMUqbWUY7y6%2Fy9It2YluhF2JGAGf8PqPT9zadygDz%2FYRLzRryb6Xcka0t%2FD6ST6P%2BZ6UlOxplcslONUukwhWgaIdPhl4cwUSP7LCL41fJAxUCiEb8%2FugQMMPvLu62qFGfdQpG7bUFddfXqSuE3mST0EAlY5b64c46YjgbadhJd52e9uIowubqpMxVsCOt0VCpLFE54ITnrMnv9t0Qp4KsNJHBu%2F4%2Bq9%2BvfxX%2BFkitnHj7oTCvtOSHBjqZAVP5nfMsDhkbIXsOcKWxas6yjg5q9eZDOecxXzT%2BR0iRAcMLh6fVhYciG327iZOaq0NQ87LDxkgT3zY9f4g0EN4DSj3jE5O28lrSPyOzbbuB9rhzJuu1A3uaap00bTErXKquKUFM4cPgCpbADzIoQQLuzoX6JSOXWySZ%2BN%2FX0MMFa4eWiZAUZyDkoet9oY4%2B%2BcdkndKPQIrceg%3D%3D&Expires=1626941537
Resolving discover-prod-datalake-v1.s3.amazonaws.com (discover-prod-datalake-v1.s3.amazonaws.com)... 52.218.216.170
Connecting to discover-prod-datalake-v1.s3.amazonaws.com (discover-prod-datalake-v1.s3.amazonaws.com)|52.218.216.170|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 186223226 (178M) [binary/octet-stream]
Saving to: ‘GSE115469_GPL16791.h5ad’

GSE115469_GPL16791. 100%[===================>] 177.60M  30.7MB/s    in 6.3s

2021-07-22 07:13:01 (28.0 MB/s) - ‘GSE115469_GPL16791.h5ad’ saved [186223226/186223226]

2. Data

The single-cell dataset used is a map of the cellular landscape of the human liver using single-cell RNA sequencing. The dataset consists of transcriptional profiles of 8444 parenchymal and non-parenchymal cells obtained from the fractionation of fresh hepatic tissue from five human livers.

The original publication shows 12 distinct cell type populations identified using gene expression patterns, flow cytometry, and immunohistochemical examinations. However, the dataset deposited on Gene Expression Omnibus (GEO), did not contain the experimentally identified cell type annotations.

options(warn=-1)

adata.pred = read_h5ad('GSE115469_GPL16791.h5ad')

seurat = get_seurat_obj(adata.pred)

DimPlot(object = seurat, reduction = "tsne", group.by = "clusters", cols = DiscretePalette(20), label = TRUE)

3. Finding cluster biomarkers

A one vs all comparison is performed between clusters to identify marker genes.

# find markers for every cluster compared to all remaining cells, report only the positive ones
Idents(object = seurat) <- seurat@meta.data$clusters
liver.markers <- FindAllMarkers(seurat, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
head(liver.markers)
A data.frame: 6 × 7
p_valavg_log2FCpct.1pct.2p_val_adjclustergene
<dbl><dbl><dbl><dbl><dbl><int><chr>
HSD11B100.74738400.8960.20701HSD11B1
APOM00.69099390.9130.20401APOM
PON300.51024620.8580.17701PON3
TTC3600.47610220.8270.19301TTC36
F1000.40804880.5830.10401F10
BCHE00.38248720.7490.15601BCHE

Plotting a heatmap showing the top 5 clusters containing largest number of cells

#Scaling data
seurat <- ScaleData(seurat)

Idents(object = seurat) <- seurat@meta.data$clusters

liver.markers.top <- liver.markers %>% group_by(cluster) %>%  top_n(n = 2 ,wt = avg_log2FC)
liver.markers.subset <- liver.markers %>% group_by(cluster) %>% filter(cluster == c(1,2,3,4,5)) %>% top_n(n = 2 ,wt = avg_log2FC)

seurat.subset <- subset(seurat, idents = c('1','2','3','4','5'))
DoHeatmap(seurat.subset, features = liver.markers.subset$gene)
Centering and scaling data matrix

4 . Automated cell type curation using biomarker genes

Marker gene database-based annotation takes advantage of cell type atlases. Literature- and scRNA-seq analysis-derived markers have been assembled into reference cell type hierarchies and marker lists. In this approach, basic scoring systems are used to ascribe cell types at the cluster level in the query dataset.

Type <- c()
clusters <- c()
for (i in seq(1,20, by=1)) {
    liver.markers.filt <- as.data.frame(liver.markers %>% filter(cluster == as.character(i)) %>% top_n(n = 25, wt = avg_log2FC))
    my_overlaps <- clustermole_overlaps(genes = liver.markers.filt$gene, species = "hs")
    type <- as.character(my_overlaps[1,5])
    Type <- c(Type, type)
    clusters <- c(clusters, i)
}


clust_map <- as.data.frame(cbind(clusters,Type))
head(clust_map)
A data.frame: 6 × 2
clustersType
<fct><fct>
11Hepatocytes
22DURANTE_ADULT_OLFACTORY_NEUROEPITHELIUM_CD8_T_CELLS
33Hepatocytes
44AIZARANI_LIVER_C25_KUPFFER_CELLS_4
55Hepatocytes
66AIZARANI_LIVER_C11_HEPATOCYTES_1

Curating cell types from cluster based identifications

The cell type annotations predicted lacked harmonisation in cell names, since the algorithm used a marker-gene database search which queries markers from multiple databases each with database specific vocabulary. Therefore, in order for the cell types to cluster, markers corresponding to same cell type but different annotation must be curated to follow a common vocabulary

#The cell type names must be cleaned in order for them to be clustered together
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('NK_NKT_CELLS','NK Cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('B cell\\D+','B cell',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_CD71\\D+','Erythroid cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_NK_CELLS','NK Cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('KUPFFER_CELLS','Kupffer Cells',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('MVECS','Endothelial cells',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_CD8_T_CELLS','CD8 T cell',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_CD4_T_CELLS','CD4 T cell',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('AIZARANI_LIVER_C\\d+_','',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('_\\d+$','',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('HEPATOCYTES','Hepatocytes',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\bPlasma cell\\b','Plasma cells',x)) 
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_PLASMA_CELLS','Plasma cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_PLASMA_CELL','Plasma cells',x))                               
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('EPCAM_POS_\\D+','EPCAM+',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_B_CELL','B cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('CUI_DEVELOPING_HEART_C3_FIBROBLAST_LIKE_CELL','Others',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('Erythrocytes_\\D+','Erythrocytes',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('FAN_EMBRYONIC_CTX_NSC','Others',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('Lymph_endothel_cell','Other endothelial cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('\\D+_Epithelial_cells','Epithelial cells',x))
clust_map$Type <- sapply(clust_map$Type, function(x) gsub('HAY_BONE_MARROW_PRO_B','Others',x))


cell.type_clust <- aggregate(clusters ~ Type, clust_map, paste, collapse=",")
cell.type_clust
A data.frame: 12 × 2
Typeclusters
<chr><chr>
B cell 16
CD8 T cell 2
Cholangiocytes 17
Endothelial cells 11,13
Erythroid cells 19
Gamma delta T cells 18
Hepatic stellate cells20
Hepatocytes 1,3,5,6,14,15
Kupffer Cells 4,10
LSECS 12
NK Cells 8,9
Plasma cells 7
rows <- rownames(seurat@meta.data)
seurat@meta.data = merge(seurat@meta.data, clust_map, by = 'clusters', all.x = TRUE)
rownames(seurat@meta.data) <- rows
DimPlot(object = seurat, reduction = "tsne", group.by = "Type", cols = DiscretePalette(12))

Appendix

Package installation

NOTE : For R packages, the cells must be run in the following order as shown below

!sudo R -e 'BiocManager::install(c("GSEABase", "GSVA", "singscore"))'
!sudo R -e 'install.packages(c("anndata", "clustermole"), repos = "https://cloud.r-project.org/")'