In this notebook, we will
# get default parameters the papermill way.
input_file = "../results/04_annotate_cell_types/adata.h5ad"
output_file = "tmp/data.h5ad"
results_dir = "tmp"
table_dir = "../tables"
# Parameters
input_file = "input_adata.h5ad"
output_file = "adata.h5ad"
table_dir = "tables"
cpus = "1"
results_dir = "."
import pandas as pd
import scanpy as sc
import numpy as np
from matplotlib import pyplot as plt
import sys
import os
sys.path.append("lib")
sys.path.append("../lib")
from jupytertools import fix_logging, display
from operator import or_
from functools import reduce
fix_logging(sc.settings)
markers = pd.read_csv(os.path.join(table_dir, "cell_type_markers.csv"))
cell_types = markers["cell_type"].unique()
adata = sc.read_h5ad(input_file)
sc.pl.umap(adata, color="cell_type")
# PCA turned out not to be entirely reproducible on different CPU architechtures.
# For the sake of reproducibility of these notebooks, we load a pre-computed result
# from the repository. If it doesn't exist, we compute it from scratch.
def pca_precomputed(adata, key, **kwargs):
pca_file = os.path.join(table_dir, f"adata_pca_{key}.pkl.gz")
try:
# CSV does not preserver floating point numbers perfectly.
adata.obsm["X_pca"] = pd.read_pickle(pca_file).values
except IOError:
assert False, "should use pre-computed version. "
sc.tl.pca(adata, svd_solver="arpack", **kwargs)
pd.DataFrame(adata.obsm["X_pca"]).to_pickle(pca_file)
adata = adata[adata.obs["cell_type"].isin(["T cell", "NK cell"]), :].copy()
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
As in a preliminary analysis, they turned out to be dominant DE genes when comparing clusters. In the transcriptomics analysis, we are not interested in the effects of T-cell receptor genes.
prefixes = ["TRAV", "TRBV", "TRBJ", "TRAJ", "TRBD"]
exclude_genes = reduce(or_, [adata.var_names.str.startswith(x) for x in prefixes])
adata.var_names[exclude_genes]
Index(['TRBV1', 'TRBV2', 'TRBV3-1', 'TRBV4-1', 'TRBV5-1', 'TRBV6-1', 'TRBV4-2', 'TRBV6-2', 'TRBV7-2', 'TRBV6-4', 'TRBV7-3', 'TRBV5-3', 'TRBV9', 'TRBV10-1', 'TRBV11-1', 'TRBV10-2', 'TRBV11-2', 'TRBV6-5', 'TRBV7-4', 'TRBV5-4', 'TRBV6-6', 'TRBV5-5', 'TRBV7-6', 'TRBV5-6', 'TRBV7-7', 'TRBV7-9', 'TRBV13', 'TRBV10-3', 'TRBV11-3', 'TRBV12-3', 'TRBV12-4', 'TRBV12-5', 'TRBV14', 'TRBV15', 'TRBV16', 'TRBV18', 'TRBV19', 'TRBV20-1', 'TRBV21-1', 'TRBV23-1', 'TRBV24-1', 'TRBV25-1', 'TRBV27', 'TRBV28', 'TRBV29-1', 'TRBV30', 'TRAV1-1', 'TRAV1-2', 'TRAV2', 'TRAV3', 'TRAV4', 'TRAV5', 'TRAV6', 'TRAV8-1', 'TRAV10', 'TRAV12-1', 'TRAV8-2', 'TRAV8-3', 'TRAV13-1', 'TRAV12-2', 'TRAV8-4', 'TRAV13-2', 'TRAV14DV4', 'TRAV9-2', 'TRAV12-3', 'TRAV8-6', 'TRAV16', 'TRAV17', 'TRAV18', 'TRAV19', 'TRAV20', 'TRAV21', 'TRAV22', 'TRAV23DV6', 'TRAV24', 'TRAV25', 'TRAV26-1', 'TRAV27', 'TRAV29DV5', 'TRAV30', 'TRAV26-2', 'TRAV34', 'TRAV35', 'TRAV36DV7', 'TRAV38-1', 'TRAV38-2DV8', 'TRAV39', 'TRAV40', 'TRAV41'], dtype='object')
adata.shape
(17062, 16818)
adata = adata[:, ~exclude_genes].copy()
adata.shape
(17062, 16729)
np.sum(adata.var_names.str.startswith("TRAV"))
0
random_state = 1
sc.pp.neighbors(adata, random_state=random_state)
sc.tl.umap(adata, random_state=random_state)
computing neighbors using 'X_pca' with n_pcs = 50 finished computing UMAP finished
sc.tl.leiden(adata, resolution=3, random_state=42)
running Leiden clustering finished
fig, ax = plt.subplots(figsize=(7, 5))
sc.pl.umap(
adata, legend_loc="on data", color="leiden", ax=ax, size=20, legend_fontoutline=3
)
for ct in cell_types:
if not "T cell" in ct and ct != "NK cell":
continue
marker_genes = markers.loc[markers["cell_type"] == ct, "gene_identifier"]
sc.pl.umap(
adata,
color=marker_genes,
title=["{}: {}".format(ct, g) for g in marker_genes],
size=15,
)
fig, ax = plt.subplots(figsize=(7, 5))
sc.pl.umap(
adata, legend_loc="on data", color="leiden", ax=ax, size=20, legend_fontoutline=3
)
annotation_t = {
"T CD4+": [],
# 27 is NK, but some of the cells are CD8
"T CD8+": [8, 0, 6, 20, 11, 29, 21, 22, 14, 27],
"T reg.": [13, 12],
"T other": [31],
"ambiguous": [24, 25, 26, 18],
}
annot_dict = {str(c): ct for ct, clusters in annotation_t.items() for c in clusters}
adata.obs["cell_type2"] = [
annot_dict.get(c, "T CD4+") if ct != "NK cell" else ct
for c, ct in zip(adata.obs["leiden"], adata.obs["cell_type"])
]
sc.pl.umap(adata, color="cell_type2")
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'cell_type2' as categorical
adata_ambiguous = adata[adata.obs["cell_type2"] == "ambiguous", :].copy()
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
sc.__version__
'1.6.0'
pca_precomputed(adata_ambiguous, "subcluster_ambiguous")
sc.pp.neighbors(adata_ambiguous, n_neighbors=5)
sc.tl.umap(adata_ambiguous)
computing neighbors using 'X_pca' with n_pcs = 50 finished computing UMAP finished
sc.tl.leiden(adata_ambiguous, resolution=2)
running Leiden clustering finished
sc.pl.umap(
adata_ambiguous, color=["leiden"], legend_loc="on data", legend_fontoutline=4
)
sc.pl.umap(adata_ambiguous, color=["CD8A", "CD8B", "FOXP3", "CD4", "n_counts"], ncols=3)
annotation_ambiguous = {
"T CD4+": [21, 14, 12, 2, 23, 11, 13, 8, 9, 17, 24, 0, 22, 10],
"T CD8+": [4, 18, 15, 26],
"T reg.": [6, 1, 5, 19, 25, 3, 16, 7],
"T other": [20],
}
annot_dict = {
str(c): ct for ct, clusters in annotation_ambiguous.items() for c in clusters
}
adata_ambiguous.obs["cell_type3"] = [
annot_dict[c]
for c, ct in zip(adata_ambiguous.obs["leiden"], adata_ambiguous.obs["cell_type"])
]
sc.pl.umap(adata_ambiguous, color="cell_type3")
... storing 'cell_type3' as categorical
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key])
# convert categorical to string
adata.obs["cell_type2"] = [str(x) for x in adata.obs["cell_type2"]]
adata.obs.loc[adata_ambiguous.obs_names, "cell_type2"] = adata_ambiguous.obs[
"cell_type3"
]
adata.obs["cell_type"] = adata.obs["cell_type2"]
del adata.obs["cell_type2"]
sc.pl.umap(
adata,
color=[
"samples",
"n_genes",
"mt_frac",
"has_ir",
"hpv_status",
"ir_status",
"chain_pairing",
"phase",
"leiden",
"CD4",
"FOXP3",
"CD8A",
"CD19",
"is_doublet",
"cell_type",
],
ncols=3,
)
... storing 'cell_type' as categorical
adatas = dict()
resolutions = {"T CD4+": 1, "T CD8+": 1, "NK cell": 0.5, "T reg.": 0.5, "T other": 0.5}
for ct in adata.obs["cell_type"].unique():
tmp_adata = adata[adata.obs["cell_type"] == ct, :].copy()
pca_precomputed(
tmp_adata,
f'subcluster_{ct.lower().replace(" ", "_").replace("+", "").replace(".", "")}',
)
sc.pp.neighbors(tmp_adata)
sc.tl.umap(tmp_adata)
sc.tl.leiden(tmp_adata, resolution=resolutions[ct])
sc.pl.umap(tmp_adata, color="leiden", title=ct)
adatas[ct] = tmp_adata
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
computing neighbors using 'X_pca' with n_pcs = 50 finished computing UMAP finished running Leiden clustering finished
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
computing neighbors using 'X_pca' with n_pcs = 50 finished computing UMAP finished running Leiden clustering finished
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
computing neighbors using 'X_pca' with n_pcs = 50 finished computing UMAP finished running Leiden clustering finished
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
computing neighbors using 'X_pca' with n_pcs = 50 finished computing UMAP finished running Leiden clustering finished
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
computing neighbors using 'X_pca' with n_pcs = 50 finished computing UMAP finished running Leiden clustering finished
for ct, tmp_adata in adatas.items():
adata.obs.loc[tmp_adata.obs.index, "cluster"] = [
"{} {}".format(ct, x) for x in tmp_adata.obs["leiden"]
]
sc.pl.umap(adata, color=["cluster", "cell_type"])
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'cluster' as categorical
adata.write_h5ad(output_file, compression="lzf")