# get default parameters the papermill way.
input_file = "../results/04_annotate_cell_types/adata.h5ad"
output_file = "tmp/data.h5ad"
results_dir = "tmp"
table_dir = "../tables"


# Parameters
input_file = "input_adata.h5ad"
output_file = "adata.h5ad"
table_dir = "tables"
cpus = "1"
results_dir = "."


import pandas as pd
import scanpy as sc
import numpy as np
from matplotlib import pyplot as plt
import sys
import os

sys.path.append("lib")
sys.path.append("../lib")

from jupytertools import fix_logging, display

from operator import or_
from functools import reduce

fix_logging(sc.settings)


markers = pd.read_csv(os.path.join(table_dir, "cell_type_markers.csv"))


cell_types = markers["cell_type"].unique()


adata = sc.read_h5ad(input_file)


sc.pl.umap(adata, color="cell_type")


# PCA turned out not to be entirely reproducible on different CPU architechtures.
# For the sake of reproducibility of these notebooks, we load a pre-computed result
# from the repository. If it doesn't exist, we compute it from scratch.
def pca_precomputed(adata, key, **kwargs):
    pca_file = os.path.join(table_dir, f"adata_pca_{key}.pkl.gz")
    try:
        # CSV does not preserver floating point numbers perfectly. 
        adata.obsm["X_pca"] = pd.read_pickle(pca_file).values
    except IOError:
        assert False, "should use pre-computed version. "
        sc.tl.pca(adata, svd_solver="arpack", **kwargs)
        pd.DataFrame(adata.obsm["X_pca"]).to_pickle(pca_file)


adata = adata[adata.obs["cell_type"].isin(["T cell", "NK cell"]), :].copy()

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):


prefixes = ["TRAV", "TRBV", "TRBJ", "TRAJ", "TRBD"]
exclude_genes = reduce(or_, [adata.var_names.str.startswith(x) for x in prefixes])


adata.var_names[exclude_genes]

Index(['TRBV1', 'TRBV2', 'TRBV3-1', 'TRBV4-1', 'TRBV5-1', 'TRBV6-1', 'TRBV4-2',
       'TRBV6-2', 'TRBV7-2', 'TRBV6-4', 'TRBV7-3', 'TRBV5-3', 'TRBV9',
       'TRBV10-1', 'TRBV11-1', 'TRBV10-2', 'TRBV11-2', 'TRBV6-5', 'TRBV7-4',
       'TRBV5-4', 'TRBV6-6', 'TRBV5-5', 'TRBV7-6', 'TRBV5-6', 'TRBV7-7',
       'TRBV7-9', 'TRBV13', 'TRBV10-3', 'TRBV11-3', 'TRBV12-3', 'TRBV12-4',
       'TRBV12-5', 'TRBV14', 'TRBV15', 'TRBV16', 'TRBV18', 'TRBV19',
       'TRBV20-1', 'TRBV21-1', 'TRBV23-1', 'TRBV24-1', 'TRBV25-1', 'TRBV27',
       'TRBV28', 'TRBV29-1', 'TRBV30', 'TRAV1-1', 'TRAV1-2', 'TRAV2', 'TRAV3',
       'TRAV4', 'TRAV5', 'TRAV6', 'TRAV8-1', 'TRAV10', 'TRAV12-1', 'TRAV8-2',
       'TRAV8-3', 'TRAV13-1', 'TRAV12-2', 'TRAV8-4', 'TRAV13-2', 'TRAV14DV4',
       'TRAV9-2', 'TRAV12-3', 'TRAV8-6', 'TRAV16', 'TRAV17', 'TRAV18',
       'TRAV19', 'TRAV20', 'TRAV21', 'TRAV22', 'TRAV23DV6', 'TRAV24', 'TRAV25',
       'TRAV26-1', 'TRAV27', 'TRAV29DV5', 'TRAV30', 'TRAV26-2', 'TRAV34',
       'TRAV35', 'TRAV36DV7', 'TRAV38-1', 'TRAV38-2DV8', 'TRAV39', 'TRAV40',
       'TRAV41'],
      dtype='object')


adata.shape

(17062, 16818)


adata = adata[:, ~exclude_genes].copy()


adata.shape

(17062, 16729)


np.sum(adata.var_names.str.startswith("TRAV"))

0


random_state = 1
sc.pp.neighbors(adata, random_state=random_state)
sc.tl.umap(adata, random_state=random_state)

computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished


sc.tl.leiden(adata, resolution=3, random_state=42)

running Leiden clustering
    finished


fig, ax = plt.subplots(figsize=(7, 5))
sc.pl.umap(
    adata, legend_loc="on data", color="leiden", ax=ax, size=20, legend_fontoutline=3
)


for ct in cell_types:
    if not "T cell" in ct and ct != "NK cell":
        continue
    marker_genes = markers.loc[markers["cell_type"] == ct, "gene_identifier"]
    sc.pl.umap(
        adata,
        color=marker_genes,
        title=["{}: {}".format(ct, g) for g in marker_genes],
        size=15,
    )


fig, ax = plt.subplots(figsize=(7, 5))
sc.pl.umap(
    adata, legend_loc="on data", color="leiden", ax=ax, size=20, legend_fontoutline=3
)


annotation_t = {
    "T CD4+": [],
    # 27 is NK, but some of the cells are CD8
    "T CD8+": [8, 0, 6, 20, 11, 29, 21, 22, 14, 27],
    "T reg.": [13, 12],
    "T other": [31],
    "ambiguous": [24, 25, 26, 18],
}


annot_dict = {str(c): ct for ct, clusters in annotation_t.items() for c in clusters}
adata.obs["cell_type2"] = [
    annot_dict.get(c, "T CD4+") if ct != "NK cell" else ct
    for c, ct in zip(adata.obs["leiden"], adata.obs["cell_type"])
]


sc.pl.umap(adata, color="cell_type2")

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if is_string_dtype(df[key]) and not is_categorical(df[key])

... storing 'cell_type2' as categorical


adata_ambiguous = adata[adata.obs["cell_type2"] == "ambiguous", :].copy()

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):


sc.__version__

'1.6.0'


pca_precomputed(adata_ambiguous, "subcluster_ambiguous")
sc.pp.neighbors(adata_ambiguous, n_neighbors=5)
sc.tl.umap(adata_ambiguous)

computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished


sc.tl.leiden(adata_ambiguous, resolution=2)

running Leiden clustering
    finished


sc.pl.umap(
    adata_ambiguous, color=["leiden"], legend_loc="on data", legend_fontoutline=4
)
sc.pl.umap(adata_ambiguous, color=["CD8A", "CD8B", "FOXP3", "CD4", "n_counts"], ncols=3)


annotation_ambiguous = {
    "T CD4+": [21, 14, 12, 2, 23, 11, 13, 8, 9, 17, 24, 0, 22, 10],
    "T CD8+": [4, 18, 15, 26],
    "T reg.": [6, 1, 5, 19, 25, 3, 16, 7],
    "T other": [20],
}


annot_dict = {
    str(c): ct for ct, clusters in annotation_ambiguous.items() for c in clusters
}
adata_ambiguous.obs["cell_type3"] = [
    annot_dict[c]
    for c, ct in zip(adata_ambiguous.obs["leiden"], adata_ambiguous.obs["cell_type"])
]


sc.pl.umap(adata_ambiguous, color="cell_type3")

... storing 'cell_type3' as categorical

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if is_string_dtype(df[key]) and not is_categorical(df[key])


# convert categorical to string
adata.obs["cell_type2"] = [str(x) for x in adata.obs["cell_type2"]]
adata.obs.loc[adata_ambiguous.obs_names, "cell_type2"] = adata_ambiguous.obs[
    "cell_type3"
]


adata.obs["cell_type"] = adata.obs["cell_type2"]
del adata.obs["cell_type2"]


sc.pl.umap(
    adata,
    color=[
        "samples",
        "n_genes",
        "mt_frac",
        "has_ir",
        "hpv_status",
        "ir_status",
        "chain_pairing",
        "phase",
        "leiden",
        "CD4",
        "FOXP3",
        "CD8A",
        "CD19",
        "is_doublet",
        "cell_type",
    ],
    ncols=3,
)

... storing 'cell_type' as categorical


adatas = dict()
resolutions = {"T CD4+": 1, "T CD8+": 1, "NK cell": 0.5, "T reg.": 0.5, "T other": 0.5}
for ct in adata.obs["cell_type"].unique():
    tmp_adata = adata[adata.obs["cell_type"] == ct, :].copy()
    pca_precomputed(
        tmp_adata,
        f'subcluster_{ct.lower().replace(" ", "_").replace("+", "").replace(".", "")}',
    )
    sc.pp.neighbors(tmp_adata)
    sc.tl.umap(tmp_adata)
    sc.tl.leiden(tmp_adata, resolution=resolutions[ct])
    sc.pl.umap(tmp_adata, color="leiden", title=ct)
    adatas[ct] = tmp_adata

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished
running Leiden clustering
    finished

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished
running Leiden clustering
    finished

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished
running Leiden clustering
    finished

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished
running Leiden clustering
    finished

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished
running Leiden clustering
    finished


for ct, tmp_adata in adatas.items():
    adata.obs.loc[tmp_adata.obs.index, "cluster"] = [
        "{} {}".format(ct, x) for x in tmp_adata.obs["leiden"]
    ]


sc.pl.umap(adata, color=["cluster", "cell_type"])

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if is_string_dtype(df[key]) and not is_categorical(df[key])

... storing 'cluster' as categorical


adata.write_h5ad(output_file, compression="lzf")

NK/T-cell compartment¶

Input data and configuration¶

Extract NK and T-cells¶

Exclude TCR genes¶

Cell-type annotation¶

re-annotate ambiguous clusters¶

integrate annotation into main object¶

Perform sub-clustering for all cell types individually¶

Store results¶