input_file = "../results/03_correct_data/adata.h5ad"
output_file = "tmp/adata_cell_types.h5ad"
table_dir = "../tables"


# Parameters
input_file = "input_adata.h5ad"
output_file = "adata.h5ad"
table_dir = "tables"


import pandas as pd
import scanpy as sc
import numpy as np
from matplotlib import pyplot as plt
import sys
import os

sys.path.append("lib")
sys.path.append("../lib")
from jupytertools import fix_logging, display

fix_logging(sc.settings)
from plotnine import ggplot, aes
import plotnine as n
import scipy.stats as stats


markers = pd.read_csv(os.path.join(table_dir, "cell_type_markers.csv"))


adata = sc.read_h5ad(input_file)


random_state = 42
sc.pp.neighbors(adata, n_pcs=20, random_state=random_state)
sc.tl.umap(adata, random_state=random_state)
sc.tl.leiden(adata, resolution=2, random_state=random_state)

computing neighbors
    using 'X_pca' with n_pcs = 20
    finished
computing UMAP
    finished
running Leiden clustering
    finished


fig, ax = plt.subplots(figsize=(14, 10))
sc.pl.umap(
    adata, color="leiden", ax=ax, legend_loc="on data", size=20, legend_fontoutline=3
)


cell_types = np.unique(markers["cell_type"])


for ct in cell_types:
    marker_genes = markers.loc[markers["cell_type"] == ct, "gene_identifier"]
    sc.pl.umap(
        adata, color=marker_genes, title=["{}: {}".format(ct, g) for g in marker_genes]
    )


fig, ax = plt.subplots(figsize=(14, 10))
sc.pl.umap(
    adata, legend_loc="on data", color="leiden", ax=ax, size=20, legend_fontoutline=3
)


annotation = {
    "B cell": [17, 4, 1, 28, 6, 7, 19, 8],
    "CAF": [27],
    "Endothelial cell": [21],
    "Mast cell": [32],
    "NK cell": [0, 18, 31, 26],
    "T cell": [2, 9, 20, 14, 24, 3, 10, 16, 12, 11, 15, 30, 5, 13, 25],
    "myeloid": [22],
    "pDC": [33],
}


annot_dict = {str(c): ct for ct, clusters in annotation.items() for c in clusters}


adata.obs["cell_type"] = [annot_dict.get(c, "unknown") for c in adata.obs["leiden"]]
adata.obs["cell_type_unknown"] = [
    "known" if ct != "unknown" else ct for ct in adata.obs["cell_type"]
]


sc.pl.umap(adata, color=["cell_type_unknown", "cell_type"])

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead

... storing 'cell_type' as categorical
... storing 'cell_type_unknown' as categorical


display(
    adata.obs.groupby("cell_type")[["samples"]].count().sort_values("samples"), n=50
)


# fractions by sample
type_per_sample = (
    adata.obs.groupby(["cell_type", "samples"])
    .size()
    .reset_index(name="n_cells")
    .merge(adata.obs.groupby("samples").size().reset_index(name="n_total_cells"))
    .assign(frac_cells=lambda x: x["n_cells"] / x["n_total_cells"])
)
type_per_sample


(
    ggplot(type_per_sample, aes(x="samples", y="frac_cells", fill="cell_type"))
    + n.geom_bar(stat="identity")
    + n.scale_fill_brewer(type="qual", palette="Paired")
    + n.theme(
        subplots_adjust={"right": 0.4},
        axis_text_x=n.element_text(angle=90, vjust=1, hjust=0.5),
    )
)

<ggplot: (2966145521427)>


# because of https://github.com/pandas-dev/pandas/issues/27519
def t_cell_frac(x):
    return np.sum(x == "T cell") / len(x)


def nk_cell_frac(x):
    return np.sum(x == "NK cell") / len(x)


cell_type_fractions = (
    adata.obs.groupby(["samples", "facs_purity_cd3", "facs_purity_cd56"])
    .agg(
        frac_t_cell=("cell_type", t_cell_frac), frac_nk_cell=("cell_type", nk_cell_frac)
    )
    .dropna()
    .reset_index()
)


display(cell_type_fractions, n=50)


x = cell_type_fractions["facs_purity_cd3"]
y = cell_type_fractions["frac_t_cell"]
r, r_p = stats.pearsonr(x, y)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
fig, ax = plt.subplots()
ax.plot(x, y, "o")
ax.plot(np.array([0, 1]), slope * np.array([0, 1]) + intercept, color="black")
ax.text(x=0, y=1, s="r={:.2f}, p={:.3f}".format(r, r_p))
ax.set_title("T cells: FACS vs. single cell")
ax.set_xlabel("%CD3")
ax.set_ylabel("%T cells")

Text(0, 0.5, '%T cells')


x = cell_type_fractions["facs_purity_cd56"]
y = cell_type_fractions["frac_nk_cell"]
r, r_p = stats.pearsonr(x, y)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
fig, ax = plt.subplots()
ax.plot(x, y, "o")
ax.plot(np.array([0, 0.6]), slope * np.array([0, 0.6]) + intercept, color="black")
ax.text(x=0, y=0.4, s="r={:.2f}, p={:.3f}".format(r, r_p))
ax.set_title("NK cells: FACS vs. single cell")
ax.set_xlabel("%CD56")
ax.set_ylabel("%NK cells")

Text(0, 0.5, '%NK cells')


adata.write(output_file, compression="lzf")

	cell_type	samples	n_cells	n_total_cells	frac_cells
0	B cell	H68	46	2801	0.016423
1	CAF	H68	9	2801	0.003213
2	Endothelial cell	H68	0	2801	0.000000
...	...	...	...	...	...
114	myeloid	H211	33	2930	0.011263
115	pDC	H211	5	2930	0.001706
116	unknown	H211	96	2930	0.032765

	samples	facs_purity_cd3	facs_purity_cd56	frac_t_cell	frac_nk_cell
0	H68	0.797	0.138	0.843984	0.129240
1	H141	0.288	0.025	0.358058	0.024275
2	H143	0.653	0.008	0.695455	0.002841
3	H149	0.644	0.033	0.750663	0.053935
4	H160	0.342	0.067	0.282983	0.035725
5	H176	0.558	0.108	0.621974	0.125388
6	H182	0.303	0.109	0.391644	0.105428
7	H185	0.493	0.163	0.655858	0.195607
8	H188	0.657	0.087	0.797382	0.073679
9	H197	0.271	0.171	0.416143	0.212788
10	H205	0.485	0.028	0.253820	0.040747
11	H208	0.336	0.323	0.487741	0.295972
12	H211	0.382	0.029	0.248805	0.026621

Coarse annotation of cell-types¶

Input data¶

Leiden clustering¶

Visualize cell-type markers¶

Assign cell types¶

Results¶

Cell-type distribution per sample¶

Compare annotations with FACS markers¶

Save output¶

	samples
cell_type
pDC	71
Mast cell	86
CAF	226
myeloid	523
Endothelial cell	534
unknown	662
NK cell	2820
B cell	9132
T cell	14242