In this notebook, we will
input_file = "../results/03_correct_data/adata.h5ad"
output_file = "tmp/adata_cell_types.h5ad"
table_dir = "../tables"
# Parameters
input_file = "input_adata.h5ad"
output_file = "adata.h5ad"
table_dir = "tables"
import pandas as pd
import scanpy as sc
import numpy as np
from matplotlib import pyplot as plt
import sys
import os
sys.path.append("lib")
sys.path.append("../lib")
from jupytertools import fix_logging, display
fix_logging(sc.settings)
from plotnine import ggplot, aes
import plotnine as n
import scipy.stats as stats
markers = pd.read_csv(os.path.join(table_dir, "cell_type_markers.csv"))
adata = sc.read_h5ad(input_file)
random_state = 42
sc.pp.neighbors(adata, n_pcs=20, random_state=random_state)
sc.tl.umap(adata, random_state=random_state)
sc.tl.leiden(adata, resolution=2, random_state=random_state)
computing neighbors using 'X_pca' with n_pcs = 20 finished computing UMAP finished running Leiden clustering finished
fig, ax = plt.subplots(figsize=(14, 10))
sc.pl.umap(
adata, color="leiden", ax=ax, legend_loc="on data", size=20, legend_fontoutline=3
)
cell_types = np.unique(markers["cell_type"])
for ct in cell_types:
marker_genes = markers.loc[markers["cell_type"] == ct, "gene_identifier"]
sc.pl.umap(
adata, color=marker_genes, title=["{}: {}".format(ct, g) for g in marker_genes]
)
fig, ax = plt.subplots(figsize=(14, 10))
sc.pl.umap(
adata, legend_loc="on data", color="leiden", ax=ax, size=20, legend_fontoutline=3
)
Assign clusters to cell types using the following mapping:
annotation = {
"B cell": [17, 4, 1, 28, 6, 7, 19, 8],
"CAF": [27],
"Endothelial cell": [21],
"Mast cell": [32],
"NK cell": [0, 18, 31, 26],
"T cell": [2, 9, 20, 14, 24, 3, 10, 16, 12, 11, 15, 30, 5, 13, 25],
"myeloid": [22],
"pDC": [33],
}
annot_dict = {str(c): ct for ct, clusters in annotation.items() for c in clusters}
adata.obs["cell_type"] = [annot_dict.get(c, "unknown") for c in adata.obs["leiden"]]
adata.obs["cell_type_unknown"] = [
"known" if ct != "unknown" else ct for ct in adata.obs["cell_type"]
]
sc.pl.umap(adata, color=["cell_type_unknown", "cell_type"])
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead
... storing 'cell_type' as categorical ... storing 'cell_type_unknown' as categorical
display(
adata.obs.groupby("cell_type")[["samples"]].count().sort_values("samples"), n=50
)
samples | |
---|---|
cell_type | |
pDC | 71 |
Mast cell | 86 |
CAF | 226 |
myeloid | 523 |
Endothelial cell | 534 |
unknown | 662 |
NK cell | 2820 |
B cell | 9132 |
T cell | 14242 |
# fractions by sample
type_per_sample = (
adata.obs.groupby(["cell_type", "samples"])
.size()
.reset_index(name="n_cells")
.merge(adata.obs.groupby("samples").size().reset_index(name="n_total_cells"))
.assign(frac_cells=lambda x: x["n_cells"] / x["n_total_cells"])
)
type_per_sample
cell_type | samples | n_cells | n_total_cells | frac_cells | |
---|---|---|---|---|---|
0 | B cell | H68 | 46 | 2801 | 0.016423 |
1 | CAF | H68 | 9 | 2801 | 0.003213 |
2 | Endothelial cell | H68 | 0 | 2801 | 0.000000 |
... | ... | ... | ... | ... | ... |
114 | myeloid | H211 | 33 | 2930 | 0.011263 |
115 | pDC | H211 | 5 | 2930 | 0.001706 |
116 | unknown | H211 | 96 | 2930 | 0.032765 |
117 rows × 5 columns
(
ggplot(type_per_sample, aes(x="samples", y="frac_cells", fill="cell_type"))
+ n.geom_bar(stat="identity")
+ n.scale_fill_brewer(type="qual", palette="Paired")
+ n.theme(
subplots_adjust={"right": 0.4},
axis_text_x=n.element_text(angle=90, vjust=1, hjust=0.5),
)
)
<ggplot: (2966145521427)>
# because of https://github.com/pandas-dev/pandas/issues/27519
def t_cell_frac(x):
return np.sum(x == "T cell") / len(x)
def nk_cell_frac(x):
return np.sum(x == "NK cell") / len(x)
cell_type_fractions = (
adata.obs.groupby(["samples", "facs_purity_cd3", "facs_purity_cd56"])
.agg(
frac_t_cell=("cell_type", t_cell_frac), frac_nk_cell=("cell_type", nk_cell_frac)
)
.dropna()
.reset_index()
)
display(cell_type_fractions, n=50)
samples | facs_purity_cd3 | facs_purity_cd56 | frac_t_cell | frac_nk_cell | |
---|---|---|---|---|---|
0 | H68 | 0.797 | 0.138 | 0.843984 | 0.129240 |
1 | H141 | 0.288 | 0.025 | 0.358058 | 0.024275 |
2 | H143 | 0.653 | 0.008 | 0.695455 | 0.002841 |
3 | H149 | 0.644 | 0.033 | 0.750663 | 0.053935 |
4 | H160 | 0.342 | 0.067 | 0.282983 | 0.035725 |
5 | H176 | 0.558 | 0.108 | 0.621974 | 0.125388 |
6 | H182 | 0.303 | 0.109 | 0.391644 | 0.105428 |
7 | H185 | 0.493 | 0.163 | 0.655858 | 0.195607 |
8 | H188 | 0.657 | 0.087 | 0.797382 | 0.073679 |
9 | H197 | 0.271 | 0.171 | 0.416143 | 0.212788 |
10 | H205 | 0.485 | 0.028 | 0.253820 | 0.040747 |
11 | H208 | 0.336 | 0.323 | 0.487741 | 0.295972 |
12 | H211 | 0.382 | 0.029 | 0.248805 | 0.026621 |
x = cell_type_fractions["facs_purity_cd3"]
y = cell_type_fractions["frac_t_cell"]
r, r_p = stats.pearsonr(x, y)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
fig, ax = plt.subplots()
ax.plot(x, y, "o")
ax.plot(np.array([0, 1]), slope * np.array([0, 1]) + intercept, color="black")
ax.text(x=0, y=1, s="r={:.2f}, p={:.3f}".format(r, r_p))
ax.set_title("T cells: FACS vs. single cell")
ax.set_xlabel("%CD3")
ax.set_ylabel("%T cells")
Text(0, 0.5, '%T cells')
x = cell_type_fractions["facs_purity_cd56"]
y = cell_type_fractions["frac_nk_cell"]
r, r_p = stats.pearsonr(x, y)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
fig, ax = plt.subplots()
ax.plot(x, y, "o")
ax.plot(np.array([0, 0.6]), slope * np.array([0, 0.6]) + intercept, color="black")
ax.text(x=0, y=0.4, s="r={:.2f}, p={:.3f}".format(r, r_p))
ax.set_title("NK cells: FACS vs. single cell")
ax.set_xlabel("%CD56")
ax.set_ylabel("%NK cells")
Text(0, 0.5, '%NK cells')
adata.write(output_file, compression="lzf")