input_file = "../results/01_process_data/adata.h5ad"
output_file = "tmp/adata.h5ad"
table_dir = "../tables"
# Parameters
input_file = "input_adata.h5ad"
output_file = "adata.h5ad"
table_dir = "tables"
%load_ext autoreload
%autoreload 2
import numpy as np
import scanpy as sc
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os
import scirpy as ir
import sys
sys.path.extend(("lib", "../lib"))
from jupytertools import *
fix_logging(sc.settings)
mito_genes = pd.read_csv(os.path.join(table_dir, "mitochondrial_genes.tsv"), sep="\t")[
"Gene name"
].values
biomart = pd.read_csv(os.path.join(table_dir, "biomart.tsv"), sep="\t")
ribo_genes = pd.read_csv(
os.path.join(table_dir, "ribosomal_genes.tsv"), sep="\t", comment="#"
)["Approved symbol"].values
adata = sc.read_h5ad(input_file)
print(adata)
AnnData object with n_obs × n_vars = 71401 × 33514 obs: 'IR_VJ_1_locus', 'IR_VJ_2_locus', 'IR_VDJ_1_locus', 'IR_VDJ_2_locus', 'IR_VJ_1_cdr3', 'IR_VJ_2_cdr3', 'IR_VDJ_1_cdr3', 'IR_VDJ_2_cdr3', 'IR_VJ_1_cdr3_nt', 'IR_VJ_2_cdr3_nt', 'IR_VDJ_1_cdr3_nt', 'IR_VDJ_2_cdr3_nt', 'IR_VJ_1_expr', 'IR_VJ_2_expr', 'IR_VDJ_1_expr', 'IR_VDJ_2_expr', 'IR_VJ_1_expr_raw', 'IR_VJ_2_expr_raw', 'IR_VDJ_1_expr_raw', 'IR_VDJ_2_expr_raw', 'IR_VJ_1_v_gene', 'IR_VJ_2_v_gene', 'IR_VDJ_1_v_gene', 'IR_VDJ_2_v_gene', 'IR_VJ_1_d_gene', 'IR_VJ_2_d_gene', 'IR_VDJ_1_d_gene', 'IR_VDJ_2_d_gene', 'IR_VJ_1_j_gene', 'IR_VJ_2_j_gene', 'IR_VDJ_1_j_gene', 'IR_VDJ_2_j_gene', 'IR_VJ_1_c_gene', 'IR_VJ_2_c_gene', 'IR_VDJ_1_c_gene', 'IR_VDJ_2_c_gene', 'IR_VJ_1_junction_ins', 'IR_VJ_2_junction_ins', 'IR_VDJ_1_junction_ins', 'IR_VDJ_2_junction_ins', 'has_ir', 'multi_chain', 'samples', 'n_genes', 'patient', 'origin', 'replicate', 'dataset', 'tumor_type', 'platform', 'age', 'sex', 'hpv_status', 'ir_status', 'facs_purity_cd3', 'facs_purity_cd56'
def compute_quality_metrics(adata):
tmp_mito = [g for g in mito_genes if g in adata.var_names]
adata.obs["mt_frac"] = np.sum(adata[:, tmp_mito].X, axis=1) / np.sum(
adata.X, axis=1
)
adata.obs["n_counts"] = adata.X.sum(axis=1)
adata.obs["n_genes"] = (adata.X != 0).sum(axis=1)
adata.obs["rk_n_counts"] = adata.obs["n_counts"].rank(
ascending=False, method="first"
)
adata.obs["rk_n_genes"] = adata.obs["n_genes"].rank(ascending=False, method="first")
adata.obs["rk_mt_frac"] = adata.obs["mt_frac"].rank(ascending=True, method="first")
Quality control follows the new "Best practice" tutorial for single cell analysis (Luecken & Theis 2019) and the accompanying case study notebook.
MIN_COUNTS = 2000
MIN_GENES = 700
MAX_MITO = 0.11
MIN_CELLS = 50
DOUBLET_THRES = 0.4
assert adata.var_names.is_unique
Coarse filtering to reduce amount of data:
sc.pp.filter_cells(adata, min_genes=100)
compute_quality_metrics(adata)
print_dim(adata)
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
Current dimensions of adata: 71401 cells x 33514 genes.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))
sc.pl.violin(adata, "n_counts", groupby="samples", log=True, cut=0, ax=ax1, show=False)
sc.pl.violin(adata, "mt_frac", groupby="samples", ax=ax2, show=False)
ax1.set_title("Count depth (counts per barcode)")
ax2.set_title("Mitochondrial fraction per barcode")
fig.show()
os.makedirs("./tmp", exist_ok=True)
fig.savefig("./tmp/qc_dist_per_sample_before.png")
/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
The sample quality looks sufficiently consistent, so that we apply global filtering cut-offs instead of per-sample filtering
Filter genes that occur in less than MIN_CELLS of cells. Relates to the minimal expected cluster size. Given the number of cells, I expect the smallest cluster of interest to contain at least 50 cells.
sc.pp.filter_genes(adata, min_cells=MIN_CELLS)
print_dim(adata)
filtered out 16522 genes that are detected in less than 50 cells
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
Current dimensions of adata: 71401 cells x 16992 genes.
ax1 = sc.pl.scatter(
adata,
x="n_counts",
y="n_genes",
color="mt_frac",
title="detected genes vs. count depth",
show=False,
)
ax1.hlines(y=MIN_GENES, xmin=0, xmax=ax1.get_xlim()[1], linewidth=1, color="red")
ax1.vlines(x=MIN_COUNTS, ymin=0, ymax=ax1.get_ylim()[1], linewidth=1, color="red")
plt.show()
ax2 = sc.pl.scatter(
adata[adata.obs["n_counts"] < 10000],
x="n_counts",
y="n_genes",
color="mt_frac",
title="detected genes vs. count depth (zoomed in)",
show=False,
)
ax2.hlines(y=MIN_GENES, xmin=0, xmax=ax2.get_xlim()[1], linewidth=1, color="red")
ax2.vlines(x=MIN_COUNTS, ymin=0, ymax=ax2.get_ylim()[1], linewidth=1, color="red")
plt.show()
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
ax1 = sns.distplot(adata.obs["n_genes"], kde=False, bins=60)
ax1.set_title("Distribution: detected genes")
ax1.vlines(x=MIN_GENES, color="red", linewidth=1, ymin=0, ymax=ax1.get_ylim()[1])
plt.show()
ax2 = sc.pl.scatter(
adata,
x="rk_n_genes",
y="n_genes",
color="mt_frac",
legend_loc="none",
title="Distribution: detected genes",
show=False,
)
ax2.hlines(y=MIN_GENES, color="red", linewidth=1, xmin=0, xmax=ax2.get_xlim()[1])
plt.show()
ax3 = sc.pl.scatter(
adata,
x="rk_n_counts",
y="n_counts",
color="mt_frac",
legend_loc="none",
title="Distribution: read counts",
show=False,
)
ax3.hlines(y=MIN_COUNTS, color="red", linewidth=1, xmin=0, xmax=ax3.get_xlim()[1])
ax3.set_yscale("log")
plt.show()
/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
ax4 = sns.distplot(adata.obs["mt_frac"], kde=False, bins=60)
ax4.set_title("Distribution: fraction mito reads")
ax4.vlines(x=MAX_MITO, color="red", linewidth=1, ymin=0, ymax=ax4.get_ylim()[1])
plt.show()
ax5 = sc.pl.scatter(
adata,
x="rk_mt_frac",
y="mt_frac",
color="mt_frac",
show=False,
legend_loc="none",
title="Distribution: fraction mito reads",
)
ax5.hlines(y=MAX_MITO, color="red", linewidth=1, xmin=0, xmax=ax5.get_xlim()[1])
plt.show()
print_dim(adata)
Current dimensions of adata: 71401 cells x 16992 genes.
Apply MIN_GENES
threshold:
sc.pp.filter_cells(adata, min_genes=MIN_GENES)
print_dim(adata)
filtered out 30847 cells that have less than 700 genes expressed Current dimensions of adata: 40554 cells x 16992 genes.
Apply MIN_COUNTS
threshold:
sc.pp.filter_cells(adata, min_counts=MIN_COUNTS)
print_dim(adata)
filtered out 3793 cells that have less than 2000 counts Current dimensions of adata: 36761 cells x 16992 genes.
Apply MAX_MITO
threshold:
adata = adata[adata.obs["mt_frac"] < MAX_MITO, :]
print_dim(adata)
Current dimensions of adata: 35052 cells x 16992 genes.
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
Ribosomal genes were downloaded from https://www.genenames.org/data/genegroup/#!/group/1054
adata = adata[:, ~adata.var_names.isin(np.append(mito_genes, ribo_genes))]
print_dim(adata)
Current dimensions of adata: 35052 cells x 16818 genes.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))
sc.pl.violin(adata, "n_counts", groupby="samples", log=True, cut=0, ax=ax1, show=False)
sc.pl.violin(adata, "mt_frac", groupby="samples", ax=ax2, show=False)
ax1.set_title("Count depth (counts per barcode)")
ax2.set_title("Mitochondrial fraction per barcode")
fig.show()
fig.savefig("./tmp/qc_dist_per_sample_after.png")
/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
ax1 = sc.pl.scatter(
adata,
x="n_counts",
y="n_genes",
color="mt_frac",
title="detected genes vs. count depth",
show=False,
)
ax1.hlines(y=MIN_GENES, xmin=0, xmax=ax1.get_xlim()[1], linewidth=1, color="red")
ax1.vlines(x=MIN_COUNTS, ymin=0, ymax=ax1.get_ylim()[1], linewidth=1, color="red")
plt.show()
ax2 = sc.pl.scatter(
adata[adata.obs["n_counts"] < 10000],
x="n_counts",
y="n_genes",
color="mt_frac",
title="detected genes vs. count depth (zoomed in)",
show=False,
)
ax2.hlines(y=MIN_GENES, xmin=0, xmax=ax2.get_xlim()[1], linewidth=1, color="red")
ax2.vlines(x=MIN_COUNTS, ymin=0, ymax=ax2.get_ylim()[1], linewidth=1, color="red")
plt.show()
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
We leverate TCR receptor sequencing data to call putative doublets. Cells with more than one TCR-beta or more than two TCR-alpha chains are removed.
ir.tl.chain_qc(adata)
Trying to set attribute `.obs` of view, copying.
ir.pl.group_abundance(adata, groupby="receptor_subtype", target_col="samples")
... storing 'receptor_type' as categorical ... storing 'receptor_subtype' as categorical ... storing 'chain_pairing' as categorical
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key])
<AxesSubplot:title={'center':'Number of cells in receptor_subtype by samples'}, xlabel='receptor_subtype', ylabel='Number of cells'>
ir.pl.group_abundance(adata, groupby="chain_pairing", target_col="samples")
<AxesSubplot:title={'center':'Number of cells in chain_pairing by samples'}, xlabel='chain_pairing', ylabel='Number of cells'>
Remove multichain cells and extra VDJ (=beta) cells. extra VJ (=alpha) is ok, as T-cells can have two alpha chains.
tcr_filter = adata.obs["chain_pairing"].isin(
["multichain", "extra VDJ", "two full chains"]
)
print_dim(adata)
adata = adata[~tcr_filter, :].copy()
print_dim(adata)
Current dimensions of adata: 35052 cells x 16818 genes. Current dimensions of adata: 29843 cells x 16818 genes.
ir.pl.group_abundance(adata, groupby="chain_pairing", target_col="samples")
<AxesSubplot:title={'center':'Number of cells in chain_pairing by samples'}, xlabel='chain_pairing', ylabel='Number of cells'>
## Normalize (for visualization only, we will save the raw counts!)
# sc.pp.filter_genes(adata, min_cells=1)
adata_vis = adata.copy()
sc.pp.normalize_per_cell(adata_vis, counts_per_cell_after=1e4)
sc.pp.log1p(adata_vis)
sc.pp.filter_genes(adata_vis, min_cells=1)
normalizing by total count per cell finished ({time_passed}): normalized adata.X and added 'n_counts', counts per cell before normalization (adata.obs)
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
sc.tl.pca(adata_vis, svd_solver="arpack")
sc.pp.neighbors(adata_vis, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata_vis)
computing PCA with n_comps=50 finished computing neighbors using 'X_pca' with n_pcs = 40 finished computing UMAP finished
sc.tl.leiden(adata_vis)
running Leiden clustering finished
sc.pl.umap(
adata_vis,
color=[
"n_genes",
"n_counts",
"mt_frac",
"samples",
"origin",
"leiden",
"chain_pairing",
"has_ir",
],
ncols=2,
)
adata.write(output_file, compression="lzf")