input_file = "../results/01_process_data/adata.h5ad"
output_file = "tmp/adata.h5ad"
table_dir = "../tables"


# Parameters
input_file = "input_adata.h5ad"
output_file = "adata.h5ad"
table_dir = "tables"


%load_ext autoreload
%autoreload 2

import numpy as np
import scanpy as sc
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os
import scirpy as ir

import sys

sys.path.extend(("lib", "../lib"))

from jupytertools import *

fix_logging(sc.settings)


mito_genes = pd.read_csv(os.path.join(table_dir, "mitochondrial_genes.tsv"), sep="\t")[
    "Gene name"
].values
biomart = pd.read_csv(os.path.join(table_dir, "biomart.tsv"), sep="\t")
ribo_genes = pd.read_csv(
    os.path.join(table_dir, "ribosomal_genes.tsv"), sep="\t", comment="#"
)["Approved symbol"].values


adata = sc.read_h5ad(input_file)


print(adata)

AnnData object with n_obs × n_vars = 71401 × 33514
    obs: 'IR_VJ_1_locus', 'IR_VJ_2_locus', 'IR_VDJ_1_locus', 'IR_VDJ_2_locus', 'IR_VJ_1_cdr3', 'IR_VJ_2_cdr3', 'IR_VDJ_1_cdr3', 'IR_VDJ_2_cdr3', 'IR_VJ_1_cdr3_nt', 'IR_VJ_2_cdr3_nt', 'IR_VDJ_1_cdr3_nt', 'IR_VDJ_2_cdr3_nt', 'IR_VJ_1_expr', 'IR_VJ_2_expr', 'IR_VDJ_1_expr', 'IR_VDJ_2_expr', 'IR_VJ_1_expr_raw', 'IR_VJ_2_expr_raw', 'IR_VDJ_1_expr_raw', 'IR_VDJ_2_expr_raw', 'IR_VJ_1_v_gene', 'IR_VJ_2_v_gene', 'IR_VDJ_1_v_gene', 'IR_VDJ_2_v_gene', 'IR_VJ_1_d_gene', 'IR_VJ_2_d_gene', 'IR_VDJ_1_d_gene', 'IR_VDJ_2_d_gene', 'IR_VJ_1_j_gene', 'IR_VJ_2_j_gene', 'IR_VDJ_1_j_gene', 'IR_VDJ_2_j_gene', 'IR_VJ_1_c_gene', 'IR_VJ_2_c_gene', 'IR_VDJ_1_c_gene', 'IR_VDJ_2_c_gene', 'IR_VJ_1_junction_ins', 'IR_VJ_2_junction_ins', 'IR_VDJ_1_junction_ins', 'IR_VDJ_2_junction_ins', 'has_ir', 'multi_chain', 'samples', 'n_genes', 'patient', 'origin', 'replicate', 'dataset', 'tumor_type', 'platform', 'age', 'sex', 'hpv_status', 'ir_status', 'facs_purity_cd3', 'facs_purity_cd56'


def compute_quality_metrics(adata):
    tmp_mito = [g for g in mito_genes if g in adata.var_names]
    adata.obs["mt_frac"] = np.sum(adata[:, tmp_mito].X, axis=1) / np.sum(
        adata.X, axis=1
    )
    adata.obs["n_counts"] = adata.X.sum(axis=1)
    adata.obs["n_genes"] = (adata.X != 0).sum(axis=1)
    adata.obs["rk_n_counts"] = adata.obs["n_counts"].rank(
        ascending=False, method="first"
    )
    adata.obs["rk_n_genes"] = adata.obs["n_genes"].rank(ascending=False, method="first")
    adata.obs["rk_mt_frac"] = adata.obs["mt_frac"].rank(ascending=True, method="first")


MIN_COUNTS = 2000
MIN_GENES = 700
MAX_MITO = 0.11
MIN_CELLS = 50
DOUBLET_THRES = 0.4


assert adata.var_names.is_unique


sc.pp.filter_cells(adata, min_genes=100)
compute_quality_metrics(adata)
print_dim(adata)

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

Current dimensions of adata: 71401 cells x 33514 genes.


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))
sc.pl.violin(adata, "n_counts", groupby="samples", log=True, cut=0, ax=ax1, show=False)
sc.pl.violin(adata, "mt_frac", groupby="samples", ax=ax2, show=False)
ax1.set_title("Count depth (counts per barcode)")
ax2.set_title("Mitochondrial fraction per barcode")
fig.show()
os.makedirs("./tmp", exist_ok=True)
fig.savefig("./tmp/qc_dist_per_sample_before.png")

/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


sc.pp.filter_genes(adata, min_cells=MIN_CELLS)
print_dim(adata)

filtered out 16522 genes that are detected in less than 50 cells

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

Current dimensions of adata: 71401 cells x 16992 genes.


ax1 = sc.pl.scatter(
    adata,
    x="n_counts",
    y="n_genes",
    color="mt_frac",
    title="detected genes vs. count depth",
    show=False,
)
ax1.hlines(y=MIN_GENES, xmin=0, xmax=ax1.get_xlim()[1], linewidth=1, color="red")
ax1.vlines(x=MIN_COUNTS, ymin=0, ymax=ax1.get_ylim()[1], linewidth=1, color="red")
plt.show()

ax2 = sc.pl.scatter(
    adata[adata.obs["n_counts"] < 10000],
    x="n_counts",
    y="n_genes",
    color="mt_frac",
    title="detected genes vs. count depth (zoomed in)",
    show=False,
)
ax2.hlines(y=MIN_GENES, xmin=0, xmax=ax2.get_xlim()[1], linewidth=1, color="red")
ax2.vlines(x=MIN_COUNTS, ymin=0, ymax=ax2.get_ylim()[1], linewidth=1, color="red")
plt.show()

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):


ax1 = sns.distplot(adata.obs["n_genes"], kde=False, bins=60)
ax1.set_title("Distribution: detected genes")
ax1.vlines(x=MIN_GENES, color="red", linewidth=1, ymin=0, ymax=ax1.get_ylim()[1])
plt.show()

ax2 = sc.pl.scatter(
    adata,
    x="rk_n_genes",
    y="n_genes",
    color="mt_frac",
    legend_loc="none",
    title="Distribution: detected genes",
    show=False,
)
ax2.hlines(y=MIN_GENES, color="red", linewidth=1, xmin=0, xmax=ax2.get_xlim()[1])
plt.show()

ax3 = sc.pl.scatter(
    adata,
    x="rk_n_counts",
    y="n_counts",
    color="mt_frac",
    legend_loc="none",
    title="Distribution: read counts",
    show=False,
)
ax3.hlines(y=MIN_COUNTS, color="red", linewidth=1, xmin=0, xmax=ax3.get_xlim()[1])
ax3.set_yscale("log")
plt.show()

/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)


ax4 = sns.distplot(adata.obs["mt_frac"], kde=False, bins=60)
ax4.set_title("Distribution: fraction mito reads")
ax4.vlines(x=MAX_MITO, color="red", linewidth=1, ymin=0, ymax=ax4.get_ylim()[1])
plt.show()

ax5 = sc.pl.scatter(
    adata,
    x="rk_mt_frac",
    y="mt_frac",
    color="mt_frac",
    show=False,
    legend_loc="none",
    title="Distribution: fraction mito reads",
)
ax5.hlines(y=MAX_MITO, color="red", linewidth=1, xmin=0, xmax=ax5.get_xlim()[1])
plt.show()


print_dim(adata)

Current dimensions of adata: 71401 cells x 16992 genes.


sc.pp.filter_cells(adata, min_genes=MIN_GENES)
print_dim(adata)

filtered out 30847 cells that have less than 700 genes expressed
Current dimensions of adata: 40554 cells x 16992 genes.


sc.pp.filter_cells(adata, min_counts=MIN_COUNTS)
print_dim(adata)

filtered out 3793 cells that have less than 2000 counts
Current dimensions of adata: 36761 cells x 16992 genes.


adata = adata[adata.obs["mt_frac"] < MAX_MITO, :]
print_dim(adata)

Current dimensions of adata: 35052 cells x 16992 genes.

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):


adata = adata[:, ~adata.var_names.isin(np.append(mito_genes, ribo_genes))]
print_dim(adata)

Current dimensions of adata: 35052 cells x 16818 genes.


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))
sc.pl.violin(adata, "n_counts", groupby="samples", log=True, cut=0, ax=ax1, show=False)
sc.pl.violin(adata, "mt_frac", groupby="samples", ax=ax2, show=False)
ax1.set_title("Count depth (counts per barcode)")
ax2.set_title("Mitochondrial fraction per barcode")
fig.show()
fig.savefig("./tmp/qc_dist_per_sample_after.png")

/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/opt/conda/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


ax1 = sc.pl.scatter(
    adata,
    x="n_counts",
    y="n_genes",
    color="mt_frac",
    title="detected genes vs. count depth",
    show=False,
)
ax1.hlines(y=MIN_GENES, xmin=0, xmax=ax1.get_xlim()[1], linewidth=1, color="red")
ax1.vlines(x=MIN_COUNTS, ymin=0, ymax=ax1.get_ylim()[1], linewidth=1, color="red")
plt.show()

ax2 = sc.pl.scatter(
    adata[adata.obs["n_counts"] < 10000],
    x="n_counts",
    y="n_genes",
    color="mt_frac",
    title="detected genes vs. count depth (zoomed in)",
    show=False,
)
ax2.hlines(y=MIN_GENES, xmin=0, xmax=ax2.get_xlim()[1], linewidth=1, color="red")
ax2.vlines(x=MIN_COUNTS, ymin=0, ymax=ax2.get_ylim()[1], linewidth=1, color="red")
plt.show()

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):


ir.tl.chain_qc(adata)

Trying to set attribute `.obs` of view, copying.


ir.pl.group_abundance(adata, groupby="receptor_subtype", target_col="samples")

... storing 'receptor_type' as categorical
... storing 'receptor_subtype' as categorical
... storing 'chain_pairing' as categorical

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if is_string_dtype(df[key]) and not is_categorical(df[key])

<AxesSubplot:title={'center':'Number of cells in receptor_subtype by samples'}, xlabel='receptor_subtype', ylabel='Number of cells'>


ir.pl.group_abundance(adata, groupby="chain_pairing", target_col="samples")

<AxesSubplot:title={'center':'Number of cells in chain_pairing by samples'}, xlabel='chain_pairing', ylabel='Number of cells'>


tcr_filter = adata.obs["chain_pairing"].isin(
    ["multichain", "extra VDJ", "two full chains"]
)
print_dim(adata)
adata = adata[~tcr_filter, :].copy()
print_dim(adata)

Current dimensions of adata: 35052 cells x 16818 genes.
Current dimensions of adata: 29843 cells x 16818 genes.


ir.pl.group_abundance(adata, groupby="chain_pairing", target_col="samples")

<AxesSubplot:title={'center':'Number of cells in chain_pairing by samples'}, xlabel='chain_pairing', ylabel='Number of cells'>


## Normalize (for visualization only, we will save the raw counts!)
# sc.pp.filter_genes(adata, min_cells=1)
adata_vis = adata.copy()
sc.pp.normalize_per_cell(adata_vis, counts_per_cell_after=1e4)
sc.pp.log1p(adata_vis)
sc.pp.filter_genes(adata_vis, min_cells=1)

normalizing by total count per cell
    finished ({time_passed}): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):


sc.tl.pca(adata_vis, svd_solver="arpack")
sc.pp.neighbors(adata_vis, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata_vis)

computing PCA
    with n_comps=50
    finished
computing neighbors
    using 'X_pca' with n_pcs = 40
    finished
computing UMAP
    finished


sc.tl.leiden(adata_vis)

running Leiden clustering
    finished


sc.pl.umap(
    adata_vis,
    color=[
        "n_genes",
        "n_counts",
        "mt_frac",
        "samples",
        "origin",
        "leiden",
        "chain_pairing",
        "has_ir",
    ],
    ncols=2,
)


adata.write(output_file, compression="lzf")

Quality control (QC) and filtering¶

Input data and configuration¶

Quality Metrics¶

Gene level filtering¶

Ratio of counts to number of mitochondrial genes¶

Count depth and detected genes¶

Mitochondrial reads¶

Apply filtering by quality metrics¶

exclude ribosomal and mitochondrial genes¶

QC plots after filtering¶

Immune-receptor-based filtering¶

UMAP plot by covariates¶

save result¶