input_file = "../results/02_filter_data/adata.h5ad"
adata_unfiltered_file = "../results/01_process_data/adata.h5ad"
tables_dir = "../tables"
output_file = "tmp/adata.h5ad"
output_file_stats = "tmp/quality_stats.csv"
doublet_file = f"{tables_dir}/is_doublet.npy"


# Parameters
input_file = "input_adata.h5ad"
output_file = "adata.h5ad"
tables_dir = "tables"
doublet_file = "is_doublet.npy"
adata_unfiltered_file = "adata_unfiltered.h5ad"
output_file_stats = "quality_stats.csv"


import pandas as pd
import scanpy as sc
import numpy as np
from matplotlib import pyplot as plt
import warnings
from numba import NumbaWarning
import sys
import os

sys.path.append("lib")
sys.path.append("../lib")
from jupytertools import fix_logging, print_dim
from scpp import norm_log

fix_logging(sc.settings)

warnings.filterwarnings("ignore", category=NumbaWarning)


cell_cycle_regev = pd.read_csv(
    os.path.join(tables_dir, "cell_cycle_regev.tsv"), sep="\t"
)
cell_cycle_regev = cell_cycle_regev[["hgnc_symbol", "phase"]].drop_duplicates()
pca_file = os.path.join(tables_dir, "adata_pca.pkl.gz")


adata = sc.read_h5ad(input_file)


adata_unfiltered = sc.read_h5ad(adata_unfiltered_file)


is_doublet = np.load(doublet_file)


adata.obs["is_doublet"] = is_doublet


norm_log(adata)
sc.pp.pca(adata, svd_solver="arpack")

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

normalizing by total count per cell
    finished ({time_passed}): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=50
    finished


sc.pl.pca_variance_ratio(adata)


sc.pp.neighbors(adata, n_pcs=30)
sc.tl.umap(adata)

computing neighbors
    using 'X_pca' with n_pcs = 30
    finished
computing UMAP
    finished


sc.tl.score_genes_cell_cycle(
    adata,
    s_genes=cell_cycle_regev.loc[
        cell_cycle_regev["phase"] == "S", "hgnc_symbol"
    ].values,
    g2m_genes=cell_cycle_regev.loc[
        cell_cycle_regev["phase"] == "G2M", "hgnc_symbol"
    ].values,
)

calculating cell cycle phase
computing score 'S_score'
genes are not in var_names and ignored: ['MLF1IP']
    finished
computing score 'G2M_score'
genes are not in var_names and ignored: ['FAM64A', 'HN1']
    finished
    'phase', cell cycle phase (adata.obs)


sc.pl.umap(
    adata,
    color=["samples", "n_genes", "n_counts", "is_doublet", "chain_pairing"],
    ncols=3,
)

... storing 'phase' as categorical

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if is_string_dtype(df[key]) and not is_categorical(df[key])


print_dim(adata)
adata = adata[~adata.obs["is_doublet"], :].copy()
print_dim(adata)

Current dimensions of adata: 29843 cells x 16818 genes.

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

Current dimensions of adata: 28296 cells x 16818 genes.


sequenced_reads = pd.read_csv(
    f"{tables_dir}/summary_fastq_counts.txt", names=["samples", "total_sequenced_reads"]
)
sequenced_reads["samples"] = [f"H{x}" for x in sequenced_reads["samples"]]
sequenced_reads.set_index("samples", inplace=True)


uniquely_mapped_reads = pd.read_csv(
    f"{tables_dir}/summary_uniquely_mapped_reads.txt",
    names=["samples", "total_uniquely_mapped_reads"],
)
uniquely_mapped_reads["samples"] = [f"H{x}" for x in uniquely_mapped_reads["samples"]]
uniquely_mapped_reads.set_index("samples", inplace=True)


called_cells = (
    adata.obs.groupby("samples")
    .size()
    .reset_index(name="total_called_cells")
    .set_index("samples")
)


ribo_genes = pd.read_csv(
    os.path.join(tables_dir, "ribosomal_genes.tsv"), sep="\t", comment="#"
)["Approved symbol"].values


adata.shape

(28296, 16818)


# only keep 'called cells'
adata_unfiltered = adata_unfiltered[adata.obs_names, :].copy()
adata_unfiltered.shape

(28296, 33514)


adata_unfiltered.var["is_ribo"] = adata_unfiltered.var_names.isin(ribo_genes)


adata.obs["ribo_frac"] = np.sum(
    adata_unfiltered[:, adata_unfiltered.var["is_ribo"]].X, axis=1
) / np.sum(adata_unfiltered.X, axis=1)


gene_stats = adata.obs.groupby("samples").agg(
    median_genes=pd.NamedAgg(column="n_genes", aggfunc="median"),
    min_genes=pd.NamedAgg(column="n_genes", aggfunc="min"),
    max_genes=pd.NamedAgg(column="n_genes", aggfunc="max"),
    median_uniquely_mapped_reads=pd.NamedAgg(column="n_counts", aggfunc="median"),
    min_uniquely_mapped_reads=pd.NamedAgg(column="n_counts", aggfunc="min"),
    max_uniquely_mapped_reads=pd.NamedAgg(column="n_counts", aggfunc="max"),
    median_ribosomal_read_fraction=pd.NamedAgg(column="ribo_frac", aggfunc="median"),
    min_ribosomal_read_fraction=pd.NamedAgg(column="ribo_frac", aggfunc="min"),
    max_ribosomal_read_fraction=pd.NamedAgg(column="ribo_frac", aggfunc="max"),
)


quality_table = sequenced_reads.join(uniquely_mapped_reads).join(called_cells)
quality_table["median_uniquely_mapped_reads_per_cell"] = [
    f'{gene_stats.loc[s, "median_uniquely_mapped_reads"]:.0f} '
    f'({gene_stats.loc[s, "min_uniquely_mapped_reads"]:.0f} - '
    f'{gene_stats.loc[s, "max_uniquely_mapped_reads"]:.0f})'
    for s in quality_table.index
]
quality_table["median_rrna_rate_per_cell"] = [
    f'{gene_stats.loc[s, "median_ribosomal_read_fraction"]:.2f} '
    f'({gene_stats.loc[s, "min_ribosomal_read_fraction"]:.2f} - '
    f'{gene_stats.loc[s, "max_ribosomal_read_fraction"]:.2f})'
    for s in quality_table.index
]
quality_table["median_detected_genes_per_cell"] = [
    f'{gene_stats.loc[s, "median_genes"]:.0f} '
    f'({gene_stats.loc[s, "min_genes"]:.0f} - '
    f'{gene_stats.loc[s, "max_genes"]:.0f})'
    for s in quality_table.index
]
quality_table


quality_table.to_csv(output_file_stats)


sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=6000)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished
added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)


# PCA turned out not to be entirely reproducible on different CPU architechtures.
# For the sake of reproducibility of these notebooks, we load a pre-computed result
# from the repository. If it doesn't exist, we compute it from scratch.
try:
    adata.obsm["X_pca"] = pd.read_pickle(pca_file).values
except IOError:
    assert False, "should use pre-computed version. "
    sc.tl.pca(adata, svd_solver="arpack")
    pd.DataFrame(adata.obsm["X_pca"]).to_pickle(pca_file)


sc.pp.neighbors(adata, n_pcs=30)
sc.tl.umap(adata)
sc.tl.leiden(adata)

computing neighbors
    using 'X_pca' with n_pcs = 30
    finished
computing UMAP
    finished
running Leiden clustering
    finished


sc.pl.umap(
    adata,
    color=["samples", "n_genes", "n_counts", "chain_pairing"],
)


adata.write(output_file, compression="lzf")

	total_sequenced_reads	total_uniquely_mapped_reads	total_called_cells	median_uniquely_mapped_reads_per_cell	median_rrna_rate_per_cell	median_detected_genes_per_cell
samples
H68	145639312	129201630	2801	2846 (1237 - 45037)	0.24 (0.01 - 0.45)	1490 (740 - 6241)
H188	228929000	190873981	2063	2654 (1206 - 38909)	0.25 (0.01 - 0.43)	1421 (727 - 4787)
H208	465578498	388028467	2284	3498 (1154 - 64179)	0.21 (0.02 - 0.45)	1753 (766 - 6586)
...	...	...	...	...	...	...
H182	286349621	252008749	2561	2587 (1165 - 43770)	0.24 (0.01 - 0.43)	1408 (727 - 6566)
H205	385387851	341106543	1178	2940 (1158 - 63142)	0.18 (0.02 - 0.41)	1521 (702 - 7983)
H143	281522477	243022789	1760	2560 (1184 - 55511)	0.23 (0.01 - 0.38)	1450 (737 - 7160)

Normalize and compute highly variable genes¶

Input-data¶

Load doublets precomputed by solo¶

Normalize and scale¶

Add cell-cycle scores¶

Remove doublets¶

Summary statistics¶

Get fraction of ribosomal genes¶

Compute stats by aggregating `obs`¶

Compute highly variable genes¶

Normalize and compute highly variable genes¶

Input-data¶

Load doublets precomputed by solo¶

Normalize and scale¶

Add cell-cycle scores¶

Remove doublets¶

Summary statistics¶

Get fraction of ribosomal genes¶

Compute stats by aggregating obs¶

Compute highly variable genes¶

Compute stats by aggregating `obs`¶