input_file = "../results/02_filter_data/adata.h5ad"
tables_dir = "../tables"
output_file = "tmp/adata.h5ad"
doublet_file = f"{tables_dir}/is_doublet.npy"


# Parameters
input_file = "input_adata.h5ad"
output_file = "adata.h5ad"
tables_dir = "tables"
doublet_file = "is_doublet.npy"


import pandas as pd
import scanpy as sc
import numpy as np
from matplotlib import pyplot as plt
import warnings
from numba import NumbaWarning
import sys
import os

sys.path.append("lib")
sys.path.append("../lib")
from jupytertools import fix_logging, print_dim
from scpp import norm_log

fix_logging(sc.settings)

warnings.filterwarnings("ignore", category=NumbaWarning)


cell_cycle_regev = pd.read_csv(
    os.path.join(tables_dir, "cell_cycle_regev.tsv"), sep="\t"
)
cell_cycle_regev = cell_cycle_regev[["hgnc_symbol", "phase"]].drop_duplicates()
pca_file = os.path.join(tables_dir, "adata_pca.pkl.gz")


adata = sc.read_h5ad(input_file)


is_doublet = np.load(doublet_file)


adata.obs["is_doublet"] = is_doublet


norm_log(adata)
sc.pp.pca(adata, svd_solver="arpack")

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

normalizing by total count per cell
    finished ({time_passed}): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=50
    finished


sc.pl.pca_variance_ratio(adata)


sc.pp.neighbors(adata, n_pcs=30)
sc.tl.umap(adata)

computing neighbors
    using 'X_pca' with n_pcs = 30
    finished
computing UMAP
    finished


sc.tl.score_genes_cell_cycle(
    adata,
    s_genes=cell_cycle_regev.loc[
        cell_cycle_regev["phase"] == "S", "hgnc_symbol"
    ].values,
    g2m_genes=cell_cycle_regev.loc[
        cell_cycle_regev["phase"] == "G2M", "hgnc_symbol"
    ].values,
)

calculating cell cycle phase
computing score 'S_score'
genes are not in var_names and ignored: ['MLF1IP']
    finished
computing score 'G2M_score'
genes are not in var_names and ignored: ['FAM64A', 'HN1']
    finished
    'phase', cell cycle phase (adata.obs)


sc.pl.umap(
    adata,
    color=["samples", "n_genes", "n_counts", "is_doublet", "chain_pairing"],
    ncols=3,
)

... storing 'phase' as categorical

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if is_string_dtype(df[key]) and not is_categorical(df[key])


print_dim(adata)
adata = adata[~adata.obs["is_doublet"], :].copy()
print_dim(adata)

Current dimensions of adata: 29843 cells x 16818 genes.

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

Current dimensions of adata: 28296 cells x 16818 genes.


sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=6000)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished
added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)


# PCA turned out not to be entirely reproducible on different CPU architechtures.
# For the sake of reproducibility of these notebooks, we load a pre-computed result
# from the repository. If it doesn't exist, we compute it from scratch.
try:
    adata.obsm["X_pca"] = pd.read_pickle(pca_file).values
except IOError:
    assert False, "should use pre-computed version. "
    sc.tl.pca(adata, svd_solver="arpack")
    pd.DataFrame(adata.obsm["X_pca"]).to_pickle(pca_file)


sc.pp.neighbors(adata, n_pcs=30)
sc.tl.umap(adata)
sc.tl.leiden(adata)

computing neighbors
    using 'X_pca' with n_pcs = 30
    finished
computing UMAP
    finished
running Leiden clustering
    finished


sc.pl.umap(
    adata,
    color=["samples", "n_genes", "n_counts", "chain_pairing"],
)


adata.write(output_file, compression="lzf")

Normalize and compute highly variable genes¶

Input-data¶

Load doublets precomputed by solo¶

Normalize and scale¶

Add cell-cycle scores¶

Remove doublets¶

Compute highly variable genes¶