import scanpy as sc
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np


crc_counts = "../data/external/Zhang_Zhang_2019_CRC/smartseq2_pipeline/resultCOUNT.txt"
nsclc_counts = (
    "../data/external/Guo_Zhang_2018_NSCLC/smartseq2_pipeline/resultCOUNT.txt"
)
hcc_counts = "../data/external/Zheng_Zhang_2017_HCC/smartseq2_pipeline/resultCOUNT.txt"
crc_patient_info = "../data/external/Zhang_Zhang_2019_CRC/scripts/patient_data.tsv"
nsclc_patient_info = "../data/external/Guo_Zhang_2018_NSCLC/scripts/patient_table.tsv"
hcc_patient_info = "../data/external/Zheng_Zhang_2017_HCC//scripts/patient_table.tsv"


# Parameters
crc_counts = "CRC/resultCOUNT.txt"
crc_patient_info = "CRC/patient_data.tsv"
hcc_counts = "HCC/resultCOUNT.txt"
hcc_patient_info = "HCC/patient_table.tsv"
nsclc_counts = "NSCLC/resultCOUNT.txt"
nsclc_patient_info = "NSCLC/patient_table.tsv"


counts = {
    "crc": pd.read_csv(crc_counts, sep="\t", index_col=0),
    "nsclc": pd.read_csv(nsclc_counts, sep="\t", index_col=0),
    "hcc": pd.read_csv(hcc_counts, sep="\t", index_col=0),
}


patient_info = {
    "crc": pd.read_csv(
        crc_patient_info,
        sep="\t",
        index_col="Cell name",
    ).rename({"Cell type": "sampleType"}, axis="columns"),
    "nsclc": pd.read_csv(nsclc_patient_info, sep="\t", index_col="UniqueCell_ID"),
    "hcc": pd.read_csv(hcc_patient_info, sep="\t", index_col="UniqueCell_ID"),
}


for counts_df in counts.values():
    counts_df.columns = ["_".join(x.split("_")[2:]) for x in counts_df.columns]


for counts_df in counts.values():
    counts_df.columns = counts_df.columns.str.replace("ZZM_", "")


patient_info["crc"].index = patient_info["crc"].index.str.replace("P0701", "0701")


adatas = {}
for key in ["crc", "nsclc", "hcc"]:
    counts_df = counts[key]
    patient_df = patient_info[key]
    cell_ids = list(set(counts_df.columns) & set(patient_df.index.values))
    adata = sc.AnnData(X=counts_df.loc[:, cell_ids].T, obs=patient_df.loc[cell_ids, :])
    adatas[key] = adata


for key, adata in adatas.items():
    print(key, adata.shape)

crc (10803, 60642)
nsclc (11477, 60642)
hcc (5050, 60642)


# os.makedirs("./tmp", exist_ok=True)
# for key, adata in adatas.items():
#     adata.write_h5ad(f"./tmp/adata_{key}.h5ad", compression='lzf')


for key, adata in adatas.items():
    adata.var["mito"] = adata.var_names.str.startswith("MT-")
    sc.pp.calculate_qc_metrics(adata, qc_vars=("mito",), inplace=True)
    ax = sns.distplot(adata.obs["pct_counts_mito"].dropna())
    ax.set_title(key)
    plt.show()
    ax = sns.distplot(adata.obs["n_genes_by_counts"].dropna())
    ax.set_title(key)
    plt.show()

/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)


MIN_CELLS = 10
MAX_MITO = 16
MIN_GENES = 2000
MAX_GENES = 8000


for key, adata in adatas.items():
    sc.pp.filter_genes(adata, min_cells=MIN_CELLS)
    sc.pp.filter_cells(adata, min_genes=MIN_GENES)
    sc.pp.filter_cells(adata, max_genes=MAX_GENES)
    adatas[key] = adata[adata.obs["pct_counts_mito"] < MAX_MITO, :]
    print(key, adatas[key].shape)

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

crc (10591, 41214)

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

nsclc (9605, 38675)
hcc (4939, 30925)

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):


# adatas_bk = {k: adata.copy() for k, adata in adatas.items()}


# adatas = {k: adata.copy() for k, adata in adatas_bk.items()}


for key, adata in adatas.items():
    ct_map = {"C": "CD8", "H": "CD4", "R": "Treg", "Y": "CD4", "7": "CD4"}
    adata.obs["cell_type"] = [ct_map[x[2]] for x in adata.obs["sampleType"]]
    adatas[key] = adata[
        adata.obs["sampleType"].str.startswith("T")
        | adata.obs["sampleType"].str.startswith("N"),
        :,
    ].copy()

Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


for key, adata in adatas.items():
    print(key)
    sc.pp.normalize_total(adata, target_sum=1000)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, n_top_genes=3000, flavor="cell_ranger")
    adata.raw = adata
    sc.pp.combat(adata, key="Patient")

crc

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Patient' as categorical
... storing 'sampleType' as categorical
... storing 'Subtype' as categorical
... storing 'Total counts' as categorical
... storing 'Number of expressed gene' as categorical
... storing 'Cluster' as categorical
... storing 'Invariant TCR' as categorical
... storing 'Clone ID' as categorical
... storing 'Clone status' as categorical
... storing 'Identifier (Alpha1)' as categorical
... storing 'CDR3 (Alpha1)' as categorical
... storing 'VDJ (Alpha1)' as categorical
... storing 'Identifier (Alpha2)' as categorical
... storing 'CDR3 (Alpha2)' as categorical
... storing 'VDJ (Alpha2)' as categorical
... storing 'Identifier (Beta1)' as categorical
... storing 'CDR3 (Beta1)' as categorical
... storing 'VDJ (Beta1)' as categorical
... storing 'Identifier (Beta2)' as categorical
... storing 'CDR3 (Beta2)' as categorical
... storing 'VDJ (Beta2)' as categorical
... storing 'cell_type' as categorical

Found 1 genes with zero variance.

/opt/conda/lib/python3.8/site-packages/scanpy/preprocessing/_combat.py:340: RuntimeWarning: invalid value encountered in true_divide
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
/opt/conda/lib/python3.8/site-packages/scanpy/preprocessing/_combat.py:340: RuntimeWarning: divide by zero encountered in true_divide
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()

nsclc

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Patient' as categorical
... storing 'majorCluster' as categorical
... storing 'sampleType' as categorical
... storing 'cell_type' as categorical

Found 14 genes with zero variance.

/opt/conda/lib/python3.8/site-packages/scanpy/preprocessing/_combat.py:340: RuntimeWarning: invalid value encountered in true_divide
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
/opt/conda/lib/python3.8/site-packages/scanpy/preprocessing/_combat.py:340: RuntimeWarning: divide by zero encountered in true_divide
  (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()

hcc

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'Patient' as categorical
... storing 'majorCluster' as categorical
... storing 'sampleType' as categorical
... storing 'cell_type' as categorical


for key, adata in adatas.items():
    sc.tl.pca(adata, svd_solver="arpack")
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
    sc.tl.umap(adata)

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):


for key, adata in adatas.items():
    print(key)
    sc.pl.umap(adata, color=["Patient", "sampleType", "cell_type"])

crc

nsclc

hcc


colors2 = plt.cm.Reds(np.linspace(0, 1, 128))
colors3 = plt.cm.Greys_r(np.linspace(0.7,0.8,20))
colorsComb = np.vstack([colors3, colors2])
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)


def make_plots(key, **kwargs):
    adata = adatas[key]
    sc.pl.umap(
        adata, color=["cell_type", "CD8A", "CD4", "FOXP3", "KLRB1"], ncols=3, **kwargs, cmap=mymap
    )


make_plots("nsclc", size=40)


make_plots("crc",  size=40)


make_plots("hcc", size=60)

CD161 expression in external datasets¶

Cell-types in the datasets¶

Results¶

NSCLC¶

CRC¶

HCC¶

Conclusion¶