import scanpy as sc
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np
crc_counts = "../data/external/Zhang_Zhang_2019_CRC/smartseq2_pipeline/resultCOUNT.txt"
nsclc_counts = (
"../data/external/Guo_Zhang_2018_NSCLC/smartseq2_pipeline/resultCOUNT.txt"
)
hcc_counts = "../data/external/Zheng_Zhang_2017_HCC/smartseq2_pipeline/resultCOUNT.txt"
crc_patient_info = "../data/external/Zhang_Zhang_2019_CRC/scripts/patient_data.tsv"
nsclc_patient_info = "../data/external/Guo_Zhang_2018_NSCLC/scripts/patient_table.tsv"
hcc_patient_info = "../data/external/Zheng_Zhang_2017_HCC//scripts/patient_table.tsv"
# Parameters
crc_counts = "CRC/resultCOUNT.txt"
crc_patient_info = "CRC/patient_data.tsv"
hcc_counts = "HCC/resultCOUNT.txt"
hcc_patient_info = "HCC/patient_table.tsv"
nsclc_counts = "NSCLC/resultCOUNT.txt"
nsclc_patient_info = "NSCLC/patient_table.tsv"
To make sure that the distribution of CD161 expression across various subtypes of CD4+ T-cells is not arising from issues of the 10x single-cell datasets, we check three publicly available Smart-seq2 datasets (CRC, NSCLC and HCC) in this notebook.
Raw data have been obtained from EGA under controlled access and processed using nf-core smartseq2 pipeline.
counts = {
"crc": pd.read_csv(crc_counts, sep="\t", index_col=0),
"nsclc": pd.read_csv(nsclc_counts, sep="\t", index_col=0),
"hcc": pd.read_csv(hcc_counts, sep="\t", index_col=0),
}
patient_info = {
"crc": pd.read_csv(
crc_patient_info,
sep="\t",
index_col="Cell name",
).rename({"Cell type": "sampleType"}, axis="columns"),
"nsclc": pd.read_csv(nsclc_patient_info, sep="\t", index_col="UniqueCell_ID"),
"hcc": pd.read_csv(hcc_patient_info, sep="\t", index_col="UniqueCell_ID"),
}
for counts_df in counts.values():
counts_df.columns = ["_".join(x.split("_")[2:]) for x in counts_df.columns]
for counts_df in counts.values():
counts_df.columns = counts_df.columns.str.replace("ZZM_", "")
patient_info["crc"].index = patient_info["crc"].index.str.replace("P0701", "0701")
adatas = {}
for key in ["crc", "nsclc", "hcc"]:
counts_df = counts[key]
patient_df = patient_info[key]
cell_ids = list(set(counts_df.columns) & set(patient_df.index.values))
adata = sc.AnnData(X=counts_df.loc[:, cell_ids].T, obs=patient_df.loc[cell_ids, :])
adatas[key] = adata
for key, adata in adatas.items():
print(key, adata.shape)
crc (10803, 60642) nsclc (11477, 60642) hcc (5050, 60642)
# os.makedirs("./tmp", exist_ok=True)
# for key, adata in adatas.items():
# adata.write_h5ad(f"./tmp/adata_{key}.h5ad", compression='lzf')
for key, adata in adatas.items():
adata.var["mito"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(adata, qc_vars=("mito",), inplace=True)
ax = sns.distplot(adata.obs["pct_counts_mito"].dropna())
ax.set_title(key)
plt.show()
ax = sns.distplot(adata.obs["n_genes_by_counts"].dropna())
ax.set_title(key)
plt.show()
/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2551: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
MIN_CELLS = 10
MAX_MITO = 16
MIN_GENES = 2000
MAX_GENES = 8000
for key, adata in adatas.items():
sc.pp.filter_genes(adata, min_cells=MIN_CELLS)
sc.pp.filter_cells(adata, min_genes=MIN_GENES)
sc.pp.filter_cells(adata, max_genes=MAX_GENES)
adatas[key] = adata[adata.obs["pct_counts_mito"] < MAX_MITO, :]
print(key, adatas[key].shape)
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
crc (10591, 41214)
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
nsclc (9605, 38675) hcc (4939, 30925)
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
We are only interested in cells from tumor and adjacent normal tissue.
# adatas_bk = {k: adata.copy() for k, adata in adatas.items()}
# adatas = {k: adata.copy() for k, adata in adatas_bk.items()}
for key, adata in adatas.items():
ct_map = {"C": "CD8", "H": "CD4", "R": "Treg", "Y": "CD4", "7": "CD4"}
adata.obs["cell_type"] = [ct_map[x[2]] for x in adata.obs["sampleType"]]
adatas[key] = adata[
adata.obs["sampleType"].str.startswith("T")
| adata.obs["sampleType"].str.startswith("N"),
:,
].copy()
Trying to set attribute `.obs` of view, copying. Trying to set attribute `.obs` of view, copying. Trying to set attribute `.obs` of view, copying.
for key, adata in adatas.items():
print(key)
sc.pp.normalize_total(adata, target_sum=1000)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=3000, flavor="cell_ranger")
adata.raw = adata
sc.pp.combat(adata, key="Patient")
crc
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'Patient' as categorical ... storing 'sampleType' as categorical ... storing 'Subtype' as categorical ... storing 'Total counts' as categorical ... storing 'Number of expressed gene' as categorical ... storing 'Cluster' as categorical ... storing 'Invariant TCR' as categorical ... storing 'Clone ID' as categorical ... storing 'Clone status' as categorical ... storing 'Identifier (Alpha1)' as categorical ... storing 'CDR3 (Alpha1)' as categorical ... storing 'VDJ (Alpha1)' as categorical ... storing 'Identifier (Alpha2)' as categorical ... storing 'CDR3 (Alpha2)' as categorical ... storing 'VDJ (Alpha2)' as categorical ... storing 'Identifier (Beta1)' as categorical ... storing 'CDR3 (Beta1)' as categorical ... storing 'VDJ (Beta1)' as categorical ... storing 'Identifier (Beta2)' as categorical ... storing 'CDR3 (Beta2)' as categorical ... storing 'VDJ (Beta2)' as categorical ... storing 'cell_type' as categorical
Found 1 genes with zero variance.
/opt/conda/lib/python3.8/site-packages/scanpy/preprocessing/_combat.py:340: RuntimeWarning: invalid value encountered in true_divide (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max() /opt/conda/lib/python3.8/site-packages/scanpy/preprocessing/_combat.py:340: RuntimeWarning: divide by zero encountered in true_divide (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
nsclc
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'Patient' as categorical ... storing 'majorCluster' as categorical ... storing 'sampleType' as categorical ... storing 'cell_type' as categorical
Found 14 genes with zero variance.
/opt/conda/lib/python3.8/site-packages/scanpy/preprocessing/_combat.py:340: RuntimeWarning: invalid value encountered in true_divide (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max() /opt/conda/lib/python3.8/site-packages/scanpy/preprocessing/_combat.py:340: RuntimeWarning: divide by zero encountered in true_divide (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
hcc
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'Patient' as categorical ... storing 'majorCluster' as categorical ... storing 'sampleType' as categorical ... storing 'cell_type' as categorical
Found 3 genes with zero variance.
/opt/conda/lib/python3.8/site-packages/scanpy/preprocessing/_combat.py:340: RuntimeWarning: invalid value encountered in true_divide (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max() /opt/conda/lib/python3.8/site-packages/scanpy/preprocessing/_combat.py:340: RuntimeWarning: divide by zero encountered in true_divide (abs(g_new - g_old) / g_old).max(), (abs(d_new - d_old) / d_old).max()
for key, adata in adatas.items():
sc.tl.pca(adata, svd_solver="arpack")
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
for key, adata in adatas.items():
print(key)
sc.pl.umap(adata, color=["Patient", "sampleType", "cell_type"])
crc
nsclc
hcc
colors2 = plt.cm.Reds(np.linspace(0, 1, 128))
colors3 = plt.cm.Greys_r(np.linspace(0.7,0.8,20))
colorsComb = np.vstack([colors3, colors2])
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)
def make_plots(key, **kwargs):
adata = adatas[key]
sc.pl.umap(
adata, color=["cell_type", "CD8A", "CD4", "FOXP3", "KLRB1"], ncols=3, **kwargs, cmap=mymap
)
make_plots("nsclc", size=40)
make_plots("crc", size=40)
make_plots("hcc", size=60)
For all three datasets, we find that KLRB1 expression is not confined to a certain subset of CD4+ T-cells.