Load the cellranger outputs and store them as h5ad
.
import scanpy as sc
import scirpy as ir
import anndata
import pandas as pd
import os
import sys
from multiprocessing import Pool
import itertools
sample_sheet = "../tables/vanderburg_01_samples.csv"
output_file = "../results/01_process_data/adata.h5ad"
data_dir = "../data"
n_cpus = 16
# Parameters
sample_sheet = "sample_sheet.csv"
output_file = "adata.h5ad"
data_dir = "data"
n_cpus = "16"
obs = pd.read_csv(sample_sheet)
obs.set_index("samples")
patient | origin | replicate | dataset | tumor_type | platform | age | sex | hpv_status | ir_status | facs_purity_cd3 | facs_purity_cd56 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
samples | ||||||||||||
H143 | H143 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.653 | 0.008 |
H149 | H149 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.644 | 0.033 |
H160 | H160 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.342 | 0.067 |
H176 | H176 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.558 | 0.108 |
H182 | H182 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.303 | 0.109 |
H185 | H185 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.493 | 0.163 |
H188 | H188 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.657 | 0.087 |
H205 | H205 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.485 | 0.028 |
H211 | H211 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.382 | 0.029 |
H197 | H197 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.271 | 0.171 |
H208 | H208 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.336 | 0.323 |
H68 | H68 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.797 | 0.138 |
H141 | H141 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.288 | 0.025 |
dataset_samples = obs["samples"].to_numpy(copy=True)
dataset_samples.sort()
dataset_samples
array(['H141', 'H143', 'H149', 'H160', 'H176', 'H182', 'H185', 'H188', 'H197', 'H205', 'H208', 'H211', 'H68'], dtype=object)
def load_sample(sample_id, data_dir):
filename_gex = os.path.join(
data_dir, f"cellranger/{sample_id[1:]}_GEX/outs/raw_feature_bc_matrix.h5"
)
filename_tcr = os.path.join(
data_dir, f"cellranger/{sample_id[1:]}_TCR/outs/filtered_contig_annotations.csv"
)
adata = sc.read_10x_h5(filename_gex, genome="GRCh38")
adata_tcr = ir.io.read_10x_vdj(filename_tcr)
adata.obs_names = [
"{}_{}".format(sample_id, barcode) for barcode in adata.obs_names
]
adata_tcr.obs_names = [
"{}_{}".format(sample_id, barcode) for barcode in adata_tcr.obs_names
]
duplicated = adata.var_names.duplicated()
print(
"Removing {} gene symbols because they are duplicated".format(sum(duplicated))
)
adata = adata[:, ~duplicated].copy()
ir.pp.merge_with_ir(adata, adata_tcr)
adata.obs["samples"] = sample_id
return adata
with Pool(int(n_cpus)) as p:
adatas = p.starmap(load_sample, zip(dataset_samples, itertools.repeat(data_dir)))
Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated Removing 24 gene symbols because they are duplicated Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical
Removing 24 gene symbols because they are duplicated
... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical
Removing 24 gene symbols because they are duplicated
... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated
... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'multi_chain' as categorical ... storing 'genome' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'multi_chain' as categorical ... storing 'genome' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical
adata = anndata.concat(adatas)
# coarse filtering, proper QC is done later
sc.pp.filter_cells(adata, min_genes=100)
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
adata.obs["samples"]
H141_AAACCTGAGATAGCAT-1 H141 H141_AAACCTGAGATCCCAT-1 H141 H141_AAACCTGAGCGGCTTC-1 H141 H141_AAACCTGAGCTGTCTA-1 H141 H141_AAACCTGAGGCATTGG-1 H141 ... H68_TTTGTCAGTTGTTTGG-1 H68 H68_TTTGTCATCAGAGACG-1 H68 H68_TTTGTCATCAGGTAAA-1 H68 H68_TTTGTCATCGTCTGAA-1 H68 H68_TTTGTCATCTCCGGTT-1 H68 Name: samples, Length: 71401, dtype: object
obs
samples | patient | origin | replicate | dataset | tumor_type | platform | age | sex | hpv_status | ir_status | facs_purity_cd3 | facs_purity_cd56 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | H143 | H143 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.653 | 0.008 |
1 | H149 | H149 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.644 | 0.033 |
2 | H160 | H160 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.342 | 0.067 |
3 | H176 | H176 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.558 | 0.108 |
4 | H182 | H182 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.303 | 0.109 |
5 | H185 | H185 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.493 | 0.163 |
6 | H188 | H188 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.657 | 0.087 |
7 | H205 | H205 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.485 | 0.028 |
8 | H211 | H211 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.382 | 0.029 |
9 | H197 | H197 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.271 | 0.171 |
10 | H208 | H208 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.336 | 0.323 |
11 | H68 | H68 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.797 | 0.138 |
12 | H141 | H141 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.288 | 0.025 |
obs.set_index("samples")
patient | origin | replicate | dataset | tumor_type | platform | age | sex | hpv_status | ir_status | facs_purity_cd3 | facs_purity_cd56 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
samples | ||||||||||||
H143 | H143 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.653 | 0.008 |
H149 | H149 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.644 | 0.033 |
H160 | H160 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.342 | 0.067 |
H176 | H176 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.558 | 0.108 |
H182 | H182 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.303 | 0.109 |
H185 | H185 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.493 | 0.163 |
H188 | H188 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.657 | 0.087 |
H205 | H205 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.485 | 0.028 |
H211 | H211 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.382 | 0.029 |
H197 | H197 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.271 | 0.171 |
H208 | H208 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.336 | 0.323 |
H68 | H68 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.797 | 0.138 |
H141 | H141 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.288 | 0.025 |
tmp_obs = adata.obs.join(obs.set_index("samples"), on="samples", how="left", sort=True)
adata.obs = tmp_obs
adata.obs.loc[:, ["samples", "patient", "facs_purity_cd56"]].drop_duplicates()
samples | patient | facs_purity_cd56 | |
---|---|---|---|
H141_AAACCTGAGATAGCAT-1 | H141 | H141 | 0.025 |
H143_AAACCTGAGCCGATTT-1 | H143 | H143 | 0.008 |
H149_AAACCTGAGACTGTAA-1 | H149 | H149 | 0.033 |
H160_AAACCTGAGAAACCAT-1 | H160 | H160 | 0.067 |
H176_AAACCTGAGAAGGTTT-1 | H176 | H176 | 0.108 |
H182_AAACCTGAGCCCAACC-1 | H182 | H182 | 0.109 |
H185_AAACCTGAGACGCACA-1 | H185 | H185 | 0.163 |
H188_AAACCTGAGAGACTAT-1 | H188 | H188 | 0.087 |
H197_AAACCTGAGGCTCTTA-1 | H197 | H197 | 0.171 |
H205_AAACCTGAGTTTAGGA-1 | H205 | H205 | 0.028 |
H208_AAACCTGAGAACTGTA-1 | H208 | H208 | 0.323 |
H211_AAACCTGAGGATATAC-1 | H211 | H211 | 0.029 |
H68_AAACCTGCACATTCGA-1 | H68 | H68 | 0.138 |
adata.shape
(71401, 33514)
adata.write(output_file, compression="lzf")
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_v_gene' as categorical ... storing 'IR_VJ_2_v_gene' as categorical ... storing 'IR_VDJ_1_v_gene' as categorical ... storing 'IR_VDJ_2_v_gene' as categorical ... storing 'IR_VJ_1_j_gene' as categorical ... storing 'IR_VJ_2_j_gene' as categorical ... storing 'IR_VJ_1_c_gene' as categorical ... storing 'IR_VJ_2_c_gene' as categorical ... storing 'IR_VDJ_1_c_gene' as categorical ... storing 'IR_VDJ_2_c_gene' as categorical ... storing 'samples' as categorical ... storing 'patient' as categorical ... storing 'origin' as categorical ... storing 'dataset' as categorical ... storing 'tumor_type' as categorical ... storing 'platform' as categorical ... storing 'hpv_status' as categorical ... storing 'ir_status' as categorical