Load the cellranger outputs and store them as h5ad.
import scanpy as sc
import scirpy as ir
import anndata
import pandas as pd
import os
import sys
from multiprocessing import Pool
import itertools
sample_sheet = "../tables/vanderburg_01_samples.csv"
output_file = "../results/01_process_data/adata.h5ad"
data_dir = "../data"
n_cpus = 16
# Parameters
sample_sheet = "sample_sheet.csv"
output_file = "adata.h5ad"
data_dir = "data"
n_cpus = "16"
obs = pd.read_csv(sample_sheet)
obs.set_index("samples")
| patient | origin | replicate | dataset | tumor_type | platform | age | sex | hpv_status | ir_status | facs_purity_cd3 | facs_purity_cd56 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| samples | ||||||||||||
| H143 | H143 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.653 | 0.008 | 
| H149 | H149 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.644 | 0.033 | 
| H160 | H160 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.342 | 0.067 | 
| H176 | H176 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.558 | 0.108 | 
| H182 | H182 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.303 | 0.109 | 
| H185 | H185 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.493 | 0.163 | 
| H188 | H188 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.657 | 0.087 | 
| H205 | H205 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.485 | 0.028 | 
| H211 | H211 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.382 | 0.029 | 
| H197 | H197 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.271 | 0.171 | 
| H208 | H208 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.336 | 0.323 | 
| H68 | H68 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.797 | 0.138 | 
| H141 | H141 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.288 | 0.025 | 
dataset_samples = obs["samples"].to_numpy(copy=True)
dataset_samples.sort()
dataset_samples
array(['H141', 'H143', 'H149', 'H160', 'H176', 'H182', 'H185', 'H188',
       'H197', 'H205', 'H208', 'H211', 'H68'], dtype=object)
def load_sample(sample_id, data_dir):
    filename_gex = os.path.join(
        data_dir, f"cellranger/{sample_id[1:]}_GEX/outs/raw_feature_bc_matrix.h5"
    )
    filename_tcr = os.path.join(
        data_dir, f"cellranger/{sample_id[1:]}_TCR/outs/filtered_contig_annotations.csv"
    )
    adata = sc.read_10x_h5(filename_gex, genome="GRCh38")
    adata_tcr = ir.io.read_10x_vdj(filename_tcr)
    adata.obs_names = [
        "{}_{}".format(sample_id, barcode) for barcode in adata.obs_names
    ]
    adata_tcr.obs_names = [
        "{}_{}".format(sample_id, barcode) for barcode in adata_tcr.obs_names
    ]
    duplicated = adata.var_names.duplicated()
    print(
        "Removing {} gene symbols because they are duplicated".format(sum(duplicated))
    )
    adata = adata[:, ~duplicated].copy()
    ir.pp.merge_with_ir(adata, adata_tcr)
    adata.obs["samples"] = sample_id
    return adata
with Pool(int(n_cpus)) as p:
    adatas = p.starmap(load_sample, zip(dataset_samples, itertools.repeat(data_dir)))
Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]): Variable names are not unique. To make them unique, call `.var_names_make_unique`. /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated Removing 24 gene symbols because they are duplicated Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical
Removing 24 gene symbols because they are duplicated
... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical /opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical
Removing 24 gene symbols because they are duplicated
... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated Removing 24 gene symbols because they are duplicated
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_junction_ins' as categorical ... storing 'IR_VJ_2_junction_ins' as categorical ... storing 'IR_VDJ_1_junction_ins' as categorical ... storing 'IR_VDJ_2_junction_ins' as categorical
Removing 24 gene symbols because they are duplicated
... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'multi_chain' as categorical ... storing 'genome' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical ... storing 'genome' as categorical ... storing 'multi_chain' as categorical ... storing 'feature_types' as categorical ... storing 'multi_chain' as categorical ... storing 'genome' as categorical ... storing 'feature_types' as categorical ... storing 'genome' as categorical
adata = anndata.concat(adatas)
# coarse filtering, proper QC is done later
sc.pp.filter_cells(adata, min_genes=100)
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if not is_categorical(df_full[k]):
adata.obs["samples"]
H141_AAACCTGAGATAGCAT-1    H141
H141_AAACCTGAGATCCCAT-1    H141
H141_AAACCTGAGCGGCTTC-1    H141
H141_AAACCTGAGCTGTCTA-1    H141
H141_AAACCTGAGGCATTGG-1    H141
                           ... 
H68_TTTGTCAGTTGTTTGG-1      H68
H68_TTTGTCATCAGAGACG-1      H68
H68_TTTGTCATCAGGTAAA-1      H68
H68_TTTGTCATCGTCTGAA-1      H68
H68_TTTGTCATCTCCGGTT-1      H68
Name: samples, Length: 71401, dtype: object
obs
| samples | patient | origin | replicate | dataset | tumor_type | platform | age | sex | hpv_status | ir_status | facs_purity_cd3 | facs_purity_cd56 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | H143 | H143 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.653 | 0.008 | 
| 1 | H149 | H149 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.644 | 0.033 | 
| 2 | H160 | H160 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.342 | 0.067 | 
| 3 | H176 | H176 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.558 | 0.108 | 
| 4 | H182 | H182 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.303 | 0.109 | 
| 5 | H185 | H185 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.493 | 0.163 | 
| 6 | H188 | H188 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.657 | 0.087 | 
| 7 | H205 | H205 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.485 | 0.028 | 
| 8 | H211 | H211 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.382 | 0.029 | 
| 9 | H197 | H197 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.271 | 0.171 | 
| 10 | H208 | H208 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.336 | 0.323 | 
| 11 | H68 | H68 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.797 | 0.138 | 
| 12 | H141 | H141 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.288 | 0.025 | 
obs.set_index("samples")
| patient | origin | replicate | dataset | tumor_type | platform | age | sex | hpv_status | ir_status | facs_purity_cd3 | facs_purity_cd56 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| samples | ||||||||||||
| H143 | H143 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.653 | 0.008 | 
| H149 | H149 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.644 | 0.033 | 
| H160 | H160 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.342 | 0.067 | 
| H176 | H176 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.558 | 0.108 | 
| H182 | H182 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.303 | 0.109 | 
| H185 | H185 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.493 | 0.163 | 
| H188 | H188 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.657 | 0.087 | 
| H205 | H205 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.485 | 0.028 | 
| H211 | H211 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.382 | 0.029 | 
| H197 | H197 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV- | NaN | 0.271 | 0.171 | 
| H208 | H208 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR- | 0.336 | 0.323 | 
| H68 | H68 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.797 | 0.138 | 
| H141 | H141 | tumor_primary | 1 | vanderburg_01 | HNSC | 10x_5p | NaN | NaN | HPV16+ | IR+ | 0.288 | 0.025 | 
tmp_obs = adata.obs.join(obs.set_index("samples"), on="samples", how="left", sort=True)
adata.obs = tmp_obs
adata.obs.loc[:, ["samples", "patient", "facs_purity_cd56"]].drop_duplicates()
| samples | patient | facs_purity_cd56 | |
|---|---|---|---|
| H141_AAACCTGAGATAGCAT-1 | H141 | H141 | 0.025 | 
| H143_AAACCTGAGCCGATTT-1 | H143 | H143 | 0.008 | 
| H149_AAACCTGAGACTGTAA-1 | H149 | H149 | 0.033 | 
| H160_AAACCTGAGAAACCAT-1 | H160 | H160 | 0.067 | 
| H176_AAACCTGAGAAGGTTT-1 | H176 | H176 | 0.108 | 
| H182_AAACCTGAGCCCAACC-1 | H182 | H182 | 0.109 | 
| H185_AAACCTGAGACGCACA-1 | H185 | H185 | 0.163 | 
| H188_AAACCTGAGAGACTAT-1 | H188 | H188 | 0.087 | 
| H197_AAACCTGAGGCTCTTA-1 | H197 | H197 | 0.171 | 
| H205_AAACCTGAGTTTAGGA-1 | H205 | H205 | 0.028 | 
| H208_AAACCTGAGAACTGTA-1 | H208 | H208 | 0.323 | 
| H211_AAACCTGAGGATATAC-1 | H211 | H211 | 0.029 | 
| H68_AAACCTGCACATTCGA-1 | H68 | H68 | 0.138 | 
adata.shape
(71401, 33514)
adata.write(output_file, compression="lzf")
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead if is_string_dtype(df[key]) and not is_categorical(df[key]) ... storing 'IR_VJ_1_cdr3' as categorical ... storing 'IR_VJ_2_cdr3' as categorical ... storing 'IR_VDJ_1_cdr3' as categorical ... storing 'IR_VDJ_2_cdr3' as categorical ... storing 'IR_VJ_1_cdr3_nt' as categorical ... storing 'IR_VJ_2_cdr3_nt' as categorical ... storing 'IR_VDJ_1_cdr3_nt' as categorical ... storing 'IR_VDJ_2_cdr3_nt' as categorical ... storing 'IR_VJ_1_v_gene' as categorical ... storing 'IR_VJ_2_v_gene' as categorical ... storing 'IR_VDJ_1_v_gene' as categorical ... storing 'IR_VDJ_2_v_gene' as categorical ... storing 'IR_VJ_1_j_gene' as categorical ... storing 'IR_VJ_2_j_gene' as categorical ... storing 'IR_VJ_1_c_gene' as categorical ... storing 'IR_VJ_2_c_gene' as categorical ... storing 'IR_VDJ_1_c_gene' as categorical ... storing 'IR_VDJ_2_c_gene' as categorical ... storing 'samples' as categorical ... storing 'patient' as categorical ... storing 'origin' as categorical ... storing 'dataset' as categorical ... storing 'tumor_type' as categorical ... storing 'platform' as categorical ... storing 'hpv_status' as categorical ... storing 'ir_status' as categorical