Python script for generating PBMC simulating datasets¶

To benchmark RareQ against existing rare cell detection methods, We generated 150 simulated datasets using peripheral blood mononuclear cells (GEO assession code: GSE194122). Each simulation dataset contained 5000 cells and 5-15 cell types (Sim-PBMC 1, 2, 3)

Sim-PBMC 1¶

This scenario contains 50 replicates, each with 4 major cell types and 1 rare cell types

Sim-PBMC 2¶

This scenario contains 50 replicates, each with 9 major cell types and 1 rare cell types

Sim-PBMC 3¶

This scenario contains 50 replicates, each with 10 major cell types and 5 rare cell types

In [ ]:
# Import packages
from collections import Counter
import matplotlib.pyplot as plt
from pysankey2 import Sankey
from scipy import sparse
from scipy.sparse import hstack, vstack, coo_matrix
from scipy.io import mmread
from sklearn import metrics
import math
import time
import anndata
import torch
import numpy as np
import pandas as pd
import scanpy as sc
import random
import os
from warnings import filterwarnings
import scipy.sparse as sp
from operator import itemgetter
filterwarnings("ignore")
import json, os
import math, copy, time
import numpy as np
from collections import defaultdict
import pyHGT2
from pyHGT2.data import *
from pyHGT2.utils import *
from pyHGT2.model1 import *
from pyHGT2.conv import *
from sklearn import metrics

from collections import Counter
import matplotlib.pyplot as plt
from pysankey2 import Sankey
from scipy.sparse import hstack, vstack, coo_matrix
from scipy.io import mmread
from sklearn import metrics
import math
import time
import anndata
import torch
import numpy as np
import pandas as pd
import scanpy as sc
import random
import os
from scipy import sparse
from scipy.io import mmread
from scipy.sparse import hstack, vstack, coo_matrix

Set seed for reproduction¶

In [ ]:
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

Prepare full PBMC data for subsampling¶

All related data can be download from zenodo.

GSE194122_openproblems_neurips2021_multiome_BMMC_processed.h5ad: The full data contains annotated cell types

Gene_Cell.mtx: Gene expression matrix data

Peak_Cell.mtx: Gene accessibility (peak count) matrix data

Gene_Peak.mtx: Regulatory score matrix based on MAESTRO55. In this matrix, Xij signifies the regulatory potential of peak j relative to gene i

Gene_names.tsv: Gene names for the expression data

Cell_names.tsv: Cell names for both expression and accessibility data

Peak_names.tsv: Peaks names for accessibility data

Then, the cell count and proportion of each cell type are computed and sorted. Rare cell types will be randomly chosen from the top smallest cell types, while the majors cell types will be selected from the most abundant cell types.

In [ ]:
## Read data

os.chdir('./simulation/')

pbmc = sc.read_h5ad('GSE194122_openproblems_neurips2021_multiome_BMMC_processed.h5ad')
cell_type = pbmc.obs

## Gene_Peak.mtx, Gene_Cell.mtx, Peak_Cell.mtx, Gene_names.tsv, Cell_names.tsv, Peak_names.tsv
gene_peak = anndata.read_mtx('Gene_Peak.mtx')
gene_cell = anndata.read_mtx('Gene_Cell.mtx')
peak_cell = anndata.read_mtx('Peak_Cell.mtx')
gene_names = pd.read_csv('Gene_names.tsv', sep='\t', header=None)
cell_names = pd.read_csv('Cell_names.tsv', sep='\t', header=None)
peak_names = pd.read_csv('Peak_names.tsv', sep='\t', header=None)
peak_cell.obs_names = peak_names[0]
peak_cell.var_names = cell_names[0]
gene_cell.obs_names = gene_names[0]
gene_cell.var_names = cell_names[0]
gene_peak.obs_names = gene_names[0]
gene_peak.var_names = peak_names[0]

#Gene_Cell, Peak_Cell, Gene_Peak = gene_cell,peak_cell,gene_peak
true_label = cell_type.loc[cell_names[0].tolist(), :]
cell_type['number'] = 1
cell_count = cell_type.groupby('cell_type').count()
cell_count =cell_count.sort_values(by='number',ascending=True, inplace=False)

cell_count['percentage'] = (cell_count['number'] / cell_count['number'].sum())
cell_count

all(true_label.index == cell_names[0])

# Reset index
cell_df = true_label.reset_index(drop=True)

# Group by cell type to get the corresponding index list
cell_type_dict = cell_df.groupby('cell_type').apply(lambda x: x.index.tolist()).to_dict()

# Sort the dictionary based on the number of cells in each type
sorted_cell_type_dict = {k: v for k, v in sorted(cell_type_dict.items(), key=lambda item: len(item[1]))}

# Get the lengths of each cell type's indices
lengths = [len(indices) for indices in sorted_cell_type_dict.values()]

# Calculate the total count
total = sum(lengths)

# Print the proportion of each cell type
for cell_type, indices in sorted_cell_type_dict.items():
    proportion = len(indices) / total
    print(f"Cell type: {cell_type}, Proportion: {proportion:.3f}")
    
# Uncomment the lines below to print the keys and lengths of the sorted dictionary
# print(sorted_cell_type_dict.keys())
# for cell_type, indices in sorted_cell_type_dict.items():
#     print(f"Cell type: {cell_type}, Length: {len(indices)}")

Sim-PBMC 1 generation¶

The full PBMC data contains 22 cell types, among which the 4 most abundant cell types are selected as major cell types. The one rare cell type is randomly selected from the 2 most smallest cell types. The four selected major cell types are further subsampled according to the proportion of the original data to generate a smaller dataset comprising around 4950 cells, and the rare cell type is subsampled to 50 cells.

The output subsampled datasets are stored in the Rare1_Ordinary4 folder, with 50 subfolders (0-49) for replicates.

In [ ]:
# Sim-PBMC 1
import numpy as np
import random

def sample_cells(cell_type_dict, total_samples=5000):
    # Extract all cell types
    cell_types = list(cell_type_dict.keys())
    
    # The last 4 types must be sampled
    fixed_types = cell_types[-4:]
    
    # Randomly select 1 cell type from the first two types
    random_type = random.choice(cell_types[:2])
    
    # Combine these 5 types
    sample_types = fixed_types + [random_type]
    
    # Calculate the proportion of each cell type within the selected types
    type_counts = {cell_type: len(cells) for cell_type, cells in cell_type_dict.items() if cell_type in sample_types}
    total_count = sum(type_counts.values()) - 50 if random_type in type_counts else 0
    type_ratio = {cell_type: count / total_count for cell_type, count in type_counts.items() if cell_type != random_type}
    
    # Sample these cell types proportionally to a total of 5000
    sample_counts = {cell_type: int(np.round(ratio * total_samples)) for cell_type, ratio in type_ratio.items()}
    
    # Find the difference between the total number of samples and 5000 (including the fixed 50), then allocate this difference to the cell type with the largest sample count
    diff = total_samples - sum(sample_counts.values()) - 50
    max_type = max(sample_counts, key=sample_counts.get)
    sample_counts[max_type] += diff
    
    # Perform sampling
    sample_indices = []
    for cell_type, count in sample_counts.items():
        cells = cell_type_dict[cell_type]
        if len(cells) > count:
            cells = np.random.choice(cells, size=count, replace=False)
        sample_indices.extend(cells)
    
    # Additionally sample 50 samples from the selected random type
    cells = cell_type_dict[random_type]
    if len(cells) > 50:
        cells = np.random.choice(cells, size=50, replace=False)
    sample_indices.extend(cells)
        
    return sample_indices

import os
import numpy as np
import scipy.sparse
from tqdm import tqdm
from scipy import io


ip_file = './'
filetype_name_list = ['Rare1_Ordinary4']

for filetype_name in filetype_name_list:
    for file_num in tqdm(range(0, 50)):  # Process the first 50 folders
        path2 = os.path.join(ip_file, filetype_name, str(file_num), 'R_input')
        
        # Set random seed
        np.random.seed(file_num)
        random.seed(file_num)
        
        # Execute sampling function
        sample_indices = sample_cells(sorted_cell_type_dict, total_samples=5000)
        
        # Sample corresponding gene_cell, peak_cell matrices, and true_label labels
        sub_gene_cell = gene_cell[:, sample_indices]
        sub_peak_cell = peak_cell[:, sample_indices]
        gene_cell_matrix = sub_gene_cell.X
        peak_cell_matrix = sub_peak_cell.X
        
        sub_label = list(true_label.iloc[sample_indices]['cell_type'])
        
        print('sub_gene_cell.shape:', end=' ')
        print(sub_gene_cell.shape)
        print('sub_peak_cell.shape:', end=' ')
        print(sub_peak_cell.shape)
        
        # Save files
        io.mmwrite(os.path.join(path2, "sub_gene_cell.mtx"), gene_cell_matrix)
        io.mmwrite(os.path.join(path2, "sub_peak_cell.mtx"), peak_cell_matrix)
        with open(os.path.join(path2, "sub_label.txt"), "w") as file:
            for label in sub_label:
                file.write(f"{label}\n")
        with open(os.path.join(path2, "sample_indices.txt"), "w") as file:
            for index in sample_indices:
                file.write(f"{index}\n")
                

Sim-PBMC 2 generation¶

The full PBMC data contains 22 cell types, among which the 9 most abundant cell types are selected as major cell types. The one rare cell type is randomly selected from the 2 most smallest cell types. The 9 selected major cell types are further subsampled according to the proportion of the original data to generate a smaller dataset comprising 4950 cells, and the rare cell type is subsampled to 50 cells.

The output subsampled datasets are stored in the Rare1_Ordinary9 folder, with 50 subfolders (0-49) for replicates.

In [ ]:
# Sim-PBMC 2

import numpy as np
import random

def sample_cells(cell_type_dict, total_samples=5000):
    # Extract all cell types
    cell_types = list(cell_type_dict.keys())
    
    # The last 9 types must be sampled
    fixed_types = cell_types[-9:]
    
    # Randomly select 1 cell type from the first two types
    random_type = random.choice(cell_types[:2])
    
    # Combine these 10 types
    sample_types = fixed_types + [random_type]
    
    # Calculate the proportion of each cell type within the selected types
    type_counts = {cell_type: len(cells) for cell_type, cells in cell_type_dict.items() if cell_type in sample_types}
    total_count = sum(type_counts.values()) - 50 if random_type in type_counts else 0
    type_ratio = {cell_type: count / total_count for cell_type, count in type_counts.items() if cell_type != random_type}
    
    # Sample these cell types proportionally to a total of 5000
    sample_counts = {cell_type: int(np.round(ratio * total_samples)) for cell_type, ratio in type_ratio.items()}
    
    # Find the difference between the total number of samples and 5000 (including the fixed 50), then allocate this difference to the cell type with the largest sample count
    diff = total_samples - sum(sample_counts.values()) - 50
    max_type = max(sample_counts, key=sample_counts.get)
    sample_counts[max_type] += diff
    
    # Perform sampling
    sample_indices = []
    for cell_type, count in sample_counts.items():
        cells = cell_type_dict[cell_type]
        if len(cells) > count:
            cells = np.random.choice(cells, size=count, replace=False)
        sample_indices.extend(cells)
    
    # Additionally sample 50 samples from the selected random type
    cells = cell_type_dict[random_type]
    if len(cells) > 50:
        cells = np.random.choice(cells, size=50, replace=False)
    sample_indices.extend(cells)
        
    return sample_indices

import os
import numpy as np
import scipy.sparse
from tqdm import tqdm
from scipy import io

ip_file = './'
filetype_name_list = ['Rare1_Ordinary9']

for filetype_name in filetype_name_list:
    for file_num in tqdm(range(0, 50)):  # Process the first 50 folders
        path2 = os.path.join(ip_file, filetype_name, str(file_num), 'R_input')
        
        # os.makedirs(path2)
        # Set random seed
        np.random.seed(file_num)
        random.seed(file_num)
        
        # Execute sampling function
        sample_indices = sample_cells(sorted_cell_type_dict, total_samples=5000)
        
        # Sample corresponding gene_cell, peak_cell matrices, and true_label labels
        sub_gene_cell = gene_cell[:, sample_indices]
        sub_peak_cell = peak_cell[:, sample_indices]
        gene_cell_matrix = sub_gene_cell.X
        peak_cell_matrix = sub_peak_cell.X
        
        sub_label = list(true_label.iloc[sample_indices]['cell_type'])
        
        print('sub_gene_cell.shape:', end=' ')
        print(sub_gene_cell.shape)
        print('sub_peak_cell.shape:', end=' ')
        print(sub_peak_cell.shape)
        
        # Save files
        io.mmwrite(os.path.join(path2, "sub_gene_cell.mtx"), gene_cell_matrix)
        io.mmwrite(os.path.join(path2, "sub_peak_cell.mtx"), peak_cell_matrix)
        with open(os.path.join(path2, "sub_label.txt"), "w") as file:
            for label in sub_label:
                file.write(f"{label}\n")
        with open(os.path.join(path2, "sample_indices.txt"), "w") as file:
            for index in sample_indices:
                file.write(f"{index}\n")

Sim-PBMC 3 generation¶

The full PBMC data contains 22 cell types, among which the 10 most abundant cell types are selected as major cell types. The 5 rare cell types are randomly selected from the 8 most smallest cell types. The 15 selected cell types are further subsampled according to the proportion of the original data to generate a smaller dataset comprising 5000 cells.

The output subsampled datasets are stored in the Rare5_Ordinary10 folder, with 50 subfolders (0-49) for replicates.

In [ ]:
# Sim-PBMC 3
import numpy as np
import random

def sample_cells(cell_type_dict, total_samples=5000):
    # Extract all cell types
    cell_types = list(cell_type_dict.keys())
    
    # The last ten types must be sampled
    fixed_types = cell_types[-10:]
    
    # Randomly select 5 cell types from the first eight types
    random_types = random.sample(cell_types[:8], 5)
    
    # Combine these 15 types
    sample_types = fixed_types + random_types
    
    # Calculate the proportion of each cell type within these 15 types
    type_counts = {cell_type: len(cells) for cell_type, cells in cell_type_dict.items() if cell_type in sample_types}
    total_count = sum(type_counts.values())
    type_ratio = {cell_type: count / total_count for cell_type, count in type_counts.items()}
    
    # Sample these cell types proportionally to a total of 5000
    sample_counts = {cell_type: int(np.round(ratio * total_samples)) for cell_type, ratio in type_ratio.items()}
    
    # Find the difference between the total number of samples and 5000, then allocate this difference to the cell type with the largest sample count
    diff = total_samples - sum(sample_counts.values())
    max_type = max(sample_counts, key=sample_counts.get)
    sample_counts[max_type] += diff
    
    # Perform sampling
    sample_indices = []
    for cell_type, count in sample_counts.items():
        cells = cell_type_dict[cell_type]
        if len(cells) > count:
            cells = np.random.choice(cells, size=count, replace=False)
        sample_indices.extend(cells)
        
    return sample_indices
  
import os
import numpy as np
import scipy.sparse
from tqdm import tqdm
from scipy import io

ip_file = './'
filetype_name_list = ['Rare5_Ordinary10']

for filetype_name in filetype_name_list:
    for file_num in tqdm(range(0, 50)):  # Process 50 folders
        path2 = os.path.join(ip_file, filetype_name, str(file_num), 'R_input')
        # os.makedirs(path2)
        
        # Set random seed
        np.random.seed(file_num)
        random.seed(file_num)
        
        # Execute sampling function
        sample_indices = sample_cells(sorted_cell_type_dict, total_samples=5000)
        
        # Sample corresponding gene_cell, peak_cell matrices, and true_label labels
        sub_gene_cell = gene_cell[:, sample_indices]
        sub_peak_cell = peak_cell[:, sample_indices]
        gene_cell_matrix = sub_gene_cell.X
        peak_cell_matrix = sub_peak_cell.X
        
        sub_label = list(true_label.iloc[sample_indices]['cell_type'])
        
        print('sub_gene_cell.shape:', end=' ')
        print(sub_gene_cell.shape)
        print('sub_peak_cell.shape:', end=' ')
        print(sub_peak_cell.shape)
        
        # Save files
        io.mmwrite(os.path.join(path2, "sub_gene_cell.mtx"), gene_cell_matrix)
        io.mmwrite(os.path.join(path2, "sub_peak_cell.mtx"), peak_cell_matrix)
        with open(os.path.join(path2, "sub_label.txt"), "w") as file:
            for label in sub_label:
                file.write(f"{label}\n")
        with open(os.path.join(path2, "sample_indices.txt"), "w") as file:
            for index in sample_indices:
                file.write(f"{index}\n")
In [ ]:
 
In [ ]: