Python script for generating PBMC simulating datasets¶
To benchmark RareQ against existing rare cell detection methods, We generated 150 simulated datasets using peripheral blood mononuclear cells (GEO assession code: GSE194122). Each simulation dataset contained 5000 cells and 5-15 cell types (Sim-PBMC 1, 2, 3)
Sim-PBMC 1¶
This scenario contains 50 replicates, each with 4 major cell types and 1 rare cell types
Sim-PBMC 2¶
This scenario contains 50 replicates, each with 9 major cell types and 1 rare cell types
Sim-PBMC 3¶
This scenario contains 50 replicates, each with 10 major cell types and 5 rare cell types
# Import packages
from collections import Counter
import matplotlib.pyplot as plt
from pysankey2 import Sankey
from scipy import sparse
from scipy.sparse import hstack, vstack, coo_matrix
from scipy.io import mmread
from sklearn import metrics
import math
import time
import anndata
import torch
import numpy as np
import pandas as pd
import scanpy as sc
import random
import os
from warnings import filterwarnings
import scipy.sparse as sp
from operator import itemgetter
filterwarnings("ignore")
import json, os
import math, copy, time
import numpy as np
from collections import defaultdict
import pyHGT2
from pyHGT2.data import *
from pyHGT2.utils import *
from pyHGT2.model1 import *
from pyHGT2.conv import *
from sklearn import metrics
from collections import Counter
import matplotlib.pyplot as plt
from pysankey2 import Sankey
from scipy.sparse import hstack, vstack, coo_matrix
from scipy.io import mmread
from sklearn import metrics
import math
import time
import anndata
import torch
import numpy as np
import pandas as pd
import scanpy as sc
import random
import os
from scipy import sparse
from scipy.io import mmread
from scipy.sparse import hstack, vstack, coo_matrix
Set seed for reproduction¶
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
Prepare full PBMC data for subsampling¶
All related data can be download from zenodo.
GSE194122_openproblems_neurips2021_multiome_BMMC_processed.h5ad: The full data contains annotated cell types
Gene_Cell.mtx: Gene expression matrix data
Peak_Cell.mtx: Gene accessibility (peak count) matrix data
Gene_Peak.mtx: Regulatory score matrix based on MAESTRO55. In this matrix, Xij signifies the regulatory potential of peak j relative to gene i
Gene_names.tsv: Gene names for the expression data
Cell_names.tsv: Cell names for both expression and accessibility data
Peak_names.tsv: Peaks names for accessibility data
Then, the cell count and proportion of each cell type are computed and sorted. Rare cell types will be randomly chosen from the top smallest cell types, while the majors cell types will be selected from the most abundant cell types.
## Read data
os.chdir('./simulation/')
pbmc = sc.read_h5ad('GSE194122_openproblems_neurips2021_multiome_BMMC_processed.h5ad')
cell_type = pbmc.obs
## Gene_Peak.mtx, Gene_Cell.mtx, Peak_Cell.mtx, Gene_names.tsv, Cell_names.tsv, Peak_names.tsv
gene_peak = anndata.read_mtx('Gene_Peak.mtx')
gene_cell = anndata.read_mtx('Gene_Cell.mtx')
peak_cell = anndata.read_mtx('Peak_Cell.mtx')
gene_names = pd.read_csv('Gene_names.tsv', sep='\t', header=None)
cell_names = pd.read_csv('Cell_names.tsv', sep='\t', header=None)
peak_names = pd.read_csv('Peak_names.tsv', sep='\t', header=None)
peak_cell.obs_names = peak_names[0]
peak_cell.var_names = cell_names[0]
gene_cell.obs_names = gene_names[0]
gene_cell.var_names = cell_names[0]
gene_peak.obs_names = gene_names[0]
gene_peak.var_names = peak_names[0]
#Gene_Cell, Peak_Cell, Gene_Peak = gene_cell,peak_cell,gene_peak
true_label = cell_type.loc[cell_names[0].tolist(), :]
cell_type['number'] = 1
cell_count = cell_type.groupby('cell_type').count()
cell_count =cell_count.sort_values(by='number',ascending=True, inplace=False)
cell_count['percentage'] = (cell_count['number'] / cell_count['number'].sum())
cell_count
all(true_label.index == cell_names[0])
# Reset index
cell_df = true_label.reset_index(drop=True)
# Group by cell type to get the corresponding index list
cell_type_dict = cell_df.groupby('cell_type').apply(lambda x: x.index.tolist()).to_dict()
# Sort the dictionary based on the number of cells in each type
sorted_cell_type_dict = {k: v for k, v in sorted(cell_type_dict.items(), key=lambda item: len(item[1]))}
# Get the lengths of each cell type's indices
lengths = [len(indices) for indices in sorted_cell_type_dict.values()]
# Calculate the total count
total = sum(lengths)
# Print the proportion of each cell type
for cell_type, indices in sorted_cell_type_dict.items():
proportion = len(indices) / total
print(f"Cell type: {cell_type}, Proportion: {proportion:.3f}")
# Uncomment the lines below to print the keys and lengths of the sorted dictionary
# print(sorted_cell_type_dict.keys())
# for cell_type, indices in sorted_cell_type_dict.items():
# print(f"Cell type: {cell_type}, Length: {len(indices)}")
Sim-PBMC 1 generation¶
The full PBMC data contains 22 cell types, among which the 4 most abundant cell types are selected as major cell types. The one rare cell type is randomly selected from the 2 most smallest cell types. The four selected major cell types are further subsampled according to the proportion of the original data to generate a smaller dataset comprising around 4950 cells, and the rare cell type is subsampled to 50 cells.
The output subsampled datasets are stored in the Rare1_Ordinary4 folder, with 50 subfolders (0-49) for replicates.
# Sim-PBMC 1
import numpy as np
import random
def sample_cells(cell_type_dict, total_samples=5000):
# Extract all cell types
cell_types = list(cell_type_dict.keys())
# The last 4 types must be sampled
fixed_types = cell_types[-4:]
# Randomly select 1 cell type from the first two types
random_type = random.choice(cell_types[:2])
# Combine these 5 types
sample_types = fixed_types + [random_type]
# Calculate the proportion of each cell type within the selected types
type_counts = {cell_type: len(cells) for cell_type, cells in cell_type_dict.items() if cell_type in sample_types}
total_count = sum(type_counts.values()) - 50 if random_type in type_counts else 0
type_ratio = {cell_type: count / total_count for cell_type, count in type_counts.items() if cell_type != random_type}
# Sample these cell types proportionally to a total of 5000
sample_counts = {cell_type: int(np.round(ratio * total_samples)) for cell_type, ratio in type_ratio.items()}
# Find the difference between the total number of samples and 5000 (including the fixed 50), then allocate this difference to the cell type with the largest sample count
diff = total_samples - sum(sample_counts.values()) - 50
max_type = max(sample_counts, key=sample_counts.get)
sample_counts[max_type] += diff
# Perform sampling
sample_indices = []
for cell_type, count in sample_counts.items():
cells = cell_type_dict[cell_type]
if len(cells) > count:
cells = np.random.choice(cells, size=count, replace=False)
sample_indices.extend(cells)
# Additionally sample 50 samples from the selected random type
cells = cell_type_dict[random_type]
if len(cells) > 50:
cells = np.random.choice(cells, size=50, replace=False)
sample_indices.extend(cells)
return sample_indices
import os
import numpy as np
import scipy.sparse
from tqdm import tqdm
from scipy import io
ip_file = './'
filetype_name_list = ['Rare1_Ordinary4']
for filetype_name in filetype_name_list:
for file_num in tqdm(range(0, 50)): # Process the first 50 folders
path2 = os.path.join(ip_file, filetype_name, str(file_num), 'R_input')
# Set random seed
np.random.seed(file_num)
random.seed(file_num)
# Execute sampling function
sample_indices = sample_cells(sorted_cell_type_dict, total_samples=5000)
# Sample corresponding gene_cell, peak_cell matrices, and true_label labels
sub_gene_cell = gene_cell[:, sample_indices]
sub_peak_cell = peak_cell[:, sample_indices]
gene_cell_matrix = sub_gene_cell.X
peak_cell_matrix = sub_peak_cell.X
sub_label = list(true_label.iloc[sample_indices]['cell_type'])
print('sub_gene_cell.shape:', end=' ')
print(sub_gene_cell.shape)
print('sub_peak_cell.shape:', end=' ')
print(sub_peak_cell.shape)
# Save files
io.mmwrite(os.path.join(path2, "sub_gene_cell.mtx"), gene_cell_matrix)
io.mmwrite(os.path.join(path2, "sub_peak_cell.mtx"), peak_cell_matrix)
with open(os.path.join(path2, "sub_label.txt"), "w") as file:
for label in sub_label:
file.write(f"{label}\n")
with open(os.path.join(path2, "sample_indices.txt"), "w") as file:
for index in sample_indices:
file.write(f"{index}\n")
Sim-PBMC 2 generation¶
The full PBMC data contains 22 cell types, among which the 9 most abundant cell types are selected as major cell types. The one rare cell type is randomly selected from the 2 most smallest cell types. The 9 selected major cell types are further subsampled according to the proportion of the original data to generate a smaller dataset comprising 4950 cells, and the rare cell type is subsampled to 50 cells.
The output subsampled datasets are stored in the Rare1_Ordinary9 folder, with 50 subfolders (0-49) for replicates.
# Sim-PBMC 2
import numpy as np
import random
def sample_cells(cell_type_dict, total_samples=5000):
# Extract all cell types
cell_types = list(cell_type_dict.keys())
# The last 9 types must be sampled
fixed_types = cell_types[-9:]
# Randomly select 1 cell type from the first two types
random_type = random.choice(cell_types[:2])
# Combine these 10 types
sample_types = fixed_types + [random_type]
# Calculate the proportion of each cell type within the selected types
type_counts = {cell_type: len(cells) for cell_type, cells in cell_type_dict.items() if cell_type in sample_types}
total_count = sum(type_counts.values()) - 50 if random_type in type_counts else 0
type_ratio = {cell_type: count / total_count for cell_type, count in type_counts.items() if cell_type != random_type}
# Sample these cell types proportionally to a total of 5000
sample_counts = {cell_type: int(np.round(ratio * total_samples)) for cell_type, ratio in type_ratio.items()}
# Find the difference between the total number of samples and 5000 (including the fixed 50), then allocate this difference to the cell type with the largest sample count
diff = total_samples - sum(sample_counts.values()) - 50
max_type = max(sample_counts, key=sample_counts.get)
sample_counts[max_type] += diff
# Perform sampling
sample_indices = []
for cell_type, count in sample_counts.items():
cells = cell_type_dict[cell_type]
if len(cells) > count:
cells = np.random.choice(cells, size=count, replace=False)
sample_indices.extend(cells)
# Additionally sample 50 samples from the selected random type
cells = cell_type_dict[random_type]
if len(cells) > 50:
cells = np.random.choice(cells, size=50, replace=False)
sample_indices.extend(cells)
return sample_indices
import os
import numpy as np
import scipy.sparse
from tqdm import tqdm
from scipy import io
ip_file = './'
filetype_name_list = ['Rare1_Ordinary9']
for filetype_name in filetype_name_list:
for file_num in tqdm(range(0, 50)): # Process the first 50 folders
path2 = os.path.join(ip_file, filetype_name, str(file_num), 'R_input')
# os.makedirs(path2)
# Set random seed
np.random.seed(file_num)
random.seed(file_num)
# Execute sampling function
sample_indices = sample_cells(sorted_cell_type_dict, total_samples=5000)
# Sample corresponding gene_cell, peak_cell matrices, and true_label labels
sub_gene_cell = gene_cell[:, sample_indices]
sub_peak_cell = peak_cell[:, sample_indices]
gene_cell_matrix = sub_gene_cell.X
peak_cell_matrix = sub_peak_cell.X
sub_label = list(true_label.iloc[sample_indices]['cell_type'])
print('sub_gene_cell.shape:', end=' ')
print(sub_gene_cell.shape)
print('sub_peak_cell.shape:', end=' ')
print(sub_peak_cell.shape)
# Save files
io.mmwrite(os.path.join(path2, "sub_gene_cell.mtx"), gene_cell_matrix)
io.mmwrite(os.path.join(path2, "sub_peak_cell.mtx"), peak_cell_matrix)
with open(os.path.join(path2, "sub_label.txt"), "w") as file:
for label in sub_label:
file.write(f"{label}\n")
with open(os.path.join(path2, "sample_indices.txt"), "w") as file:
for index in sample_indices:
file.write(f"{index}\n")
Sim-PBMC 3 generation¶
The full PBMC data contains 22 cell types, among which the 10 most abundant cell types are selected as major cell types. The 5 rare cell types are randomly selected from the 8 most smallest cell types. The 15 selected cell types are further subsampled according to the proportion of the original data to generate a smaller dataset comprising 5000 cells.
The output subsampled datasets are stored in the Rare5_Ordinary10 folder, with 50 subfolders (0-49) for replicates.
# Sim-PBMC 3
import numpy as np
import random
def sample_cells(cell_type_dict, total_samples=5000):
# Extract all cell types
cell_types = list(cell_type_dict.keys())
# The last ten types must be sampled
fixed_types = cell_types[-10:]
# Randomly select 5 cell types from the first eight types
random_types = random.sample(cell_types[:8], 5)
# Combine these 15 types
sample_types = fixed_types + random_types
# Calculate the proportion of each cell type within these 15 types
type_counts = {cell_type: len(cells) for cell_type, cells in cell_type_dict.items() if cell_type in sample_types}
total_count = sum(type_counts.values())
type_ratio = {cell_type: count / total_count for cell_type, count in type_counts.items()}
# Sample these cell types proportionally to a total of 5000
sample_counts = {cell_type: int(np.round(ratio * total_samples)) for cell_type, ratio in type_ratio.items()}
# Find the difference between the total number of samples and 5000, then allocate this difference to the cell type with the largest sample count
diff = total_samples - sum(sample_counts.values())
max_type = max(sample_counts, key=sample_counts.get)
sample_counts[max_type] += diff
# Perform sampling
sample_indices = []
for cell_type, count in sample_counts.items():
cells = cell_type_dict[cell_type]
if len(cells) > count:
cells = np.random.choice(cells, size=count, replace=False)
sample_indices.extend(cells)
return sample_indices
import os
import numpy as np
import scipy.sparse
from tqdm import tqdm
from scipy import io
ip_file = './'
filetype_name_list = ['Rare5_Ordinary10']
for filetype_name in filetype_name_list:
for file_num in tqdm(range(0, 50)): # Process 50 folders
path2 = os.path.join(ip_file, filetype_name, str(file_num), 'R_input')
# os.makedirs(path2)
# Set random seed
np.random.seed(file_num)
random.seed(file_num)
# Execute sampling function
sample_indices = sample_cells(sorted_cell_type_dict, total_samples=5000)
# Sample corresponding gene_cell, peak_cell matrices, and true_label labels
sub_gene_cell = gene_cell[:, sample_indices]
sub_peak_cell = peak_cell[:, sample_indices]
gene_cell_matrix = sub_gene_cell.X
peak_cell_matrix = sub_peak_cell.X
sub_label = list(true_label.iloc[sample_indices]['cell_type'])
print('sub_gene_cell.shape:', end=' ')
print(sub_gene_cell.shape)
print('sub_peak_cell.shape:', end=' ')
print(sub_peak_cell.shape)
# Save files
io.mmwrite(os.path.join(path2, "sub_gene_cell.mtx"), gene_cell_matrix)
io.mmwrite(os.path.join(path2, "sub_peak_cell.mtx"), peak_cell_matrix)
with open(os.path.join(path2, "sub_label.txt"), "w") as file:
for label in sub_label:
file.write(f"{label}\n")
with open(os.path.join(path2, "sample_indices.txt"), "w") as file:
for index in sample_indices:
file.write(f"{index}\n")