Generation of datasets using Simulation Strategy 2 - MERFISH Lung Cancer

#Restart runtime after every run
!git clone https://github.com/Zafar-Lab/spDDB.git
Cloning into 'spDDB'...
remote: Enumerating objects: 239, done.
remote: Counting objects: 100% (239/239), done.
remote: Compressing objects: 100% (191/191), done.
remote: Total 239 (delta 71), reused 203 (delta 47), pack-reused 0 (from 0)
Receiving objects: 100% (239/239), 28.38 MiB | 19.13 MiB/s, done.
Resolving deltas: 100% (71/71), done.
%cd spDDB/Experiments/_Binning_code_for_Gold_standard_datasets/
!ls
/content/spDDB/Experiments/_Binning_code_for_Gold_standard_datasets
Binning_in_python_brain.ipynb	       Binning_in_python_lung_cancer.ipynb
Binning_in_python_breast_cancer.ipynb  binning.py
Binning_in_python_ileum_100um.ipynb

Mounting google drive for accessing the input data

from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive

Installing Libraries

!pip install scanpy
!pip install diptest
Collecting scanpy
  Downloading scanpy-1.12.1-py3-none-any.whl.metadata (8.4 kB)
Collecting anndata>=0.10.8 (from scanpy)
  Downloading anndata-0.12.16-py3-none-any.whl.metadata (9.9 kB)
Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from scanpy) (2026.4.22)
Collecting fast-array-utils>=1.4 (from fast-array-utils[accel,sparse]>=1.4->scanpy)
  Downloading fast_array_utils-1.4.1-py3-none-any.whl.metadata (2.7 kB)
Requirement already satisfied: h5py>=3.11 in /usr/local/lib/python3.12/dist-packages (from scanpy) (3.16.0)
Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from scanpy) (1.5.3)
Collecting legacy-api-wrap>=1.5 (from scanpy)
  Downloading legacy_api_wrap-1.5-py3-none-any.whl.metadata (2.2 kB)
Requirement already satisfied: matplotlib>=3.9 in /usr/local/lib/python3.12/dist-packages (from scanpy) (3.10.0)
Requirement already satisfied: natsort in /usr/local/lib/python3.12/dist-packages (from scanpy) (8.4.0)
Requirement already satisfied: networkx>=2.8.8 in /usr/local/lib/python3.12/dist-packages (from scanpy) (3.6.1)
Requirement already satisfied: numba>=0.60 in /usr/local/lib/python3.12/dist-packages (from scanpy) (0.60.0)
Requirement already satisfied: numpy>=2 in /usr/local/lib/python3.12/dist-packages (from scanpy) (2.0.2)
Requirement already satisfied: packaging>=25 in /usr/local/lib/python3.12/dist-packages (from scanpy) (26.1)
Collecting pandas>=2.3 (from scanpy)
  Downloading pandas-3.0.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 79.5/79.5 kB 4.6 MB/s eta 0:00:00
?25hRequirement already satisfied: patsy in /usr/local/lib/python3.12/dist-packages (from scanpy) (1.0.2)
Requirement already satisfied: pynndescent>=0.5.13 in /usr/local/lib/python3.12/dist-packages (from scanpy) (0.6.0)
Requirement already satisfied: scikit-learn>=1.6 in /usr/local/lib/python3.12/dist-packages (from scanpy) (1.6.1)
Requirement already satisfied: scipy>=1.13 in /usr/local/lib/python3.12/dist-packages (from scanpy) (1.16.3)
Requirement already satisfied: seaborn>=0.13.2 in /usr/local/lib/python3.12/dist-packages (from scanpy) (0.13.2)
Collecting session-info2 (from scanpy)
  Downloading session_info2-0.4.1-py3-none-any.whl.metadata (2.5 kB)
Requirement already satisfied: statsmodels>=0.14.5 in /usr/local/lib/python3.12/dist-packages (from scanpy) (0.14.6)
Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from scanpy) (4.67.3)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.12/dist-packages (from scanpy) (4.15.0)
Requirement already satisfied: umap-learn>=0.5.12 in /usr/local/lib/python3.12/dist-packages (from scanpy) (0.5.12)
Collecting array-api-compat>=1.7.1 (from anndata>=0.10.8->scanpy)
  Downloading array_api_compat-1.14.0-py3-none-any.whl.metadata (2.5 kB)
Collecting pandas>=2.3 (from scanpy)
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 91.2/91.2 kB 6.9 MB/s eta 0:00:00
?25hCollecting scverse-misc>=0.0.3 (from anndata>=0.10.8->scanpy)
  Downloading scverse_misc-0.0.7-py3-none-any.whl.metadata (4.5 kB)
Collecting zarr!=3.0.*,>=2.18.7 (from anndata>=0.10.8->scanpy)
  Downloading zarr-3.2.1-py3-none-any.whl.metadata (8.7 kB)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.9->scanpy) (1.3.3)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.9->scanpy) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.9->scanpy) (4.62.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.9->scanpy) (1.5.0)
Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.9->scanpy) (11.3.0)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.9->scanpy) (3.3.2)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.12/dist-packages (from matplotlib>=3.9->scanpy) (2.9.0.post0)
Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.12/dist-packages (from numba>=0.60->scanpy) (0.43.0)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas>=2.3->scanpy) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas>=2.3->scanpy) (2026.1)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn>=1.6->scanpy) (3.6.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.7->matplotlib>=3.9->scanpy) (1.17.0)
Collecting donfig>=0.8 (from zarr!=3.0.*,>=2.18.7->anndata>=0.10.8->scanpy)
  Downloading donfig-0.8.1.post1-py3-none-any.whl.metadata (5.0 kB)
Requirement already satisfied: google-crc32c>=1.5 in /usr/local/lib/python3.12/dist-packages (from zarr!=3.0.*,>=2.18.7->anndata>=0.10.8->scanpy) (1.8.0)
Collecting numcodecs>=0.14 (from zarr!=3.0.*,>=2.18.7->anndata>=0.10.8->scanpy)
  Downloading numcodecs-0.16.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (from donfig>=0.8->zarr!=3.0.*,>=2.18.7->anndata>=0.10.8->scanpy) (6.0.3)
Downloading scanpy-1.12.1-py3-none-any.whl (2.1 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 55.0 MB/s eta 0:00:00
?25hDownloading anndata-0.12.16-py3-none-any.whl (175 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 175.3/175.3 kB 20.2 MB/s eta 0:00:00
?25hDownloading fast_array_utils-1.4.1-py3-none-any.whl (39 kB)
Downloading legacy_api_wrap-1.5-py3-none-any.whl (10 kB)
Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.4/12.4 MB 124.4 MB/s eta 0:00:00
?25hDownloading session_info2-0.4.1-py3-none-any.whl (17 kB)
Downloading array_api_compat-1.14.0-py3-none-any.whl (60 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.1/60.1 kB 7.0 MB/s eta 0:00:00
?25hDownloading scverse_misc-0.0.7-py3-none-any.whl (13 kB)
Downloading zarr-3.2.1-py3-none-any.whl (319 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 319.6/319.6 kB 37.1 MB/s eta 0:00:00
?25hDownloading donfig-0.8.1.post1-py3-none-any.whl (21 kB)
Downloading numcodecs-0.16.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (9.2 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.2/9.2 MB 138.6 MB/s eta 0:00:00
?25hInstalling collected packages: session-info2, numcodecs, legacy-api-wrap, fast-array-utils, donfig, array-api-compat, zarr, scverse-misc, pandas, anndata, scanpy
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
Successfully installed anndata-0.12.16 array-api-compat-1.14.0 donfig-0.8.1.post1 fast-array-utils-1.4.1 legacy-api-wrap-1.5 numcodecs-0.16.5 pandas-2.3.3 scanpy-1.12.1 scverse-misc-0.0.7 session-info2-0.4.1 zarr-3.2.1
Collecting diptest
  Downloading diptest-0.11.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (6.2 kB)
Requirement already satisfied: psutil in /usr/local/lib/python3.12/dist-packages (from diptest) (5.9.5)
Requirement already satisfied: numpy>=1.18 in /usr/local/lib/python3.12/dist-packages (from diptest) (2.0.2)
Downloading diptest-0.11.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (238 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 238.0/238.0 kB 9.0 MB/s eta 0:00:00
?25hInstalling collected packages: diptest
Successfully installed diptest-0.11.0

Importing libraries

import scanpy as sc
import pandas as pd
import numpy as np
import anndata
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import gaussian_kde
from diptest import diptest
from binning import *
"""
We assume that the spot coordinates (avaiable with the datasets) are in um unit.
"""
'\nWe assume that the spot coordinates (avaiable with the datasets) are in um unit.\n'
data_path = "/content/drive/MyDrive/Major_project/Benchmarking_Shared/spDDB_tutorials/3_data/"

unbinned_st_path = data_path + "unbinned_ST/ST_embeddings.h5ad"

binned_path = data_path + "ST_55_python.h5ad"

dataset =  "merfish_lung_cancer"
celltype_col = "celltype"
spot_diameter = 55 #55 # 550 -> 510 # 100->6, 9, etc.
coord1 = 0
coord2 = 1

combined_adata = binning(unbinned_st_path, spot_diameter, celltype_col, dataset, coord1, coord2)
print (combined_adata)
AnnData object with n_obs × n_vars = 13792 × 488
    obs: 'Monocytes1', 'Macrophages3', 'Macrophages1', 'Epithelial_cells', 'Macrophages2', 'Neutrophils_cells', 'Dendritic_cells', 'Stromal_cells', 'B_cells', 'T_cells', 'Endothelial_cells', 'NK_cells', 'Monocytes2', 'dataset'
    obsm: 'spatial'

Unbinned dataset

st_adata = sc.read_h5ad(unbinned_st_path)
st_adata.obs["umi_count"] = st_adata.X.sum(axis = 1)
sc.pl.embedding(st_adata, basis = "spatial", color = "umi_count", title = st_adata.shape)
../_images/4f186a3fe5cb0f9126406175f548de08d49f62146cf10b6e564f7ef900499fb3.png

Binned dataset

combined_adata.obs["umi_count"] = combined_adata.X.sum(axis = 1)
sc.pl.embedding(combined_adata, basis = "spatial", color = "umi_count", title = combined_adata.shape)
../_images/dad4d2c7878cc562dcd79e94bf6f8426066cf66cd65ab4cd71ffafd4cea653cd.png
dt = combined_adata.obs
dt = dt[dt.columns[:-2]]
dt_ = dt.div(dt.sum(axis=1),axis=0)
dt_ = dt_.sort_index(axis=1)

# Violin plot for more detailed distribution visualization
plt.figure(figsize=(10, 6))
sns.violinplot(data=dt_)
plt.title('Violin Plot of Proportions Across Columns')
plt.ylabel('Proportion Value')
plt.xlabel('Proportion Categories')
plt.xticks(rotation=45) #after creating the plot. Here's how you can adjust the code:
plt.show()
../_images/f7f604a7fb8781afde13b28c2e9153ecedfcd7a2ee6c45cc36661a7f5d3c9dfe.png
combined_adata.write_h5ad(binned_path)