NeighborhoodCollection population and gene feature spaces¶
This notebook builds hextile neighborhoods, then calculates two neighborhood-level feature spaces with NeighborhoodCollection — a neighborhood-by-population modality and a neighborhood-by-gene (expression signature) modality — and stores them as MuData modalities on a shared observation axis. It downloads an example .h5ad from the Broad Institute Celldega supporting-data repository on Hugging Face at runtime.
from pathlib import Path
from urllib.parse import quote
import numpy as np
import pandas as pd
import requests
import scanpy as sc
from scipy import sparse
import celldega as dega
REPO_ID = "broadinstitute/Celldega_Supporting_Data"
REVISION = "main"
CACHE_DIR = Path("data/celldega_supporting_data")
H5AD_PATH = "Xenium_Prime_Human_Skin_FFPE_outs.h5ad"
def download_repo_file(repo_id, repo_path, cache_dir, revision="main"):
cache_dir.mkdir(parents=True, exist_ok=True)
local_path = cache_dir / Path(repo_path).name
if local_path.exists():
return local_path
encoded_path = quote(repo_path)
url = f"https://huggingface.co/datasets/{repo_id}/resolve/{revision}/{encoded_path}"
with requests.get(url, stream=True, timeout=60) as response:
response.raise_for_status()
with local_path.open("wb") as handle:
for chunk in response.iter_content(chunk_size=1024 * 1024):
if chunk:
handle.write(chunk)
return local_path
local_h5ad = download_repo_file(REPO_ID, H5AD_PATH, CACHE_DIR, REVISION)
adata = sc.read_h5ad(local_h5ad)
adata
def ensure_spatial_coordinates(adata):
if "spatial" in adata.obsm:
return
coordinate_pairs = [
("x_centroid", "y_centroid"),
("x_location", "y_location"),
("x", "y"),
("X_centroid", "Y_centroid"),
]
for x_col, y_col in coordinate_pairs:
if x_col in adata.obs and y_col in adata.obs:
adata.obsm["spatial"] = adata.obs[[x_col, y_col]].to_numpy()
return
raise ValueError("The AnnData needs adata.obsm['spatial'] or x/y centroid columns.")
ensure_spatial_coordinates(adata)
population_col = "leiden"
if population_col not in adata.obs:
raise ValueError(f"Expected adata.obs[{population_col!r}] in this example.")
population_col
gdf_hex = dega.nbhd.generate_hextile(adata, diameter=100)
gdf_hex.head()
nbhd = dega.nbhd.NeighborhoodCollection(
gdf=gdf_hex,
nbhd_type="hextile",
)
nbhd.calc_nbhd_by_pop(
adata,
category=population_col,
modality_name="population",
output="counts",
min_cells=5,
)
population = nbhd.mod["population"]
population
nbhd_cluster_key = "nbhd_population_leiden"
X = population.X.toarray() if sparse.issparse(population.X) else np.asarray(population.X)
X = np.asarray(X, dtype=float)
row_sums = X.sum(axis=1)
valid = np.isfinite(X).all(axis=1) & (row_sums > 0)
cluster_labels = pd.Series("unclustered", index=population.obs_names.astype(str), dtype="object")
valid_positions = np.flatnonzero(valid)
if len(valid_positions) == 1:
cluster_labels.iloc[valid_positions] = "0"
elif len(valid_positions) > 1:
X_valid = X[valid_positions]
X_valid = X_valid / X_valid.sum(axis=1, keepdims=True)
leiden_adata = population[valid_positions, :].copy()
leiden_adata.X = X_valid
n_neighbors = min(15, max(2, len(valid_positions) - 1))
sc.pp.neighbors(leiden_adata, n_neighbors=n_neighbors, use_rep="X")
sc.tl.leiden(
leiden_adata,
key_added=nbhd_cluster_key,
random_state=0,
flavor="igraph",
n_iterations=2,
directed=False,
)
cluster_labels.iloc[valid_positions] = leiden_adata.obs[nbhd_cluster_key].astype(str).to_numpy()
population.obs[nbhd_cluster_key] = cluster_labels.astype("category")
nbhd.obs[nbhd_cluster_key] = (
cluster_labels.reindex(nbhd.obs.index.astype(str)).fillna("unclustered").astype("category")
)
cluster_values = nbhd.obs[nbhd_cluster_key].astype(str)
palette = sc.pl.palettes.default_20
clusters = [cluster for cluster in sorted(cluster_values.dropna().unique()) if cluster != "unclustered"]
cluster_colors = {cluster: palette[i % len(palette)] for i, cluster in enumerate(clusters)}
cluster_colors["unclustered"] = "#bdbdbd"
nbhd.obs["cat"] = cluster_values
nbhd.obs["color"] = cluster_values.map(cluster_colors).fillna("#4f80ff")
nbhd.obs[[nbhd_cluster_key, "color"]].head()
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.cluster import MiniBatchKMeans
nbhd_cluster_key = "nbhd_population_cluster"
X = population.X.toarray() if sparse.issparse(population.X) else np.asarray(population.X)
X = np.asarray(X, dtype=float)
row_sums = X.sum(axis=1)
valid = np.isfinite(X).all(axis=1) & (row_sums > 0)
cluster_labels = pd.Series("unclustered", index=population.obs_names.astype(str), dtype="object")
valid_positions = np.flatnonzero(valid)
if len(valid_positions) == 1:
cluster_labels.iloc[valid_positions] = "0"
elif len(valid_positions) > 1:
X_valid = X[valid_positions]
X_valid = X_valid / X_valid.sum(axis=1, keepdims=True)
n_unique = np.unique(np.round(X_valid, 8), axis=0).shape[0]
n_clusters = min(8, max(2, int(np.sqrt(len(valid_positions)))), n_unique)
if n_clusters <= 1:
labels = np.zeros(len(valid_positions), dtype=int)
else:
model = MiniBatchKMeans(
n_clusters=n_clusters,
random_state=0,
n_init=10,
batch_size=min(1024, max(100, len(valid_positions))),
)
labels = model.fit_predict(X_valid)
cluster_labels.iloc[valid_positions] = labels.astype(str)
population.obs[nbhd_cluster_key] = cluster_labels.astype("category")
nbhd.obs[nbhd_cluster_key] = (
cluster_labels.reindex(nbhd.obs.index.astype(str)).fillna("unclustered").astype("category")
)
cluster_values = nbhd.obs[nbhd_cluster_key].astype(str)
palette = sc.pl.palettes.default_20
clusters = [cluster for cluster in sorted(cluster_values.dropna().unique()) if cluster != "unclustered"]
cluster_colors = {
cluster: palette[i % len(palette)]
for i, cluster in enumerate(clusters)
}
cluster_colors["unclustered"] = "#bdbdbd"
nbhd.obs["cat"] = cluster_values
nbhd.obs["color"] = cluster_values.map(cluster_colors).fillna("#4f80ff")
nbhd.obs[[nbhd_cluster_key, "color"]].head()
nbhd.geometry.plot()
nbhd.collection_type
nbhd.mod
The population modality is stored inside the same collection object. The canonical observation axis is nbhd.obs, while nbhd.mod["population"].X is the clusterable neighborhood-by-population matrix.
mat = dega.clust.Matrix(
population,
row_attr=[population_col],
col_attr=[nbhd_cluster_key],
)
mat.row_entity, mat.col_entity
mat.cluster()
cgm = dega.viz.Clustergram(matrix=mat)
cgm
base_url = 'https://raw.githubusercontent.com/broadinstitute/celldega_Xenium_Prime_Human_Skin_FFPE_outs/main/Xenium_Prime_Human_Skin_FFPE_outs'
nbhd.obs[["neighborhood_id", nbhd_cluster_key, "cat", "color"]].head()
nbhd.geometry.head()
landscape = dega.viz.Landscape(
technology='Xenium',
base_url = base_url,
height=550,
adata=adata,
nbhd=nbhd
)
dega.viz.landscape_clustergram(landscape, cgm)
Neighborhood-by-gene signature¶
The same hextile neighborhoods can also carry a gene-expression feature space. calc_nbhd_by_gene computes mean expression per neighborhood from the cell-level adata and attaches it as a second modality alongside population, sharing the same nbhd.obs observation axis. We pass drop_missing=False so the neighborhood axis is unchanged (zero-filled rows for any neighborhood below min_cells), keeping the two modalities aligned.
nbhd.calc_nbhd_by_gene(
adata=adata,
by="cell",
modality_name="gene",
min_cells=5,
drop_missing=False,
)
gene = nbhd.mod["gene"]
gene
mat_gene = dega.clust.Matrix(gene)
mat_gene.norm(axis='row', by='zscore')
cgm_gene =
# both feature spaces now live on the same neighborhood collection
list(nbhd.mod)