NeighborhoodCollection population and gene feature spaces¶

This notebook builds hextile neighborhoods, then calculates two neighborhood-level feature spaces with NeighborhoodCollection — a neighborhood-by-population modality and a neighborhood-by-gene (expression signature) modality — and stores them as MuData modalities on a shared observation axis. It downloads an example .h5ad from the Broad Institute Celldega supporting-data repository on Hugging Face at runtime.

In [ ]:

Copied!





from pathlib import Path
from urllib.parse import quote

import numpy as np
import pandas as pd
import requests
import scanpy as sc
from scipy import sparse

import celldega as dega
from pathlib import Path
from urllib.parse import quote

import numpy as np
import pandas as pd
import requests
import scanpy as sc
from scipy import sparse

import celldega as dega

In [ ]:

Copied!





REPO_ID = "broadinstitute/Celldega_Supporting_Data"
REVISION = "main"
CACHE_DIR = Path("data/celldega_supporting_data")
H5AD_PATH = "Xenium_Prime_Human_Skin_FFPE_outs.h5ad"


def download_repo_file(repo_id, repo_path, cache_dir, revision="main"):
    cache_dir.mkdir(parents=True, exist_ok=True)
    local_path = cache_dir / Path(repo_path).name
    if local_path.exists():
        return local_path

    encoded_path = quote(repo_path)
    url = f"https://huggingface.co/datasets/{repo_id}/resolve/{revision}/{encoded_path}"
    with requests.get(url, stream=True, timeout=60) as response:
        response.raise_for_status()
        with local_path.open("wb") as handle:
            for chunk in response.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    handle.write(chunk)
    return local_path


local_h5ad = download_repo_file(REPO_ID, H5AD_PATH, CACHE_DIR, REVISION)
adata = sc.read_h5ad(local_h5ad)
adata
REPO_ID = "broadinstitute/Celldega_Supporting_Data"
REVISION = "main"
CACHE_DIR = Path("data/celldega_supporting_data")
H5AD_PATH = "Xenium_Prime_Human_Skin_FFPE_outs.h5ad"


def download_repo_file(repo_id, repo_path, cache_dir, revision="main"):
    cache_dir.mkdir(parents=True, exist_ok=True)
    local_path = cache_dir / Path(repo_path).name
    if local_path.exists():
        return local_path

    encoded_path = quote(repo_path)
    url = f"https://huggingface.co/datasets/{repo_id}/resolve/{revision}/{encoded_path}"
    with requests.get(url, stream=True, timeout=60) as response:
        response.raise_for_status()
        with local_path.open("wb") as handle:
            for chunk in response.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    handle.write(chunk)
    return local_path


local_h5ad = download_repo_file(REPO_ID, H5AD_PATH, CACHE_DIR, REVISION)
adata = sc.read_h5ad(local_h5ad)
adata

In [ ]:

Copied!





def ensure_spatial_coordinates(adata):
    if "spatial" in adata.obsm:
        return

    coordinate_pairs = [
        ("x_centroid", "y_centroid"),
        ("x_location", "y_location"),
        ("x", "y"),
        ("X_centroid", "Y_centroid"),
    ]
    for x_col, y_col in coordinate_pairs:
        if x_col in adata.obs and y_col in adata.obs:
            adata.obsm["spatial"] = adata.obs[[x_col, y_col]].to_numpy()
            return

    raise ValueError("The AnnData needs adata.obsm['spatial'] or x/y centroid columns.")


ensure_spatial_coordinates(adata)
population_col = "leiden"
if population_col not in adata.obs:
    raise ValueError(f"Expected adata.obs[{population_col!r}] in this example.")
population_col
def ensure_spatial_coordinates(adata):
    if "spatial" in adata.obsm:
        return

    coordinate_pairs = [
        ("x_centroid", "y_centroid"),
        ("x_location", "y_location"),
        ("x", "y"),
        ("X_centroid", "Y_centroid"),
    ]
    for x_col, y_col in coordinate_pairs:
        if x_col in adata.obs and y_col in adata.obs:
            adata.obsm["spatial"] = adata.obs[[x_col, y_col]].to_numpy()
            return

    raise ValueError("The AnnData needs adata.obsm['spatial'] or x/y centroid columns.")


ensure_spatial_coordinates(adata)
population_col = "leiden"
if population_col not in adata.obs:
    raise ValueError(f"Expected adata.obs[{population_col!r}] in this example.")
population_col

In [ ]:

Copied!

gdf_hex = dega.nbhd.generate_hextile(adata, diameter=100)
gdf_hex.head()
gdf_hex = dega.nbhd.generate_hextile(adata, diameter=100)
gdf_hex.head()

In [ ]:

Copied!





nbhd = dega.nbhd.NeighborhoodCollection(
    gdf=gdf_hex,
    nbhd_type="hextile",
)

nbhd.calc_population(
    adata,
    category=population_col,
    modality_name="population",
    output="counts",
    min_cells=5,
)

population = nbhd.mod["population"]
population
nbhd = dega.nbhd.NeighborhoodCollection(
    gdf=gdf_hex,
    nbhd_type="hextile",
)

nbhd.calc_population(
    adata,
    category=population_col,
    modality_name="population",
    output="counts",
    min_cells=5,
)

population = nbhd.mod["population"]
population

In [ ]:

Copied!





nbhd_cluster_key = "nbhd_population_leiden"

X = population.X.toarray() if sparse.issparse(population.X) else np.asarray(population.X)
X = np.asarray(X, dtype=float)
row_sums = X.sum(axis=1)
valid = np.isfinite(X).all(axis=1) & (row_sums > 0)

cluster_labels = pd.Series("unclustered", index=population.obs_names.astype(str), dtype="object")
valid_positions = np.flatnonzero(valid)

if len(valid_positions) == 1:
    cluster_labels.iloc[valid_positions] = "0"
elif len(valid_positions) > 1:
    X_valid = X[valid_positions]
    X_valid = X_valid / X_valid.sum(axis=1, keepdims=True)

    leiden_adata = population[valid_positions, :].copy()
    leiden_adata.X = X_valid
    n_neighbors = min(15, max(2, len(valid_positions) - 1))
    sc.pp.neighbors(leiden_adata, n_neighbors=n_neighbors, use_rep="X")
    sc.tl.leiden(
        leiden_adata,
        key_added=nbhd_cluster_key,
        random_state=0,
        flavor="igraph",
        n_iterations=2,
        directed=False,
    )
    cluster_labels.iloc[valid_positions] = leiden_adata.obs[nbhd_cluster_key].astype(str).to_numpy()

population.obs[nbhd_cluster_key] = cluster_labels.astype("category")
nbhd.obs[nbhd_cluster_key] = (
    cluster_labels.reindex(nbhd.obs.index.astype(str)).fillna("unclustered").astype("category")
)

cluster_values = nbhd.obs[nbhd_cluster_key].astype(str)
palette = sc.pl.palettes.default_20
clusters = [cluster for cluster in sorted(cluster_values.dropna().unique()) if cluster != "unclustered"]
cluster_colors = {cluster: palette[i % len(palette)] for i, cluster in enumerate(clusters)}
cluster_colors["unclustered"] = "#bdbdbd"

nbhd.obs["cat"] = cluster_values
nbhd.obs["color"] = cluster_values.map(cluster_colors).fillna("#4f80ff")

nbhd.obs[[nbhd_cluster_key, "color"]].head()
nbhd_cluster_key = "nbhd_population_leiden"

X = population.X.toarray() if sparse.issparse(population.X) else np.asarray(population.X)
X = np.asarray(X, dtype=float)
row_sums = X.sum(axis=1)
valid = np.isfinite(X).all(axis=1) & (row_sums > 0)

cluster_labels = pd.Series("unclustered", index=population.obs_names.astype(str), dtype="object")
valid_positions = np.flatnonzero(valid)

if len(valid_positions) == 1:
    cluster_labels.iloc[valid_positions] = "0"
elif len(valid_positions) > 1:
    X_valid = X[valid_positions]
    X_valid = X_valid / X_valid.sum(axis=1, keepdims=True)

    leiden_adata = population[valid_positions, :].copy()
    leiden_adata.X = X_valid
    n_neighbors = min(15, max(2, len(valid_positions) - 1))
    sc.pp.neighbors(leiden_adata, n_neighbors=n_neighbors, use_rep="X")
    sc.tl.leiden(
        leiden_adata,
        key_added=nbhd_cluster_key,
        random_state=0,
        flavor="igraph",
        n_iterations=2,
        directed=False,
    )
    cluster_labels.iloc[valid_positions] = leiden_adata.obs[nbhd_cluster_key].astype(str).to_numpy()

population.obs[nbhd_cluster_key] = cluster_labels.astype("category")
nbhd.obs[nbhd_cluster_key] = (
    cluster_labels.reindex(nbhd.obs.index.astype(str)).fillna("unclustered").astype("category")
)

cluster_values = nbhd.obs[nbhd_cluster_key].astype(str)
palette = sc.pl.palettes.default_20
clusters = [cluster for cluster in sorted(cluster_values.dropna().unique()) if cluster != "unclustered"]
cluster_colors = {cluster: palette[i % len(palette)] for i, cluster in enumerate(clusters)}
cluster_colors["unclustered"] = "#bdbdbd"

nbhd.obs["cat"] = cluster_values
nbhd.obs["color"] = cluster_values.map(cluster_colors).fillna("#4f80ff")

nbhd.obs[[nbhd_cluster_key, "color"]].head()

In [ ]:

Copied!





import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.cluster import MiniBatchKMeans

nbhd_cluster_key = "nbhd_population_cluster"

X = population.X.toarray() if sparse.issparse(population.X) else np.asarray(population.X)
X = np.asarray(X, dtype=float)
row_sums = X.sum(axis=1)
valid = np.isfinite(X).all(axis=1) & (row_sums > 0)

cluster_labels = pd.Series("unclustered", index=population.obs_names.astype(str), dtype="object")
valid_positions = np.flatnonzero(valid)

if len(valid_positions) == 1:
    cluster_labels.iloc[valid_positions] = "0"
elif len(valid_positions) > 1:
    X_valid = X[valid_positions]
    X_valid = X_valid / X_valid.sum(axis=1, keepdims=True)
    n_unique = np.unique(np.round(X_valid, 8), axis=0).shape[0]
    n_clusters = min(8, max(2, int(np.sqrt(len(valid_positions)))), n_unique)

    if n_clusters <= 1:
        labels = np.zeros(len(valid_positions), dtype=int)
    else:
        model = MiniBatchKMeans(
            n_clusters=n_clusters,
            random_state=0,
            n_init=10,
            batch_size=min(1024, max(100, len(valid_positions))),
        )
        labels = model.fit_predict(X_valid)

    cluster_labels.iloc[valid_positions] = labels.astype(str)

population.obs[nbhd_cluster_key] = cluster_labels.astype("category")
nbhd.obs[nbhd_cluster_key] = (
    cluster_labels.reindex(nbhd.obs.index.astype(str)).fillna("unclustered").astype("category")
)

cluster_values = nbhd.obs[nbhd_cluster_key].astype(str)
palette = sc.pl.palettes.default_20
clusters = [cluster for cluster in sorted(cluster_values.dropna().unique()) if cluster != "unclustered"]
cluster_colors = {
    cluster: palette[i % len(palette)]
    for i, cluster in enumerate(clusters)
}
cluster_colors["unclustered"] = "#bdbdbd"
nbhd.obs["cat"] = cluster_values
nbhd.obs["color"] = cluster_values.map(cluster_colors).fillna("#4f80ff")

nbhd.obs[[nbhd_cluster_key, "color"]].head()
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.cluster import MiniBatchKMeans

nbhd_cluster_key = "nbhd_population_cluster"

X = population.X.toarray() if sparse.issparse(population.X) else np.asarray(population.X)
X = np.asarray(X, dtype=float)
row_sums = X.sum(axis=1)
valid = np.isfinite(X).all(axis=1) & (row_sums > 0)

cluster_labels = pd.Series("unclustered", index=population.obs_names.astype(str), dtype="object")
valid_positions = np.flatnonzero(valid)

if len(valid_positions) == 1:
    cluster_labels.iloc[valid_positions] = "0"
elif len(valid_positions) > 1:
    X_valid = X[valid_positions]
    X_valid = X_valid / X_valid.sum(axis=1, keepdims=True)
    n_unique = np.unique(np.round(X_valid, 8), axis=0).shape[0]
    n_clusters = min(8, max(2, int(np.sqrt(len(valid_positions)))), n_unique)

    if n_clusters <= 1:
        labels = np.zeros(len(valid_positions), dtype=int)
    else:
        model = MiniBatchKMeans(
            n_clusters=n_clusters,
            random_state=0,
            n_init=10,
            batch_size=min(1024, max(100, len(valid_positions))),
        )
        labels = model.fit_predict(X_valid)

    cluster_labels.iloc[valid_positions] = labels.astype(str)

population.obs[nbhd_cluster_key] = cluster_labels.astype("category")
nbhd.obs[nbhd_cluster_key] = (
    cluster_labels.reindex(nbhd.obs.index.astype(str)).fillna("unclustered").astype("category")
)

cluster_values = nbhd.obs[nbhd_cluster_key].astype(str)
palette = sc.pl.palettes.default_20
clusters = [cluster for cluster in sorted(cluster_values.dropna().unique()) if cluster != "unclustered"]
cluster_colors = {
    cluster: palette[i % len(palette)]
    for i, cluster in enumerate(clusters)
}
cluster_colors["unclustered"] = "#bdbdbd"
nbhd.obs["cat"] = cluster_values
nbhd.obs["color"] = cluster_values.map(cluster_colors).fillna("#4f80ff")

nbhd.obs[[nbhd_cluster_key, "color"]].head()

In [ ]:

Copied!

nbhd.geometry.plot()
nbhd.geometry.plot()

In [ ]:

Copied!

nbhd.collection_type
nbhd.collection_type

In [ ]:

Copied!

nbhd.mod
nbhd.mod

The population modality is stored inside the same collection object. The canonical observation axis is nbhd.obs, while nbhd.mod["population"].X is the clusterable neighborhood-by-population matrix.

In [ ]:

Copied!





mat = dega.clust.Matrix(
    population,
    row_attr=[population_col],
    col_attr=[nbhd_cluster_key],
)
mat.row_entity, mat.col_entity
mat = dega.clust.Matrix(
    population,
    row_attr=[population_col],
    col_attr=[nbhd_cluster_key],
)
mat.row_entity, mat.col_entity

In [ ]:

Copied!

mat.cluster()
mat.cluster()

In [ ]:

Copied!

cgm = dega.viz.Clustergram(matrix=mat)
cgm
cgm = dega.viz.Clustergram(matrix=mat)
cgm

In [ ]:

Copied!

base_url = 'https://raw.githubusercontent.com/broadinstitute/celldega_Xenium_Prime_Human_Skin_FFPE_outs/main/Xenium_Prime_Human_Skin_FFPE_outs'
base_url = 'https://raw.githubusercontent.com/broadinstitute/celldega_Xenium_Prime_Human_Skin_FFPE_outs/main/Xenium_Prime_Human_Skin_FFPE_outs'

In [ ]:

Copied!

nbhd.obs[["neighborhood_id", nbhd_cluster_key, "cat", "color"]].head()
nbhd.obs[["neighborhood_id", nbhd_cluster_key, "cat", "color"]].head()

In [ ]:

Copied!

nbhd.geometry.head()
nbhd.geometry.head()

In [ ]:

Copied!





landscape = dega.viz.Landscape(
    technology='Xenium', 
    base_url = base_url,
    height=550,
    adata=adata,
    nbhd=nbhd
)
landscape = dega.viz.Landscape(
    technology='Xenium', 
    base_url = base_url,
    height=550,
    adata=adata,
    nbhd=nbhd
)

In [ ]:

Copied!

dega.viz.landscape_clustergram(landscape, cgm)
dega.viz.landscape_clustergram(landscape, cgm)

Neighborhood-by-gene signature¶

The same hextile neighborhoods can also carry a gene-expression feature space. calc_signature computes mean expression per neighborhood from the cell-level adata and attaches it as a second modality alongside population, sharing the same nbhd.obs observation axis. We pass drop_missing=False so the neighborhood axis is unchanged (zero-filled rows for any neighborhood below min_cells), keeping the two modalities aligned.

In [ ]:

Copied!





nbhd.calc_signature(
    adata=adata,
    by="cell",
    modality_name="gene",
    min_cells=5,
    drop_missing=False,
)

gene = nbhd.mod["gene"]
gene
nbhd.calc_signature(
    adata=adata,
    by="cell",
    modality_name="gene",
    min_cells=5,
    drop_missing=False,
)

gene = nbhd.mod["gene"]
gene

In [ ]:

Copied!

mat_gene = dega.clust.Matrix(gene)
mat_gene = dega.clust.Matrix(gene)

In [ ]:

Copied!

mat_gene.norm(axis='row', by='zscore')
mat_gene.norm(axis='row', by='zscore')

In [ ]:

Copied!

cgm_gene = 
cgm_gene =

In [ ]:

Copied!

# both feature spaces now live on the same neighborhood collection
list(nbhd.mod)
# both feature spaces now live on the same neighborhood collection
list(nbhd.mod)