DatasetCollection population and expression feature spaces¶

This notebook creates toy cell-level data for multiple datasets, then calculates dataset-level feature spaces with DatasetCollection. The toy data has ten cell types grouped into broader immune categories, a T-cell proportion gradient across datasets, and toy gene expression that lets us calculate a CD8 T-cell category signature across dataset space.

In [ ]:

Copied!





import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from anndata import AnnData

import celldega as dega
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from anndata import AnnData

import celldega as dega

In [ ]:

Copied!





rng = np.random.default_rng(42)

cell_types = [
    "CD4 T",
    "CD8 T",
    "Treg",
    "B cell",
    "Plasma cell",
    "NK",
    "Monocyte",
    "Macrophage",
    "Dendritic cell",
    "Neutrophil",
]
immune_group = {
    "CD4 T": "T/NK",
    "CD8 T": "T/NK",
    "Treg": "T/NK",
    "NK": "T/NK",
    "B cell": "B/plasma",
    "Plasma cell": "B/plasma",
    "Monocyte": "myeloid",
    "Macrophage": "myeloid",
    "Dendritic cell": "myeloid",
    "Neutrophil": "granulocyte",
}

genes = [
    "CD3D",
    "CD3E",
    "TRAC",
    "GZMB",
    "IFNG",
    "MS4A1",
    "CD79A",
    "MZB1",
    "LYZ",
    "S100A8",
    "FCGR3A",
    "ITGAX",
]
gene_to_idx = {gene: idx for idx, gene in enumerate(genes)}

condition_templates = {
    "inflamed": np.array([5, 6, 2, 2, 1, 3, 5, 4, 3, 4], dtype=float),
    "cold": np.array([2, 2, 1, 4, 3, 1, 2, 2, 1, 1], dtype=float),
    "myeloid_rich": np.array([2, 2, 1, 1, 1, 1, 6, 7, 4, 2], dtype=float),
}
t_cell_boost = np.array([10, 12, 4, 0, 0, 3, 0, 0, 0, 0], dtype=float)


def expression_mean(cell_type, sample_progress):
    mean = np.full(len(genes), 0.08, dtype=float)

    if cell_type in {"CD4 T", "CD8 T", "Treg", "NK"}:
        mean[[gene_to_idx["CD3D"], gene_to_idx["CD3E"], gene_to_idx["TRAC"]]] += [2.2, 2.0, 2.4]
    if cell_type == "CD8 T":
        mean[[gene_to_idx["GZMB"], gene_to_idx["IFNG"]]] += [1.2 + 3.0 * sample_progress, 0.7 + 2.2 * sample_progress]
    if cell_type == "NK":
        mean[[gene_to_idx["GZMB"], gene_to_idx["FCGR3A"]]] += [1.6, 1.8]
    if cell_type == "B cell":
        mean[[gene_to_idx["MS4A1"], gene_to_idx["CD79A"]]] += [2.4, 2.2]
    if cell_type == "Plasma cell":
        mean[[gene_to_idx["MZB1"], gene_to_idx["CD79A"]]] += [3.2, 1.0]
    if cell_type in {"Monocyte", "Macrophage", "Dendritic cell", "Neutrophil"}:
        mean[[gene_to_idx["LYZ"], gene_to_idx["S100A8"]]] += [2.3, 1.8]
    if cell_type == "Dendritic cell":
        mean[gene_to_idx["ITGAX"]] += 2.4
    if cell_type == "Macrophage":
        mean[gene_to_idx["FCGR3A"]] += 1.2

    return mean


rows = []
expression_rows = []
n_samples = 12
for sample_idx in range(n_samples):
    sample_id = f"sample_{sample_idx:02d}"
    patient_id = f"patient_{sample_idx // 2:02d}"
    condition = ["cold", "myeloid_rich", "inflamed"][sample_idx % 3]
    sample_progress = sample_idx / (n_samples - 1)
    n_cells = int(rng.integers(500, 900))

    alpha = condition_templates[condition] + sample_progress * t_cell_boost
    alpha = alpha * rng.uniform(0.9, 1.1, size=len(cell_types))
    proportions = rng.dirichlet(alpha)
    counts = rng.multinomial(n_cells, proportions)

    for cell_type, count in zip(cell_types, counts, strict=False):
        for _ in range(count):
            rows.append(
                {
                    "sample_id": sample_id,
                    "patient_id": patient_id,
                    "condition": condition,
                    "sample_order": sample_idx,
                    "t_cell_gradient": sample_progress,
                    "cell_type": cell_type,
                    "immune_group": immune_group[cell_type],
                }
            )
            expression_rows.append(rng.poisson(expression_mean(cell_type, sample_progress)))

obs = pd.DataFrame(rows)
obs.index = [f"cell_{i:05d}" for i in range(len(obs))]
X = np.vstack(expression_rows).astype(float)

adata = AnnData(
    X=X,
    obs=obs,
    var=pd.DataFrame(index=genes),
)
adata
rng = np.random.default_rng(42)

cell_types = [
    "CD4 T",
    "CD8 T",
    "Treg",
    "B cell",
    "Plasma cell",
    "NK",
    "Monocyte",
    "Macrophage",
    "Dendritic cell",
    "Neutrophil",
]
immune_group = {
    "CD4 T": "T/NK",
    "CD8 T": "T/NK",
    "Treg": "T/NK",
    "NK": "T/NK",
    "B cell": "B/plasma",
    "Plasma cell": "B/plasma",
    "Monocyte": "myeloid",
    "Macrophage": "myeloid",
    "Dendritic cell": "myeloid",
    "Neutrophil": "granulocyte",
}

genes = [
    "CD3D",
    "CD3E",
    "TRAC",
    "GZMB",
    "IFNG",
    "MS4A1",
    "CD79A",
    "MZB1",
    "LYZ",
    "S100A8",
    "FCGR3A",
    "ITGAX",
]
gene_to_idx = {gene: idx for idx, gene in enumerate(genes)}

condition_templates = {
    "inflamed": np.array([5, 6, 2, 2, 1, 3, 5, 4, 3, 4], dtype=float),
    "cold": np.array([2, 2, 1, 4, 3, 1, 2, 2, 1, 1], dtype=float),
    "myeloid_rich": np.array([2, 2, 1, 1, 1, 1, 6, 7, 4, 2], dtype=float),
}
t_cell_boost = np.array([10, 12, 4, 0, 0, 3, 0, 0, 0, 0], dtype=float)


def expression_mean(cell_type, sample_progress):
    mean = np.full(len(genes), 0.08, dtype=float)

    if cell_type in {"CD4 T", "CD8 T", "Treg", "NK"}:
        mean[[gene_to_idx["CD3D"], gene_to_idx["CD3E"], gene_to_idx["TRAC"]]] += [2.2, 2.0, 2.4]
    if cell_type == "CD8 T":
        mean[[gene_to_idx["GZMB"], gene_to_idx["IFNG"]]] += [1.2 + 3.0 * sample_progress, 0.7 + 2.2 * sample_progress]
    if cell_type == "NK":
        mean[[gene_to_idx["GZMB"], gene_to_idx["FCGR3A"]]] += [1.6, 1.8]
    if cell_type == "B cell":
        mean[[gene_to_idx["MS4A1"], gene_to_idx["CD79A"]]] += [2.4, 2.2]
    if cell_type == "Plasma cell":
        mean[[gene_to_idx["MZB1"], gene_to_idx["CD79A"]]] += [3.2, 1.0]
    if cell_type in {"Monocyte", "Macrophage", "Dendritic cell", "Neutrophil"}:
        mean[[gene_to_idx["LYZ"], gene_to_idx["S100A8"]]] += [2.3, 1.8]
    if cell_type == "Dendritic cell":
        mean[gene_to_idx["ITGAX"]] += 2.4
    if cell_type == "Macrophage":
        mean[gene_to_idx["FCGR3A"]] += 1.2

    return mean


rows = []
expression_rows = []
n_samples = 12
for sample_idx in range(n_samples):
    sample_id = f"sample_{sample_idx:02d}"
    patient_id = f"patient_{sample_idx // 2:02d}"
    condition = ["cold", "myeloid_rich", "inflamed"][sample_idx % 3]
    sample_progress = sample_idx / (n_samples - 1)
    n_cells = int(rng.integers(500, 900))

    alpha = condition_templates[condition] + sample_progress * t_cell_boost
    alpha = alpha * rng.uniform(0.9, 1.1, size=len(cell_types))
    proportions = rng.dirichlet(alpha)
    counts = rng.multinomial(n_cells, proportions)

    for cell_type, count in zip(cell_types, counts, strict=False):
        for _ in range(count):
            rows.append(
                {
                    "sample_id": sample_id,
                    "patient_id": patient_id,
                    "condition": condition,
                    "sample_order": sample_idx,
                    "t_cell_gradient": sample_progress,
                    "cell_type": cell_type,
                    "immune_group": immune_group[cell_type],
                }
            )
            expression_rows.append(rng.poisson(expression_mean(cell_type, sample_progress)))

obs = pd.DataFrame(rows)
obs.index = [f"cell_{i:05d}" for i in range(len(obs))]
X = np.vstack(expression_rows).astype(float)

adata = AnnData(
    X=X,
    obs=obs,
    var=pd.DataFrame(index=genes),
)
adata

In [ ]:

Copied!

adata.obs.head()
adata.obs.head()

In [ ]:

Copied!

adata.obs['sample_id'].value_counts()
adata.obs['sample_id'].value_counts()

In [ ]:

Copied!





dset = dega.dataset.DatasetCollection(
    adata,
    dataset_col="sample_id",
    obs_columns=["patient_id", "condition", "sample_order", "t_cell_gradient"],
)

dset.obs.head()
dset = dega.dataset.DatasetCollection(
    adata,
    dataset_col="sample_id",
    obs_columns=["patient_id", "condition", "sample_order", "t_cell_gradient"],
)

dset.obs.head()

In [ ]:

Copied!





dset.calc_population(
    adata,
    category="cell_type",
    modality_name="cell_type_population",
    output="proportion",
)

dset.calc_population(
    adata,
    category="immune_group",
    modality_name="immune_group_population",
    output="proportion",
)

cell_type_space = dset.mod["cell_type_population"]
immune_group_space = dset.mod["immune_group_population"]

list(dset.mod)
dset.calc_population(
    adata,
    category="cell_type",
    modality_name="cell_type_population",
    output="proportion",
)

dset.calc_population(
    adata,
    category="immune_group",
    modality_name="immune_group_population",
    output="proportion",
)

cell_type_space = dset.mod["cell_type_population"]
immune_group_space = dset.mod["immune_group_population"]

list(dset.mod)

In [ ]:

Copied!





ordered_samples = dset.obs.sort_values("sample_order").index
cell_type_proportions = cell_type_space.to_df().loc[ordered_samples]

ax = cell_type_proportions.plot(
    kind="bar",
    stacked=True,
    figsize=(12, 4),
    colormap="tab20",
    width=0.9,
)
ax.set_xlabel("Dataset")
ax.set_ylabel("Cell type proportion")
ax.set_title("Toy datasets with increasing T-cell abundance")
ax.legend(title="Cell type", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
ordered_samples = dset.obs.sort_values("sample_order").index
cell_type_proportions = cell_type_space.to_df().loc[ordered_samples]

ax = cell_type_proportions.plot(
    kind="bar",
    stacked=True,
    figsize=(12, 4),
    colormap="tab20",
    width=0.9,
)
ax.set_xlabel("Dataset")
ax.set_ylabel("Cell type proportion")
ax.set_title("Toy datasets with increasing T-cell abundance")
ax.legend(title="Cell type", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()

In [ ]:

Copied!

cell_type_space.to_df().loc[ordered_samples].head()
cell_type_space.to_df().loc[ordered_samples].head()

In [ ]:

Copied!





target_cell_type = "CD8 T"

dset.calc_signature(
    adata,
    category="cell_type",
    value=target_cell_type,
    modality_name="cd8_t_expression",
    missing_datasets="nan",
)

cd8_t_expression = dset.mod["cd8_t_expression"]
cd8_t_expression
target_cell_type = "CD8 T"

dset.calc_signature(
    adata,
    category="cell_type",
    value=target_cell_type,
    modality_name="cd8_t_expression",
    missing_datasets="nan",
)

cd8_t_expression = dset.mod["cd8_t_expression"]
cd8_t_expression

In [ ]:

Copied!

mat = dega.clust.Matrix(cd8_t_expression)
mat.norm(axis='row', by='zscore')
mat.cluster()
mat = dega.clust.Matrix(cd8_t_expression)
mat.norm(axis='row', by='zscore')
mat.cluster()

In [ ]:

Copied!

dega.viz.Clustergram(matrix=mat)
dega.viz.Clustergram(matrix=mat)

In [ ]:

Copied!





marker_genes = ["CD3D", "GZMB", "IFNG"]
cd8_expression_df = cd8_t_expression.to_df().loc[ordered_samples, marker_genes]

ax = cd8_expression_df.plot(marker="o", figsize=(10, 4))
ax.set_xlabel("Dataset")
ax.set_ylabel("CD8 T pseudobulk expression (log1p CPM)")
ax.set_title("CD8 T-cell expression changes across dataset space")
ax.tick_params(axis="x", rotation=45)
plt.tight_layout()
marker_genes = ["CD3D", "GZMB", "IFNG"]
cd8_expression_df = cd8_t_expression.to_df().loc[ordered_samples, marker_genes]

ax = cd8_expression_df.plot(marker="o", figsize=(10, 4))
ax.set_xlabel("Dataset")
ax.set_ylabel("CD8 T pseudobulk expression (log1p CPM)")
ax.set_title("CD8 T-cell expression changes across dataset space")
ax.tick_params(axis="x", rotation=45)
plt.tight_layout()

In [ ]:

Copied!

immune_group_space.to_df().loc[ordered_samples].head()
immune_group_space.to_df().loc[ordered_samples].head()

Each modality has the same dataset/sample observation axis as dset.obs, but a different feature axis. Here cell_type_population and immune_group_population describe composition, while cd8_t_expression stores a dataset-level gene-expression signature for one selected cell type.

In [ ]:

Copied!

dset.mod
dset.mod

In [ ]: