Visium-HD Landscape Pre-Processing¶

In [1]:

Copied!

%load_ext autoreload
%autoreload 2
%env ANYWIDGET_HMR=1
%load_ext autoreload
%autoreload 2
%env ANYWIDGET_HMR=1

env: ANYWIDGET_HMR=1

In [2]:

Copied!





import os
import tarfile
import json
from pathlib import Path
from datetime import date
from collections import defaultdict

import numpy as np
import pandas as pd
import polars as pl
import h5py

import scanpy as sc
import celldega as dega

import tifffile
import fiona
import geopandas as gpd

from glob import glob
from matplotlib import pyplot as plt
from matplotlib.colors import to_hex

from shapely.geometry import Polygon, Point, shape
from scipy.sparse import coo_matrix, csr_matrix, issparse

from ipywidgets import Widget

print(f"Celldega Version: {dega.__version__}")
import os
import tarfile
import json
from pathlib import Path
from datetime import date
from collections import defaultdict

import numpy as np
import pandas as pd
import polars as pl
import h5py

import scanpy as sc
import celldega as dega

import tifffile
import fiona
import geopandas as gpd

from glob import glob
from matplotlib import pyplot as plt
from matplotlib.colors import to_hex

from shapely.geometry import Polygon, Point, shape
from scipy.sparse import coo_matrix, csr_matrix, issparse

from ipywidgets import Widget

print(f"Celldega Version: {dega.__version__}")

Celldega Version: 0.13.0

In [3]:

Copied!

today = date.today()
today = date.today()

In [ ]:

Copied!





technology = "Visium-HD"

path_data = 'data'
dataset_name = 'dataset-name'

bin_size = 2
jitter = 2

image_file_name = 'image-name'

path_landscape_files=f'data/landscape_files/{dataset_name}_{today}_{bin_size}um'

image_tile_layer='h&e'
image_scale = 1.0
suffix = '.webp[Q=100]'

tile_size = 500
max_tiles_to_view = 5
technology = "Visium-HD"

path_data = 'data'
dataset_name = 'dataset-name'

bin_size = 2
jitter = 2

image_file_name = 'image-name'

path_landscape_files=f'data/landscape_files/{dataset_name}_{today}_{bin_size}um'

image_tile_layer='h&e'
image_scale = 1.0
suffix = '.webp[Q=100]'

tile_size = 500
max_tiles_to_view = 5

Make Landscape Files directory¶

In [ ]:

Copied!

os.makedirs(path_landscape_files, exist_ok=True)
os.makedirs(path_landscape_files, exist_ok=True)

Decompress all .tar.gz files in raw data¶

In [ ]:

Copied!





def decompress_all_tar_gz(path):
    for file in os.listdir(path):
        if file.endswith(".tar.gz"):
            file_path = os.path.join(path, file)
            print(f"Decompressing: {file}")
            with tarfile.open(file_path, "r:gz") as tar:
                tar.extractall(path)
    print("All files decompressed.")

decompress_all_tar_gz(f"{path_data}/{dataset_name}")
def decompress_all_tar_gz(path):
    for file in os.listdir(path):
        if file.endswith(".tar.gz"):
            file_path = os.path.join(path, file)
            print(f"Decompressing: {file}")
            with tarfile.open(file_path, "r:gz") as tar:
                tar.extractall(path)
    print("All files decompressed.")

decompress_all_tar_gz(f"{path_data}/{dataset_name}")

Process Image¶

In [ ]:

Copied!





# Path to your OME-TIFF file
file_path = f"{path_data}/{dataset_name}/{image_file_name}"

# Open the OME-TIFF file and read the image data
with tifffile.TiffFile(file_path) as tif:
    series = tif.series[0]
    image_data = series.asarray()

# Save the image data to a regular TIFF file without compression
tifffile.imwrite(path_landscape_files + '/output_regular.tif', image_data, compression=None)
image_png = dega.pre._convert_to_png(path_landscape_files + '/output_regular.tif')
dega.pre.make_deepzoom_pyramid(image_png,
                               path_landscape_files + '/pyramid_images/', image_tile_layer, tile_size=tile_size, suffix=suffix)
# Path to your OME-TIFF file
file_path = f"{path_data}/{dataset_name}/{image_file_name}"

# Open the OME-TIFF file and read the image data
with tifffile.TiffFile(file_path) as tif:
    series = tif.series[0]
    image_data = series.asarray()

# Save the image data to a regular TIFF file without compression
tifffile.imwrite(path_landscape_files + '/output_regular.tif', image_data, compression=None)
image_png = dega.pre._convert_to_png(path_landscape_files + '/output_regular.tif')
dega.pre.make_deepzoom_pyramid(image_png,
                               path_landscape_files + '/pyramid_images/', image_tile_layer, tile_size=tile_size, suffix=suffix)

In [ ]:

Copied!





tile_bounds = {}
tile_bounds["x_min"] = 0
tile_bounds["x_max"] = image_data.shape[1]
tile_bounds["y_min"] = 0
tile_bounds["y_max"] = image_data.shape[0]
tile_bounds = {}
tile_bounds["x_min"] = 0
tile_bounds["x_max"] = image_data.shape[1]
tile_bounds["y_min"] = 0
tile_bounds["y_max"] = image_data.shape[0]

Process Cells¶

Note: Cells are in pixel/ image space and are already aligned with the Microscope Image¶

In [ ]:

Copied!





features = []
with fiona.open(f"{path_data}/{dataset_name}/segmented_outputs/cell_segmentations.geojson") as src:
    for feat in src:
        features.append({"geometry": shape(feat["geometry"]),
                         "cell_id": feat["properties"]['cell_id']})

cells = gpd.GeoDataFrame(features)
features = []
with fiona.open(f"{path_data}/{dataset_name}/segmented_outputs/cell_segmentations.geojson") as src:
    for feat in src:
        features.append({"geometry": shape(feat["geometry"]),
                         "cell_id": feat["properties"]['cell_id']})

cells = gpd.GeoDataFrame(features)

In [ ]:

Copied!

cells.head()
cells.head()

In [ ]:

Copied!





cells['centroid_x'] = cells.geometry.centroid.x
cells['centroid_y'] = cells.geometry.centroid.y

cells["geometry_point"] = cells.apply(lambda row: Point(row["centroid_x"], row["centroid_y"]), axis=1)
cells["cell_id"] = cells["cell_id"].astype(str).map(lambda x: f"c-{x}")
cells.head()
cells['centroid_x'] = cells.geometry.centroid.x
cells['centroid_y'] = cells.geometry.centroid.y

cells["geometry_point"] = cells.apply(lambda row: Point(row["centroid_x"], row["centroid_y"]), axis=1)
cells["cell_id"] = cells["cell_id"].astype(str).map(lambda x: f"c-{x}")
cells.head()

In [ ]:

Copied!





cell_metadata = gpd.GeoDataFrame(cells[["cell_id", "geometry_point"]], geometry="geometry_point", crs="EPSG:4326")

cell_metadata['geometry'] = cell_metadata["geometry_point"].apply(lambda p: [round(p.x, 2), round(p.y, 2)])
cell_metadata.drop("geometry_point", axis=1, inplace=True)
cell_metadata.rename(columns={"geometry_point":"geometry",
                              "cell_id":"name"}, inplace=True)

cell_metadata.to_parquet(path_landscape_files + '/cell_metadata.parquet')
cell_metadata.head()
cell_metadata = gpd.GeoDataFrame(cells[["cell_id", "geometry_point"]], geometry="geometry_point", crs="EPSG:4326")

cell_metadata['geometry'] = cell_metadata["geometry_point"].apply(lambda p: [round(p.x, 2), round(p.y, 2)])
cell_metadata.drop("geometry_point", axis=1, inplace=True)
cell_metadata.rename(columns={"geometry_point":"geometry",
                              "cell_id":"name"}, inplace=True)

cell_metadata.to_parquet(path_landscape_files + '/cell_metadata.parquet')
cell_metadata.head()

In [ ]:

Copied!

output_dir = Path(path_landscape_files + '/cell_clusters')
output_dir.mkdir(parents=True, exist_ok=True)
output_dir = Path(path_landscape_files + '/cell_clusters')
output_dir.mkdir(parents=True, exist_ok=True)

Saving dummy clusters...¶

In [ ]:

Copied!

# def_clusters = pd.DataFrame(index=cells.index.tolist())
# def_clusters['cluster'] = pd.Series(0, index=cells.index.tolist())
# def_clusters = pd.DataFrame(index=cells.index.tolist())
# def_clusters['cluster'] = pd.Series(0, index=cells.index.tolist())

In [ ]:

Copied!

# def_clusters.to_parquet(path_landscape_files + '/cell_clusters/cluster.parquet')
# def_clusters.to_parquet(path_landscape_files + '/cell_clusters/cluster.parquet')

In [ ]:

Copied!





# meta_cluster = pd.DataFrame()
# meta_cluster.loc['0', 'color'] = '#ff7f0e'
# meta_cluster.loc['0', 'count'] = 1000
# meta_cluster.to_parquet(path_landscape_files + '/cell_clusters/meta_cluster.parquet')
# meta_cluster = pd.DataFrame()
# meta_cluster.loc['0', 'color'] = '#ff7f0e'
# meta_cluster.loc['0', 'count'] = 1000
# meta_cluster.to_parquet(path_landscape_files + '/cell_clusters/meta_cluster.parquet')

Saving default clusters...¶

In [ ]:

Copied!





def_clusters = pd.read_csv(
    f"{path_data}/{dataset_name}/segmented_outputs/analysis/clustering/gene_expression_graphclust/clusters.csv"
)

def_clusters.index = def_clusters['Barcode'].str.extract(r"cellid_0*(\d+)-")[0] \
    .astype(int).astype(str) \
    .map(lambda x: f"c-{x}")

def_clusters.drop("Barcode", axis=1, inplace=True)
def_clusters.rename(columns={'Cluster':"cluster"}, inplace=True)
def_clusters['cluster'] = def_clusters['cluster'].astype(str)
def_clusters.index.name = ""

cells_copy = cells.copy()
cells_copy.set_index("cell_id", inplace=True)

missing = cells_copy.index.difference(def_clusters.index)
def_clusters = def_clusters.reindex(cells_copy.index, fill_value=0)
def_clusters['cluster'] = def_clusters['cluster'].astype(str)

def_clusters.to_parquet(path_landscape_files + '/cell_clusters/cluster.parquet')
def_clusters = pd.read_csv(
    f"{path_data}/{dataset_name}/segmented_outputs/analysis/clustering/gene_expression_graphclust/clusters.csv"
)

def_clusters.index = def_clusters['Barcode'].str.extract(r"cellid_0*(\d+)-")[0] \
    .astype(int).astype(str) \
    .map(lambda x: f"c-{x}")

def_clusters.drop("Barcode", axis=1, inplace=True)
def_clusters.rename(columns={'Cluster':"cluster"}, inplace=True)
def_clusters['cluster'] = def_clusters['cluster'].astype(str)
def_clusters.index.name = ""

cells_copy = cells.copy()
cells_copy.set_index("cell_id", inplace=True)

missing = cells_copy.index.difference(def_clusters.index)
def_clusters = def_clusters.reindex(cells_copy.index, fill_value=0)
def_clusters['cluster'] = def_clusters['cluster'].astype(str)

def_clusters.to_parquet(path_landscape_files + '/cell_clusters/cluster.parquet')

In [ ]:

Copied!





meta_cluster = def_clusters["cluster"].value_counts().to_frame(name="count")
cmap = plt.cm.get_cmap("tab10")

n_colors = cmap.N
meta_cluster["color"] = [
    plt.cm.colors.to_hex(cmap(i % n_colors)) for i in range(len(meta_cluster))
]
meta_cluster = meta_cluster[["color", "count"]]
meta_cluster.index = meta_cluster.index.astype("string")
meta_cluster.index.name = "__index_level_0__"
meta_cluster.to_parquet(path_landscape_files + '/cell_clusters/meta_cluster.parquet')
meta_cluster.head()
meta_cluster = def_clusters["cluster"].value_counts().to_frame(name="count")
cmap = plt.cm.get_cmap("tab10")

n_colors = cmap.N
meta_cluster["color"] = [
    plt.cm.colors.to_hex(cmap(i % n_colors)) for i in range(len(meta_cluster))
]
meta_cluster = meta_cluster[["color", "count"]]
meta_cluster.index = meta_cluster.index.astype("string")
meta_cluster.index.name = "__index_level_0__"
meta_cluster.to_parquet(path_landscape_files + '/cell_clusters/meta_cluster.parquet')
meta_cluster.head()

Create cell tiles¶

In [ ]:

Copied!

def simple_format(geometry, image_scale):
    # factor in scaling
    return [[[coord[0] / image_scale, coord[1] / image_scale] for coord in polygon] for polygon in geometry]
def simple_format(geometry, image_scale):
    # factor in scaling
    return [[[coord[0] / image_scale, coord[1] / image_scale] for coord in polygon] for polygon in geometry]

In [ ]:

Copied!

def transform_polygon(polygon):

    exterior_coords = polygon.exterior.coords

    # Creating the original structure by directly using numpy array for each coordinate pair
    original_format_coords = np.array([np.array(coord) for coord in exterior_coords])

    return np.array([original_format_coords], dtype=object)
def transform_polygon(polygon):

    exterior_coords = polygon.exterior.coords

    # Creating the original structure by directly using numpy array for each coordinate pair
    original_format_coords = np.array([np.array(coord) for coord in exterior_coords])

    return np.array([original_format_coords], dtype=object)

In [ ]:

Copied!

cells["GEOMETRY"] = cells["geometry"].apply(lambda poly: transform_polygon(poly))
cells["GEOMETRY"] = cells["GEOMETRY"].apply(lambda x: simple_format(x, image_scale))
cells["GEOMETRY"] = cells["geometry"].apply(lambda poly: transform_polygon(poly))
cells["GEOMETRY"] = cells["GEOMETRY"].apply(lambda x: simple_format(x, image_scale))

In [ ]:

Copied!

cells.rename(columns={"cell_id":"name",
                      "centroid_x":"center_x",
                      "centroid_y":"center_y"}, inplace=True)

cells.drop(["geometry", "geometry_point"], axis=1, inplace=True)
cells.rename(columns={"cell_id":"name",
                      "centroid_x":"center_x",
                      "centroid_y":"center_y"}, inplace=True)

cells.drop(["geometry", "geometry_point"], axis=1, inplace=True)

In [ ]:

Copied!

cells.head()
cells.head()

In [ ]:

Copied!





tile_size_x = tile_size
tile_size_y = tile_size

x_min = tile_bounds["x_min"]
x_max = tile_bounds["x_max"]
y_min = tile_bounds["y_min"]
y_max = tile_bounds["y_max"]
tile_size_x = tile_size
tile_size_y = tile_size

x_min = tile_bounds["x_min"]
x_max = tile_bounds["x_max"]
y_min = tile_bounds["y_min"]
y_max = tile_bounds["y_max"]

Calculate the number of tiles needed¶

In [ ]:

Copied!

n_tiles_x = int(np.ceil((x_max - x_min) / tile_size_x))
n_tiles_y = int(np.ceil((y_max - y_min) / tile_size_y))
n_tiles_x = int(np.ceil((x_max - x_min) / tile_size_x))
n_tiles_y = int(np.ceil((y_max - y_min) / tile_size_y))

In [ ]:

Copied!

path_output = path_landscape_files + '/cell_segmentation'
os.makedirs(path_output, exist_ok=True)
path_output = path_landscape_files + '/cell_segmentation'
os.makedirs(path_output, exist_ok=True)

In [ ]:

Copied!





for i in range(n_tiles_x):

    if i % 2 == 0:
        print('row', i)

    for j in range(n_tiles_y):
        tile_x_min = x_min + i * tile_size_x
        tile_x_max = tile_x_min + tile_size_x
        tile_y_min = y_min + j * tile_size_y
        tile_y_max = tile_y_min + tile_size_y

        # find cell polygons with centroids in the tile
        keep_cells = cells[
            (cells.center_x >= tile_x_min)
            & (cells.center_x < tile_x_max)
            & (cells.center_y >= tile_y_min)
            & (cells.center_y < tile_y_max)
        ].index.tolist()

        inst_geo = cells.loc[keep_cells, ["GEOMETRY"]]

        # try adding cell name to geometry
        inst_geo["name"] = pd.Series(
            inst_geo.index.tolist(), index=inst_geo.index.tolist()
        )

        filename = f"{path_output}/cell_tile_{i}_{j}.parquet"

        # Save the filtered DataFrame to a Parquet file
        if inst_geo.shape[0] > 0:
            inst_geo[["GEOMETRY", "name"]].to_parquet(filename)
for i in range(n_tiles_x):

    if i % 2 == 0:
        print('row', i)

    for j in range(n_tiles_y):
        tile_x_min = x_min + i * tile_size_x
        tile_x_max = tile_x_min + tile_size_x
        tile_y_min = y_min + j * tile_size_y
        tile_y_max = tile_y_min + tile_size_y

        # find cell polygons with centroids in the tile
        keep_cells = cells[
            (cells.center_x >= tile_x_min)
            & (cells.center_x < tile_x_max)
            & (cells.center_y >= tile_y_min)
            & (cells.center_y < tile_y_max)
        ].index.tolist()

        inst_geo = cells.loc[keep_cells, ["GEOMETRY"]]

        # try adding cell name to geometry
        inst_geo["name"] = pd.Series(
            inst_geo.index.tolist(), index=inst_geo.index.tolist()
        )

        filename = f"{path_output}/cell_tile_{i}_{j}.parquet"

        # Save the filtered DataFrame to a Parquet file
        if inst_geo.shape[0] > 0:
            inst_geo[["GEOMETRY", "name"]].to_parquet(filename)

Process Meta Gene¶

In [ ]:

Copied!

adata_cell = sc.read_10x_h5(f"{path_data}/{dataset_name}/segmented_outputs/filtered_feature_cell_matrix.h5")
adata_cell = sc.read_10x_h5(f"{path_data}/{dataset_name}/segmented_outputs/filtered_feature_cell_matrix.h5")

In [ ]:

Copied!





list_genes = adata_cell.var.index.tolist()
meta_gene = pd.DataFrame(index=list_genes)

palettes = [plt.get_cmap(name).colors for name in plt.colormaps() if "tab" in name]
flat_colors = [color for palette in palettes for color in palette]
flat_colors_hex = [to_hex(color) for color in flat_colors]

colors = [
    flat_colors_hex[i % len(flat_colors_hex)] if "Blank" not in gene else "#FFFFFF"
    for i, gene in enumerate(list_genes)
]

ser_color = pd.Series(colors, index=list_genes)
list_genes = adata_cell.var.index.tolist()
meta_gene = pd.DataFrame(index=list_genes)

palettes = [plt.get_cmap(name).colors for name in plt.colormaps() if "tab" in name]
flat_colors = [color for palette in palettes for color in palette]
flat_colors_hex = [to_hex(color) for color in flat_colors]

colors = [
    flat_colors_hex[i % len(flat_colors_hex)] if "Blank" not in gene else "#FFFFFF"
    for i, gene in enumerate(list_genes)
]

ser_color = pd.Series(colors, index=list_genes)

In [ ]:

Copied!





meta_gene['mean'] = pd.Series(100, index=list_genes)
meta_gene['std'] = pd.Series(10, index=list_genes)
meta_gene['max'] = pd.Series(100, index=list_genes)
meta_gene['non-zero'] = pd.Series(0.5, index=list_genes)
meta_gene['color'] = ser_color

meta_gene.to_parquet(path_landscape_files + '/meta_gene.parquet')
meta_gene['mean'] = pd.Series(100, index=list_genes)
meta_gene['std'] = pd.Series(10, index=list_genes)
meta_gene['max'] = pd.Series(100, index=list_genes)
meta_gene['non-zero'] = pd.Series(0.5, index=list_genes)
meta_gene['color'] = ser_color

meta_gene.to_parquet(path_landscape_files + '/meta_gene.parquet')

Cell-by-gene (CBG)¶

In [ ]:

Copied!

cbg_df = dega.pre.read_cbg_mtx(f"{path_data}/{dataset_name}/segmented_outputs/filtered_feature_cell_matrix",
             barcodes_name='barcodes', features_name='features', technology=technology)
cbg_df = dega.pre.read_cbg_mtx(f"{path_data}/{dataset_name}/segmented_outputs/filtered_feature_cell_matrix",
             barcodes_name='barcodes', features_name='features', technology=technology)

In [ ]:

Copied!

cbg_df.index = cbg_df.index.str.extract(r"cellid_0*(\d+)-")[0] \
    .astype(int).astype(str) \
    .map(lambda x: f"c-{x}")

cbg_df.columns = cbg_df.columns.str.replace("/", "_", regex=False)

cbg_df.head()
cbg_df.index = cbg_df.index.str.extract(r"cellid_0*(\d+)-")[0] \
    .astype(int).astype(str) \
    .map(lambda x: f"c-{x}")

cbg_df.columns = cbg_df.columns.str.replace("/", "_", regex=False)

cbg_df.head()

In [ ]:

Copied!





def make_column_names_unique_fast(df):
    counts = defaultdict(int)
    used = set()
    new_cols = []

    for col in df.columns:
        if col not in used:
            new_cols.append(col)
            used.add(col)
            counts[col] += 1
        else:
            while True:
                new_name = f"{col}_{counts[col]}"
                counts[col] += 1
                if new_name not in used:
                    new_cols.append(new_name)
                    used.add(new_name)
                    break

    df.columns = new_cols
    df = df.rename_axis("__index_level_0__", axis="columns")
    return df
def make_column_names_unique_fast(df):
    counts = defaultdict(int)
    used = set()
    new_cols = []

    for col in df.columns:
        if col not in used:
            new_cols.append(col)
            used.add(col)
            counts[col] += 1
        else:
            while True:
                new_name = f"{col}_{counts[col]}"
                counts[col] += 1
                if new_name not in used:
                    new_cols.append(new_name)
                    used.add(new_name)
                    break

    df.columns = new_cols
    df = df.rename_axis("__index_level_0__", axis="columns")
    return df

In [ ]:

Copied!

cbg_df = make_column_names_unique_fast(cbg_df)
cbg_df = make_column_names_unique_fast(cbg_df)

In [ ]:

Copied!

cbg_df.head()
cbg_df.head()

Signatures¶

In [ ]:

Copied!





list_ser = []
clusters = def_clusters["cluster"].unique().tolist()
for inst_cat in clusters:
    if inst_cat is not None:
        inst_cells = def_clusters[def_clusters["cluster"] == inst_cat].index.tolist()
        if set(inst_cells) & set(cbg_df.index):
            common_cells = list(set(inst_cells) & set(cbg_df.index))
            inst_ser = cbg_df.loc[common_cells].sum() / len(common_cells)
        else:
            genes = cbg_df.columns
            inst_ser = pd.Series(0.0, index=genes)

        inst_ser.name = inst_cat
        list_ser.append(inst_ser)
list_ser = []
clusters = def_clusters["cluster"].unique().tolist()
for inst_cat in clusters:
    if inst_cat is not None:
        inst_cells = def_clusters[def_clusters["cluster"] == inst_cat].index.tolist()
        if set(inst_cells) & set(cbg_df.index):
            common_cells = list(set(inst_cells) & set(cbg_df.index))
            inst_ser = cbg_df.loc[common_cells].sum() / len(common_cells)
        else:
            genes = cbg_df.columns
            inst_ser = pd.Series(0.0, index=genes)

        inst_ser.name = inst_cat
        list_ser.append(inst_ser)

In [ ]:

Copied!





# Combine the signatures into a DataFrame
df_sig = pd.concat(list_ser, axis=1)

# Handle potential multiindex issues
df_sig.columns = df_sig.columns.tolist()
df_sig.index = df_sig.index.tolist()

# Filter out unwanted genes
keep_genes = df_sig.index.tolist()
keep_genes = [x for x in keep_genes if "Unassigned" not in x]
keep_genes = [x for x in keep_genes if "NegControl" not in x]
keep_genes = [x for x in keep_genes if "DeprecatedCodeword" not in x]

# Subset the DataFrame to keep only relevant genes and clusters
df_sig = df_sig.loc[keep_genes, clusters]

# drop columns with Nan values
df_sig = df_sig.dropna(axis=1, how="all")

df_sig = df_sig.loc[sorted(df_sig.index), sorted(df_sig.columns)]

# Convert only sparse columns to dense
for col in df_sig.columns:
    if isinstance(df_sig[col].dtype, pd.SparseDtype):
        df_sig[col] = df_sig[col].sparse.to_dense()

# proceed with filtering
df_sig = df_sig[(df_sig != 0).any(axis=1)]
df_sig = df_sig.loc[df_sig.std(axis=1) != 0]
df_sig = df_sig.replace([np.inf, -np.inf], np.nan).dropna()

# Save
df_sig.to_parquet(
    f"{path_landscape_files}/df_sig.parquet"
)

df_sig.head()
# Combine the signatures into a DataFrame
df_sig = pd.concat(list_ser, axis=1)

# Handle potential multiindex issues
df_sig.columns = df_sig.columns.tolist()
df_sig.index = df_sig.index.tolist()

# Filter out unwanted genes
keep_genes = df_sig.index.tolist()
keep_genes = [x for x in keep_genes if "Unassigned" not in x]
keep_genes = [x for x in keep_genes if "NegControl" not in x]
keep_genes = [x for x in keep_genes if "DeprecatedCodeword" not in x]

# Subset the DataFrame to keep only relevant genes and clusters
df_sig = df_sig.loc[keep_genes, clusters]

# drop columns with Nan values
df_sig = df_sig.dropna(axis=1, how="all")

df_sig = df_sig.loc[sorted(df_sig.index), sorted(df_sig.columns)]

# Convert only sparse columns to dense
for col in df_sig.columns:
    if isinstance(df_sig[col].dtype, pd.SparseDtype):
        df_sig[col] = df_sig[col].sparse.to_dense()

# proceed with filtering
df_sig = df_sig[(df_sig != 0).any(axis=1)]
df_sig = df_sig.loc[df_sig.std(axis=1) != 0]
df_sig = df_sig.replace([np.inf, -np.inf], np.nan).dropna()

# Save
df_sig.to_parquet(
    f"{path_landscape_files}/df_sig.parquet"
)

df_sig.head()

Create CBG parquet files¶

In [ ]:

Copied!

dega.pre.make_meta_gene(cbg_df,
                        path_landscape_files + '/meta_gene.parquet')
dega.pre.make_meta_gene(cbg_df,
                        path_landscape_files + '/meta_gene.parquet')

In [ ]:

Copied!

dega.pre.save_cbg_gene_parquets(technology, path_landscape_files, cbg_df, verbose=True)
dega.pre.save_cbg_gene_parquets(technology, path_landscape_files, cbg_df, verbose=True)

Create jittered transcripts¶

In [ ]:

Copied!

tissue_positions = pd.read_parquet(f"{path_data}/{dataset_name}/binned_outputs/square_00{bin_size}um/spatial/tissue_positions.parquet")

spots = tissue_positions[tissue_positions['in_tissue']==1]
spots.plot(kind='scatter', x='pxl_col_in_fullres', y='pxl_row_in_fullres')
tissue_positions = pd.read_parquet(f"{path_data}/{dataset_name}/binned_outputs/square_00{bin_size}um/spatial/tissue_positions.parquet")

spots = tissue_positions[tissue_positions['in_tissue']==1]
spots.plot(kind='scatter', x='pxl_col_in_fullres', y='pxl_row_in_fullres')

In [ ]:

Copied!

spots.head()
spots.head()

In [ ]:

Copied!





spots.set_index('barcode', inplace=True)
spots.drop(['array_row','array_col'], axis=1, inplace=True)
spots.rename(columns={'pxl_row_in_fullres':"y",
                       'pxl_col_in_fullres':"x"}, inplace=True)
spots.head()
spots.set_index('barcode', inplace=True)
spots.drop(['array_row','array_col'], axis=1, inplace=True)
spots.rename(columns={'pxl_row_in_fullres':"y",
                       'pxl_col_in_fullres':"x"}, inplace=True)
spots.head()

In [ ]:

Copied!





gene_str_to_int = dega.pre.boundary_tile._get_name_mapping(
    path_landscape_files,
    layer='transcript'
)
gene_str_to_int = dega.pre.boundary_tile._get_name_mapping(
    path_landscape_files,
    layer='transcript'
)

In [ ]:

Copied!





# Read spot-by-gene matrix

sbg = dega.pre.read_cbg_mtx(
    f"{path_data}/{dataset_name}/binned_outputs/square_00{bin_size}um/filtered_feature_bc_matrix",
    technology=technology,
    barcodes_name='barcodes'
)

sbg = make_column_names_unique_fast(sbg)
sbg.head()
# Read spot-by-gene matrix

sbg = dega.pre.read_cbg_mtx(
    f"{path_data}/{dataset_name}/binned_outputs/square_00{bin_size}um/filtered_feature_bc_matrix",
    technology=technology,
    barcodes_name='barcodes'
)

sbg = make_column_names_unique_fast(sbg)
sbg.head()

In [ ]:

Copied!

total_trx = sbg.sum(axis=0).sum()
print(f"Total transcripts: {total_trx/1e6:.1f}M")
total_trx = sbg.sum(axis=0).sum()
print(f"Total transcripts: {total_trx/1e6:.1f}M")

In [ ]:

Copied!





rng = np.random.default_rng()

out_dir = Path(path_landscape_files + '/transcript_tiles')
out_dir.mkdir(parents=True, exist_ok=True)

for i in range(n_tiles_x):
    if i % 10 == 0:
        print("row", i)

    for j in range(n_tiles_y):
        tile_x_min = tile_bounds["x_min"] + i * tile_size
        tile_x_max = tile_x_min + tile_size
        tile_y_min = tile_bounds["y_min"] + j * tile_size
        tile_y_max = tile_y_min + tile_size

        # Filter spot coordinates in this tile
        tile_spots = spots[
            (spots.x >= tile_x_min)
            & (spots.x < tile_x_max)
            & (spots.y >= tile_y_min)
            & (spots.y < tile_y_max)
        ]

        if tile_spots.empty:
            continue

        inst_spots = tile_spots.index.tolist()

        # Subset sparse matrix
        tile_sbg = sbg.loc[inst_spots]

        tile_sbg_coo = csr_matrix(tile_sbg.values)
        if tile_sbg_coo.nnz == 0:
            continue

        # Convert DataFrame → SciPy sparse
        coo = csr_matrix(tile_sbg.values).tocoo()
        row = np.array([inst_spots[r] for r in coo.row])
        col = tile_sbg.columns.to_numpy()[coo.col]
        count = coo.data

        # Create long format table with repeats
        df = pd.DataFrame({"spot": row, "gene": col, "count": count})
        df = df[df["count"] > 0]

        df = df.loc[df.index.repeat(df["count"].astype(int))].reset_index(drop=True)

        # Map spot → x, y
        df["x"] = df["spot"].map(tile_spots["x"])
        df["y"] = df["spot"].map(tile_spots["y"])

        # Map gene → int
        df["name"] = df["gene"].map(gene_str_to_int).astype("int32")

        # Convert to Polars for final processing
        pl_df = pl.DataFrame(df[["name", "x", "y"]])

        if rng is None:
            rng = np.random.default_rng()

        jitter_radius = jitter / 2
        jitter_x = rng.uniform(-jitter_radius, jitter_radius, size=len(pl_df))
        jitter_y = rng.uniform(-jitter_radius, jitter_radius, size=len(pl_df))

        pl_df = pl_df.with_columns(
            [
                (pl.col("x") + pl.Series(jitter_x)).round(2).alias("x"),
                (pl.col("y") + pl.Series(jitter_y)).round(2).alias("y"),
            ]
        )

        df_out = pl_df.to_pandas()
        df_out["geometry"] = df_out[["x", "y"]].values.tolist()
        filename = f"{path_landscape_files}/transcript_tiles/transcripts_tile_{i}_{j}.parquet"
        df_out[["name", "geometry"]].to_parquet(filename, index=False)
rng = np.random.default_rng()

out_dir = Path(path_landscape_files + '/transcript_tiles')
out_dir.mkdir(parents=True, exist_ok=True)

for i in range(n_tiles_x):
    if i % 10 == 0:
        print("row", i)

    for j in range(n_tiles_y):
        tile_x_min = tile_bounds["x_min"] + i * tile_size
        tile_x_max = tile_x_min + tile_size
        tile_y_min = tile_bounds["y_min"] + j * tile_size
        tile_y_max = tile_y_min + tile_size

        # Filter spot coordinates in this tile
        tile_spots = spots[
            (spots.x >= tile_x_min)
            & (spots.x < tile_x_max)
            & (spots.y >= tile_y_min)
            & (spots.y < tile_y_max)
        ]

        if tile_spots.empty:
            continue

        inst_spots = tile_spots.index.tolist()

        # Subset sparse matrix
        tile_sbg = sbg.loc[inst_spots]

        tile_sbg_coo = csr_matrix(tile_sbg.values)
        if tile_sbg_coo.nnz == 0:
            continue

        # Convert DataFrame → SciPy sparse
        coo = csr_matrix(tile_sbg.values).tocoo()
        row = np.array([inst_spots[r] for r in coo.row])
        col = tile_sbg.columns.to_numpy()[coo.col]
        count = coo.data

        # Create long format table with repeats
        df = pd.DataFrame({"spot": row, "gene": col, "count": count})
        df = df[df["count"] > 0]

        df = df.loc[df.index.repeat(df["count"].astype(int))].reset_index(drop=True)

        # Map spot → x, y
        df["x"] = df["spot"].map(tile_spots["x"])
        df["y"] = df["spot"].map(tile_spots["y"])

        # Map gene → int
        df["name"] = df["gene"].map(gene_str_to_int).astype("int32")

        # Convert to Polars for final processing
        pl_df = pl.DataFrame(df[["name", "x", "y"]])

        if rng is None:
            rng = np.random.default_rng()

        jitter_radius = jitter / 2
        jitter_x = rng.uniform(-jitter_radius, jitter_radius, size=len(pl_df))
        jitter_y = rng.uniform(-jitter_radius, jitter_radius, size=len(pl_df))

        pl_df = pl_df.with_columns(
            [
                (pl.col("x") + pl.Series(jitter_x)).round(2).alias("x"),
                (pl.col("y") + pl.Series(jitter_y)).round(2).alias("y"),
            ]
        )

        df_out = pl_df.to_pandas()
        df_out["geometry"] = df_out[["x", "y"]].values.tolist()
        filename = f"{path_landscape_files}/transcript_tiles/transcripts_tile_{i}_{j}.parquet"
        df_out[["name", "geometry"]].to_parquet(filename, index=False)

Save Landscape Parameters¶

In [ ]:

Copied!

image_files_path = path_landscape_files + f"/pyramid_images/{image_tile_layer}_files"
max_pyramid_zoom = dega.pre.get_max_zoom_level(image_files_path)
image_files_path = path_landscape_files + f"/pyramid_images/{image_tile_layer}_files"
max_pyramid_zoom = dega.pre.get_max_zoom_level(image_files_path)

In [ ]:

Copied!





landscape_parameters = {
  "technology": technology,
  "segmentation_approach": [
    "default"
  ],
  "max_pyramid_zoom": max_pyramid_zoom,
  "tile_size": tile_size,
  "image_info": [
    {
      "name": image_tile_layer,
      "button_name": image_tile_layer.upper(),
      "color": [
        0,
        0,
        255
      ]
    }
  ],
  "image_format": ".webp",
  "use_int_index": True
}

path = Path(f"{path_landscape_files}/landscape_parameters.json")
path.parent.mkdir(parents=True, exist_ok=True)

with path.open("w") as f:
    json.dump(landscape_parameters, f, indent=2)
landscape_parameters = {
  "technology": technology,
  "segmentation_approach": [
    "default"
  ],
  "max_pyramid_zoom": max_pyramid_zoom,
  "tile_size": tile_size,
  "image_info": [
    {
      "name": image_tile_layer,
      "button_name": image_tile_layer.upper(),
      "color": [
        0,
        0,
        255
      ]
    }
  ],
  "image_format": ".webp",
  "use_int_index": True
}

path = Path(f"{path_landscape_files}/landscape_parameters.json")
path.parent.mkdir(parents=True, exist_ok=True)

with path.open("w") as f:
    json.dump(landscape_parameters, f, indent=2)

Viz¶

To use locally stored LandscapeFiles, define the base_url as shown below:¶

In [ ]:

Copied!





# server_address = dega.viz.get_local_server()
# base_url = f"http://localhost:{server_address}/{path_landscape_files}"

# file_path = f'{path_landscape_files}/df_sig.parquet'
# df_sig = pd.read_parquet(file_path)
# top_genes = df_sig.sum(axis=1).sort_values(ascending=False).index.tolist()[:5000]
# df_sig = df_sig.loc[top_genes]
# server_address = dega.viz.get_local_server()
# base_url = f"http://localhost:{server_address}/{path_landscape_files}"

# file_path = f'{path_landscape_files}/df_sig.parquet'
# df_sig = pd.read_parquet(file_path)
# top_genes = df_sig.sum(axis=1).sort_values(ascending=False).index.tolist()[:5000]
# df_sig = df_sig.loc[top_genes]

Tutorial Data Source¶

1. By default, this notebook fetches data from the public GitHub repository: https://github.com/broadinstitute/celldega_Visium_HD_mouse_brain.¶

2. To use local LandscapeFiles instead, comment out the cell below.¶

In [5]:

Copied!





base_url = 'https://raw.githubusercontent.com/broadinstitute/celldega_Visium_HD_mouse_brain/main/Visium_HD_mouse_brain'

file_path = f'{base_url}/df_sig.parquet'
df_sig = pd.read_parquet(file_path)
top_genes = df_sig.sum(axis=1).sort_values(ascending=False).index.tolist()[:5000]
df_sig = df_sig.loc[top_genes]
base_url = 'https://raw.githubusercontent.com/broadinstitute/celldega_Visium_HD_mouse_brain/main/Visium_HD_mouse_brain'

file_path = f'{base_url}/df_sig.parquet'
df_sig = pd.read_parquet(file_path)
top_genes = df_sig.sum(axis=1).sort_values(ascending=False).index.tolist()[:5000]
df_sig = df_sig.loc[top_genes]

Initialize Heatmap¶

In [6]:

Copied!





mat = dega.clust.Matrix(data=df_sig)
mat.norm(axis='row', by='zscore')
mat.cluster()
cgm = dega.viz.Clustergram(matrix=mat, width=500, height=500)
mat = dega.clust.Matrix(data=df_sig)
mat.norm(axis='row', by='zscore')
mat.cluster()
cgm = dega.viz.Clustergram(matrix=mat, width=500, height=500)

Initialize Landscape¶

In [7]:

Copied!





landscape_visium = dega.viz.Landscape(
    technology=technology,
    base_url=base_url,
    max_tiles_to_view=max_tiles_to_view
)
landscape_visium = dega.viz.Landscape(
    technology=technology,
    base_url=base_url,
    max_tiles_to_view=max_tiles_to_view
)

/var/folders/_6/bhs42vt57t1dkb59k4sy0p440000gp/T/ipykernel_35290/2822121107.py:1: UserWarning: Transformation matrix not found at https://raw.githubusercontent.com/broadinstitute/celldega_Visium_HD_mouse_brain/main/Visium_HD_mouse_brain/micron_to_image_transform.csv. Using identity.
  landscape_visium = dega.viz.Landscape(

In [8]:

Copied!

dega.viz.landscape_clustergram(landscape_visium, cgm)
dega.viz.landscape_clustergram(landscape_visium, cgm)

Out[8]:

In [ ]: