Visium-HD Landscape Pre-Processing¶
In [1]:
Copied!
%load_ext autoreload
%autoreload 2
%env ANYWIDGET_HMR=1
%load_ext autoreload
%autoreload 2
%env ANYWIDGET_HMR=1
env: ANYWIDGET_HMR=1
In [2]:
Copied!
import os
import tarfile
import json
from pathlib import Path
from datetime import date
from collections import defaultdict
import numpy as np
import pandas as pd
import polars as pl
import h5py
import scanpy as sc
import celldega as dega
import tifffile
import fiona
import geopandas as gpd
from glob import glob
from matplotlib import pyplot as plt
from matplotlib.colors import to_hex
from shapely.geometry import Polygon, Point, shape
from scipy.sparse import coo_matrix, csr_matrix, issparse
from ipywidgets import Widget
print(f"Celldega Version: {dega.__version__}")
import os
import tarfile
import json
from pathlib import Path
from datetime import date
from collections import defaultdict
import numpy as np
import pandas as pd
import polars as pl
import h5py
import scanpy as sc
import celldega as dega
import tifffile
import fiona
import geopandas as gpd
from glob import glob
from matplotlib import pyplot as plt
from matplotlib.colors import to_hex
from shapely.geometry import Polygon, Point, shape
from scipy.sparse import coo_matrix, csr_matrix, issparse
from ipywidgets import Widget
print(f"Celldega Version: {dega.__version__}")
Celldega Version: 0.13.0
In [3]:
Copied!
today = date.today()
today = date.today()
In [ ]:
Copied!
technology = "Visium-HD"
path_data = 'data'
dataset_name = 'dataset-name'
bin_size = 2
jitter = 2
image_file_name = 'image-name'
path_landscape_files=f'data/landscape_files/{dataset_name}_{today}_{bin_size}um'
image_tile_layer='h&e'
image_scale = 1.0
suffix = '.webp[Q=100]'
tile_size = 500
max_tiles_to_view = 5
technology = "Visium-HD"
path_data = 'data'
dataset_name = 'dataset-name'
bin_size = 2
jitter = 2
image_file_name = 'image-name'
path_landscape_files=f'data/landscape_files/{dataset_name}_{today}_{bin_size}um'
image_tile_layer='h&e'
image_scale = 1.0
suffix = '.webp[Q=100]'
tile_size = 500
max_tiles_to_view = 5
Make Landscape Files directory¶
In [ ]:
Copied!
os.makedirs(path_landscape_files, exist_ok=True)
os.makedirs(path_landscape_files, exist_ok=True)
Decompress all .tar.gz files in raw data¶
In [ ]:
Copied!
def decompress_all_tar_gz(path):
for file in os.listdir(path):
if file.endswith(".tar.gz"):
file_path = os.path.join(path, file)
print(f"Decompressing: {file}")
with tarfile.open(file_path, "r:gz") as tar:
tar.extractall(path)
print("All files decompressed.")
decompress_all_tar_gz(f"{path_data}/{dataset_name}")
def decompress_all_tar_gz(path):
for file in os.listdir(path):
if file.endswith(".tar.gz"):
file_path = os.path.join(path, file)
print(f"Decompressing: {file}")
with tarfile.open(file_path, "r:gz") as tar:
tar.extractall(path)
print("All files decompressed.")
decompress_all_tar_gz(f"{path_data}/{dataset_name}")
Process Image¶
In [ ]:
Copied!
# Path to your OME-TIFF file
file_path = f"{path_data}/{dataset_name}/{image_file_name}"
# Open the OME-TIFF file and read the image data
with tifffile.TiffFile(file_path) as tif:
series = tif.series[0]
image_data = series.asarray()
# Save the image data to a regular TIFF file without compression
tifffile.imwrite(path_landscape_files + '/output_regular.tif', image_data, compression=None)
image_png = dega.pre._convert_to_png(path_landscape_files + '/output_regular.tif')
dega.pre.make_deepzoom_pyramid(image_png,
path_landscape_files + '/pyramid_images/', image_tile_layer, tile_size=tile_size, suffix=suffix)
# Path to your OME-TIFF file
file_path = f"{path_data}/{dataset_name}/{image_file_name}"
# Open the OME-TIFF file and read the image data
with tifffile.TiffFile(file_path) as tif:
series = tif.series[0]
image_data = series.asarray()
# Save the image data to a regular TIFF file without compression
tifffile.imwrite(path_landscape_files + '/output_regular.tif', image_data, compression=None)
image_png = dega.pre._convert_to_png(path_landscape_files + '/output_regular.tif')
dega.pre.make_deepzoom_pyramid(image_png,
path_landscape_files + '/pyramid_images/', image_tile_layer, tile_size=tile_size, suffix=suffix)
In [ ]:
Copied!
tile_bounds = {}
tile_bounds["x_min"] = 0
tile_bounds["x_max"] = image_data.shape[1]
tile_bounds["y_min"] = 0
tile_bounds["y_max"] = image_data.shape[0]
tile_bounds = {}
tile_bounds["x_min"] = 0
tile_bounds["x_max"] = image_data.shape[1]
tile_bounds["y_min"] = 0
tile_bounds["y_max"] = image_data.shape[0]
Process Cells¶
Note: Cells are in pixel/ image space and are already aligned with the Microscope Image¶
In [ ]:
Copied!
features = []
with fiona.open(f"{path_data}/{dataset_name}/segmented_outputs/cell_segmentations.geojson") as src:
for feat in src:
features.append({"geometry": shape(feat["geometry"]),
"cell_id": feat["properties"]['cell_id']})
cells = gpd.GeoDataFrame(features)
features = []
with fiona.open(f"{path_data}/{dataset_name}/segmented_outputs/cell_segmentations.geojson") as src:
for feat in src:
features.append({"geometry": shape(feat["geometry"]),
"cell_id": feat["properties"]['cell_id']})
cells = gpd.GeoDataFrame(features)
In [ ]:
Copied!
cells.head()
cells.head()
In [ ]:
Copied!
cells['centroid_x'] = cells.geometry.centroid.x
cells['centroid_y'] = cells.geometry.centroid.y
cells["geometry_point"] = cells.apply(lambda row: Point(row["centroid_x"], row["centroid_y"]), axis=1)
cells["cell_id"] = cells["cell_id"].astype(str).map(lambda x: f"c-{x}")
cells.head()
cells['centroid_x'] = cells.geometry.centroid.x
cells['centroid_y'] = cells.geometry.centroid.y
cells["geometry_point"] = cells.apply(lambda row: Point(row["centroid_x"], row["centroid_y"]), axis=1)
cells["cell_id"] = cells["cell_id"].astype(str).map(lambda x: f"c-{x}")
cells.head()
In [ ]:
Copied!
cell_metadata = gpd.GeoDataFrame(cells[["cell_id", "geometry_point"]], geometry="geometry_point", crs="EPSG:4326")
cell_metadata['geometry'] = cell_metadata["geometry_point"].apply(lambda p: [round(p.x, 2), round(p.y, 2)])
cell_metadata.drop("geometry_point", axis=1, inplace=True)
cell_metadata.rename(columns={"geometry_point":"geometry",
"cell_id":"name"}, inplace=True)
cell_metadata.to_parquet(path_landscape_files + '/cell_metadata.parquet')
cell_metadata.head()
cell_metadata = gpd.GeoDataFrame(cells[["cell_id", "geometry_point"]], geometry="geometry_point", crs="EPSG:4326")
cell_metadata['geometry'] = cell_metadata["geometry_point"].apply(lambda p: [round(p.x, 2), round(p.y, 2)])
cell_metadata.drop("geometry_point", axis=1, inplace=True)
cell_metadata.rename(columns={"geometry_point":"geometry",
"cell_id":"name"}, inplace=True)
cell_metadata.to_parquet(path_landscape_files + '/cell_metadata.parquet')
cell_metadata.head()
In [ ]:
Copied!
output_dir = Path(path_landscape_files + '/cell_clusters')
output_dir.mkdir(parents=True, exist_ok=True)
output_dir = Path(path_landscape_files + '/cell_clusters')
output_dir.mkdir(parents=True, exist_ok=True)
Saving dummy clusters...¶
In [ ]:
Copied!
# def_clusters = pd.DataFrame(index=cells.index.tolist())
# def_clusters['cluster'] = pd.Series(0, index=cells.index.tolist())
# def_clusters = pd.DataFrame(index=cells.index.tolist())
# def_clusters['cluster'] = pd.Series(0, index=cells.index.tolist())
In [ ]:
Copied!
# def_clusters.to_parquet(path_landscape_files + '/cell_clusters/cluster.parquet')
# def_clusters.to_parquet(path_landscape_files + '/cell_clusters/cluster.parquet')
In [ ]:
Copied!
# meta_cluster = pd.DataFrame()
# meta_cluster.loc['0', 'color'] = '#ff7f0e'
# meta_cluster.loc['0', 'count'] = 1000
# meta_cluster.to_parquet(path_landscape_files + '/cell_clusters/meta_cluster.parquet')
# meta_cluster = pd.DataFrame()
# meta_cluster.loc['0', 'color'] = '#ff7f0e'
# meta_cluster.loc['0', 'count'] = 1000
# meta_cluster.to_parquet(path_landscape_files + '/cell_clusters/meta_cluster.parquet')
Saving default clusters...¶
In [ ]:
Copied!
def_clusters = pd.read_csv(
f"{path_data}/{dataset_name}/segmented_outputs/analysis/clustering/gene_expression_graphclust/clusters.csv"
)
def_clusters.index = def_clusters['Barcode'].str.extract(r"cellid_0*(\d+)-")[0] \
.astype(int).astype(str) \
.map(lambda x: f"c-{x}")
def_clusters.drop("Barcode", axis=1, inplace=True)
def_clusters.rename(columns={'Cluster':"cluster"}, inplace=True)
def_clusters['cluster'] = def_clusters['cluster'].astype(str)
def_clusters.index.name = ""
cells_copy = cells.copy()
cells_copy.set_index("cell_id", inplace=True)
missing = cells_copy.index.difference(def_clusters.index)
def_clusters = def_clusters.reindex(cells_copy.index, fill_value=0)
def_clusters['cluster'] = def_clusters['cluster'].astype(str)
def_clusters.to_parquet(path_landscape_files + '/cell_clusters/cluster.parquet')
def_clusters = pd.read_csv(
f"{path_data}/{dataset_name}/segmented_outputs/analysis/clustering/gene_expression_graphclust/clusters.csv"
)
def_clusters.index = def_clusters['Barcode'].str.extract(r"cellid_0*(\d+)-")[0] \
.astype(int).astype(str) \
.map(lambda x: f"c-{x}")
def_clusters.drop("Barcode", axis=1, inplace=True)
def_clusters.rename(columns={'Cluster':"cluster"}, inplace=True)
def_clusters['cluster'] = def_clusters['cluster'].astype(str)
def_clusters.index.name = ""
cells_copy = cells.copy()
cells_copy.set_index("cell_id", inplace=True)
missing = cells_copy.index.difference(def_clusters.index)
def_clusters = def_clusters.reindex(cells_copy.index, fill_value=0)
def_clusters['cluster'] = def_clusters['cluster'].astype(str)
def_clusters.to_parquet(path_landscape_files + '/cell_clusters/cluster.parquet')
In [ ]:
Copied!
meta_cluster = def_clusters["cluster"].value_counts().to_frame(name="count")
cmap = plt.cm.get_cmap("tab10")
n_colors = cmap.N
meta_cluster["color"] = [
plt.cm.colors.to_hex(cmap(i % n_colors)) for i in range(len(meta_cluster))
]
meta_cluster = meta_cluster[["color", "count"]]
meta_cluster.index = meta_cluster.index.astype("string")
meta_cluster.index.name = "__index_level_0__"
meta_cluster.to_parquet(path_landscape_files + '/cell_clusters/meta_cluster.parquet')
meta_cluster.head()
meta_cluster = def_clusters["cluster"].value_counts().to_frame(name="count")
cmap = plt.cm.get_cmap("tab10")
n_colors = cmap.N
meta_cluster["color"] = [
plt.cm.colors.to_hex(cmap(i % n_colors)) for i in range(len(meta_cluster))
]
meta_cluster = meta_cluster[["color", "count"]]
meta_cluster.index = meta_cluster.index.astype("string")
meta_cluster.index.name = "__index_level_0__"
meta_cluster.to_parquet(path_landscape_files + '/cell_clusters/meta_cluster.parquet')
meta_cluster.head()
Create cell tiles¶
In [ ]:
Copied!
def simple_format(geometry, image_scale):
# factor in scaling
return [[[coord[0] / image_scale, coord[1] / image_scale] for coord in polygon] for polygon in geometry]
def simple_format(geometry, image_scale):
# factor in scaling
return [[[coord[0] / image_scale, coord[1] / image_scale] for coord in polygon] for polygon in geometry]
In [ ]:
Copied!
def transform_polygon(polygon):
exterior_coords = polygon.exterior.coords
# Creating the original structure by directly using numpy array for each coordinate pair
original_format_coords = np.array([np.array(coord) for coord in exterior_coords])
return np.array([original_format_coords], dtype=object)
def transform_polygon(polygon):
exterior_coords = polygon.exterior.coords
# Creating the original structure by directly using numpy array for each coordinate pair
original_format_coords = np.array([np.array(coord) for coord in exterior_coords])
return np.array([original_format_coords], dtype=object)
In [ ]:
Copied!
cells["GEOMETRY"] = cells["geometry"].apply(lambda poly: transform_polygon(poly))
cells["GEOMETRY"] = cells["GEOMETRY"].apply(lambda x: simple_format(x, image_scale))
cells["GEOMETRY"] = cells["geometry"].apply(lambda poly: transform_polygon(poly))
cells["GEOMETRY"] = cells["GEOMETRY"].apply(lambda x: simple_format(x, image_scale))
In [ ]:
Copied!
cells.rename(columns={"cell_id":"name",
"centroid_x":"center_x",
"centroid_y":"center_y"}, inplace=True)
cells.drop(["geometry", "geometry_point"], axis=1, inplace=True)
cells.rename(columns={"cell_id":"name",
"centroid_x":"center_x",
"centroid_y":"center_y"}, inplace=True)
cells.drop(["geometry", "geometry_point"], axis=1, inplace=True)
In [ ]:
Copied!
cells.head()
cells.head()
In [ ]:
Copied!
tile_size_x = tile_size
tile_size_y = tile_size
x_min = tile_bounds["x_min"]
x_max = tile_bounds["x_max"]
y_min = tile_bounds["y_min"]
y_max = tile_bounds["y_max"]
tile_size_x = tile_size
tile_size_y = tile_size
x_min = tile_bounds["x_min"]
x_max = tile_bounds["x_max"]
y_min = tile_bounds["y_min"]
y_max = tile_bounds["y_max"]
Calculate the number of tiles needed¶
In [ ]:
Copied!
n_tiles_x = int(np.ceil((x_max - x_min) / tile_size_x))
n_tiles_y = int(np.ceil((y_max - y_min) / tile_size_y))
n_tiles_x = int(np.ceil((x_max - x_min) / tile_size_x))
n_tiles_y = int(np.ceil((y_max - y_min) / tile_size_y))
In [ ]:
Copied!
path_output = path_landscape_files + '/cell_segmentation'
os.makedirs(path_output, exist_ok=True)
path_output = path_landscape_files + '/cell_segmentation'
os.makedirs(path_output, exist_ok=True)
In [ ]:
Copied!
for i in range(n_tiles_x):
if i % 2 == 0:
print('row', i)
for j in range(n_tiles_y):
tile_x_min = x_min + i * tile_size_x
tile_x_max = tile_x_min + tile_size_x
tile_y_min = y_min + j * tile_size_y
tile_y_max = tile_y_min + tile_size_y
# find cell polygons with centroids in the tile
keep_cells = cells[
(cells.center_x >= tile_x_min)
& (cells.center_x < tile_x_max)
& (cells.center_y >= tile_y_min)
& (cells.center_y < tile_y_max)
].index.tolist()
inst_geo = cells.loc[keep_cells, ["GEOMETRY"]]
# try adding cell name to geometry
inst_geo["name"] = pd.Series(
inst_geo.index.tolist(), index=inst_geo.index.tolist()
)
filename = f"{path_output}/cell_tile_{i}_{j}.parquet"
# Save the filtered DataFrame to a Parquet file
if inst_geo.shape[0] > 0:
inst_geo[["GEOMETRY", "name"]].to_parquet(filename)
for i in range(n_tiles_x):
if i % 2 == 0:
print('row', i)
for j in range(n_tiles_y):
tile_x_min = x_min + i * tile_size_x
tile_x_max = tile_x_min + tile_size_x
tile_y_min = y_min + j * tile_size_y
tile_y_max = tile_y_min + tile_size_y
# find cell polygons with centroids in the tile
keep_cells = cells[
(cells.center_x >= tile_x_min)
& (cells.center_x < tile_x_max)
& (cells.center_y >= tile_y_min)
& (cells.center_y < tile_y_max)
].index.tolist()
inst_geo = cells.loc[keep_cells, ["GEOMETRY"]]
# try adding cell name to geometry
inst_geo["name"] = pd.Series(
inst_geo.index.tolist(), index=inst_geo.index.tolist()
)
filename = f"{path_output}/cell_tile_{i}_{j}.parquet"
# Save the filtered DataFrame to a Parquet file
if inst_geo.shape[0] > 0:
inst_geo[["GEOMETRY", "name"]].to_parquet(filename)
Process Meta Gene¶
In [ ]:
Copied!
adata_cell = sc.read_10x_h5(f"{path_data}/{dataset_name}/segmented_outputs/filtered_feature_cell_matrix.h5")
adata_cell = sc.read_10x_h5(f"{path_data}/{dataset_name}/segmented_outputs/filtered_feature_cell_matrix.h5")
In [ ]:
Copied!
list_genes = adata_cell.var.index.tolist()
meta_gene = pd.DataFrame(index=list_genes)
palettes = [plt.get_cmap(name).colors for name in plt.colormaps() if "tab" in name]
flat_colors = [color for palette in palettes for color in palette]
flat_colors_hex = [to_hex(color) for color in flat_colors]
colors = [
flat_colors_hex[i % len(flat_colors_hex)] if "Blank" not in gene else "#FFFFFF"
for i, gene in enumerate(list_genes)
]
ser_color = pd.Series(colors, index=list_genes)
list_genes = adata_cell.var.index.tolist()
meta_gene = pd.DataFrame(index=list_genes)
palettes = [plt.get_cmap(name).colors for name in plt.colormaps() if "tab" in name]
flat_colors = [color for palette in palettes for color in palette]
flat_colors_hex = [to_hex(color) for color in flat_colors]
colors = [
flat_colors_hex[i % len(flat_colors_hex)] if "Blank" not in gene else "#FFFFFF"
for i, gene in enumerate(list_genes)
]
ser_color = pd.Series(colors, index=list_genes)
In [ ]:
Copied!
meta_gene['mean'] = pd.Series(100, index=list_genes)
meta_gene['std'] = pd.Series(10, index=list_genes)
meta_gene['max'] = pd.Series(100, index=list_genes)
meta_gene['non-zero'] = pd.Series(0.5, index=list_genes)
meta_gene['color'] = ser_color
meta_gene.to_parquet(path_landscape_files + '/meta_gene.parquet')
meta_gene['mean'] = pd.Series(100, index=list_genes)
meta_gene['std'] = pd.Series(10, index=list_genes)
meta_gene['max'] = pd.Series(100, index=list_genes)
meta_gene['non-zero'] = pd.Series(0.5, index=list_genes)
meta_gene['color'] = ser_color
meta_gene.to_parquet(path_landscape_files + '/meta_gene.parquet')
Cell-by-gene (CBG)¶
In [ ]:
Copied!
cbg_df = dega.pre.read_cbg_mtx(f"{path_data}/{dataset_name}/segmented_outputs/filtered_feature_cell_matrix",
barcodes_name='barcodes', features_name='features', technology=technology)
cbg_df = dega.pre.read_cbg_mtx(f"{path_data}/{dataset_name}/segmented_outputs/filtered_feature_cell_matrix",
barcodes_name='barcodes', features_name='features', technology=technology)
In [ ]:
Copied!
cbg_df.index = cbg_df.index.str.extract(r"cellid_0*(\d+)-")[0] \
.astype(int).astype(str) \
.map(lambda x: f"c-{x}")
cbg_df.columns = cbg_df.columns.str.replace("/", "_", regex=False)
cbg_df.head()
cbg_df.index = cbg_df.index.str.extract(r"cellid_0*(\d+)-")[0] \
.astype(int).astype(str) \
.map(lambda x: f"c-{x}")
cbg_df.columns = cbg_df.columns.str.replace("/", "_", regex=False)
cbg_df.head()
In [ ]:
Copied!
def make_column_names_unique_fast(df):
counts = defaultdict(int)
used = set()
new_cols = []
for col in df.columns:
if col not in used:
new_cols.append(col)
used.add(col)
counts[col] += 1
else:
while True:
new_name = f"{col}_{counts[col]}"
counts[col] += 1
if new_name not in used:
new_cols.append(new_name)
used.add(new_name)
break
df.columns = new_cols
df = df.rename_axis("__index_level_0__", axis="columns")
return df
def make_column_names_unique_fast(df):
counts = defaultdict(int)
used = set()
new_cols = []
for col in df.columns:
if col not in used:
new_cols.append(col)
used.add(col)
counts[col] += 1
else:
while True:
new_name = f"{col}_{counts[col]}"
counts[col] += 1
if new_name not in used:
new_cols.append(new_name)
used.add(new_name)
break
df.columns = new_cols
df = df.rename_axis("__index_level_0__", axis="columns")
return df
In [ ]:
Copied!
cbg_df = make_column_names_unique_fast(cbg_df)
cbg_df = make_column_names_unique_fast(cbg_df)
In [ ]:
Copied!
cbg_df.head()
cbg_df.head()
Signatures¶
In [ ]:
Copied!
list_ser = []
clusters = def_clusters["cluster"].unique().tolist()
for inst_cat in clusters:
if inst_cat is not None:
inst_cells = def_clusters[def_clusters["cluster"] == inst_cat].index.tolist()
if set(inst_cells) & set(cbg_df.index):
common_cells = list(set(inst_cells) & set(cbg_df.index))
inst_ser = cbg_df.loc[common_cells].sum() / len(common_cells)
else:
genes = cbg_df.columns
inst_ser = pd.Series(0.0, index=genes)
inst_ser.name = inst_cat
list_ser.append(inst_ser)
list_ser = []
clusters = def_clusters["cluster"].unique().tolist()
for inst_cat in clusters:
if inst_cat is not None:
inst_cells = def_clusters[def_clusters["cluster"] == inst_cat].index.tolist()
if set(inst_cells) & set(cbg_df.index):
common_cells = list(set(inst_cells) & set(cbg_df.index))
inst_ser = cbg_df.loc[common_cells].sum() / len(common_cells)
else:
genes = cbg_df.columns
inst_ser = pd.Series(0.0, index=genes)
inst_ser.name = inst_cat
list_ser.append(inst_ser)
In [ ]:
Copied!
# Combine the signatures into a DataFrame
df_sig = pd.concat(list_ser, axis=1)
# Handle potential multiindex issues
df_sig.columns = df_sig.columns.tolist()
df_sig.index = df_sig.index.tolist()
# Filter out unwanted genes
keep_genes = df_sig.index.tolist()
keep_genes = [x for x in keep_genes if "Unassigned" not in x]
keep_genes = [x for x in keep_genes if "NegControl" not in x]
keep_genes = [x for x in keep_genes if "DeprecatedCodeword" not in x]
# Subset the DataFrame to keep only relevant genes and clusters
df_sig = df_sig.loc[keep_genes, clusters]
# drop columns with Nan values
df_sig = df_sig.dropna(axis=1, how="all")
df_sig = df_sig.loc[sorted(df_sig.index), sorted(df_sig.columns)]
# Convert only sparse columns to dense
for col in df_sig.columns:
if isinstance(df_sig[col].dtype, pd.SparseDtype):
df_sig[col] = df_sig[col].sparse.to_dense()
# proceed with filtering
df_sig = df_sig[(df_sig != 0).any(axis=1)]
df_sig = df_sig.loc[df_sig.std(axis=1) != 0]
df_sig = df_sig.replace([np.inf, -np.inf], np.nan).dropna()
# Save
df_sig.to_parquet(
f"{path_landscape_files}/df_sig.parquet"
)
df_sig.head()
# Combine the signatures into a DataFrame
df_sig = pd.concat(list_ser, axis=1)
# Handle potential multiindex issues
df_sig.columns = df_sig.columns.tolist()
df_sig.index = df_sig.index.tolist()
# Filter out unwanted genes
keep_genes = df_sig.index.tolist()
keep_genes = [x for x in keep_genes if "Unassigned" not in x]
keep_genes = [x for x in keep_genes if "NegControl" not in x]
keep_genes = [x for x in keep_genes if "DeprecatedCodeword" not in x]
# Subset the DataFrame to keep only relevant genes and clusters
df_sig = df_sig.loc[keep_genes, clusters]
# drop columns with Nan values
df_sig = df_sig.dropna(axis=1, how="all")
df_sig = df_sig.loc[sorted(df_sig.index), sorted(df_sig.columns)]
# Convert only sparse columns to dense
for col in df_sig.columns:
if isinstance(df_sig[col].dtype, pd.SparseDtype):
df_sig[col] = df_sig[col].sparse.to_dense()
# proceed with filtering
df_sig = df_sig[(df_sig != 0).any(axis=1)]
df_sig = df_sig.loc[df_sig.std(axis=1) != 0]
df_sig = df_sig.replace([np.inf, -np.inf], np.nan).dropna()
# Save
df_sig.to_parquet(
f"{path_landscape_files}/df_sig.parquet"
)
df_sig.head()
Create CBG parquet files¶
In [ ]:
Copied!
dega.pre.make_meta_gene(cbg_df,
path_landscape_files + '/meta_gene.parquet')
dega.pre.make_meta_gene(cbg_df,
path_landscape_files + '/meta_gene.parquet')
In [ ]:
Copied!
dega.pre.save_cbg_gene_parquets(technology, path_landscape_files, cbg_df, verbose=True)
dega.pre.save_cbg_gene_parquets(technology, path_landscape_files, cbg_df, verbose=True)
Create jittered transcripts¶
In [ ]:
Copied!
tissue_positions = pd.read_parquet(f"{path_data}/{dataset_name}/binned_outputs/square_00{bin_size}um/spatial/tissue_positions.parquet")
spots = tissue_positions[tissue_positions['in_tissue']==1]
spots.plot(kind='scatter', x='pxl_col_in_fullres', y='pxl_row_in_fullres')
tissue_positions = pd.read_parquet(f"{path_data}/{dataset_name}/binned_outputs/square_00{bin_size}um/spatial/tissue_positions.parquet")
spots = tissue_positions[tissue_positions['in_tissue']==1]
spots.plot(kind='scatter', x='pxl_col_in_fullres', y='pxl_row_in_fullres')
In [ ]:
Copied!
spots.head()
spots.head()
In [ ]:
Copied!
spots.set_index('barcode', inplace=True)
spots.drop(['array_row','array_col'], axis=1, inplace=True)
spots.rename(columns={'pxl_row_in_fullres':"y",
'pxl_col_in_fullres':"x"}, inplace=True)
spots.head()
spots.set_index('barcode', inplace=True)
spots.drop(['array_row','array_col'], axis=1, inplace=True)
spots.rename(columns={'pxl_row_in_fullres':"y",
'pxl_col_in_fullres':"x"}, inplace=True)
spots.head()
In [ ]:
Copied!
gene_str_to_int = dega.pre.boundary_tile._get_name_mapping(
path_landscape_files,
layer='transcript'
)
gene_str_to_int = dega.pre.boundary_tile._get_name_mapping(
path_landscape_files,
layer='transcript'
)
In [ ]:
Copied!
# Read spot-by-gene matrix
sbg = dega.pre.read_cbg_mtx(
f"{path_data}/{dataset_name}/binned_outputs/square_00{bin_size}um/filtered_feature_bc_matrix",
technology=technology,
barcodes_name='barcodes'
)
sbg = make_column_names_unique_fast(sbg)
sbg.head()
# Read spot-by-gene matrix
sbg = dega.pre.read_cbg_mtx(
f"{path_data}/{dataset_name}/binned_outputs/square_00{bin_size}um/filtered_feature_bc_matrix",
technology=technology,
barcodes_name='barcodes'
)
sbg = make_column_names_unique_fast(sbg)
sbg.head()
In [ ]:
Copied!
total_trx = sbg.sum(axis=0).sum()
print(f"Total transcripts: {total_trx/1e6:.1f}M")
total_trx = sbg.sum(axis=0).sum()
print(f"Total transcripts: {total_trx/1e6:.1f}M")
In [ ]:
Copied!
rng = np.random.default_rng()
out_dir = Path(path_landscape_files + '/transcript_tiles')
out_dir.mkdir(parents=True, exist_ok=True)
for i in range(n_tiles_x):
if i % 10 == 0:
print("row", i)
for j in range(n_tiles_y):
tile_x_min = tile_bounds["x_min"] + i * tile_size
tile_x_max = tile_x_min + tile_size
tile_y_min = tile_bounds["y_min"] + j * tile_size
tile_y_max = tile_y_min + tile_size
# Filter spot coordinates in this tile
tile_spots = spots[
(spots.x >= tile_x_min)
& (spots.x < tile_x_max)
& (spots.y >= tile_y_min)
& (spots.y < tile_y_max)
]
if tile_spots.empty:
continue
inst_spots = tile_spots.index.tolist()
# Subset sparse matrix
tile_sbg = sbg.loc[inst_spots]
tile_sbg_coo = csr_matrix(tile_sbg.values)
if tile_sbg_coo.nnz == 0:
continue
# Convert DataFrame → SciPy sparse
coo = csr_matrix(tile_sbg.values).tocoo()
row = np.array([inst_spots[r] for r in coo.row])
col = tile_sbg.columns.to_numpy()[coo.col]
count = coo.data
# Create long format table with repeats
df = pd.DataFrame({"spot": row, "gene": col, "count": count})
df = df[df["count"] > 0]
df = df.loc[df.index.repeat(df["count"].astype(int))].reset_index(drop=True)
# Map spot → x, y
df["x"] = df["spot"].map(tile_spots["x"])
df["y"] = df["spot"].map(tile_spots["y"])
# Map gene → int
df["name"] = df["gene"].map(gene_str_to_int).astype("int32")
# Convert to Polars for final processing
pl_df = pl.DataFrame(df[["name", "x", "y"]])
if rng is None:
rng = np.random.default_rng()
jitter_radius = jitter / 2
jitter_x = rng.uniform(-jitter_radius, jitter_radius, size=len(pl_df))
jitter_y = rng.uniform(-jitter_radius, jitter_radius, size=len(pl_df))
pl_df = pl_df.with_columns(
[
(pl.col("x") + pl.Series(jitter_x)).round(2).alias("x"),
(pl.col("y") + pl.Series(jitter_y)).round(2).alias("y"),
]
)
df_out = pl_df.to_pandas()
df_out["geometry"] = df_out[["x", "y"]].values.tolist()
filename = f"{path_landscape_files}/transcript_tiles/transcripts_tile_{i}_{j}.parquet"
df_out[["name", "geometry"]].to_parquet(filename, index=False)
rng = np.random.default_rng()
out_dir = Path(path_landscape_files + '/transcript_tiles')
out_dir.mkdir(parents=True, exist_ok=True)
for i in range(n_tiles_x):
if i % 10 == 0:
print("row", i)
for j in range(n_tiles_y):
tile_x_min = tile_bounds["x_min"] + i * tile_size
tile_x_max = tile_x_min + tile_size
tile_y_min = tile_bounds["y_min"] + j * tile_size
tile_y_max = tile_y_min + tile_size
# Filter spot coordinates in this tile
tile_spots = spots[
(spots.x >= tile_x_min)
& (spots.x < tile_x_max)
& (spots.y >= tile_y_min)
& (spots.y < tile_y_max)
]
if tile_spots.empty:
continue
inst_spots = tile_spots.index.tolist()
# Subset sparse matrix
tile_sbg = sbg.loc[inst_spots]
tile_sbg_coo = csr_matrix(tile_sbg.values)
if tile_sbg_coo.nnz == 0:
continue
# Convert DataFrame → SciPy sparse
coo = csr_matrix(tile_sbg.values).tocoo()
row = np.array([inst_spots[r] for r in coo.row])
col = tile_sbg.columns.to_numpy()[coo.col]
count = coo.data
# Create long format table with repeats
df = pd.DataFrame({"spot": row, "gene": col, "count": count})
df = df[df["count"] > 0]
df = df.loc[df.index.repeat(df["count"].astype(int))].reset_index(drop=True)
# Map spot → x, y
df["x"] = df["spot"].map(tile_spots["x"])
df["y"] = df["spot"].map(tile_spots["y"])
# Map gene → int
df["name"] = df["gene"].map(gene_str_to_int).astype("int32")
# Convert to Polars for final processing
pl_df = pl.DataFrame(df[["name", "x", "y"]])
if rng is None:
rng = np.random.default_rng()
jitter_radius = jitter / 2
jitter_x = rng.uniform(-jitter_radius, jitter_radius, size=len(pl_df))
jitter_y = rng.uniform(-jitter_radius, jitter_radius, size=len(pl_df))
pl_df = pl_df.with_columns(
[
(pl.col("x") + pl.Series(jitter_x)).round(2).alias("x"),
(pl.col("y") + pl.Series(jitter_y)).round(2).alias("y"),
]
)
df_out = pl_df.to_pandas()
df_out["geometry"] = df_out[["x", "y"]].values.tolist()
filename = f"{path_landscape_files}/transcript_tiles/transcripts_tile_{i}_{j}.parquet"
df_out[["name", "geometry"]].to_parquet(filename, index=False)
Save Landscape Parameters¶
In [ ]:
Copied!
image_files_path = path_landscape_files + f"/pyramid_images/{image_tile_layer}_files"
max_pyramid_zoom = dega.pre.get_max_zoom_level(image_files_path)
image_files_path = path_landscape_files + f"/pyramid_images/{image_tile_layer}_files"
max_pyramid_zoom = dega.pre.get_max_zoom_level(image_files_path)
In [ ]:
Copied!
landscape_parameters = {
"technology": technology,
"segmentation_approach": [
"default"
],
"max_pyramid_zoom": max_pyramid_zoom,
"tile_size": tile_size,
"image_info": [
{
"name": image_tile_layer,
"button_name": image_tile_layer.upper(),
"color": [
0,
0,
255
]
}
],
"image_format": ".webp",
"use_int_index": True
}
path = Path(f"{path_landscape_files}/landscape_parameters.json")
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w") as f:
json.dump(landscape_parameters, f, indent=2)
landscape_parameters = {
"technology": technology,
"segmentation_approach": [
"default"
],
"max_pyramid_zoom": max_pyramid_zoom,
"tile_size": tile_size,
"image_info": [
{
"name": image_tile_layer,
"button_name": image_tile_layer.upper(),
"color": [
0,
0,
255
]
}
],
"image_format": ".webp",
"use_int_index": True
}
path = Path(f"{path_landscape_files}/landscape_parameters.json")
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w") as f:
json.dump(landscape_parameters, f, indent=2)
Viz¶
To use locally stored LandscapeFiles, define the base_url as shown below:¶
In [ ]:
Copied!
# server_address = dega.viz.get_local_server()
# base_url = f"http://localhost:{server_address}/{path_landscape_files}"
# file_path = f'{path_landscape_files}/df_sig.parquet'
# df_sig = pd.read_parquet(file_path)
# top_genes = df_sig.sum(axis=1).sort_values(ascending=False).index.tolist()[:5000]
# df_sig = df_sig.loc[top_genes]
# server_address = dega.viz.get_local_server()
# base_url = f"http://localhost:{server_address}/{path_landscape_files}"
# file_path = f'{path_landscape_files}/df_sig.parquet'
# df_sig = pd.read_parquet(file_path)
# top_genes = df_sig.sum(axis=1).sort_values(ascending=False).index.tolist()[:5000]
# df_sig = df_sig.loc[top_genes]
Tutorial Data Source¶
1. By default, this notebook fetches data from the public GitHub repository: https://github.com/broadinstitute/celldega_Visium_HD_mouse_brain.¶
2. To use local LandscapeFiles instead, comment out the cell below.¶
In [5]:
Copied!
base_url = 'https://raw.githubusercontent.com/broadinstitute/celldega_Visium_HD_mouse_brain/main/Visium_HD_mouse_brain'
file_path = f'{base_url}/df_sig.parquet'
df_sig = pd.read_parquet(file_path)
top_genes = df_sig.sum(axis=1).sort_values(ascending=False).index.tolist()[:5000]
df_sig = df_sig.loc[top_genes]
base_url = 'https://raw.githubusercontent.com/broadinstitute/celldega_Visium_HD_mouse_brain/main/Visium_HD_mouse_brain'
file_path = f'{base_url}/df_sig.parquet'
df_sig = pd.read_parquet(file_path)
top_genes = df_sig.sum(axis=1).sort_values(ascending=False).index.tolist()[:5000]
df_sig = df_sig.loc[top_genes]
Initialize Heatmap¶
In [6]:
Copied!
mat = dega.clust.Matrix(data=df_sig)
mat.norm(axis='row', by='zscore')
mat.cluster()
cgm = dega.viz.Clustergram(matrix=mat, width=500, height=500)
mat = dega.clust.Matrix(data=df_sig)
mat.norm(axis='row', by='zscore')
mat.cluster()
cgm = dega.viz.Clustergram(matrix=mat, width=500, height=500)
Initialize Landscape¶
In [7]:
Copied!
landscape_visium = dega.viz.Landscape(
technology=technology,
base_url=base_url,
max_tiles_to_view=max_tiles_to_view
)
landscape_visium = dega.viz.Landscape(
technology=technology,
base_url=base_url,
max_tiles_to_view=max_tiles_to_view
)
/var/folders/_6/bhs42vt57t1dkb59k4sy0p440000gp/T/ipykernel_35290/2822121107.py:1: UserWarning: Transformation matrix not found at https://raw.githubusercontent.com/broadinstitute/celldega_Visium_HD_mouse_brain/main/Visium_HD_mouse_brain/micron_to_image_transform.csv. Using identity. landscape_visium = dega.viz.Landscape(
In [8]:
Copied!
dega.viz.landscape_clustergram(landscape_visium, cgm)
dega.viz.landscape_clustergram(landscape_visium, cgm)
Out[8]:
In [ ]:
Copied!