import polars as pl
import requestsRetrieve JUMP profiles
The JUMP Cell Painting project provides several processed datasets for morphological profiling. Choose the dataset that matches your perturbation type:
- crispr: CRISPR knockout genetic perturbations
- orf: Open Reading Frame (ORF) overexpression perturbations
- compound: Chemical compound perturbations
- all: Combined dataset containing all perturbation types (use for cross-modality comparisons)
Each dataset is available in two processing versions:
- Standard (e.g., - crispr,- compound,- orf): Fully processed including batch correction steps. Recommended for most analyses as they provide better cross-dataset comparability.
- Interpretable (e.g., - crispr_interpretable,- compound_interpretable,- orf_interpretable): Same initial processing but without batch correction transformations that modify the original feature space. Use these when you need to interpret individual morphological features.
All datasets are stored as Parquet files on AWS S3 and can be accessed directly via their URLs.
The index file below contains the recommended profiles for each subset. Each profile includes: - Direct links to the processing recipe and configuration used - ETags for data integrity verification
For details on creating your own profile manifests, see the manifest guide.
INDEX_FILE = "https://raw.githubusercontent.com/jump-cellpainting/datasets/v0.11.0/manifests/profile_index.json"We use the version-controlled manifest above to release the latest corrected profiles
# Load the JSON manifest
response = requests.get(INDEX_FILE)
profile_index = response.json()
# Display the manifest data
for dataset in profile_index:
    print(f"- {dataset['subset']}: {dataset['url']}")- orf: https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles_assembled/ORF/v1.0a/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony.parquet
- crispr: https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles_assembled/CRISPR/v1.0a/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected.parquet
- compound: https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles_assembled/COMPOUND/v1.0/profiles_var_mad_int_featselect_harmony.parquet
- orf_interpretable: https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles_assembled/ORF/v1.0a/profiles_wellpos_cc_var_mad_outlier.parquet
- crispr_interpretable: https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles_assembled/CRISPR/v1.0a/profiles_wellpos_cc_var_mad_outlier.parquet
- compound_interpretable: https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles_assembled/COMPOUND/v1.0/profiles_var_mad_int.parquet
- all: https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles_assembled/ALL/v1.0b/profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony.parquet
- all_interpretable: https://cellpainting-gallery.s3.amazonaws.com/cpg0016-jump-assembled/source_all/workspace/profiles_assembled/ALL/v1.0b/profiles_wellpos_cc_var_mad_outlier_featselect.parquetEach profile in the manifest includes direct links to: - recipe_permalink: The exact version of the processing code used - config_permalink: The specific configuration file that defines the processing steps
Let’s display the key information from the manifest:
# Convert JSON to DataFrame for better display
profile_df = pl.DataFrame(profile_index)
# Show key information in a clean table
display_df = profile_df.select(
    [
        "subset",
        pl.col("url").str.extract(r"([^/]+)\.parquet$").alias("filename"),
        pl.col("recipe_permalink")
        .str.extract(r"tree/([^/]+)$")
        .str.slice(0, 7)
        .alias("recipe_version"),
        pl.col("config_permalink").str.extract(r"([^/]+)\.json$").alias("config"),
    ]
)
display_df| subset | filename | recipe_version | config | 
|---|---|---|---|
| str | str | str | str | 
| "orf" | "profiles_wellpos_cc_var_mad_ou… | "a917fa7" | "orf" | 
| "crispr" | "profiles_wellpos_cc_var_mad_ou… | "a917fa7" | "crispr" | 
| "compound" | "profiles_var_mad_int_featselec… | "a917fa7" | "compound" | 
| "orf_interpretable" | "profiles_wellpos_cc_var_mad_ou… | "a917fa7" | "orf" | 
| "crispr_interpretable" | "profiles_wellpos_cc_var_mad_ou… | "a917fa7" | "crispr" | 
| "compound_interpretable" | "profiles_var_mad_int" | "a917fa7" | "compound" | 
| "all" | "profiles_wellpos_cc_var_mad_ou… | "0224e0f" | "pipeline_2" | 
| "all_interpretable" | "profiles_wellpos_cc_var_mad_ou… | "0224e0f" | "pipeline_2" | 
Let inspect the standard profiles.
# Create dictionary of subset -> url for the standard profiles only
filepaths = {
    dataset["subset"]: dataset["url"]
    for dataset in profile_index
    if dataset["subset"] in ("crispr", "orf", "compound")
}
print("Selected profiles:")
for subset, url in filepaths.items():
    print(f"  {subset}: {url.split('/')[-1]}")Selected profiles:
  orf: profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony.parquet
  crispr: profiles_wellpos_cc_var_mad_outlier_featselect_sphering_harmony_PCA_corrected.parquet
  compound: profiles_var_mad_int_featselect_harmony.parquetWe will lazy-load the dataframes and print the number of rows and columns
info = {k: [] for k in ("dataset", "#rows", "#cols", "#Metadata cols", "Size (MB)")}
for name, path in filepaths.items():
    data = pl.scan_parquet(path)
    n_rows = data.select(pl.len()).collect().item()
    schema = data.collect_schema()
    metadata_cols = [col for col in schema.keys() if col.startswith("Metadata")]
    n_cols = schema.len()
    n_meta_cols = len(metadata_cols)
    estimated_size = int(round(4.03 * n_rows * n_cols / 1e6, 0))  # B -> MB
    for k, v in zip(info.keys(), (name, n_rows, n_cols, n_meta_cols, estimated_size)):
        info[k].append(v)
pl.DataFrame(info)| dataset | #rows | #cols | #Metadata cols | Size (MB) | 
|---|---|---|---|---|
| str | i64 | i64 | i64 | i64 | 
| "orf" | 81660 | 726 | 4 | 239 | 
| "crispr" | 51185 | 263 | 4 | 54 | 
| "compound" | 803853 | 741 | 4 | 2400 | 
Let us now focus on the crispr dataset and use a regex to select the metadata columns. We will then sample rows and display the overview. Note that the collect() method enforces loading some data into memory.
data = pl.scan_parquet(filepaths["crispr"])
data.select(pl.col("^Metadata.*$").sample(n=5, seed=1)).collect()| Metadata_Source | Metadata_Plate | Metadata_Well | Metadata_JCP2022 | 
|---|---|---|---|
| str | str | str | str | 
| "source_13" | "CP-CC9-R2-15" | "D02" | "JCP2022_800002" | 
| "source_13" | "CP-CC9-R1-04" | "J18" | "JCP2022_800028" | 
| "source_13" | "CP-CC9-R2-04" | "J09" | "JCP2022_807421" | 
| "source_13" | "CP-CC9-R2-26" | "L14" | "JCP2022_807129" | 
| "source_13" | "CP-CC9-R6-01" | "C12" | "JCP2022_806640" | 
The following line excludes the metadata columns:
data_only = data.select(pl.all().exclude("^Metadata.*$").sample(n=5, seed=1)).collect()
data_only| X_1 | X_2 | X_3 | X_4 | X_5 | X_6 | X_7 | X_8 | X_9 | X_10 | X_11 | X_12 | X_13 | X_14 | X_15 | X_16 | X_17 | X_18 | X_19 | X_20 | X_21 | X_22 | X_23 | X_24 | X_25 | X_26 | X_27 | X_28 | X_29 | X_30 | X_31 | X_32 | X_33 | X_34 | X_35 | X_36 | X_37 | … | X_223 | X_224 | X_225 | X_226 | X_227 | X_228 | X_229 | X_230 | X_231 | X_232 | X_233 | X_234 | X_235 | X_236 | X_237 | X_238 | X_239 | X_240 | X_241 | X_242 | X_243 | X_244 | X_245 | X_246 | X_247 | X_248 | X_249 | X_250 | X_251 | X_252 | X_253 | X_254 | X_255 | X_256 | X_257 | X_258 | X_259 | 
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | … | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | f32 | 
| 0.431689 | 0.121776 | -0.288611 | 1.199042 | -0.758412 | -0.466926 | -0.777705 | -0.081231 | -0.619822 | -1.27128 | -0.373444 | 0.755662 | -0.271196 | -0.219682 | 0.268569 | -0.831324 | -0.916929 | 0.128514 | 0.202126 | -0.448374 | 0.57358 | -0.148984 | -0.451346 | -0.863105 | -0.519879 | -0.485649 | 0.067051 | -0.461362 | -0.87479 | 0.060438 | -0.86988 | -0.053304 | 0.479346 | 0.415922 | 0.55612 | 0.057157 | -0.486731 | … | 0.070464 | 0.011686 | -0.071482 | 0.047634 | -0.137811 | 0.010114 | -0.146834 | 0.028652 | 0.048453 | 0.015478 | -0.371927 | -0.318295 | -0.07663 | 0.099552 | -0.067174 | 0.324664 | 0.11507 | 0.07018 | 0.149843 | 0.090655 | -0.024452 | -0.167478 | -0.063188 | 0.10028 | -0.20603 | -0.143531 | -0.042267 | -0.103231 | 0.166172 | 0.268637 | -0.249552 | -0.125842 | -0.010658 | 0.148293 | -0.002996 | 0.018602 | 0.120415 | 
| -0.286125 | -0.139647 | 0.521229 | -0.130772 | -0.392223 | -0.478905 | -2.190718 | -0.910039 | -0.923397 | -0.89992 | 0.809614 | 0.195752 | 1.051458 | -0.586142 | 0.132069 | 0.691497 | 2.309921 | 0.451202 | 0.017881 | 0.722985 | 0.094764 | 0.458089 | 0.289687 | -0.005019 | -0.44384 | -0.292192 | -0.661437 | -0.480588 | -0.43835 | 0.392833 | 0.883042 | -0.183804 | -0.63443 | 0.088329 | 0.317562 | 0.790481 | 0.49558 | … | 0.12586 | 0.150716 | 0.092419 | 0.070398 | -0.10096 | 0.241489 | -0.02793 | -0.069464 | 0.173498 | 0.096578 | -0.006984 | -0.010409 | -0.122357 | -0.154975 | -0.264336 | -0.026424 | -0.107131 | -0.217108 | -0.076673 | -0.025199 | 0.178872 | 0.273566 | -0.011964 | -0.284162 | -0.07764 | -0.147836 | -0.030516 | 0.039593 | -0.251191 | -0.145978 | -0.061276 | 0.260967 | 0.136172 | 0.220407 | -0.016074 | 0.24593 | -0.051766 | 
| 0.044537 | 0.093762 | 0.38071 | -0.078268 | -0.332677 | -0.492756 | -0.54244 | -0.751058 | 0.28314 | 0.772951 | -0.344511 | -0.291534 | -0.64803 | 1.04816 | 0.814905 | 0.020586 | -1.699232 | -0.35928 | 0.474136 | -0.500731 | 0.16648 | 0.460551 | 0.773349 | -0.584125 | 0.070497 | 0.382738 | 1.290578 | 1.115024 | 0.656066 | -0.211548 | 0.615551 | 1.202399 | 0.61274 | 0.467623 | 0.826743 | 0.98965 | 0.515379 | … | -0.035649 | 0.084653 | -0.148614 | 0.41456 | -0.035386 | 0.039774 | 0.222122 | 0.127807 | 0.212482 | -0.087575 | 0.149949 | -0.146337 | 0.031107 | 0.048564 | -0.151519 | -0.256957 | -0.147494 | -0.051771 | 0.000703 | -0.100694 | 0.127297 | -0.159605 | 0.056752 | 0.079783 | -0.301415 | -0.033567 | -0.073402 | 0.073441 | 0.003454 | -0.065908 | 0.003793 | 0.017154 | 0.122071 | 0.031753 | -0.115469 | -0.183939 | -0.037042 | 
| 0.045477 | 0.020634 | 0.312316 | 1.316 | -0.831466 | -1.536956 | 0.495057 | -1.25451 | -0.417021 | 0.099831 | 0.010575 | 0.815467 | -0.793362 | -0.602823 | -0.470462 | -1.901034 | -0.749613 | -0.03417 | -0.349764 | -0.109558 | 0.50934 | 0.937879 | -0.567808 | -0.361403 | 0.07038 | 0.428986 | 0.178268 | -0.264072 | -1.08156 | 0.484804 | 0.257085 | -0.387199 | -0.594517 | -0.142474 | 0.364982 | 0.369385 | -0.033974 | … | 0.080806 | 0.047688 | 0.081428 | -0.072393 | -0.134251 | 0.32516 | -0.013819 | -0.231218 | 0.235347 | -0.099079 | -0.214146 | -0.088035 | 0.279149 | 0.235552 | 0.056753 | -0.002605 | -0.121467 | -0.011054 | 0.014276 | 0.031513 | 0.056525 | -0.204108 | 0.056208 | -0.007412 | 0.295334 | 0.059559 | -0.072717 | 0.143892 | -0.175082 | 0.06916 | -0.240234 | -0.243179 | 0.132553 | -0.10939 | -0.006807 | -0.081922 | -0.033631 | 
| -0.128473 | -0.163732 | 0.052351 | -3.2502 | 0.237454 | 0.327462 | 2.975345 | 1.074392 | -0.642075 | -0.309154 | -1.427569 | 0.209862 | -0.207053 | -0.785397 | -1.690689 | 0.57705 | 1.286289 | -0.260824 | -0.066723 | -0.378312 | -0.107758 | 0.58553 | 0.723803 | -0.085321 | -0.899026 | -0.508275 | 0.946614 | 0.681252 | 0.591428 | -0.058463 | -0.611216 | -0.249337 | 0.151805 | -0.201767 | -0.364704 | -0.279569 | 0.032865 | … | -0.103084 | -0.092279 | 0.061387 | -0.229078 | 0.214459 | 0.018508 | -0.164547 | 0.170245 | -0.028671 | -0.024243 | 0.116811 | 0.03172 | 0.010574 | 0.014084 | 0.15063 | -0.053592 | -0.297773 | -0.033743 | 0.264092 | -0.030906 | -0.04306 | -0.126682 | -0.050824 | -0.011592 | 0.082704 | -0.186133 | 0.172641 | -0.056459 | 0.190109 | 0.06259 | 0.093085 | -0.251115 | 0.141207 | 0.180379 | -0.006493 | -0.155394 | -0.013597 | 
Finally, we can convert this to pandas if we want to perform analyses with that tool. Keep in mind that this loads the entire dataframe into memory.
data_only.to_pandas()| X_1 | X_2 | X_3 | X_4 | X_5 | X_6 | X_7 | X_8 | X_9 | X_10 | ... | X_250 | X_251 | X_252 | X_253 | X_254 | X_255 | X_256 | X_257 | X_258 | X_259 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.431689 | 0.121776 | -0.288611 | 1.199042 | -0.758412 | -0.466926 | -0.777705 | -0.081231 | -0.619822 | -1.271280 | ... | -0.103231 | 0.166172 | 0.268637 | -0.249552 | -0.125842 | -0.010658 | 0.148293 | -0.002996 | 0.018602 | 0.120415 | 
| 1 | -0.286125 | -0.139647 | 0.521229 | -0.130772 | -0.392223 | -0.478905 | -2.190718 | -0.910039 | -0.923397 | -0.899920 | ... | 0.039593 | -0.251191 | -0.145978 | -0.061276 | 0.260967 | 0.136172 | 0.220407 | -0.016074 | 0.245930 | -0.051766 | 
| 2 | 0.044537 | 0.093762 | 0.380710 | -0.078268 | -0.332677 | -0.492756 | -0.542440 | -0.751058 | 0.283140 | 0.772951 | ... | 0.073441 | 0.003454 | -0.065908 | 0.003793 | 0.017154 | 0.122071 | 0.031753 | -0.115469 | -0.183939 | -0.037042 | 
| 3 | 0.045477 | 0.020634 | 0.312316 | 1.316000 | -0.831466 | -1.536956 | 0.495057 | -1.254510 | -0.417021 | 0.099831 | ... | 0.143892 | -0.175082 | 0.069160 | -0.240234 | -0.243179 | 0.132553 | -0.109390 | -0.006807 | -0.081922 | -0.033631 | 
| 4 | -0.128473 | -0.163732 | 0.052351 | -3.250200 | 0.237454 | 0.327462 | 2.975345 | 1.074392 | -0.642075 | -0.309154 | ... | -0.056459 | 0.190109 | 0.062590 | 0.093085 | -0.251115 | 0.141207 | 0.180379 | -0.006493 | -0.155394 | -0.013597 | 
5 rows × 259 columns