Source code for gnomad_toolbox.load_data

"""Functions to import gnomAD data."""

from typing import Callable, Optional, Union

import gnomad.resources.grch37 as grch37_res
import gnomad.resources.grch38 as grch38_res
import hail as hl
from gnomad.resources.resource_utils import VersionedTableResource

DATA_TYPES = ["exomes", "genomes", "joint"]
PEXT_DATA_TYPES = ["base_level", "annotation_level"]
RESOURCES_BY_BUILD = {
    "GRCh37": grch37_res,
    "GRCh38": grch38_res,
}
# In the toolbox, for each dataset, we only support loading the most recent versions
# for each reference genome build (GRCh37 and GRCh38) and data type (exomes, genomes,
# joint). Older versions are not supported because they are less complete datasets, and
# some may have errors that have been fixed in newer versions. If other versions are
# actually needed, they can be loaded using `gnomad_methods` or directly loading the
# Table with hl.read_table.
VARIANT_DATA = {
    "2.1.1": {
        "reference_genome": "GRCh37",
        "data_types": ["exomes", "genomes"],
        "dataset_versions": {
            "vep": "85",
            "gencode": "v19",
            "coverage": {
                "exomes": "2.1",
                "genomes": "2.1",
            },
            "constraint": "2.1.1",
            "pext": "v7",
            "liftover": "2.1.1",
        },
    },
    "4.1": {
        "reference_genome": "GRCh38",
        "data_types": ["exomes", "genomes", "joint"],
        "dataset_versions": {
            "vep": "105",
            "gencode": "v39",
            "coverage": {"exomes": "4.0", "genomes": "3.0.1"},
            "all_sites_an": "4.1",
            "constraint": "4.1",
            "pext": "v10",
            "browser": "4.1",
        },
    },
}
COVERAGE_DATA = {
    "2.1": {"reference_genome": "GRCh37", "data_types": ["exomes", "genomes"]},
    "3.0.1": {"reference_genome": "GRCh38", "data_types": ["genomes"]},
    "4.0": {
        "reference_genome": "GRCh38",
        "data_types": ["exomes"],
    },
}
ALL_SITES_AN_DATA = {
    "4.1": {
        "reference_genome": "GRCh38",
        "data_types": ["exomes", "genomes"],
    }
}
CONSTRAINT_DATA = {
    "2.1.1": {
        "reference_genome": "GRCh37",
        "exome_coverage_field": "median",
        "exome_coverage_cutoff": 30,
        "af_cutoff": 0.001,
    },
    "4.1": {
        "reference_genome": "GRCh38",
        "exome_coverage_field": "median_approx",
        "exome_coverage_cutoff": 30,
        "af_cutoff": 0.001,
    },
}
LIFTOVER_DATA = {
    "2.1.1": {
        "reference_genome": "GRCh37",
        "data_types": ["exomes", "genomes"],
    }
}
PEXT_DATA = {
    "v7": {"reference_genome": "GRCh37", "data_types": PEXT_DATA_TYPES},
    "v10": {"reference_genome": "GRCh38", "data_types": PEXT_DATA_TYPES},
}
BROWSER_DATA = {"4.1": {"reference_genome": "GRCh38"}}
SUPPORTED_DATASETS = {
    "variant": {"resource": "public_release", "versions": VARIANT_DATA},
    "coverage": {"resource": "coverage", "versions": COVERAGE_DATA},
    "all_sites_an": {"resource": "all_sites_an", "versions": ALL_SITES_AN_DATA},
    "constraint": {"resource": "constraint", "versions": CONSTRAINT_DATA},
    "liftover": {"resource": "liftover", "versions": LIFTOVER_DATA},
    "pext": {"resource": "pext", "versions": PEXT_DATA},
    "browser": {"resource": "browser_variant", "versions": BROWSER_DATA},
}
SUPPORTED_REFERENCE_DATA = {
    "vep": {
        "resource": "vep_context",
        "versions": {
            "85": {"reference_genome": "GRCh37"},
            "105": {"reference_genome": "GRCh38"},
        },
    },
    "gencode": {
        "resource": "gencode",
        "versions": {
            "v19": {"reference_genome": "GRCh37"},
            "v39": {"reference_genome": "GRCh38"},
        },
    },
}


[docs]class GnomADSession: """Class to manage the default data type and version for a gnomAD session.""" def __init__(self) -> None: """ Initialize a gnomAD session. The default data type is exomes and the default version is the current exome release. :return: None. """ self.data_type = "exomes" self.version = "4.1" self.set_default_data()
[docs] def set_default_data( self, data_type: Optional[str] = None, version: Optional[str] = None, ) -> None: """ Set default data type and version. :param data_type: Data type (exomes, genomes, or joint). :param version: gnomAD version. :return: None. """ data_type = data_type or self.data_type version = version or self.version # Validate version. if version not in VARIANT_DATA: raise ValueError( f"Version {version} is not a supported gnomAD version in the Toolbox." ) # Validate data type for the version. version_info = VARIANT_DATA[version] if data_type not in version_info["data_types"]: raise ValueError(f"Version {version} for {data_type} is not available.") self.data_type = data_type self.version = version self.reference_genome = version_info["reference_genome"] self.compatible_datasets = version_info["dataset_versions"]
# Global gnomad session object gnomad_session = GnomADSession() def _get_dataset( ht: hl.Table = None, dataset: str = "variant", data_type: str = None, version: str = None, ) -> hl.Table: """ Get gnomAD HT using a Hail Table, specific parameters, or session defaults. :param ht: Pre-loaded Hail Table. If provided, other parameters are ignored. :param dataset: Dataset type. One of "variant", "all_sites_an", "coverage", "gencode". Default is variant. :param data_type: Data type (exomes, genomes, or joint). Default is session value. :param version: gnomAD version. Default is session value. :return: Hail Table for requested dataset, data type, and version. """ # If a pre-loaded Hail Table is provided, return it directly. if ht is not None: return ht # Validate dataset. dataset_info = SUPPORTED_DATASETS.get(dataset) or SUPPORTED_REFERENCE_DATA.get( dataset ) if dataset_info is None: raise ValueError( f"{dataset} is invalid. Choose from:\n" f"\tgnomAD datasets:{list(SUPPORTED_DATASETS.keys())}\n" f"\treference datasets: {list(SUPPORTED_REFERENCE_DATA.keys())}" ) # If version is not provided, use the session information. if dataset == "variant": version = version or gnomad_session.version else: version = version or gnomad_session.compatible_datasets[dataset] # Validate version. versions = dataset_info["versions"] version_info = versions.get(version) if version_info is None: version_format = ", ".join( f"{v} ({versions[v]['reference_genome']})" for v in versions ) raise ValueError( f"Version {version} is not in the supported versions for {dataset}. " f"Supported versions: {version_format}" ) # Validate data type. data_types = version_info.get("data_types") if data_types: data_type = data_type or gnomad_session.data_type if data_type and data_type not in data_types: raise ValueError( f"Version {version} is not available for {data_type} in the {dataset} " f"dataset. Available data types: {data_types}." ) # Get the resource for the given build. build = version_info["reference_genome"] res_build = RESOURCES_BY_BUILD[build] res_name = dataset_info["resource"] res = getattr(res_build.gnomad, res_name, None) or getattr( res_build.reference_data, res_name ) if isinstance(res, Callable): res = res(data_type) if data_type else res() if isinstance(res, VersionedTableResource): res = res.versions[version] return res.ht()
[docs]def get_gnomad_release( dataset: str = "variant", data_type: Optional[str] = None, version: Optional[str] = None, ) -> hl.Table: """ Get gnomAD HT by dataset, data type, and version. Not all combinations of dataset, data type, and version are available and/or supported by the toolbox. The table below shows what is supported. .. table:: Available versions for each dataset and data type are (as of 2025-1-13) :widths: auto +--------------+--------------+---------+------------------------------+ | Genome Build | Dataset | Version | Data Types | +==============+==============+=========+==============================+ | GRCh37 | variant | 2.1.1 | exomes, genomes | | +--------------+---------+------------------------------+ | | coverage | 2.1 | exomes, genomes | | +--------------+---------+------------------------------+ | | constraint | 2.1.1 | N/A | | +--------------+---------+------------------------------+ | + pext | v7 | base_level, annotation_level | | +--------------+---------+------------------------------+ | | liftover | 2.1.1 | exomes, genomes | +--------------+--------------+---------+------------------------------+ | GRCh38 | variant | 4.1 | exomes, genomes, joint | | +--------------+---------+------------------------------+ | | all_sites_an | 4.1 | exomes, genomes | | +--------------+---------+------------------------------+ | | browser | 4.1 | N/A (joint, but doesn't need | | | | | to be specified) | | +--------------+---------+------------------------------+ | | coverage | 3.0.1 | genomes | | +--------------+---------+------------------------------+ | | constraint | 4.1 | N/A | | +--------------+---------+------------------------------+ | | pext | v10 | base_level, annotation_level | +--------------+--------------+---------+------------------------------+ :param dataset: Dataset type. One of "variant", "coverage", "all_sites_an", "constraint", "liftover", "pext", "browser". Default is "variant". :param data_type: Data type. One of "exomes", "genomes", "joint" for all datasets except "pext" where it is one of "base_level", "annotation_level". Default is the current session data type. :param version: gnomAD dataset version. Default is the current session version. :return: Hail Table for requested dataset, data type, and version. """ return _get_dataset(dataset=dataset, data_type=data_type, version=version)
[docs]def get_compatible_dataset_versions( dataset: str, variant_version: Optional[str] = None, data_type: Optional[str] = None ) -> Union[str, dict]: """ Get the compatible version of another datasets for a given gnomAD variant data version. :param dataset: Dataset to get the compatible version for. :param variant_version: Optional gnomAD variant data version. If not provided, the current session version is used. :param data_type: Optional data type for the dataset if applicable. :return: Compatible version of the dataset for the given variant version. """ # Get the dictionary of compatible versions for the given variant version or # the current session version. if variant_version is None: versions = gnomad_session.compatible_datasets else: versions = VARIANT_DATA[variant_version]["dataset_versions"] # Validate dataset. if dataset not in versions: raise ValueError( f"{dataset} is not available for {variant_version}." f"Available datasets: {list(versions.keys())}" ) # If the dataset has multiple data types and a data type is provided, return the # version for the data type. dataset_version = versions[dataset] if data_type and isinstance(dataset_version, dict): if data_type not in dataset_version: raise ValueError( f"{data_type} is not available for {variant_version} {dataset}." f"Available data types: {list(dataset_version.keys())}" ) return dataset_version[data_type] return dataset_version