Source code for gnomad.resources.grch38.gnomad

# noqa: D100

import logging
from typing import Optional

import hail as hl

from gnomad.resources.resource_utils import (
    DataException,
    GnomadPublicMatrixTableResource,
    GnomadPublicTableResource,
    VersionedMatrixTableResource,
    VersionedTableResource,
)
from gnomad.sample_qc.ancestry import GEN_ANC_NAMES
from gnomad.utils.annotations import add_gks_va, add_gks_vrs

logging.basicConfig(
    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
    datefmt="%m/%d/%Y %I:%M:%S %p",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

CURRENT_EXOME_RELEASE = "4.1"
CURRENT_GENOME_RELEASE = "4.1"
CURRENT_JOINT_RELEASE = "4.1"

CURRENT_BROWSER_RELEASE = "4.1"

CURRENT_EXOME_COVERAGE_RELEASE = "4.0"
CURRENT_GENOME_COVERAGE_RELEASE = "3.0.1"

CURRENT_EXOME_AN_RELEASE = "4.1"
CURRENT_GENOME_AN_RELEASE = "4.1"

EXOME_RELEASES = ["4.0", "4.1"]
GENOME_RELEASES = ["3.0", "3.1", "3.1.1", "3.1.2", "4.0", "4.1"]
JOINT_RELEASES = ["4.1"]
BROWSER_RELEASES = ["4.1"]

EXOME_COVERAGE_RELEASES = ["4.0"]
GENOME_COVERAGE_RELEASES = ["3.0", "3.0.1"]

EXOME_AN_RELEASES = ["4.1"]
GENOME_AN_RELEASES = ["4.1"]

DATA_TYPES = ["exomes", "genomes", "joint"]
MAJOR_RELEASES = ["v3", "v4"]
CURRENT_MAJOR_RELEASE = MAJOR_RELEASES[-1]

GENOME_GEN_ANCS = ["AFR", "AMI", "AMR", "ASJ", "EAS", "FIN", "NFE", "SAS", "OTH"]
SUBSETS = {
    "v3": [
        "non_v2",
        "non_topmed",
        "non_cancer",
        "controls_and_biobanks",
        "non_neuro",
        "tgp",
        "hgdp",
    ],
    "v4": ["non_ukb"],
}
"""
Order to sort subgroupings during VCF export by version.

Ensures that INFO labels in VCF are in desired order (e.g., tgp_raw_AC_esn_XX).
"""

GROUPS = ["adj", "raw"]
"""
Group names used to generate labels for high quality genotypes and all raw genotypes.

Used in VCF export.
"""

SEXES = ["XX", "XY"]
"""
Sample sexes used in VCF export.

Used to stratify frequency annotations (AC, AN, AF) for each sex.
"""

GEN_ANC_GROUPS = {
    "v3": {
        "genomes": [
            "afr",
            "ami",
            "amr",
            "asj",
            "eas",
            "fin",
            "nfe",
            "oth",
            "sas",
            "mid",
        ]
    },
    "v4": {
        "exomes": [
            "afr",
            "amr",
            "asj",
            "eas",
            "fin",
            "mid",
            "nfe",
            "remaining",
            "sas",
        ],
        "genomes": [
            "afr",
            "ami",
            "amr",
            "asj",
            "eas",
            "fin",
            "mid",
            "nfe",
            "remaining",
            "sas",
        ],
    },
}
"""
Genetic ancestry groups in gnomAD by version.
"""

COHORTS_WITH_GEN_ANC_STORED_AS_SUBGRP = ["tgp", "hgdp"]
"""
Subsets in gnomAD v3.1 that are broken down by their known genetic ancestry subgroups instead of groups in the frequency struct.
"""

TGP_GEN_ANC_GROUPS = [
    "esn",
    "pur",
    "pjl",
    "clm",
    "jpt",
    "chb",
    "stu",
    "itu",
    "tsi",
    "mxl",
    "ceu",
    "msl",
    "yri",
    "beb",
    "fin",
    "khv",
    "cdx",
    "lwk",
    "acb",
    "asw",
    "ibs",
    "gbr",
    "pel",
    "gih",
    "chs",
    "gwd",
]
"""
1000 Genomes Project (1KG/TGP) genetic ancestry subgroups.
"""

HGDP_GEN_ANC_GROUPS = [
    "japanese",
    "papuanhighlands",
    "papuansepik",
    "adygei",
    "orcadian",
    "biaka",
    "yakut",
    "han",
    "northernhan",
    "uygur",
    "miao",
    "mongolian",
    "balochi",
    "bedouin",
    "russian",
    "daur",
    "pima",
    "hezhen",
    "sindhi",
    "yi",
    "oroqen",
    "san",
    "tuscan",
    "tu",
    "palestinian",
    "tujia",
    "druze",
    "pathan",
    "basque",
    "makrani",
    "bergamoitalian",
    "naxi",
    "karitiana",
    "sardinian",
    "mbuti",
    "mozabite",
    "yoruba",
    "lahu",
    "dai",
    "cambodian",
    "bougainville",
    "french",
    "brahui",
    "hazara",
    "bantusouthafrica",
    "surui",
    "mandenka",
    "kalash",
    "xibo",
    "colombian",
    "bantukenya",
    "she",
    "burusho",
    "maya",
]
"""
Human Genome Diversity Project (HGDP) genetic ancestry subgroups.
"""

TGP_GEN_ANC_GROUP_NAMES = {
    "chb": "Han Chinese",
    "jpt": "Japanese",
    "chs": "Southern Han Chinese",
    "cdx": "Chinese Dai",
    "khv": "Kinh",
    "ceu": "Utah Residents (European Ancestry)",
    "tsi": "Toscani",
    "fin": "Finnish",
    "gbr": "British",
    "ibs": "Iberian",
    "yri": "Yoruba",
    "lwk": "Luhya",
    "gwd": "Gambian",
    "msl": "Mende",
    "esn": "Esan",
    "asw": "African-American",
    "acb": "African Caribbean",
    "mxl": "Mexican-American",
    "pur": "Puerto Rican",
    "clm": "Colombian",
    "pel": "Peruvian",
    "gih": "Gujarati",
    "pjl": "Punjabi",
    "beb": "Bengali",
    "stu": "Sri Lankan Tamil",
    "itu": "Indian Telugu",
}
"""
1000 Genomes Project (1KG/TGP) genetic ancestry group label map.
"""

GEN_ANC_GROUPS_STORED_AS_SUBGRPS = TGP_GEN_ANC_GROUPS + HGDP_GEN_ANC_GROUPS
GEN_ANC_GROUPS_TO_REMOVE_FOR_GRPMAX = {
    "v3": {"asj", "fin", "mid", "oth", "ami", "remaining"},
    "v4": {"asj", "fin", "oth", "ami", "remaining"},
}
"""
Genetic ancestry groups that are removed before genetic ancestry group max calculations.
"""

DOWNSAMPLINGS = {
    "v3": [
        10,
        20,
        50,
        100,
        200,
        500,
        1000,
        2000,
        5000,
        10000,
        15000,
        20000,
        25000,
        30000,
        40000,
        50000,
        60000,
        70000,
        75000,
        80000,
        85000,
        90000,
        95000,
        100000,
        110000,
        120000,
    ],
    "v4": [
        10,
        100,
        500,
        1000,
        2000,
        5000,
        10000,
        20000,
        50000,
        100000,
        200000,
        500000,
    ],
    "v5": [
        10000,
        59355,
        89074,
        100000,
        223733,
    ],
}
"""
List of the downsampling numbers to use for frequency calculations by version.
"""

gnomad_syndip = VersionedMatrixTableResource(
    default_version="3.0",
    versions={
        "3.0": GnomadPublicMatrixTableResource(
            path="gs://gnomad-public-requester-pays/truth-sets/hail-0.2/gnomad_v3_syndip.b38.mt"
        )
    },
)

na12878 = VersionedMatrixTableResource(
    default_version="3.0",
    versions={
        "3.0": GnomadPublicMatrixTableResource(
            path="gs://gnomad-public-requester-pays/truth-sets/hail-0.2/gnomad_v3_na12878.mt"
        )
    },
)


def _public_release_ht_path(data_type: str, version: str) -> str:
    """
    Get public release table path.

    :param data_type: One of "exomes", "genomes" or "joint".
    :param version: One of the release versions of gnomAD on GRCh38
    :return: Path to release Table
    """
    version_prefix = "r" if version.startswith("3.0") else "v"
    return f"gs://gnomad-public-requester-pays/release/{version}/ht/{data_type}/gnomad.{data_type}.{version_prefix}{version}.sites.ht"


def _public_coverage_ht_path(data_type: str, version: str) -> str:
    """
    Get public coverage hail table.

    :param data_type: One of "exomes" or "genomes"
    :param version: One of the release versions of gnomAD on GRCh38
    :return: Path to coverage Table
    """
    version_prefix = "r" if version.startswith("3.0") else "v"
    return f"gs://gnomad-public-requester-pays/release/{version}/coverage/{data_type}/gnomad.{data_type}.{version_prefix}{version}.coverage.ht"


def _public_an_ht_path(data_type: str, version: str) -> str:
    """
    Get public allele number hail table.

    :param data_type: One of "exomes" or "genomes"
    :param version: One of the release versions of gnomAD on GRCh38
    :return: Path to allle number Table
    """
    return f"gs://gnomad-public-requester-pays/release/{version}/ht/{data_type}/gnomad.{data_type}.v{version}.allele_number_all_sites.ht"


def _public_pext_ht_path(pext_type: str = "base_level") -> str:
    """
    Get public proportion expressed across transcripts (pext) data.

    :param pext_type: One of "base_level" or "annotation_level". Default is "base_level".
    :return: Path to pext Table.
    """
    valid_types = ["base_level", "annotation_level"]

    if pext_type not in valid_types:
        raise DataException(
            f"Invalid pext_type: '{pext_type}'. Valid options are {valid_types}."
        )

    return f"gs://gnomad-public-requester-pays/release/4.1/pext/gnomad.pext.gtex_v10.{pext_type}.ht"


def _public_constraint_ht_path(version: str) -> str:
    """
    Get public constraint table path.

    :param version: One of the release versions of gnomAD on GRCh38.
    :return: Path to gene constraint Table.
    """
    return f"gs://gnomad-public-requester-pays/release/{version}/constraint/gnomad.v{version}.constraint_metrics.ht"


def _public_browser_variant_ht_path(version: str) -> str:
    """
    Get public browser variant table path.

    :param version: One of the release versions of gnomAD on GRCh38.
    :return: Path to browser variant Table.
    """
    return f"gs://gnomad-public-requester-pays/release/{version}/ht/browser/gnomad.browser.v{version}.sites.ht"


def _public_browser_gene_ht_path() -> str:
    """
    Get public browser gene table path.

    .. note::

       This table has smaller number of partitions (n=100) for faster computation and
       contains pext data compared to gnomad.genes.GRCh38.GENCODEv39.ht (which was
       used by the browser for ES export) under the same path.

    :return: Path to browser gene Table.
    """
    return "gs://gnomad-public-requester-pays/resources/grch38/browser/gnomad.genes.GRCh38.GENCODEv39.pext.ht"


[docs]def public_release(data_type: str) -> VersionedTableResource:
    """
    Retrieve publicly released versioned table resource.

    :param data_type: One of "exomes", "genomes" or "joint".
    :return: Release Table
    """
    if data_type not in DATA_TYPES:
        raise DataException(
            f"{data_type} not in {DATA_TYPES}, please select a data type from"
            f" {DATA_TYPES}"
        )

    if data_type == "exomes":
        current_release = CURRENT_EXOME_RELEASE
        releases = EXOME_RELEASES
    elif data_type == "genomes":
        current_release = CURRENT_GENOME_RELEASE
        releases = GENOME_RELEASES
    else:
        current_release = CURRENT_JOINT_RELEASE
        releases = JOINT_RELEASES

    return VersionedTableResource(
        current_release,
        {
            release: GnomadPublicTableResource(
                path=_public_release_ht_path(data_type, release)
            )
            for release in releases
        },
    )


[docs]def coverage(data_type: str) -> VersionedTableResource:
    """
    Retrieve gnomAD's coverage table by data_type.

    :param data_type: One of "exomes" or "genomes"
    :return: Coverage Table
    """
    if data_type not in DATA_TYPES:
        raise DataException(
            f"{data_type} not in {DATA_TYPES}, please select a data type from"
            f" {DATA_TYPES}"
        )

    if data_type == "exomes":
        current_release = CURRENT_EXOME_COVERAGE_RELEASE
        releases = EXOME_COVERAGE_RELEASES
    else:
        current_release = CURRENT_GENOME_COVERAGE_RELEASE
        releases = GENOME_COVERAGE_RELEASES

    return VersionedTableResource(
        current_release,
        {
            release: GnomadPublicTableResource(
                path=_public_coverage_ht_path(data_type, release)
            )
            for release in releases
        },
    )


[docs]def all_sites_an(data_type: str) -> VersionedTableResource:
    """
    Retrieve gnomAD's all sites allele number table by data_type.

    :param data_type: One of "exomes" or "genomes"
    :return: All sites allele number VersionedTableResource
    """
    if data_type not in DATA_TYPES:
        raise DataException(
            f"{data_type} not in {DATA_TYPES}, please select a data type from"
            f" {DATA_TYPES}"
        )

    if data_type == "exomes":
        current_release = CURRENT_EXOME_AN_RELEASE
        releases = EXOME_AN_RELEASES
    else:
        current_release = CURRENT_GENOME_AN_RELEASE
        releases = GENOME_AN_RELEASES

    return VersionedTableResource(
        current_release,
        {
            release: GnomadPublicTableResource(
                path=_public_an_ht_path(data_type, release)
            )
            for release in releases
        },
    )


[docs]def coverage_tsv_path(data_type: str, version: Optional[str] = None) -> str:
    """
    Retrieve gnomAD's coverage table by data_type.

    :param data_type: One of "exomes" or "genomes"
    :return: Coverage Table
    """
    if data_type not in DATA_TYPES:
        raise DataException(
            f"{data_type} not in {DATA_TYPES}, please select a data type from"
            f" {DATA_TYPES}"
        )

    if data_type == "exomes":
        if version is None:
            version = CURRENT_EXOME_COVERAGE_RELEASE
        elif version not in EXOME_COVERAGE_RELEASES:
            raise DataException(
                f"Version {version} of gnomAD exomes for GRCh38 does not exist"
            )
    else:
        if version is None:
            version = CURRENT_GENOME_COVERAGE_RELEASE
        elif version not in GENOME_COVERAGE_RELEASES:
            raise DataException(
                f"Version {version} of gnomAD genomes for GRCh38 does not exist"
            )

    version_prefix = "r" if version.startswith("3.0") else "v"
    return f"gs://gcp-public-data--gnomad/release/{version}/coverage/{data_type}/gnomad.{data_type}.{version_prefix}{version}.coverage.summary.tsv.bgz"


[docs]def release_vcf_path(data_type: str, version: str, contig: str) -> str:
    """
    Publically released VCF. Provide specific contig, i.e. "chr20", to retrieve contig specific VCF.

    :param data_type: One of "exomes" or "genomes"
    :param version: One of the release versions of gnomAD on GRCh37
    :param contig: Single contig "chr1" to "chrY"
    :return: Path to VCF
    """
    if version.startswith("2"):
        raise DataException(
            f"gnomAD version {version} is not available on reference genome GRCh38"
        )

    contig = f".{contig}" if contig else ""
    version_prefix = "r" if version.startswith("3.0") else "v"
    return f"gs://gcp-public-data--gnomad/release/{version}/vcf/{data_type}/gnomad.{data_type}.{version_prefix}{version}.sites{contig}.vcf.bgz"


[docs]def add_grpMaxFAF95_v4(ht: hl.Table) -> hl.Table:
    """
    Add a grpMaxFAF95 struct with 'grpmax' and 'grpmax_gen_anc'.

    Also includes a jointGrpMaxFAF95 annotation using the v4 fafmax and joint_fafmax structures.

    :param ht: Input hail table.
    :return: Annotated hail table.
    """
    if "gnomad" in ht.fafmax:
        fafmax_field = ht.fafmax.gnomad
    else:
        fafmax_field = ht.fafmax
    ht = ht.annotate(
        grpMaxFAF95=hl.struct(
            grpmax=fafmax_field.faf95_max,
            grpmax_gen_anc=fafmax_field.faf95_max_gen_anc,
        ),
        jointGrpMaxFAF95=hl.struct(
            grpmax=ht.joint_fafmax.faf95_max,
            grpmax_gen_anc=ht.joint_fafmax.faf95_max_gen_anc,
        ),
    )
    return ht


[docs]def gnomad_gks(
    locus_interval: hl.IntervalExpression,
    version: str,
    data_type: str = "genomes",
    by_gen_anc_group: bool = False,
    by_sex: bool = False,
    vrs_only: bool = False,
    custom_ht: hl.Table = None,
    skip_checkpoint: bool = False,
    skip_coverage: bool = False,
    custom_coverage_ht: hl.Table = None,
) -> list:
    """
    Perform gnomad GKS annotations on a range of variants at once.

    :param locus_interval: Hail IntervalExpression of locus<reference_genome>.
        e.g. hl.locus_interval('chr1', 6424776, 6461367, reference_genome="GRCh38")
    :param version: String of version of gnomAD release to use.
    :param data_type: String of either "exomes" or "genomes" for the type of reads that are desired.
    :param by_gen_anc_group: Boolean to pass to obtain frequency information for each cohort.
    :param by_sex: Boolean to pass to return frequency information for each cohort split by chromosomal sex.
    :param vrs_only: Boolean to pass for only VRS info to be returned
        (will not include allele frequency information).
    :param custom_ht: Table to use instead of what public_release() method would return for the version.
    :param skip_checkpoint: Bool to pass to skip checkpointing selected fields
        (checkpointing may be desirable for large datasets by reducing data copies across the cluster).
    :param skip_coverage: Bool to pass to skip adding coverage statistics.
    :param custom_coverage_ht: Custom table to use for coverage statistics instead of the release coverage table.
    :return: List of dictionaries containing VRS information
        (and freq info split by ancestry groups and sex if desired) for specified variant.
    """
    # Obtain the high level version number and verify that it is 4.
    high_level_version = f"v{version.split('.')[0]}"
    if high_level_version != "v4":
        raise NotImplementedError(
            "gnomad_gks() is currently only implemented for gnomAD v4."
        )

    # Read public_release table if no custom table provided.
    if custom_ht:
        ht = custom_ht
    else:
        ht = hl.read_table(public_release(data_type).versions[version].path)

    # Read coverage statistics if requested.
    coverage_version_3_0_1 = "3.0.1"  # v4 genomes coverage
    coverage_version_4_0 = "4.0"  # v4 exomes coverage

    # In v4, exomes have coverage in v4 coverage table,
    #   genomes have coverage in v3 coverage table.
    if data_type == "genomes":
        coverage_version = coverage_version_3_0_1
    else:
        coverage_version = coverage_version_4_0

    coverage_ht = None

    if not skip_coverage:
        if custom_coverage_ht:
            coverage_ht = custom_coverage_ht
        else:
            coverage_ht = hl.read_table(
                coverage(data_type).versions[coverage_version].path
            )
        ht = ht.annotate(mean_depth=coverage_ht[ht.locus].mean)
        ht = ht.annotate(fraction_cov_over_20=coverage_ht[ht.locus].over_20)

    # Retrieve genetic ancestry groups from the imported GEN_ANC_NAMES dictionary.
    grps_list = (
        list(GEN_ANC_NAMES[high_level_version][data_type]) if by_gen_anc_group else None
    )

    # Throw warnings if contradictory arguments are passed.
    if by_gen_anc_group and vrs_only:
        logger.warning(
            "Both 'vrs_only' and 'by_gen_anc_groups' have been specified. Ignoring"
            " 'by_gen_anc_groups' list and returning only VRS information."
        )
    elif by_sex and not by_gen_anc_group:
        logger.warning(
            "Splitting whole database by sex is not yet supported. If using 'by_sex',"
            " please also specify 'by_ancestry_group' to stratify by."
        )

    # Select relevant fields, checkpoint, and filter to interval before adding
    # annotations.

    # Pull up LCR flag and make referrable in the same field.
    ht = ht.annotate(lcr=ht.region_flags.lcr)

    # Pull up allele balance histogram arrays.
    ht = ht.annotate(ab_hist_alt=ht.histograms.qual_hists.ab_hist_alt)

    ht = add_grpMaxFAF95_v4(ht)

    ht = ht.annotate(in_autosome_or_par=ht.locus.in_autosome_or_par())

    keep_fields = [
        ht.freq,
        ht.info.vrs,
        ht.info.monoallelic,
        ht.grpMaxFAF95,
        ht.filters,
        ht.lcr,
        ht.ab_hist_alt,
        ht.in_autosome_or_par,
    ]

    if not skip_coverage:
        keep_fields.append(ht.mean_depth)
        keep_fields.append(ht.fraction_cov_over_20)

    if "jointGrpMaxFAF95" in ht.row:
        keep_fields.append(ht.jointGrpMaxFAF95)

    ht = ht.select(*keep_fields)

    # Checkpoint narrower set of columns if not skipped.
    ht = hl.filter_intervals(ht, [locus_interval])
    if not skip_checkpoint:
        ht = ht.checkpoint(hl.utils.new_temp_file("vrs_checkpoint", extension="ht"))

    # Collect all variants as structs, so all dictionary construction can be
    # done in native Python.
    variant_list = ht.collect()
    ht_freq_index_dict = ht.freq_index_dict.collect()[0]
    # gnomad v4 renamed freq_index_dict keys to use underscores instead of dashes.
    # Use underscores for v3 as well.
    ht_freq_index_dict = {k.replace("-", "_"): v for k, v in ht_freq_index_dict.items()}

    # Assemble output dicts with VRS and optionally frequency, append to list,
    # then return list
    outputs = []
    for variant in variant_list:
        vrs_variant = add_gks_vrs(variant.locus, variant.vrs)

        out = {
            "locus": {
                "contig": variant.locus.contig,
                "position": variant.locus.position,
                "reference_genome": variant.locus.reference_genome.name,
            },
            "alleles": variant.alleles,
            "gks_vrs_variant": vrs_variant,
        }

        if not vrs_only:
            va_freq_dict = add_gks_va(
                input_struct=variant,
                label_name="gnomAD",
                label_version=version,
                gen_anc_groups=grps_list,
                gen_anc_groups_dict=GEN_ANC_NAMES,
                by_sex=by_sex,
                freq_index_dict=ht_freq_index_dict,
            )

            # Assign existing VRS information to "focusAllele" key
            va_freq_dict["focusAllele"] = vrs_variant
            out["gks_va_freq"] = va_freq_dict

        # Append variant dictionary to list of outputs
        outputs.append(out)

    return outputs


[docs]def pext(pext_type: str = "base_level") -> GnomadPublicTableResource:
    """
    Retrieve pext table by type.

    :param pext_type: One of "base_level" or "annotation_level". Default is "base_level".
    :return: Pext Table.
    """
    return GnomadPublicTableResource(path=_public_pext_ht_path(pext_type))


[docs]def constraint() -> VersionedTableResource:
    """
    Retrieve gene constraint Table.

    :return: Gene constraint Table.
    """
    return VersionedTableResource(
        CURRENT_EXOME_RELEASE,
        {
            release: GnomadPublicTableResource(path=_public_constraint_ht_path(release))
            for release in EXOME_RELEASES
        },
    )


[docs]def browser_variant() -> VersionedTableResource:
    """
    Retrieve browser variant table.

    :return: Browser variant Table.
    """
    return VersionedTableResource(
        CURRENT_BROWSER_RELEASE,
        {
            release: GnomadPublicTableResource(
                path=_public_browser_variant_ht_path(release)
            )
            for release in BROWSER_RELEASES
        },
    )


[docs]def browser_gene() -> GnomadPublicTableResource:
    """
    Retrieve browser gene table.

    :return: Browser gene Table.
    """
    return GnomadPublicTableResource(path=_public_browser_gene_ht_path())