Source code for gnomad.resources.grch38.reference_data

# noqa: D100

import hail as hl
from hail import Table

from gnomad.resources.resource_utils import (
    DBSNP_B154_CHR_CONTIG_RECODING,
    NO_CHR_TO_CHR_CONTIG_RECODING,
    GnomadPublicMatrixTableResource,
    GnomadPublicTableResource,
    VersionedMatrixTableResource,
    VersionedTableResource,
    import_gencode,
    import_sites_vcf,
)
from gnomad.utils.vep import vep_or_lookup_vep


def _import_purcell_5k(path) -> hl.Table:
    p5k = hl.import_locus_intervals(path, reference_genome="GRCh37")
    rg37 = hl.get_reference("GRCh37")
    rg38 = hl.get_reference("GRCh38")
    if not rg37.has_liftover("GRCh38"):
        rg37.add_liftover(
            "gs://hail-common/references/grch37_to_grch38.over.chain.gz", rg38
        )
    p5k = p5k.annotate(
        start=hl.liftover(p5k.interval.start, "GRCh38"),
        end=hl.liftover(p5k.interval.start, "GRCh38"),
    )
    p5k = p5k.filter(
        (p5k.start.contig == "chr" + p5k.interval.start.contig)
        & (p5k.end.contig == "chr" + p5k.interval.end.contig)
    )
    p5k = p5k.key_by()
    p5k = p5k.select(locus=p5k.start, locus_b37=p5k.interval.start)
    return p5k.key_by("locus")


def _import_clinvar(**kwargs) -> hl.Table:
    clinvar = import_sites_vcf(**kwargs)
    clinvar = clinvar.filter(
        hl.len(clinvar.alleles) > 1
    )  # Get around problematic single entry in alleles array in the clinvar vcf
    clinvar = vep_or_lookup_vep(clinvar, reference="GRCh38")
    return clinvar


def _import_dbsnp(**kwargs) -> hl.Table:
    dbsnp = import_sites_vcf(**kwargs)
    # Note: permit_shuffle is set because the dbsnp vcf has duplicate loci
    # (turned into a set) so might be out of order
    dbsnp = hl.split_multi(dbsnp, permit_shuffle=True)
    dbsnp = dbsnp.group_by(dbsnp.locus, dbsnp.alleles).aggregate(
        rsid=hl.agg.collect_as_set(dbsnp.rsid)
    )

    return dbsnp


def _import_methylation_sites(path) -> hl.Table:
    """
    Import methylation data from bed file.

    :param path: Path to bed file containing methylation scores.
    :return: Table with methylation data.
    """
    ht = hl.import_bed(path, min_partitions=100, reference_genome="GRCh38")
    ht = ht.select(
        locus=ht.interval.start,
        methylation_level=hl.int32(ht.target),
    )

    return ht.key_by("locus").drop("interval")


def _import_ensembl_interval(path) -> hl.Table:
    """
    Import and parse Ensembl intervals of protein-coding genes to a Hail Table.

    File is expected to include only the following fields: gene_stable_ID, chr, start, end, source_gene, gene_name, and type.

    :param path: Path to the interval Table file.
    """
    ensembl = hl.import_table(
        path,
        delimiter="\t",
        min_partitions=100,
        impute=True,
    )

    ensembl = ensembl.key_by(
        interval=hl.locus_interval(
            "chr" + ensembl.chr,
            ensembl.start,
            ensembl.end,
            reference_genome="GRCh38",
        )
    )
    return ensembl


# Resources with no versioning needed
purcell_5k_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch38/purcell_5k_intervals/purcell5k.ht",
    import_func=_import_purcell_5k,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch38/purcell_5k_intervals/purcell5k.interval_list",
    },
)

na12878_giab = GnomadPublicMatrixTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.mt",
    import_func=hl.import_vcf,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz",
        "force_bgz": True,
        "min_partitions": 100,
        "reference_genome": "GRCh38",
    },
)

na12878_giab_hc_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7_hc_regions.ht",
    import_func=hl.import_bed,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed",
        "reference_genome": "GRCh38",
        "skip_invalid_intervals": True,
    },
)

# Versioned resources: versions should be listed from most recent to oldest
vep_context = VersionedTableResource(
    default_version="105",
    versions={
        "95": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/context/grch38_context_vep_annotated.ht",
        ),
        "101": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/context/grch38_context_vep_annotated.v101.ht",
        ),
        "105": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/context/grch38_context_vep_annotated.v105.ht",
        ),
    },
)

syndip = VersionedMatrixTableResource(
    default_version="20180222",
    versions={
        "20180222": GnomadPublicMatrixTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch38/syndip/syndip.b38_20180222.mt",
            import_func=hl.import_vcf,
            import_args={
                "path": "gs://gcp-public-data--gnomad/resources/grch38/syndip/full.38.20180222.vcf.gz",
                "force_bgz": True,
                "min_partitions": 100,
                "reference_genome": "GRCh38",
            },
        )
    },
)

syndip_hc_intervals = VersionedTableResource(
    default_version="20180222",
    versions={
        "20180222": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch38/syndip/syndip_b38_20180222_hc_regions.ht",
            import_func=hl.import_bed,
            import_args={
                "path": "gs://gcp-public-data--gnomad/resources/grch38/syndip/syndip.b38_20180222.bed",
                "reference_genome": "GRCh38",
                "skip_invalid_intervals": True,
                "min_partitions": 10,
            },
        )
    },
)

# These Ensembl Interval Tables are focused on protein-coding genes on chr1-22,X,Y.
# Downloaded from the biomart of Ensembl Archive (https://useast.ensembl.org/info/website/archives/index.html)
# Ensembl 101 & 105 are included, since 101 was used to annotate gnomAD v3 and 105 to gnomAD v4.
# Basic stats: 19924 protein-coding genes in Ensembl 101, and1 19951
# protein-coding genes in Ensembl 105.
ensembl_interval = VersionedTableResource(
    default_version="105",
    versions={
        "105": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch38/ensembl/ensembl_105_pc_genes_grch38.ht",
            import_func=_import_ensembl_interval,
            import_args={
                "path": "gs://gcp-public-data--gnomad/resources/grch38/ensembl/ensembl_105_pc_genes_grch38.tsv",
            },
        ),
        "101": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch38/ensembl/ensembl_101_pc_genes_grch38.ht",
            import_func=_import_ensembl_interval,
            import_args={
                "path": "gs://gcp-public-data--gnomad/resources/grch38/ensembl/ensembl_101_pc_genes_grch38.tsv",
            },
        ),
    },
)

clinvar = VersionedTableResource(
    default_version="20190923",
    versions={
        "20190923": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch38/clinvar/clinvar_20190923.ht",
            import_func=_import_clinvar,
            import_args={
                "path": "gs://gcp-public-data--gnomad/resources/grch38/clinvar/clinvar_20190923.vcf.gz",
                "force_bgz": True,
                "contig_recoding": NO_CHR_TO_CHR_CONTIG_RECODING,
                "skip_invalid_loci": True,
                "min_partitions": 100,
                "reference_genome": "GRCh38",
            },
        )
    },
)

dbsnp = VersionedTableResource(
    default_version="b156",
    versions={
        "b156": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch38/dbsnp/dbsnp_b156_grch38_all_20221116.ht",
            import_func=_import_dbsnp,
            import_args={
                "path": "gs://gcp-public-data--gnomad/resources/grch38/dbsnp/dbsnp_b156_grch38_all_GCF_000001405.40_20221116.vcf.bgz",
                "header_file": "gs://gcp-public-data--gnomad/resources/grch38/dbsnp/dbsnp_b156_grch38_all_GCF_000001405.40_20221116.vcf.header",
                "force_bgz": True,
                "contig_recoding": DBSNP_B154_CHR_CONTIG_RECODING,
                "skip_invalid_loci": True,
                "min_partitions": 400,
                "reference_genome": "GRCh38",
            },
        ),
        "b154": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch38/dbsnp/dbsnp_b154_grch38_all_20200514.ht",
            import_func=_import_dbsnp,
            import_args={
                "path": "gs://gcp-public-data--gnomad/resources/grch38/dbsnp/dbsnp_b154_grch38_all_GCF_000001405.38_20200514.vcf.bgz",
                "header_file": "gs://gcp-public-data--gnomad/resources/grch38/dbsnp/dbsnp_b154_grch38_all_GCF_000001405.38_20200514.vcf.header",
                "force_bgz": True,
                "contig_recoding": DBSNP_B154_CHR_CONTIG_RECODING,
                "skip_invalid_loci": True,
                "min_partitions": 400,
                "reference_genome": "GRCh38",
            },
        ),
        "b151": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch38/dbsnp/dbsnp_b151_grch38_all_20180418.ht",
            import_func=import_sites_vcf,
            import_args={
                "path": "gs://gcp-public-data--gnomad/resources/grch38/dbsnp/dbsnp_b151_grch38_all_20180418.vcf.bgz",
                "header_file": "gs://gcp-public-data--gnomad/resources/grch38/dbsnp/dbsnp_b151_grch38_all_20180418.vcf.header",
                "force_bgz": True,
                "contig_recoding": NO_CHR_TO_CHR_CONTIG_RECODING,
                "skip_invalid_loci": True,
                "min_partitions": 400,
                "reference_genome": "GRCh38",
            },
        ),
    },
)

hapmap = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch38/hapmap/hapmap_3.3.hg38.ht",
    import_func=import_sites_vcf,
    import_args={
        "path": (
            "gs://genomics-public-data/resources/broad/hg38/v0/hapmap_3.3.hg38.vcf.gz"
        ),
        "force_bgz": True,
        "reference_genome": "GRCh38",
    },
)

kgp_omni = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch38/kgp/1000G_omni2.5.hg38.ht",
    import_func=import_sites_vcf,
    import_args={
        "path": "gs://genomics-public-data/resources/broad/hg38/v0/1000G_omni2.5.hg38.vcf.gz",
        "force_bgz": True,
        "reference_genome": "GRCh38",
    },
)

kgp = VersionedTableResource(
    default_version="phase_1_hc",
    versions={
        "phase_1_hc": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch38/kgp/1000G_phase1.snps.high_confidence.hg38.ht",
            import_func=import_sites_vcf,
            import_args={
                "path": "gs://genomics-public-data/resources/broad/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
                "force_bgz": True,
                "reference_genome": "GRCh38",
            },
        )
    },
)

mills = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch38/mills/Mills_and_1000G_gold_standard.indels.hg38.ht",
    import_func=import_sites_vcf,
    import_args={
        "path": "gs://genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
        "force_bgz": True,
        "reference_genome": "GRCh38",
    },
)

# Methylation scores range from 0-15 and are described in Chen et al
# (https://www.biorxiv.org/content/10.1101/2022.03.20.485034v2.full).
methylation_sites = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch38/methylation_sites/methylation.ht",
    import_func=_import_methylation_sites,
    import_args={
        "path": "gs://gnomad-public-requester-pays/resources/grch38/methylation_sites/methylation.bed",
    },
)

# Methylation scores for chromosome X range from 0-12 and are described in Chen et al
# (https://www.biorxiv.org/content/10.1101/2022.03.20.485034v2.full).
methylation_sites_chrx = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch38/methylation_sites/methylation_chrX.ht",
    import_func=_import_methylation_sites,
    import_args={
        "path": "gs://gnomad-public-requester-pays/resources/grch38/methylation_sites/methylation_chrX.bed",
    },
)

lcr_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch38/lcr_intervals/LCRFromHengHg38.ht",
    import_func=hl.import_locus_intervals,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch38/lcr_intervals/LCRFromHengHg38.txt",
        "reference_genome": "GRCh38",
        "skip_invalid_intervals": True,
    },
)

seg_dup_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch38/seg_dup_intervals/GRCh38_segdups.ht",
    import_func=hl.import_bed,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch38/seg_dup_intervals/GRCh38_segdups.bed",
        "reference_genome": "GRCh38",
    },
)

telomeres_and_centromeres = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch38/telomeres_and_centromeres/hg38.telomeresAndMergedCentromeres.ht",
    import_func=hl.import_bed,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch38/telomeres_and_centromeres/hg38.telomeresAndMergedCentromeres.bed",
        "reference_genome": "GRCh38",
        "skip_invalid_intervals": True,
    },
)


[docs]def get_truth_ht() -> Table: """ Return a table with annotations from the latest version of the corresponding truth data. The following annotations are included: - hapmap - kgp_omni (1000 Genomes intersection Onni 2.5M array) - kgp_phase_1_hc (high confidence sites in 1000 genonmes) - mills (Mills & Devine indels) :return: A table with the latest version of popular truth data annotations """ return ( hapmap.ht() .select(hapmap=True) .join(kgp_omni.ht().select(omni=True), how="outer") .join(kgp.versions["phase_1_hc"].ht().select(kgp_phase1_hc=True), how="outer") .join(mills.ht().select(mills=True), how="outer") .repartition(200, shuffle=False) .persist() )
gencode = VersionedTableResource( default_version="v39", versions={ "v39": GnomadPublicTableResource( path="gs://gnomad-public-requester-pays/resources/grch38/gencode/gencode.v39.annotation.ht", import_func=import_gencode, import_args={ "gtf_path": "gs://gcp-public-data--gnomad/resources/grch38/gencode/gencode.v39.annotation.gtf.gz", "reference_genome": "GRCh38", "force_bgz": True, "min_partitions": 100, }, ), }, )