Source code for gnomad.resources.grch37.reference_data

# noqa: D100

import hail as hl

from gnomad.resources.resource_utils import (
    GnomadPublicMatrixTableResource,
    GnomadPublicTableResource,
    VersionedMatrixTableResource,
    VersionedTableResource,
    import_gencode,
    import_sites_vcf,
)


def _import_gtex_rsem(gtex_path: str, meta_path: str, **kwargs) -> hl.MatrixTable:
    """
    Import GTEx RSEM data from expression data and sample attributes file.

    .. note::

        Files are downloaded from https://www.gtexportal.org/home/downloads/adult-gtex.
        We get the transcript TPM under Bulk tissue expression and sample attributes
        under Metadata. The transcript TPM file is expected to have transcript
        expression data, with transcript IDs as the first column and gene IDs as the
        second column.

    :param gtex_path: Path to the GTEx RSEM file.
    :param meta_path: Path to the GTEx sample attributes file.
    :param kwargs: Any additional parameters to be passed to Hail's `import_matrix_table`.
    :return: Matrix Table with GTEx RSEM data with tissue information.
    """
    meta_ht = hl.import_table(meta_path, force_bgz=True, impute=True)
    meta_ht = meta_ht.key_by("SAMPID")

    mt = hl.import_matrix_table(
        gtex_path,
        row_fields={"transcript_id": hl.tstr, "gene_id": hl.tstr},
        entry_type=hl.tfloat64,
        force_bgz=True,
        **kwargs,
    )

    mt = mt.rename({"x": "transcript_tpm", "col_id": "s"})

    # GTEx data has gene IDs and transcript IDs with version numbers, we need
    # to remove the version numbers so that it can later be joined with VEP
    # transcript consequences transcript_id.
    mt = mt.annotate_cols(
        tissue=meta_ht[mt.s]
        .SMTSD.replace(" ", "")
        .replace("-", "_")
        .replace("\\(", "_")
        .replace("\\)", "")
    )
    mt = mt.annotate_rows(
        transcript_id=mt.transcript_id.split("\\.")[0],
        gene_id=mt.gene_id.split("\\.")[0],
    )
    mt = mt.key_rows_by("transcript_id").drop("row_id")

    return mt


na12878_giab = GnomadPublicMatrixTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.mt",
    import_func=hl.import_vcf,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.vcf.bgz",
        "force_bgz": True,
        "min_partitions": 100,
        "reference_genome": "GRCh37",
    },
)

hapmap = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/hapmap/hapmap_3.3.b37.ht",
    import_func=import_sites_vcf,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/hapmap/hapmap_3.3.b37.vcf.bgz",
        "force_bgz": True,
        "min_partitions": 100,
        "reference_genome": "GRCh37",
    },
)

kgp_omni = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000G_omni2.5.b37.ht",
    import_func=import_sites_vcf,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/kgp/1000G_omni2.5.b37.vcf.bgz",
        "force_bgz": True,
        "min_partitions": 100,
        "reference_genome": "GRCh37",
    },
)

mills = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/mills/Mills_and_1000G_gold_standard.indels.b37.ht",
    import_func=import_sites_vcf,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/mills/Mills_and_1000G_gold_standard.indels.b37.vcf.bgz",
        "force_bgz": True,
        "min_partitions": 100,
        "reference_genome": "GRCh37",
    },
)

syndip = GnomadPublicMatrixTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/syndip/hybrid.m37m.mt",
    import_func=hl.import_vcf,
    import_args={
        "path": (
            "gs://gcp-public-data--gnomad/resources/grch37/syndip/hybrid.m37m.vcf.bgz"
        ),
        "min_partitions": 100,
        "reference_genome": "GRCh37",
    },
)

# Versioned resources: versions should be listed from most recent to oldest
vep_context = VersionedTableResource(
    default_version="85",
    versions={
        "85": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/context/grch37_context_vep_annotated.ht",
        )
    },
)

dbsnp = VersionedTableResource(
    default_version="20180423",
    versions={
        "20180423": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch37/dbsnp/All_20180423.ht",
            import_func=import_sites_vcf,
            import_args={
                "path": "gs://gcp-public-data--gnomad/resources/grch37/dbsnp/All_20180423.vcf.bgz",
                "force_bgz": True,
                "skip_invalid_loci": True,
                "min_partitions": 100,
                "reference_genome": "GRCh37",
            },
        )
    },
)

clinvar = VersionedTableResource(
    default_version="20181028",
    versions={
        "20181028": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch37/clinvar/clinvar_20181028.vep.ht",
            import_func=import_sites_vcf,
            import_args={
                "path": "gs://gcp-public-data--gnomad/resources/grch37/clinvar/clinvar_20181028.vcf.bgz",
                "force_bgz": True,
                "skip_invalid_loci": True,
                "min_partitions": 100,
                "reference_genome": "GRCh37",
            },
        )
    },
)

kgp_phase_3 = VersionedMatrixTableResource(
    default_version="phase_3_split",
    versions={
        "phase_3_split": GnomadPublicMatrixTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.split.mt",
            import_func=hl.import_vcf,
            import_args={
                "path": "gs://genomics-public-data/1000-genomes-phase-3/vcf-20150220/ALL.chr*.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf",
                "force_bgz": True,
                "skip_invalid_loci": True,
                "min_partitions": 300,
                "reference_genome": "GRCh37",
            },
        ),
        "phase_3": GnomadPublicMatrixTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.mt",
            import_func=hl.import_vcf,
            import_args={
                "path": "gs://genomics-public-data/1000-genomes-phase-3/vcf-20150220/ALL.chr*.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf",
                "force_bgz": True,
                "skip_invalid_loci": True,
                "min_partitions": 300,
                "reference_genome": "GRCh37",
            },
        ),
    },
)

kgp = VersionedTableResource(
    default_version="phase_1_hc",
    versions={
        "phase_1_hc": GnomadPublicTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000G_phase1.snps.high_confidence.b37.ht",
            import_func=import_sites_vcf,
            import_args={
                "path": "gs://gcp-public-data--gnomad/resources/grch37/kgp/1000G_phase1.snps.high_confidence.b37.vcf.bgz",
                "force_bgz": True,
                "skip_invalid_loci": True,
                "min_partitions": 100,
                "reference_genome": "GRCh37",
            },
        ),
    },
)

cpg_sites = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/cpg_sites/cpg.ht"
)

methylation_sites = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/methylation_sites/methylation.ht"
)

lcr_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/lcr_intervals/LCR.GRCh37_compliant.interval_list.ht",
    import_func=hl.import_locus_intervals,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/lcr_intervals/LCR.GRCh37_compliant.interval_list",
        "reference_genome": "GRCh37",
    },
)

decoy_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/decoy_intervals/mm-2-merged.GRCh37_compliant.ht",
    import_func=hl.import_bed,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/decoy_intervals/mm-2-merged.GRCh37_compliant.bed",
        "reference_genome": "GRCh37",
    },
)

purcell_5k_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/purcell_5k_intervals/purcell5k.ht",
    import_func=hl.import_locus_intervals,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/purcell_5k_intervals/purcell5k.interval_list",
        "reference_genome": "GRCh37",
    },
)

seg_dup_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/seg_dup_intervals/hg19_self_chain_split_both.ht",
    import_func=hl.import_bed,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/seg_dup_intervals/hg19_self_chain_split_both.bed",
        "reference_genome": "GRCh37",
    },
)

exome_hc_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/exomes_high_coverage.auto.interval_list.ht",
    import_func=hl.import_locus_intervals,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/exomes_high_coverage.auto.interval_list",
        "reference_genome": "GRCh37",
    },
)

high_coverage_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/high_coverage.auto.interval_list.ht",
    import_func=hl.import_locus_intervals,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/high_coverage.auto.interval_list",
        "reference_genome": "GRCh37",
    },
)

exome_calling_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/exome_calling_regions.v1.interval_list.ht",
    import_func=hl.import_locus_intervals,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/exome_calling_regions.v1.interval_list",
        "reference_genome": "GRCh37",
    },
)

exome_evaluation_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/exome_evaluation_regions.v1.noheader.interval_list.ht",
    import_func=hl.import_locus_intervals,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/exome_evaluation_regions.v1.noheader.interval_list",
        "reference_genome": "GRCh37",
    },
)

genome_evaluation_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/hg19-v0-wgs_evaluation_regions.v1.interval_list.ht",
    import_func=hl.import_locus_intervals,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/hg19-v0-wgs_evaluation_regions.v1.interval_list",
        "reference_genome": "GRCh37",
    },
)

na12878_hc_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/na12878/NA12878_GIAB_highconf_intervals.ht",
    import_func=hl.import_bed,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.bed",
        "reference_genome": "GRCh37",
    },
)

syndip_hc_intervals = GnomadPublicTableResource(
    path="gs://gnomad-public-requester-pays/resources/grch37/syndip/syndip_highconf_genome_intervals.ht",
    import_func=hl.import_bed,
    import_args={
        "path": "gs://gcp-public-data--gnomad/resources/grch37/syndip/hybrid.m37m.bed",
        "reference_genome": "GRCh37",
    },
)


[docs]def get_truth_ht() -> hl.Table: """ Return a table with annotations from the latest version of the corresponding truth data. The following annotations are included: - hapmap - kgp_omni (1000 Genomes intersection Onni 2.5M array) - kgp_phase_1_hc (high confidence sites in 1000 genonmes) - mills (Mills & Devine indels) :return: A table with the latest version of popular truth data annotations """ return ( hapmap.ht() .select(hapmap=True) .join(kgp_omni.ht().select(omni=True), how="outer") .join( kgp.versions["phase_1_hc"].mt().rows().select(kgp_phase1_hc=True), how="outer", ) .join(mills.ht().select(mills=True), how="outer") .repartition(200, shuffle=False) .persist() )
gtex_rsem = VersionedMatrixTableResource( default_version="v7", versions={ "v7": GnomadPublicMatrixTableResource( path="gs://gnomad-public-requester-pays/resources/grch37/gtex_rsem/gtex_rsem_v7.mt", import_func=_import_gtex_rsem, import_args={ "gtex_path": "gs://gcp-public-data--gnomad/resources/grch37/gtex/bulk-gex_v7_rna-seq_GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_tpm.txt.gz", "meta_path": "gs://gcp-public-data--gnomad/resources/grch37/gtex/annotations_v7_GTEx_v7_Annotations_SampleAttributesDS.txt.gz", "min_partitions": 1000, }, ), }, ) gencode = VersionedTableResource( default_version="v19", versions={ "v19": GnomadPublicTableResource( path="gs://gnomad-public-requester-pays/resources/grch37/gencode/gencode.v19.annotation.ht", import_func=import_gencode, import_args={ "gtf_path": "gs://gcp-public-data--gnomad/resources/grch37/gencode/gencode.v19.annotation.gtf.gz", "reference_genome": "GRCh37", "force_bgz": True, "min_partitions": 10, }, ), }, )