# noqa: D100
from typing import Optional, Union
import hail as hl
from gnomad.resources.resource_utils import (
GnomadPublicMatrixTableResource,
GnomadPublicTableResource,
VersionedMatrixTableResource,
VersionedTableResource,
import_gencode,
import_sites_vcf,
)
from gnomad.utils.constraint import transform_methylation_level
def _import_gtex_rsem(gtex_path: str, meta_path: str, **kwargs) -> hl.MatrixTable:
"""
Import GTEx RSEM data from expression data and sample attributes file.
.. note::
Files are downloaded from https://www.gtexportal.org/home/downloads/adult-gtex.
We get the transcript TPM under Bulk tissue expression and sample attributes
under Metadata. The transcript TPM file is expected to have transcript
expression data, with transcript IDs as the first column and gene IDs as the
second column.
:param gtex_path: Path to the GTEx RSEM file.
:param meta_path: Path to the GTEx sample attributes file.
:param kwargs: Any additional parameters to be passed to Hail's `import_matrix_table`.
:return: Matrix Table with GTEx RSEM data with tissue information.
"""
meta_ht = hl.import_table(meta_path, force_bgz=True, impute=True)
meta_ht = meta_ht.key_by("SAMPID")
mt = hl.import_matrix_table(
gtex_path,
row_fields={"transcript_id": hl.tstr, "gene_id": hl.tstr},
entry_type=hl.tfloat64,
force_bgz=True,
**kwargs,
)
mt = mt.rename({"x": "transcript_tpm", "col_id": "s"})
# GTEx data has gene IDs and transcript IDs with version numbers, we need
# to remove the version numbers so that it can later be joined with VEP
# transcript consequences transcript_id.
mt = mt.annotate_cols(
tissue=meta_ht[mt.s]
.SMTSD.replace(" ", "")
.replace("-", "_")
.replace("\\(", "_")
.replace("\\)", "")
)
mt = mt.annotate_rows(
transcript_id=mt.transcript_id.split("\\.")[0],
gene_id=mt.gene_id.split("\\.")[0],
)
mt = mt.key_rows_by("transcript_id").drop("row_id")
return mt
na12878_giab = GnomadPublicMatrixTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.mt",
import_func=hl.import_vcf,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.vcf.bgz",
"force_bgz": True,
"min_partitions": 100,
"reference_genome": "GRCh37",
},
)
hapmap = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/hapmap/hapmap_3.3.b37.ht",
import_func=import_sites_vcf,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/hapmap/hapmap_3.3.b37.vcf.bgz",
"force_bgz": True,
"min_partitions": 100,
"reference_genome": "GRCh37",
},
)
kgp_omni = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000G_omni2.5.b37.ht",
import_func=import_sites_vcf,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/kgp/1000G_omni2.5.b37.vcf.bgz",
"force_bgz": True,
"min_partitions": 100,
"reference_genome": "GRCh37",
},
)
mills = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/mills/Mills_and_1000G_gold_standard.indels.b37.ht",
import_func=import_sites_vcf,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/mills/Mills_and_1000G_gold_standard.indels.b37.vcf.bgz",
"force_bgz": True,
"min_partitions": 100,
"reference_genome": "GRCh37",
},
)
syndip = GnomadPublicMatrixTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/syndip/hybrid.m37m.mt",
import_func=hl.import_vcf,
import_args={
"path": (
"gs://gcp-public-data--gnomad/resources/grch37/syndip/hybrid.m37m.vcf.bgz"
),
"min_partitions": 100,
"reference_genome": "GRCh37",
},
)
# Versioned resources: versions should be listed from most recent to oldest
vep_context = VersionedTableResource(
default_version="85",
versions={
"85": GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/context/grch37_context_vep_annotated.ht",
)
},
)
dbsnp = VersionedTableResource(
default_version="20180423",
versions={
"20180423": GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/dbsnp/All_20180423.ht",
import_func=import_sites_vcf,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/dbsnp/All_20180423.vcf.bgz",
"force_bgz": True,
"skip_invalid_loci": True,
"min_partitions": 100,
"reference_genome": "GRCh37",
},
)
},
)
clinvar = VersionedTableResource(
default_version="20181028",
versions={
"20181028": GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/clinvar/clinvar_20181028.vep.ht",
import_func=import_sites_vcf,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/clinvar/clinvar_20181028.vcf.bgz",
"force_bgz": True,
"skip_invalid_loci": True,
"min_partitions": 100,
"reference_genome": "GRCh37",
},
)
},
)
kgp_phase_3 = VersionedMatrixTableResource(
default_version="phase_3_split",
versions={
"phase_3_split": GnomadPublicMatrixTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.split.mt",
import_func=hl.import_vcf,
import_args={
"path": "gs://genomics-public-data/1000-genomes-phase-3/vcf-20150220/ALL.chr*.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf",
"force_bgz": True,
"skip_invalid_loci": True,
"min_partitions": 300,
"reference_genome": "GRCh37",
},
),
"phase_3": GnomadPublicMatrixTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.mt",
import_func=hl.import_vcf,
import_args={
"path": "gs://genomics-public-data/1000-genomes-phase-3/vcf-20150220/ALL.chr*.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf",
"force_bgz": True,
"skip_invalid_loci": True,
"min_partitions": 300,
"reference_genome": "GRCh37",
},
),
},
)
kgp = VersionedTableResource(
default_version="phase_1_hc",
versions={
"phase_1_hc": GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000G_phase1.snps.high_confidence.b37.ht",
import_func=import_sites_vcf,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/kgp/1000G_phase1.snps.high_confidence.b37.vcf.bgz",
"force_bgz": True,
"skip_invalid_loci": True,
"min_partitions": 100,
"reference_genome": "GRCh37",
},
),
},
)
cpg_sites = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/cpg_sites/cpg.ht"
)
methylation_sites = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/methylation_sites/methylation.ht"
)
lcr_intervals = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/lcr_intervals/LCR.GRCh37_compliant.interval_list.ht",
import_func=hl.import_locus_intervals,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/lcr_intervals/LCR.GRCh37_compliant.interval_list",
"reference_genome": "GRCh37",
},
)
decoy_intervals = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/decoy_intervals/mm-2-merged.GRCh37_compliant.ht",
import_func=hl.import_bed,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/decoy_intervals/mm-2-merged.GRCh37_compliant.bed",
"reference_genome": "GRCh37",
},
)
purcell_5k_intervals = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/purcell_5k_intervals/purcell5k.ht",
import_func=hl.import_locus_intervals,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/purcell_5k_intervals/purcell5k.interval_list",
"reference_genome": "GRCh37",
},
)
seg_dup_intervals = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/seg_dup_intervals/hg19_self_chain_split_both.ht",
import_func=hl.import_bed,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/seg_dup_intervals/hg19_self_chain_split_both.bed",
"reference_genome": "GRCh37",
},
)
exome_hc_intervals = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/exomes_high_coverage.auto.interval_list.ht",
import_func=hl.import_locus_intervals,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/exomes_high_coverage.auto.interval_list",
"reference_genome": "GRCh37",
},
)
high_coverage_intervals = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/high_coverage.auto.interval_list.ht",
import_func=hl.import_locus_intervals,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/high_coverage.auto.interval_list",
"reference_genome": "GRCh37",
},
)
exome_calling_intervals = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/exome_calling_regions.v1.interval_list.ht",
import_func=hl.import_locus_intervals,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/exome_calling_regions.v1.interval_list",
"reference_genome": "GRCh37",
},
)
exome_evaluation_intervals = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/exome_evaluation_regions.v1.noheader.interval_list.ht",
import_func=hl.import_locus_intervals,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/exome_evaluation_regions.v1.noheader.interval_list",
"reference_genome": "GRCh37",
},
)
genome_evaluation_intervals = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/broad_intervals/hg19-v0-wgs_evaluation_regions.v1.interval_list.ht",
import_func=hl.import_locus_intervals,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/broad_intervals/hg19-v0-wgs_evaluation_regions.v1.interval_list",
"reference_genome": "GRCh37",
},
)
na12878_hc_intervals = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/na12878/NA12878_GIAB_highconf_intervals.ht",
import_func=hl.import_bed,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.bed",
"reference_genome": "GRCh37",
},
)
syndip_hc_intervals = GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/syndip/syndip_highconf_genome_intervals.ht",
import_func=hl.import_bed,
import_args={
"path": "gs://gcp-public-data--gnomad/resources/grch37/syndip/hybrid.m37m.bed",
"reference_genome": "GRCh37",
},
)
[docs]def get_truth_ht() -> hl.Table:
"""
Return a table with annotations from the latest version of the corresponding truth data.
The following annotations are included:
- hapmap
- kgp_omni (1000 Genomes intersection Onni 2.5M array)
- kgp_phase_1_hc (high confidence sites in 1000 genonmes)
- mills (Mills & Devine indels)
:return: A table with the latest version of popular truth data annotations
"""
return (
hapmap.ht()
.select(hapmap=True)
.join(kgp_omni.ht().select(omni=True), how="outer")
.join(
kgp.versions["phase_1_hc"].mt().rows().select(kgp_phase1_hc=True),
how="outer",
)
.join(mills.ht().select(mills=True), how="outer")
.repartition(200, shuffle=False)
.persist()
)
gtex_rsem = VersionedMatrixTableResource(
default_version="v7",
versions={
"v7": GnomadPublicMatrixTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/gtex_rsem/gtex_rsem_v7.mt",
import_func=_import_gtex_rsem,
import_args={
"gtex_path": "gs://gcp-public-data--gnomad/resources/grch37/gtex/bulk-gex_v7_rna-seq_GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_tpm.txt.gz",
"meta_path": "gs://gcp-public-data--gnomad/resources/grch37/gtex/annotations_v7_GTEx_v7_Annotations_SampleAttributesDS.txt.gz",
"min_partitions": 1000,
},
),
},
)
gencode = VersionedTableResource(
default_version="v19",
versions={
"v19": GnomadPublicTableResource(
path="gs://gnomad-public-requester-pays/resources/grch37/gencode/gencode.v19.annotation.ht",
import_func=import_gencode,
import_args={
"gtf_path": "gs://gcp-public-data--gnomad/resources/grch37/gencode/gencode.v19.annotation.gtf.gz",
"reference_genome": "GRCh37",
"force_bgz": True,
"min_partitions": 10,
},
),
},
)