"""Script to generate Hail Tables with in silico predictors."""
import argparse
import logging
import hail as hl
from gnomad.resources.resource_utils import NO_CHR_TO_CHR_CONTIG_RECODING
from gnomad.utils.slack import slack_notifications
from gnomad.utils.vep import filter_vep_transcript_csqs, process_consequences
from hail.utils import new_temp_file
from gnomad_qc.slack_creds import slack_token
from gnomad_qc.v4.resources.annotations import get_insilico_predictors
from gnomad_qc.v4.resources.release import release_sites
logging.basicConfig(
format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
[docs]def get_sift_polyphen_from_vep(ht: hl.Table) -> hl.Table:
"""
Get the max SIFT and PolyPhen scores from VEP 105 annotations.
This retrieves the max of SIFT and PolyPhen scores for a variant's MANE Select
transcript or, if MANE Select does not exist, canonical transcript.
:param ht: VEP 105 annotated Hail Table.
:return: Table annotated with max SIFT and PolyPhen scores.
"""
mane = filter_vep_transcript_csqs(
ht, synonymous=False, canonical=False, mane_select=True
)
canonical = filter_vep_transcript_csqs(ht, synonymous=False, canonical=True)
ht = ht.annotate(
sift_mane=mane[ht.key].vep.transcript_consequences.sift_score,
polyphen_mane=mane[ht.key].vep.transcript_consequences.polyphen_score,
sift_canonical=canonical[ht.key].vep.transcript_consequences.sift_score,
polyphen_canonical=canonical[ht.key].vep.transcript_consequences.polyphen_score,
)
ht = ht.select(
sift_max=hl.or_else(hl.max(ht.sift_mane), hl.max(ht.sift_canonical)),
polyphen_max=hl.or_else(
hl.max(ht.polyphen_mane), hl.max(ht.polyphen_canonical)
),
)
return ht
[docs]def create_cadd_grch38_ht() -> hl.Table:
"""
Create a Hail Table with CADD scores for GRCh38.
The combined CADD scores in the returned table are from the following sources:
- all SNVs: `cadd.v1.6.whole_genome_SNVs.tsv.bgz` (81G) downloaded from
`https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz`.
It contains 8,812,917,339 SNVs.
- gnomad 3.0 indels: `cadd.v1.6.gnomad.genomes.v3.0.indel.tsv.bgz` (1.1G)
downloaded from `https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/gnomad.genomes.r3.0.indel.tsv.gz`.
It contains 100,546,109 indels from gnomaD v3.0.
- gnomad 3.1 indels: `cadd.v1.6.gnomad.genomes.v3.1.indels.new.ht` was run
on gnomAD v3.1 with CADD v1.6 in 2020. It contains 166,122,720 new indels from
gnomAD v3.1 compared to v3.0.
- gnomad 3.1 complex indels: `cadd.v1.6.gnomad.genomes.v3.1.indels.complex.ht`
was run on gnomAD v3.1 with CADD v1.6 in 2020. It contains 2,307 complex
variants that do not fit Hail's criteria for an indel and thus exist in
a separate table than the gnomad 3.1 indels.
- gnomAD v4 exomes indels: `cadd.v1.6.gnomad.exomes.v4.0.indels.new.tsv.bgz`
(368M) was run on gnomAD v4 with CADD v1.6 in 2023. It contains 32,561,
253 indels that are new in gnomAD v4.
- gnomAD v4 genomes indels: `cadd.v1.6.gnomad.genomes.v4.0.indels.new.tsv.bgz`
(13M) was run on gnomAD v4 with CADD v1.6 in 2023. It contains 904,906 indels
that are new in gnomAD v4 genomes because of the addition of HGDP/TGP samples.
.. note::
~1,9M indels were duplicated in gnomAD v3.0 and v4.0 or in gnomAD v3.1 and
v4.0. However, CADD only generates a score per loci. We keep only the latest
prediction, v4.0, for these loci.
The output generated a CADD HT with 9,110,177,520 rows.
:return: Hail Table with CADD scores for GRCh38.
"""
def _load_cadd_raw(cadd_tsv) -> hl.Table:
"""Functions to load CADD raw data in TSV format to Hail Table."""
column_names = {
"f0": "chr",
"f1": "pos",
"f2": "ref",
"f3": "alt",
"f4": "RawScore",
"f5": "PHRED",
}
types = {"f0": hl.tstr, "f1": hl.tint32, "f4": hl.tfloat32, "f5": hl.tfloat32}
ht = hl.import_table(
cadd_tsv,
types=types,
no_header=True,
force_bgz=True,
comment="#",
min_partitions=1000,
)
ht = ht.rename(column_names)
chr = hl.if_else(ht.chr.startswith("chr"), ht.chr, hl.format("chr%s", ht.chr))
ht = ht.annotate(
locus=hl.locus(chr, ht.pos, reference_genome="GRCh38"),
alleles=hl.array([ht.ref, ht.alt]),
)
ht = ht.select("locus", "alleles", "RawScore", "PHRED")
ht = ht.key_by("locus", "alleles")
ht = ht.checkpoint(new_temp_file("cadd", "ht"))
return ht
snvs = _load_cadd_raw(
"gs://gnomad-insilico/cadd/cadd.v1.6.whole_genome_SNVs.tsv.bgz"
)
indel3_0 = _load_cadd_raw(
"gs://gnomad-insilico/cadd/cadd.v1.6.gnomad.genomes.v3.0.indel.tsv.bgz"
)
indel3_1 = hl.read_table(
"gs://gnomad-insilico/cadd/cadd.v1.6.gnomad.genomes.v3.1.indels.new.ht"
)
indel3_1_complex = hl.read_table(
"gs://gnomad-insilico/cadd/cadd.v1.6.gnomad.genomes.v3.1.indels.complex.ht"
)
indel4_e = _load_cadd_raw(
"gs://gnomad-insilico/cadd/cadd.v1.6.gnomad.exomes.v4.0.indels.new.tsv.bgz"
)
indel4_g = _load_cadd_raw(
"gs://gnomad-insilico/cadd/cadd.v1.6.gnomad.genomes.v4.0.indels.new.tsv.bgz"
)
# Merge the CADD predictions run for v3 versions.
indel3 = indel3_0.union(indel3_1, indel3_1_complex)
# Merge the CADD predictions run for v4 versions.
indel4 = indel4_e.union(indel4_g).distinct()
# This will avoid duplicated indels in gnomAD v3 and v4.
indel3 = indel3.anti_join(indel4)
ht = snvs.union(indel3, indel4)
ht = ht.select(cadd=hl.struct(phred=ht.PHRED, raw_score=ht.RawScore))
ht = ht.annotate_globals(cadd_version="v1.6")
return ht
[docs]def create_spliceai_grch38_ht() -> hl.Table:
"""
Create a Hail Table with SpliceAI scores for GRCh38.
SpliceAI scores are from the following resources:
- Precomputed SNVs: spliceai_scores.masked.snv.hg38.vcf.bgz,
downloaded from https://basespace.illumina.com/s/5u6ThOblecrh
- Precomputed indels: spliceai_scores.masked.indel.hg38.vcf.bgz,
downloaded from https://basespace.illumina.com/s/5u6ThOblecrh
- gnomAD v3 indels: gnomad_v3_indel.spliceai_masked.vcf.bgz,
computed on v3.1 indels by Illumina in 2020.
- gnomAD v4 indels: gnomad_v4_new_indels.spliceai_masked.vcf.bgz,
computed on v4 indels that are new compared to v3 indels by Illumina in
February 2023.
- gnomAD v3 and v4 unscored indels:
spliceai_scores.masked.gnomad_v3_v4_unscored_indels.hg38.vcf.bgz,
another set of indels were not scored in v3 or v4 but computed by Illumina in
September 2023.
:return: Hail Table with SpliceAI scores for GRCh38.
"""
snvs_path = "gs://gnomad-insilico/spliceai/spliceai_scores.masked.snv.hg38.vcf.bgz"
indels_path = (
"gs://gnomad-insilico/spliceai/spliceai_scores.masked.indel.hg38.vcf.bgz"
)
v3_indels_path = "gs://gnomad-insilico/spliceai/spliceai_scores.masked.gnomad_v3_indel.hg38.vcf.bgz"
v4_indels_path = "gs://gnomad-insilico/spliceai/spliceai_scores.masked.gnomad_v4_new_indels.hg38.vcf.bgz"
v3_v4_unscored_path = "gs://gnomad-insilico/spliceai/spliceai_scores.masked.gnomad_v3_v4_unscored_indels.hg38.vcf.bgz"
header_file_path = "gs://gnomad-insilico/spliceai/spliceai.vcf.header"
def import_spliceai_vcf(path: str) -> hl.Table:
"""
Import SpliceAI VCF into Hail Table.
:param str path: Path to SpliceAI VCF.
:return: Hail MatrixTable with SpliceAI scores.
"""
ht = hl.import_vcf(
path,
force_bgz=True,
header_file=header_file_path,
reference_genome="GRCh38",
contig_recoding=NO_CHR_TO_CHR_CONTIG_RECODING,
skip_invalid_loci=True,
min_partitions=1000,
).rows()
return ht
logger.info("Importing vcf of SpliceAI scores into HT...")
spliceai_snvs = import_spliceai_vcf(snvs_path)
spliceai_indels = import_spliceai_vcf(indels_path)
v3_indels = import_spliceai_vcf(v3_indels_path)
v4_indels = import_spliceai_vcf(v4_indels_path)
v3_v4_unscored_indels = import_spliceai_vcf(v3_v4_unscored_path)
ht = spliceai_snvs.union(
spliceai_indels, v3_indels, v4_indels, v3_v4_unscored_indels
)
# `explode` because some variants fall on multiple genes and have a score per gene.
# All rows without a SpliceAI score, an empty array, are removed through explode.
logger.info("Exploding SpliceAI scores...")
ht = ht.explode(ht.info.SpliceAI)
# delta_score array for 4 splicing consequences: DS_AG|DS_AL|DS_DG|DS_DL
logger.info("Annotating SpliceAI scores...")
delta_scores = ht.info.SpliceAI.split(delim="\\|")[2:6]
ht = ht.annotate(delta_scores=hl.map(lambda x: hl.float32(x), delta_scores))
logger.info(
"Getting the max SpliceAI score across consequences for each variant per"
" gene..."
)
ht = ht.select(ds_max=hl.max(ht.delta_scores))
logger.info("Getting the max SpliceAI score for each variant across genes...")
ht = ht.collect_by_key()
ht = ht.select(spliceai_ds_max=hl.max(ht.values.ds_max))
ht = ht.annotate_globals(spliceai_version="v1.3")
return ht
[docs]def create_pangolin_grch38_ht() -> hl.Table:
"""
Create a Hail Table with Pangolin score for splicing for GRCh38.
.. note::
The score was based on the splicing prediction tool Pangolin:
Zeng, T., Li, Y.I. Predicting RNA splicing from DNA sequence using Pangolin.
Genome Biol 23, 103 (2022). https://doi.org/10.1186/s13059-022-02664-4
There's no precomputed score for all possible variants, the scores were
generated for gnomAD v4 genomes (=v3 genomes) and v4 exomes variants in
gene body only with code from developers at Invitae:
https://github.com/invitae/pangolin. All v4 genomes variants (except ~20M
bug-affected and ~3M new variants from HGDP/TGP samples, noted below) were run on
Pangolin v1.3.12, the others were run on Pangolin v1.4.4.
:return: Hail Table with Pangolin score for splicing for GRCh38.
"""
v4_genomes = (
"gs://gnomad-insilico/pangolin/gnomad.v4.0.genomes.pangolin.vcf.bgz/*.gz"
)
# ~3 million new variants in gnomAD v4 genomes from the addition of HGDP/TGP
# samples, they were run with the fixed code.
v4_genomes_new = "gs://gnomad-insilico/pangolin/gnomad.v4.0.genomes.new_variants.pangolin.vcf.bgz/*.gz"
v4_exomes = "gs://gnomad-insilico/pangolin/gnomad.v4.0.exomes.pangolin.vcf.bgz/*.gz"
# There was a bug in original Pangolin code, that would generate incorrect
# scores when a variant falls on multiple genes on the same strand. This bug
# impacted v4 genomes but was fixed before running v4 exomes, the affected
# ~20M v4 genome variants were rerun.
v4_bugfix = (
"gs://gnomad-insilico/pangolin/gnomad.v4.0.genomes.bugfix.pangolin.vcf.bgz/*.gz"
)
header_file_path = "gs://gnomad-insilico/pangolin/pangolin.vcf.header"
def import_pangolin_vcf(vcf_path: str) -> hl.Table:
"""
Import Pangolin VCF to Hail Table.
:param vcf_path: Path to Pangolin VCF.
:return: Hail Table with Pangolin scores.
"""
ht = hl.import_vcf(
vcf_path,
min_partitions=1000,
force_bgz=True,
reference_genome="GRCh38",
skip_invalid_loci=True,
header_file=header_file_path,
).rows()
ht = ht.checkpoint(hl.utils.new_temp_file("pangolin", "ht"))
return ht
ht_g = import_pangolin_vcf(v4_genomes)
ht_g_new = import_pangolin_vcf(v4_genomes_new)
ht_e = import_pangolin_vcf(v4_exomes)
ht_bugfix = import_pangolin_vcf(v4_bugfix)
ht_g_correct = ht_g.anti_join(ht_bugfix)
ht_g = ht_g_correct.union(ht_bugfix)
logger.info(
f"Number of rows in genomes Pangolin HT: {ht_g.count()};\n Number of rows in"
f" raw exomes Pangolin HT: {ht_e.count()};\n Number of rows in Pangolin HT for"
f" new variants of genomes: {ht_g_new.count()}.\n"
)
ht = ht_g.union(ht_e, ht_g_new)
logger.info("Exploding Pangolin scores...")
# `explode` will eliminate rows with empty array
# The Pangolin annotation is imported as an array of strings containing
# one element with the following format:
# gene1|pos_splice_gain:largest_increase|pos_splice_loss:largest_decrease|Warnings:||gene2...
# for example:
# Pangolin=ENSG00000121005.9|-86:0.25|38:-0.49|Warnings:||ENSG00000254238.1|-40:0.01|30:-0.17|Warnings:
ht = ht.annotate(pangolin=ht.info.Pangolin[0].split(delim="\|\|"))
ht = ht.explode(ht.pangolin)
logger.info("Number of rows in exploded Pangolin Hail Table: %s", ht.count())
logger.info("Annotating Pangolin scores...")
# The Pangolin score is the delta score of splice gain and splice loss,
# which is the second and fourth element in the array after splitting.
ht = ht.transmute(
pangolin=hl.struct(
delta_scores=(
hl.empty_array(hl.tfloat64).append(
hl.float(ht.pangolin.split(delim=":|\\|")[2])
)
).append(hl.float(ht.pangolin.split(delim=":|\\|")[4])),
)
)
# Using > instead of >= in case where delta score of splice gain and splice
# loss are the same but different sign, we will keep the positive score of
# splice gain
ht = ht.annotate(
largest_ds_gene=hl.if_else(
hl.abs(hl.min(ht.pangolin.delta_scores))
> hl.abs(hl.max(ht.pangolin.delta_scores)),
hl.min(ht.pangolin.delta_scores),
hl.max(ht.pangolin.delta_scores),
)
)
ht = ht.select(ht.largest_ds_gene)
ht = ht.collect_by_key()
logger.info(
"Number of rows in Pangolin Hail Table after collect_by_key: %s", ht.count()
)
ht = ht.select(
pangolin_largest_ds=hl.if_else(
hl.abs(hl.min(ht.values.largest_ds_gene))
> hl.abs(hl.max(ht.values.largest_ds_gene)),
hl.min(ht.values.largest_ds_gene),
hl.max(ht.values.largest_ds_gene),
)
)
# TODO: get the version for genomes run in May 2023.
ht = ht.annotate_globals(pangolin_version=["v1.3.12", "v1.4.4"])
logger.info(
"\nNumber of variants indicating splice gain: %s;\n"
"Number of variants indicating splice loss: %s; \n"
"Number of variants with no splicing consequence: %s \n",
hl.agg.count_where(ht.pangolin_largest_ds > 0),
hl.agg.count_where(ht.pangolin_largest_ds < 0),
hl.agg.count_where(ht.pangolin_largest_ds == 0),
)
return ht
[docs]def create_revel_grch38_ht() -> hl.Table:
"""
Create a Hail Table with REVEL scores for GRCh38.
.. note::
Starting with gnomAD v4, we use REVEL scores for only MANE Select and
canonical transcripts. Even when a variant falls on multiple MANE/canonical
transcripts of different genes, the scores are equal.
REVEL scores were downloaded from:
https://rothsj06.dmz.hpc.mssm.edu/revel-v1.3_all_chromosomes.zip
size ~648M, ~82,100,677 variants
REVEL's Ensembl ID is not from Ensembl 105, so we filter to transcripts
that are in Ensembl 105. The Ensembl 105 ID file was downloaded from Ensembl
105 archive. It contains the following columns:
- Transcript stable ID
- Ensembl Canonical
- MANE Select
This deprecates the `has_duplicate` field present in gnomAD v3.1.
:return: Hail Table with REVEL scores for GRCh38.
"""
revel_csv = "gs://gnomad-insilico/revel/revel-v1.3_all_chromosomes_with_transcript_ids.csv.bgz"
ensembl_path = "gs://gnomad-insilico/ensembl/ensembl105id.grch38.tsv.bgz"
ht = hl.import_table(
revel_csv,
delimiter=",",
min_partitions=1000,
types={"grch38_pos": hl.tstr, "REVEL": hl.tfloat64},
)
logger.info("Annotating REVEL table...")
ht = ht.drop("hg19_pos", "aaref", "aaalt")
# drop variants that have no position in GRCh38 when lifted over from GRCh37
ht = ht.filter(ht.grch38_pos.contains("."), keep=False)
ht = ht.transmute(chr="chr" + ht.chr)
ht = ht.select(
locus=hl.locus(ht.chr, hl.int(ht.grch38_pos), reference_genome="GRCh38"),
alleles=hl.array([ht.ref, ht.alt]),
REVEL=ht.REVEL,
Transcript_stable_ID=ht.Ensembl_transcriptid.strip().split(";"),
)
ht = ht.explode("Transcript_stable_ID")
ht = ht.key_by("Transcript_stable_ID")
logger.info("Number of rows in REVEL table: %s", ht.count())
logger.info("Import and process the ensembl ID file...")
ensembl_ids_ht = hl.import_table(
ensembl_path, min_partitions=200, impute=True, key="Transcript_stable_ID"
)
ensembl_ids_ht = ensembl_ids_ht.select("Ensembl_Canonical", "MANE_Select")
logger.info("Annotating REVEL HT with canonical and MANE Select transcripts...")
ht = ht.annotate(**ensembl_ids_ht[ht.key])
logger.info(
"Annotating REVEL scores for MANE Select transcripts and canonical"
" transcripts..."
)
ht = ht.key_by("locus", "alleles")
ht = ht.annotate(
revel_mane=hl.or_missing(ht.MANE_Select != "", ht.REVEL),
revel_canonical=hl.or_missing(ht.Ensembl_Canonical == "1", ht.REVEL),
)
ht = ht.checkpoint(
"gs://gnomad-tmp-4day/revel-v1.3_in_Ensembl105.ht", overwrite=True
)
# Since the REVEL score for each variant is transcript-specific, we
# prioritize the scores predicted on MANE Select and canonical transcripts,
# and take the max if a variants falls on multiple MANE Select or canonical
# transcripts. Normally, the score should be equal across MANE Select
# and canonical.
logger.info("Taking max REVEL scores for MANE Select transcripts...")
mane_ht = ht.filter(hl.is_defined(ht.revel_mane), keep=True)
max_revel_mane = mane_ht.group_by(*mane_ht.key).aggregate(
revel_mane_max=hl.agg.max(mane_ht.revel_mane),
)
logger.info("Taking max REVEL scores for canonical transcripts...")
canonical_ht = ht.filter(
(~hl.is_defined(ht.revel_mane)) & (hl.is_defined(ht.revel_canonical)), keep=True
)
max_revel_canonical = canonical_ht.group_by(*canonical_ht.key).aggregate(
revel_canonical_max=hl.agg.max(canonical_ht.revel_canonical),
)
logger.info(
"Merge max REVEL scores for MANE Select transcripts and canonical transcripts"
" to one HT..."
)
final_ht = max_revel_mane.join(max_revel_canonical, how="outer")
final_ht = final_ht.select(
revel_max=hl.or_else(final_ht.revel_mane_max, final_ht.revel_canonical_max)
)
logger.info("Number of rows in final REVEL HT: %s", final_ht.count())
final_ht = final_ht.annotate_globals(revel_version="v1.3")
return final_ht
[docs]def create_phylop_grch38_ht() -> hl.Table:
"""
Convert PhyloP scores to Hail Table.
BigWig format of Phylop was download from here:
https://cgl.gi.ucsc.edu/data/cactus/241-mammalian-2020v2-hub/Homo_sapiens/241-mammalian-2020v2.bigWig
and converted it to bedGraph format with bigWigToBedGraph from the kent packages
of UCSC (https://hgdownload.cse.ucsc.edu/admin/exe/) with the following command:
`./bigWigToBedGraph ~/Downloads/241-mammalian-2020v2.bigWig ~/Downloads/241-mammalian-2020v2.bedGraph`
The bedGraph file is bigzipped before importing to Hail.
.. note::
Different to other in silico predictors, the Phylop HT is keyed by locus only.
Since the PhyloP scores have one value per position, we exploded the interval
to store the HT by locus. In result, we have Phylop scores for 2,852,623,265
locus from 2,648,607,958 intervals.
:return: Hail Table with Phylop Scores for GRCh38
"""
bg_path = "gs://gnomad-insilico/phylop/Human-GRCh38-Phylop-241-mammalian-2020v2.bedGraph.bgz"
columns = ["chr", "start", "end", "phylop"]
ht = hl.import_table(
bg_path,
min_partitions=1000,
impute=True,
no_header=True,
).rename({f"f{i}": c for i, c in enumerate(columns)})
# We add 1 to both start and end because input bedGraph is 0-indexed interval and
# the interval is end exclusive
ht = ht.annotate(pos=hl.range(ht.start + 1, ht.end + 1))
ht = ht.explode("pos")
ht = ht.annotate(locus=hl.locus(ht.chr, ht.pos, reference_genome="GRCh38"))
ht = ht.select("locus", "phylop")
ht = ht.key_by("locus")
ht = ht.annotate_globals(phylop_version="v2")
return ht
[docs]def get_revel_for_unmatched_transcripts() -> None:
"""
Create Tables with alternative REVEL scores for variants in v4.1 release.
..note:
REVEL was computed using transcripts from Ensembl v64. In the gnomAD v4.0
and v4.1 release Tables, transcript information from Ensembl v105 and variant
information (locus and alleles combination) were used to ascertain variant
REVEL scores for MANE select or canonical transcripts only. This means that
variants within 2,414 MANE select transcripts in gnomAD v4.0 and v4.1 are
missing REVEL scores because they are not present in Ensembl v64.
To address this, we annotated the variants within the 2,414 genes with the
maximum REVEL score found at the specific locus and allele, rather than the
score for the MANE Select transcript.
The exomes TSV adds REVEL scores to 1,936,321 out of 2,284,296 (87.77%)
missense variants within the 2,414 genes. The genomes TSV adds REVEL scores
to 528,204 out of 620,799 ( 85.08%) missense variants within the 2,414 genes.
"""
def _process_revel():
"""Process REVEL scores."""
revel_csv = "gs://gnomad-insilico/revel/revel-v1.3_all_chromosomes_with_transcript_ids.csv.bgz"
ht = hl.import_table(
revel_csv,
delimiter=",",
min_partitions=1000,
types={"grch38_pos": hl.tstr, "REVEL": hl.tfloat64},
)
# drop variants that have no position in GRCh38 when lifted over from GRCh37
ht = ht.filter(ht.grch38_pos.contains("."), keep=False)
ht = ht.select(
locus=hl.locus(
"chr" + ht.chr, hl.int(ht.grch38_pos), reference_genome="GRCh38"
),
alleles=hl.array([ht.ref, ht.alt]),
REVEL=ht.REVEL,
Transcript_stable_ID=ht.Ensembl_transcriptid.strip().split(";"),
)
ht = ht.key_by("locus", "alleles")
ht = ht.group_by("locus", "alleles").aggregate(REVEL_max=hl.agg.max(ht.REVEL))
return ht
def _filter_release_ht(data_type, genes_list):
"""Filter release sites to only include genes with missing revel scores."""
ht = release_sites(data_type=data_type).ht()
ht = process_consequences(ht, has_polyphen=False)
ht = filter_vep_transcript_csqs(
ht,
synonymous=False,
mane_select=True,
genes=genes_list,
csqs=["missense_variant"],
)
ht = ht.select(
gene_id=ht.vep.transcript_consequences.gene_id,
most_severe_consequence=ht.vep.transcript_consequences.most_severe_consequence,
)
return ht
# Get the max REVEL score for each variant
revel = _process_revel()
revel = revel.checkpoint(hl.utils.new_temp_file("revel_tmp", "ht"))
# Get genes missing revel scores
genes = hl.import_table(
"gs://gnomad-insilico/revel/Ensembl105MANE_without_REVEL.txt",
impute=True,
comment="#",
)
genes_list = genes.Gene_stable_ID.collect()
for data_type in ["exomes", "genomes"]:
# Filter release sites to only include genes with missing revel
ht = _filter_release_ht(data_type, genes_list)
ht = ht.checkpoint(hl.utils.new_temp_file(f"{data_type}_tmp_filtered", "ht"))
# Join REVEL scores with release sites
ht = ht.annotate(REVEL_max=revel[ht.key].REVEL_max)
# Filter out variants without a REVEL score
ht = ht.filter(hl.is_defined(ht.REVEL_max))
ht.export(
"gs://gnomad-insilico/revel/gnomad.v4.1."
f"{data_type}.revel_unmatched_transcripts.tsv.bgz"
)
[docs]def main(args):
"""Generate Hail Tables with in silico predictors."""
hl.init(
default_reference="GRCh38",
log="/insilico_predictors.log",
tmp_dir="gs://gnomad-tmp-4day",
)
if args.cadd:
logger.info("Creating CADD Hail Table for GRCh38...")
ht = create_cadd_grch38_ht()
ht.write(
get_insilico_predictors(predictor="cadd").path,
overwrite=args.overwrite,
)
logger.info("CADD Hail Table for GRCh38 created.")
if args.spliceai:
logger.info("Creating SpliceAI Hail Table for GRCh38...")
ht = create_spliceai_grch38_ht()
ht.write(
get_insilico_predictors(predictor="spliceai").path,
overwrite=args.overwrite,
)
logger.info("SpliceAI Hail Table for GRCh38 created.")
if args.pangolin:
logger.info("Creating Pangolin Hail Table for GRCh38...")
ht = create_pangolin_grch38_ht()
ht.write(
get_insilico_predictors(predictor="pangolin").path,
overwrite=args.overwrite,
)
logger.info("Pangolin Hail Table for GRCh38 created.")
if args.revel:
logger.info("Creating REVEL Hail Table for GRCh38...")
ht = create_revel_grch38_ht()
ht.write(
get_insilico_predictors(predictor="revel").path,
overwrite=args.overwrite,
)
logger.info("REVEL Hail Table for GRCh38 created.")
if args.revel_unmatched_transcripts:
logger.info(
"Get REVEL score for variants in missing MANE transcripts in v4.1"
" release..."
)
get_revel_for_unmatched_transcripts()
if args.phylop:
logger.info("Creating PhyloP Hail Table for GRCh38...")
ht = create_phylop_grch38_ht()
ht.write(
get_insilico_predictors(predictor="phylop").path,
overwrite=args.overwrite,
)
def get_script_argument_parser() -> argparse.ArgumentParser:
"""Get script argument parser."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--slack-channel", help="Slack channel to post results and notifications to."
)
parser.add_argument("--overwrite", help="Overwrite data", action="store_true")
parser.add_argument("--cadd", help="Create CADD HT", action="store_true")
parser.add_argument("--spliceai", help="Create SpliceAI HT", action="store_true")
parser.add_argument("--pangolin", help="Create Pangolin HT", action="store_true")
parser.add_argument("--revel", help="Create REVEL HT.", action="store_true")
parser.add_argument("--phylop", help="Create PhyloP HT.", action="store_true")
parser.add_argument(
"--revel-unmatched-transcripts",
help=(
"Get alternative REVEL score for variants "
"in MANE transcripts in v4.1 release."
),
action="store_true",
)
return parser
if __name__ == "__main__":
parser = get_script_argument_parser()
args = parser.parse_args()
if args.slack_channel:
with slack_notifications(slack_token, args.slack_channel):
main(args)
else:
main(args)