Source code for gnomad_qc.v4.annotations.insilico_predictors

"""Script to generate Hail Tables with in silico predictors."""

import argparse
import logging

import hail as hl
from gnomad.resources.resource_utils import NO_CHR_TO_CHR_CONTIG_RECODING
from gnomad.utils.slack import slack_notifications
from gnomad.utils.vep import filter_vep_transcript_csqs, process_consequences
from hail.utils import new_temp_file

from gnomad_qc.slack_creds import slack_token
from gnomad_qc.v4.resources.annotations import get_insilico_predictors
from gnomad_qc.v4.resources.release import release_sites

logging.basicConfig(
    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
    datefmt="%m/%d/%Y %I:%M:%S %p",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


[docs]def get_sift_polyphen_from_vep(ht: hl.Table) -> hl.Table: """ Get the max SIFT and PolyPhen scores from VEP 105 annotations. This retrieves the max of SIFT and PolyPhen scores for a variant's MANE Select transcript or, if MANE Select does not exist, canonical transcript. :param ht: VEP 105 annotated Hail Table. :return: Table annotated with max SIFT and PolyPhen scores. """ mane = filter_vep_transcript_csqs( ht, synonymous=False, canonical=False, mane_select=True ) canonical = filter_vep_transcript_csqs(ht, synonymous=False, canonical=True) ht = ht.annotate( sift_mane=mane[ht.key].vep.transcript_consequences.sift_score, polyphen_mane=mane[ht.key].vep.transcript_consequences.polyphen_score, sift_canonical=canonical[ht.key].vep.transcript_consequences.sift_score, polyphen_canonical=canonical[ht.key].vep.transcript_consequences.polyphen_score, ) ht = ht.select( sift_max=hl.or_else(hl.max(ht.sift_mane), hl.max(ht.sift_canonical)), polyphen_max=hl.or_else( hl.max(ht.polyphen_mane), hl.max(ht.polyphen_canonical) ), ) return ht
[docs]def create_cadd_grch38_ht() -> hl.Table: """ Create a Hail Table with CADD scores for GRCh38. The combined CADD scores in the returned table are from the following sources: - all SNVs: `cadd.v1.6.whole_genome_SNVs.tsv.bgz` (81G) downloaded from `https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz`. It contains 8,812,917,339 SNVs. - gnomad 3.0 indels: `cadd.v1.6.gnomad.genomes.v3.0.indel.tsv.bgz` (1.1G) downloaded from `https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/gnomad.genomes.r3.0.indel.tsv.gz`. It contains 100,546,109 indels from gnomaD v3.0. - gnomad 3.1 indels: `cadd.v1.6.gnomad.genomes.v3.1.indels.new.ht` was run on gnomAD v3.1 with CADD v1.6 in 2020. It contains 166,122,720 new indels from gnomAD v3.1 compared to v3.0. - gnomad 3.1 complex indels: `cadd.v1.6.gnomad.genomes.v3.1.indels.complex.ht` was run on gnomAD v3.1 with CADD v1.6 in 2020. It contains 2,307 complex variants that do not fit Hail's criteria for an indel and thus exist in a separate table than the gnomad 3.1 indels. - gnomAD v4 exomes indels: `cadd.v1.6.gnomad.exomes.v4.0.indels.new.tsv.bgz` (368M) was run on gnomAD v4 with CADD v1.6 in 2023. It contains 32,561, 253 indels that are new in gnomAD v4. - gnomAD v4 genomes indels: `cadd.v1.6.gnomad.genomes.v4.0.indels.new.tsv.bgz` (13M) was run on gnomAD v4 with CADD v1.6 in 2023. It contains 904,906 indels that are new in gnomAD v4 genomes because of the addition of HGDP/TGP samples. .. note:: ~1,9M indels were duplicated in gnomAD v3.0 and v4.0 or in gnomAD v3.1 and v4.0. However, CADD only generates a score per loci. We keep only the latest prediction, v4.0, for these loci. The output generated a CADD HT with 9,110,177,520 rows. :return: Hail Table with CADD scores for GRCh38. """ def _load_cadd_raw(cadd_tsv) -> hl.Table: """Functions to load CADD raw data in TSV format to Hail Table.""" column_names = { "f0": "chr", "f1": "pos", "f2": "ref", "f3": "alt", "f4": "RawScore", "f5": "PHRED", } types = {"f0": hl.tstr, "f1": hl.tint32, "f4": hl.tfloat32, "f5": hl.tfloat32} ht = hl.import_table( cadd_tsv, types=types, no_header=True, force_bgz=True, comment="#", min_partitions=1000, ) ht = ht.rename(column_names) chr = hl.if_else(ht.chr.startswith("chr"), ht.chr, hl.format("chr%s", ht.chr)) ht = ht.annotate( locus=hl.locus(chr, ht.pos, reference_genome="GRCh38"), alleles=hl.array([ht.ref, ht.alt]), ) ht = ht.select("locus", "alleles", "RawScore", "PHRED") ht = ht.key_by("locus", "alleles") ht = ht.checkpoint(new_temp_file("cadd", "ht")) return ht snvs = _load_cadd_raw( "gs://gnomad-insilico/cadd/cadd.v1.6.whole_genome_SNVs.tsv.bgz" ) indel3_0 = _load_cadd_raw( "gs://gnomad-insilico/cadd/cadd.v1.6.gnomad.genomes.v3.0.indel.tsv.bgz" ) indel3_1 = hl.read_table( "gs://gnomad-insilico/cadd/cadd.v1.6.gnomad.genomes.v3.1.indels.new.ht" ) indel3_1_complex = hl.read_table( "gs://gnomad-insilico/cadd/cadd.v1.6.gnomad.genomes.v3.1.indels.complex.ht" ) indel4_e = _load_cadd_raw( "gs://gnomad-insilico/cadd/cadd.v1.6.gnomad.exomes.v4.0.indels.new.tsv.bgz" ) indel4_g = _load_cadd_raw( "gs://gnomad-insilico/cadd/cadd.v1.6.gnomad.genomes.v4.0.indels.new.tsv.bgz" ) # Merge the CADD predictions run for v3 versions. indel3 = indel3_0.union(indel3_1, indel3_1_complex) # Merge the CADD predictions run for v4 versions. indel4 = indel4_e.union(indel4_g).distinct() # This will avoid duplicated indels in gnomAD v3 and v4. indel3 = indel3.anti_join(indel4) ht = snvs.union(indel3, indel4) ht = ht.select(cadd=hl.struct(phred=ht.PHRED, raw_score=ht.RawScore)) ht = ht.annotate_globals(cadd_version="v1.6") return ht
[docs]def create_spliceai_grch38_ht() -> hl.Table: """ Create a Hail Table with SpliceAI scores for GRCh38. SpliceAI scores are from the following resources: - Precomputed SNVs: spliceai_scores.masked.snv.hg38.vcf.bgz, downloaded from https://basespace.illumina.com/s/5u6ThOblecrh - Precomputed indels: spliceai_scores.masked.indel.hg38.vcf.bgz, downloaded from https://basespace.illumina.com/s/5u6ThOblecrh - gnomAD v3 indels: gnomad_v3_indel.spliceai_masked.vcf.bgz, computed on v3.1 indels by Illumina in 2020. - gnomAD v4 indels: gnomad_v4_new_indels.spliceai_masked.vcf.bgz, computed on v4 indels that are new compared to v3 indels by Illumina in February 2023. - gnomAD v3 and v4 unscored indels: spliceai_scores.masked.gnomad_v3_v4_unscored_indels.hg38.vcf.bgz, another set of indels were not scored in v3 or v4 but computed by Illumina in September 2023. :return: Hail Table with SpliceAI scores for GRCh38. """ snvs_path = "gs://gnomad-insilico/spliceai/spliceai_scores.masked.snv.hg38.vcf.bgz" indels_path = ( "gs://gnomad-insilico/spliceai/spliceai_scores.masked.indel.hg38.vcf.bgz" ) v3_indels_path = "gs://gnomad-insilico/spliceai/spliceai_scores.masked.gnomad_v3_indel.hg38.vcf.bgz" v4_indels_path = "gs://gnomad-insilico/spliceai/spliceai_scores.masked.gnomad_v4_new_indels.hg38.vcf.bgz" v3_v4_unscored_path = "gs://gnomad-insilico/spliceai/spliceai_scores.masked.gnomad_v3_v4_unscored_indels.hg38.vcf.bgz" header_file_path = "gs://gnomad-insilico/spliceai/spliceai.vcf.header" def import_spliceai_vcf(path: str) -> hl.Table: """ Import SpliceAI VCF into Hail Table. :param str path: Path to SpliceAI VCF. :return: Hail MatrixTable with SpliceAI scores. """ ht = hl.import_vcf( path, force_bgz=True, header_file=header_file_path, reference_genome="GRCh38", contig_recoding=NO_CHR_TO_CHR_CONTIG_RECODING, skip_invalid_loci=True, min_partitions=1000, ).rows() return ht logger.info("Importing vcf of SpliceAI scores into HT...") spliceai_snvs = import_spliceai_vcf(snvs_path) spliceai_indels = import_spliceai_vcf(indels_path) v3_indels = import_spliceai_vcf(v3_indels_path) v4_indels = import_spliceai_vcf(v4_indels_path) v3_v4_unscored_indels = import_spliceai_vcf(v3_v4_unscored_path) ht = spliceai_snvs.union( spliceai_indels, v3_indels, v4_indels, v3_v4_unscored_indels ) # `explode` because some variants fall on multiple genes and have a score per gene. # All rows without a SpliceAI score, an empty array, are removed through explode. logger.info("Exploding SpliceAI scores...") ht = ht.explode(ht.info.SpliceAI) # delta_score array for 4 splicing consequences: DS_AG|DS_AL|DS_DG|DS_DL logger.info("Annotating SpliceAI scores...") delta_scores = ht.info.SpliceAI.split(delim="\\|")[2:6] ht = ht.annotate(delta_scores=hl.map(lambda x: hl.float32(x), delta_scores)) logger.info( "Getting the max SpliceAI score across consequences for each variant per" " gene..." ) ht = ht.select(ds_max=hl.max(ht.delta_scores)) logger.info("Getting the max SpliceAI score for each variant across genes...") ht = ht.collect_by_key() ht = ht.select(spliceai_ds_max=hl.max(ht.values.ds_max)) ht = ht.annotate_globals(spliceai_version="v1.3") return ht
[docs]def create_pangolin_grch38_ht() -> hl.Table: """ Create a Hail Table with Pangolin score for splicing for GRCh38. .. note:: The score was based on the splicing prediction tool Pangolin: Zeng, T., Li, Y.I. Predicting RNA splicing from DNA sequence using Pangolin. Genome Biol 23, 103 (2022). https://doi.org/10.1186/s13059-022-02664-4 There's no precomputed score for all possible variants, the scores were generated for gnomAD v4 genomes (=v3 genomes) and v4 exomes variants in gene body only with code from developers at Invitae: https://github.com/invitae/pangolin. All v4 genomes variants (except ~20M bug-affected and ~3M new variants from HGDP/TGP samples, noted below) were run on Pangolin v1.3.12, the others were run on Pangolin v1.4.4. :return: Hail Table with Pangolin score for splicing for GRCh38. """ v4_genomes = ( "gs://gnomad-insilico/pangolin/gnomad.v4.0.genomes.pangolin.vcf.bgz/*.gz" ) # ~3 million new variants in gnomAD v4 genomes from the addition of HGDP/TGP # samples, they were run with the fixed code. v4_genomes_new = "gs://gnomad-insilico/pangolin/gnomad.v4.0.genomes.new_variants.pangolin.vcf.bgz/*.gz" v4_exomes = "gs://gnomad-insilico/pangolin/gnomad.v4.0.exomes.pangolin.vcf.bgz/*.gz" # There was a bug in original Pangolin code, that would generate incorrect # scores when a variant falls on multiple genes on the same strand. This bug # impacted v4 genomes but was fixed before running v4 exomes, the affected # ~20M v4 genome variants were rerun. v4_bugfix = ( "gs://gnomad-insilico/pangolin/gnomad.v4.0.genomes.bugfix.pangolin.vcf.bgz/*.gz" ) header_file_path = "gs://gnomad-insilico/pangolin/pangolin.vcf.header" def import_pangolin_vcf(vcf_path: str) -> hl.Table: """ Import Pangolin VCF to Hail Table. :param vcf_path: Path to Pangolin VCF. :return: Hail Table with Pangolin scores. """ ht = hl.import_vcf( vcf_path, min_partitions=1000, force_bgz=True, reference_genome="GRCh38", skip_invalid_loci=True, header_file=header_file_path, ).rows() ht = ht.checkpoint(hl.utils.new_temp_file("pangolin", "ht")) return ht ht_g = import_pangolin_vcf(v4_genomes) ht_g_new = import_pangolin_vcf(v4_genomes_new) ht_e = import_pangolin_vcf(v4_exomes) ht_bugfix = import_pangolin_vcf(v4_bugfix) ht_g_correct = ht_g.anti_join(ht_bugfix) ht_g = ht_g_correct.union(ht_bugfix) logger.info( f"Number of rows in genomes Pangolin HT: {ht_g.count()};\n Number of rows in" f" raw exomes Pangolin HT: {ht_e.count()};\n Number of rows in Pangolin HT for" f" new variants of genomes: {ht_g_new.count()}.\n" ) ht = ht_g.union(ht_e, ht_g_new) logger.info("Exploding Pangolin scores...") # `explode` will eliminate rows with empty array # The Pangolin annotation is imported as an array of strings containing # one element with the following format: # gene1|pos_splice_gain:largest_increase|pos_splice_loss:largest_decrease|Warnings:||gene2... # for example: # Pangolin=ENSG00000121005.9|-86:0.25|38:-0.49|Warnings:||ENSG00000254238.1|-40:0.01|30:-0.17|Warnings: ht = ht.annotate(pangolin=ht.info.Pangolin[0].split(delim="\|\|")) ht = ht.explode(ht.pangolin) logger.info("Number of rows in exploded Pangolin Hail Table: %s", ht.count()) logger.info("Annotating Pangolin scores...") # The Pangolin score is the delta score of splice gain and splice loss, # which is the second and fourth element in the array after splitting. ht = ht.transmute( pangolin=hl.struct( delta_scores=( hl.empty_array(hl.tfloat64).append( hl.float(ht.pangolin.split(delim=":|\\|")[2]) ) ).append(hl.float(ht.pangolin.split(delim=":|\\|")[4])), ) ) # Using > instead of >= in case where delta score of splice gain and splice # loss are the same but different sign, we will keep the positive score of # splice gain ht = ht.annotate( largest_ds_gene=hl.if_else( hl.abs(hl.min(ht.pangolin.delta_scores)) > hl.abs(hl.max(ht.pangolin.delta_scores)), hl.min(ht.pangolin.delta_scores), hl.max(ht.pangolin.delta_scores), ) ) ht = ht.select(ht.largest_ds_gene) ht = ht.collect_by_key() logger.info( "Number of rows in Pangolin Hail Table after collect_by_key: %s", ht.count() ) ht = ht.select( pangolin_largest_ds=hl.if_else( hl.abs(hl.min(ht.values.largest_ds_gene)) > hl.abs(hl.max(ht.values.largest_ds_gene)), hl.min(ht.values.largest_ds_gene), hl.max(ht.values.largest_ds_gene), ) ) # TODO: get the version for genomes run in May 2023. ht = ht.annotate_globals(pangolin_version=["v1.3.12", "v1.4.4"]) logger.info( "\nNumber of variants indicating splice gain: %s;\n" "Number of variants indicating splice loss: %s; \n" "Number of variants with no splicing consequence: %s \n", hl.agg.count_where(ht.pangolin_largest_ds > 0), hl.agg.count_where(ht.pangolin_largest_ds < 0), hl.agg.count_where(ht.pangolin_largest_ds == 0), ) return ht
[docs]def create_revel_grch38_ht() -> hl.Table: """ Create a Hail Table with REVEL scores for GRCh38. .. note:: Starting with gnomAD v4, we use REVEL scores for only MANE Select and canonical transcripts. Even when a variant falls on multiple MANE/canonical transcripts of different genes, the scores are equal. REVEL scores were downloaded from: https://rothsj06.dmz.hpc.mssm.edu/revel-v1.3_all_chromosomes.zip size ~648M, ~82,100,677 variants REVEL's Ensembl ID is not from Ensembl 105, so we filter to transcripts that are in Ensembl 105. The Ensembl 105 ID file was downloaded from Ensembl 105 archive. It contains the following columns: - Transcript stable ID - Ensembl Canonical - MANE Select This deprecates the `has_duplicate` field present in gnomAD v3.1. :return: Hail Table with REVEL scores for GRCh38. """ revel_csv = "gs://gnomad-insilico/revel/revel-v1.3_all_chromosomes_with_transcript_ids.csv.bgz" ensembl_path = "gs://gnomad-insilico/ensembl/ensembl105id.grch38.tsv.bgz" ht = hl.import_table( revel_csv, delimiter=",", min_partitions=1000, types={"grch38_pos": hl.tstr, "REVEL": hl.tfloat64}, ) logger.info("Annotating REVEL table...") ht = ht.drop("hg19_pos", "aaref", "aaalt") # drop variants that have no position in GRCh38 when lifted over from GRCh37 ht = ht.filter(ht.grch38_pos.contains("."), keep=False) ht = ht.transmute(chr="chr" + ht.chr) ht = ht.select( locus=hl.locus(ht.chr, hl.int(ht.grch38_pos), reference_genome="GRCh38"), alleles=hl.array([ht.ref, ht.alt]), REVEL=ht.REVEL, Transcript_stable_ID=ht.Ensembl_transcriptid.strip().split(";"), ) ht = ht.explode("Transcript_stable_ID") ht = ht.key_by("Transcript_stable_ID") logger.info("Number of rows in REVEL table: %s", ht.count()) logger.info("Import and process the ensembl ID file...") ensembl_ids_ht = hl.import_table( ensembl_path, min_partitions=200, impute=True, key="Transcript_stable_ID" ) ensembl_ids_ht = ensembl_ids_ht.select("Ensembl_Canonical", "MANE_Select") logger.info("Annotating REVEL HT with canonical and MANE Select transcripts...") ht = ht.annotate(**ensembl_ids_ht[ht.key]) logger.info( "Annotating REVEL scores for MANE Select transcripts and canonical" " transcripts..." ) ht = ht.key_by("locus", "alleles") ht = ht.annotate( revel_mane=hl.or_missing(ht.MANE_Select != "", ht.REVEL), revel_canonical=hl.or_missing(ht.Ensembl_Canonical == "1", ht.REVEL), ) ht = ht.checkpoint( "gs://gnomad-tmp-4day/revel-v1.3_in_Ensembl105.ht", overwrite=True ) # Since the REVEL score for each variant is transcript-specific, we # prioritize the scores predicted on MANE Select and canonical transcripts, # and take the max if a variants falls on multiple MANE Select or canonical # transcripts. Normally, the score should be equal across MANE Select # and canonical. logger.info("Taking max REVEL scores for MANE Select transcripts...") mane_ht = ht.filter(hl.is_defined(ht.revel_mane), keep=True) max_revel_mane = mane_ht.group_by(*mane_ht.key).aggregate( revel_mane_max=hl.agg.max(mane_ht.revel_mane), ) logger.info("Taking max REVEL scores for canonical transcripts...") canonical_ht = ht.filter( (~hl.is_defined(ht.revel_mane)) & (hl.is_defined(ht.revel_canonical)), keep=True ) max_revel_canonical = canonical_ht.group_by(*canonical_ht.key).aggregate( revel_canonical_max=hl.agg.max(canonical_ht.revel_canonical), ) logger.info( "Merge max REVEL scores for MANE Select transcripts and canonical transcripts" " to one HT..." ) final_ht = max_revel_mane.join(max_revel_canonical, how="outer") final_ht = final_ht.select( revel_max=hl.or_else(final_ht.revel_mane_max, final_ht.revel_canonical_max) ) logger.info("Number of rows in final REVEL HT: %s", final_ht.count()) final_ht = final_ht.annotate_globals(revel_version="v1.3") return final_ht
[docs]def create_phylop_grch38_ht() -> hl.Table: """ Convert PhyloP scores to Hail Table. BigWig format of Phylop was download from here: https://cgl.gi.ucsc.edu/data/cactus/241-mammalian-2020v2-hub/Homo_sapiens/241-mammalian-2020v2.bigWig and converted it to bedGraph format with bigWigToBedGraph from the kent packages of UCSC (https://hgdownload.cse.ucsc.edu/admin/exe/) with the following command: `./bigWigToBedGraph ~/Downloads/241-mammalian-2020v2.bigWig ~/Downloads/241-mammalian-2020v2.bedGraph` The bedGraph file is bigzipped before importing to Hail. .. note:: Different to other in silico predictors, the Phylop HT is keyed by locus only. Since the PhyloP scores have one value per position, we exploded the interval to store the HT by locus. In result, we have Phylop scores for 2,852,623,265 locus from 2,648,607,958 intervals. :return: Hail Table with Phylop Scores for GRCh38 """ bg_path = "gs://gnomad-insilico/phylop/Human-GRCh38-Phylop-241-mammalian-2020v2.bedGraph.bgz" columns = ["chr", "start", "end", "phylop"] ht = hl.import_table( bg_path, min_partitions=1000, impute=True, no_header=True, ).rename({f"f{i}": c for i, c in enumerate(columns)}) # We add 1 to both start and end because input bedGraph is 0-indexed interval and # the interval is end exclusive ht = ht.annotate(pos=hl.range(ht.start + 1, ht.end + 1)) ht = ht.explode("pos") ht = ht.annotate(locus=hl.locus(ht.chr, ht.pos, reference_genome="GRCh38")) ht = ht.select("locus", "phylop") ht = ht.key_by("locus") ht = ht.annotate_globals(phylop_version="v2") return ht
[docs]def get_revel_for_unmatched_transcripts() -> None: """ Create Tables with alternative REVEL scores for variants in v4.1 release. ..note: REVEL was computed using transcripts from Ensembl v64. In the gnomAD v4.0 and v4.1 release Tables, transcript information from Ensembl v105 and variant information (locus and alleles combination) were used to ascertain variant REVEL scores for MANE select or canonical transcripts only. This means that variants within 2,414 MANE select transcripts in gnomAD v4.0 and v4.1 are missing REVEL scores because they are not present in Ensembl v64. To address this, we annotated the variants within the 2,414 genes with the maximum REVEL score found at the specific locus and allele, rather than the score for the MANE Select transcript. The exomes TSV adds REVEL scores to 1,936,321 out of 2,284,296 (87.77%) missense variants within the 2,414 genes. The genomes TSV adds REVEL scores to 528,204 out of 620,799 ( 85.08%) missense variants within the 2,414 genes. """ def _process_revel(): """Process REVEL scores.""" revel_csv = "gs://gnomad-insilico/revel/revel-v1.3_all_chromosomes_with_transcript_ids.csv.bgz" ht = hl.import_table( revel_csv, delimiter=",", min_partitions=1000, types={"grch38_pos": hl.tstr, "REVEL": hl.tfloat64}, ) # drop variants that have no position in GRCh38 when lifted over from GRCh37 ht = ht.filter(ht.grch38_pos.contains("."), keep=False) ht = ht.select( locus=hl.locus( "chr" + ht.chr, hl.int(ht.grch38_pos), reference_genome="GRCh38" ), alleles=hl.array([ht.ref, ht.alt]), REVEL=ht.REVEL, Transcript_stable_ID=ht.Ensembl_transcriptid.strip().split(";"), ) ht = ht.key_by("locus", "alleles") ht = ht.group_by("locus", "alleles").aggregate(REVEL_max=hl.agg.max(ht.REVEL)) return ht def _filter_release_ht(data_type, genes_list): """Filter release sites to only include genes with missing revel scores.""" ht = release_sites(data_type=data_type).ht() ht = process_consequences(ht, has_polyphen=False) ht = filter_vep_transcript_csqs( ht, synonymous=False, mane_select=True, genes=genes_list, csqs=["missense_variant"], ) ht = ht.select( gene_id=ht.vep.transcript_consequences.gene_id, most_severe_consequence=ht.vep.transcript_consequences.most_severe_consequence, ) return ht # Get the max REVEL score for each variant revel = _process_revel() revel = revel.checkpoint(hl.utils.new_temp_file("revel_tmp", "ht")) # Get genes missing revel scores genes = hl.import_table( "gs://gnomad-insilico/revel/Ensembl105MANE_without_REVEL.txt", impute=True, comment="#", ) genes_list = genes.Gene_stable_ID.collect() for data_type in ["exomes", "genomes"]: # Filter release sites to only include genes with missing revel ht = _filter_release_ht(data_type, genes_list) ht = ht.checkpoint(hl.utils.new_temp_file(f"{data_type}_tmp_filtered", "ht")) # Join REVEL scores with release sites ht = ht.annotate(REVEL_max=revel[ht.key].REVEL_max) # Filter out variants without a REVEL score ht = ht.filter(hl.is_defined(ht.REVEL_max)) ht.export( "gs://gnomad-insilico/revel/gnomad.v4.1." f"{data_type}.revel_unmatched_transcripts.tsv.bgz" )
[docs]def main(args): """Generate Hail Tables with in silico predictors.""" hl.init( default_reference="GRCh38", log="/insilico_predictors.log", tmp_dir="gs://gnomad-tmp-4day", ) if args.cadd: logger.info("Creating CADD Hail Table for GRCh38...") ht = create_cadd_grch38_ht() ht.write( get_insilico_predictors(predictor="cadd").path, overwrite=args.overwrite, ) logger.info("CADD Hail Table for GRCh38 created.") if args.spliceai: logger.info("Creating SpliceAI Hail Table for GRCh38...") ht = create_spliceai_grch38_ht() ht.write( get_insilico_predictors(predictor="spliceai").path, overwrite=args.overwrite, ) logger.info("SpliceAI Hail Table for GRCh38 created.") if args.pangolin: logger.info("Creating Pangolin Hail Table for GRCh38...") ht = create_pangolin_grch38_ht() ht.write( get_insilico_predictors(predictor="pangolin").path, overwrite=args.overwrite, ) logger.info("Pangolin Hail Table for GRCh38 created.") if args.revel: logger.info("Creating REVEL Hail Table for GRCh38...") ht = create_revel_grch38_ht() ht.write( get_insilico_predictors(predictor="revel").path, overwrite=args.overwrite, ) logger.info("REVEL Hail Table for GRCh38 created.") if args.revel_unmatched_transcripts: logger.info( "Get REVEL score for variants in missing MANE transcripts in v4.1" " release..." ) get_revel_for_unmatched_transcripts() if args.phylop: logger.info("Creating PhyloP Hail Table for GRCh38...") ht = create_phylop_grch38_ht() ht.write( get_insilico_predictors(predictor="phylop").path, overwrite=args.overwrite, )
def get_script_argument_parser() -> argparse.ArgumentParser: """Get script argument parser.""" parser = argparse.ArgumentParser() parser.add_argument( "--slack-channel", help="Slack channel to post results and notifications to." ) parser.add_argument("--overwrite", help="Overwrite data", action="store_true") parser.add_argument("--cadd", help="Create CADD HT", action="store_true") parser.add_argument("--spliceai", help="Create SpliceAI HT", action="store_true") parser.add_argument("--pangolin", help="Create Pangolin HT", action="store_true") parser.add_argument("--revel", help="Create REVEL HT.", action="store_true") parser.add_argument("--phylop", help="Create PhyloP HT.", action="store_true") parser.add_argument( "--revel-unmatched-transcripts", help=( "Get alternative REVEL score for variants " "in MANE transcripts in v4.1 release." ), action="store_true", ) return parser if __name__ == "__main__": parser = get_script_argument_parser() args = parser.parse_args() if args.slack_channel: with slack_notifications(slack_token, args.slack_channel): main(args) else: main(args)