"""Functions to filter gnomAD sites HT by VEP annotations."""
from typing import List, Optional
import hail as hl
from gnomad.utils.filtering import filter_gencode_ht
from gnomad.utils.vep import (
LOF_CSQ_SET,
filter_vep_transcript_csqs,
filter_vep_transcript_csqs_expr,
)
from gnomad_toolbox.load_data import _get_dataset, get_compatible_dataset_versions
[docs]def filter_by_consequence_category(
plof: bool = False,
missense: bool = False,
synonymous: bool = False,
other: bool = False,
pass_filters: bool = True,
**kwargs,
) -> hl.Table:
"""
Filter gnomAD variants based on VEP consequence.
https://gnomad.broadinstitute.org/help/consequence-category-filter
The [VEP](https://useast.ensembl.org/info/docs/tools/vep/index.html) consequences included in each category are:
pLoF:
- transcript_ablation
- splice_acceptor_variant
- splice_donor_variant
- stop_gained
- frameshift_variant
Missense / Inframe indel:
- stop_lost
- start_lost
- inframe_insertion
- inframe_deletion
- missense_variant
Synonymous:
- synonymous_variant
Other:
- All other consequences not included in the above categories.
:param plof: Whether to include pLoF variants.
:param missense: Whether to include missense variants.
:param synonymous: Whether to include synonymous variants.
:param other: Whether to include other variants.
:param pass_filters: Boolean if the variants pass the filters.
:param kwargs: Arguments to pass to `_get_dataset`.
:return: Table with variants with the specified consequences.
"""
if not any([plof, missense, synonymous, other]):
raise ValueError(
"At least one of plof, missense, synonymous, or other must be True."
)
# Load the Hail Table if not provided
ht = _get_dataset(dataset="variant", **kwargs)
lof_csqs = list(LOF_CSQ_SET) + ["transcript_ablation"]
missense_csqs = [
"missense_variant",
"inframe_insertion",
"inframe_deletion",
"stop_lost",
"start_lost",
]
synonymous_csqs = ["synonymous_variant"]
other_csqs = lof_csqs + missense_csqs + synonymous_csqs
csqs = (
(lof_csqs if plof else [])
+ (missense_csqs if missense else [])
+ (synonymous_csqs if synonymous else [])
)
filter_expr = None
if csqs:
filter_expr = filter_vep_transcript_csqs_expr(ht.vep, csqs=csqs)
if other:
other_expr = filter_vep_transcript_csqs_expr(
ht.vep, csqs=other_csqs, keep_csqs=False
)
filter_expr = other_expr if filter_expr is None else (filter_expr | other_expr)
if pass_filters:
pass_expr = hl.len(ht.filters) == 0
filter_expr = pass_expr if filter_expr is None else (filter_expr & pass_expr)
return ht.filter(filter_expr)
[docs]def get_gene_intervals(
gene_symbol: str, gencode_version: Optional[str] = None
) -> List[hl.utils.Interval]:
"""
Get the GENCODE genomic intervals for a given gene symbol.
:param gene_symbol: Gene symbol.
:param gencode_version: Optional GENCODE version. If not provided, uses the gencode
version associated with the gnomAD session.
:return: List of GENCODE intervals for the specified gene.
"""
# Load the Hail Table if not provided.
ht = _get_dataset(dataset="gencode", version=gencode_version)
gene_symbol = gene_symbol.upper()
intervals = filter_gencode_ht(gencode_ht=ht, feature="gene", genes=gene_symbol)
intervals = intervals.interval.collect()
if not intervals:
raise ValueError(f"No interval found for gene: {gene_symbol}")
return intervals
[docs]def filter_to_high_confidence_loftee(
gene_symbol: Optional[str] = None,
no_lof_flags: bool = False,
mane_select_only: bool = False,
canonical_only: bool = False,
version: Optional[str] = None,
**kwargs,
) -> hl.Table:
"""
Filter gnomAD variants to high-confidence LOFTEE variants for a gene.
:param gene_symbol: Optional gene symbol to filter by.
:param no_lof_flags: Whether to exclude variants with LOFTEE flags. Default is
False.
:param mane_select_only: Whether to include only MANE Select transcripts. Default
is False.
:param canonical_only: Whether to include only canonical transcripts. Default is
False.
:param version: Optional version of the dataset to use.
:param kwargs: Additional arguments to pass to `_get_dataset`.
:return: Table with high-confidence LOFTEE variants.
"""
# Load the Hail Table if not provided.
ht = _get_dataset(dataset="variant", version=version, **kwargs)
gene_symbol = gene_symbol.upper() if gene_symbol else None
if gene_symbol:
gencode_version = get_compatible_dataset_versions("gencode", version)
ht = hl.filter_intervals(
ht, get_gene_intervals(gene_symbol, gencode_version=gencode_version)
)
return filter_vep_transcript_csqs(
ht,
synonymous=False,
canonical=canonical_only,
mane_select=mane_select_only,
genes=[gene_symbol],
match_by_gene_symbol=True,
loftee_labels=["HC"],
no_lof_flags=no_lof_flags,
)