Source code for gnomad_toolbox.analysis.general

"""Set of general functions for gnomAD analysis."""

from typing import Dict, List, Optional

import hail as hl
from gnomad.assessment.summary_stats import freq_bin_expr

from gnomad_toolbox.load_data import _get_dataset


[docs]def get_variant_count_by_freq_bin( af_cutoffs: Optional[List[float]] = None, singletons: bool = False, doubletons: bool = False, pass_only: bool = True, **kwargs, ) -> Dict[str, int]: """ Count variants by frequency bin. By default, this function counts PASS variants that are AC0, AF < 0.01%, and AF 0.01% - 0.1%. The function can also include counts of singletons and doubletons, with or without passing filters. .. note:: This function works for gnomAD exomes and genomes data types, not yet for gnomAD joint data type, since the HT schema is slightly different. :param af_cutoffs: List of allele frequencies cutoffs. :param singletons: Include singletons. :param doubletons: Include doubletons. :param pass_only: Include only PASS variants. :param kwargs: Keyword arguments to pass to `_get_dataset`. Includes 'ht', 'data_type', and 'version'. :return: Dictionary with counts. """ # Initialize af_cutoffs if not provided if af_cutoffs is None: af_cutoffs = [0.001, 0.01] # Load the Hail Table if not provided ht = _get_dataset(dataset="variant", **kwargs) # Filter to PASS variants. if pass_only: ht = ht.filter(hl.len(ht.filters) == 0) # Initialize allele count cutoffs with AC0. ac_cutoffs = [(0, "AC0")] if singletons: ac_cutoffs.append((1, "singletons")) if doubletons: ac_cutoffs.append((2, "doubletons")) freq_expr = freq_bin_expr( ht.freq, ac_cutoffs=ac_cutoffs, af_cutoffs=af_cutoffs, upper_af=None ) return ht.aggregate(hl.agg.counter(freq_expr))