Source code for gnomad.utils.reference_genome

# noqa: D100

import logging
from typing import List, Optional, Union

import hail as hl

logging.basicConfig(
    format="%(asctime)s (%(name)s %(lineno)s): %(message)s",
    datefmt="%m/%d/%Y %I:%M:%S %p",
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


[docs]def get_reference_ht( ref: hl.ReferenceGenome, contigs: Optional[List[str]] = None, excluded_intervals: Optional[List[hl.Interval]] = None, add_all_substitutions: bool = False, filter_n: bool = True, ) -> hl.Table: """ Create a reference Table with locus and alleles (containing only the reference allele by default) from the given reference genome. .. note:: If the `contigs` argument is not provided, all contigs (including obscure ones) will be added to the table. This can be slow as contigs are added one by one. :param ref: Input reference genome :param contigs: An optional list of contigs that the Table should include :param excluded_intervals: An optional list of intervals to exclude :param add_all_substitutions: If set, then all possible substitutions are added in the alleles array :param filter_n: If set, bases where the reference is unknown (n) are filtered. :return: """ if not ref.has_sequence(): add_reference_sequence(ref) if not contigs: contigs = ref.contigs if add_all_substitutions: SUBSTITUTIONS_TABLE = hl.literal( { "a": ["c", "g", "t"], "c": ["a", "g", "t"], "g": ["a", "c", "t"], "t": ["a", "c", "g"], } ) context = [] for contig in contigs: n_partitions = max(1, int(ref.contig_length(contig) / 5000000)) logger.info( "Creating reference contig %s with %d partitions.", contig, n_partitions ) _context = hl.utils.range_table( ref.contig_length(contig), n_partitions=n_partitions ) locus_expr = hl.locus(contig=contig, pos=_context.idx + 1, reference_genome=ref) ref_allele_expr = locus_expr.sequence_context().lower() if add_all_substitutions: alleles_expr = hl.array([ref_allele_expr]).extend( SUBSTITUTIONS_TABLE.get(ref_allele_expr, hl.empty_array(hl.tstr)) ) else: alleles_expr = [ref_allele_expr] _context = ( _context.select(locus=locus_expr, alleles=alleles_expr) .key_by("locus", "alleles") .drop("idx") ) if excluded_intervals is not None: _context = hl.filter_intervals(_context, excluded_intervals, keep=False) if filter_n: _context = _context.filter(_context.alleles[0] == "n", keep=False) context.append(_context) return context.pop().union(*context)
[docs]def add_reference_sequence(ref: hl.ReferenceGenome) -> hl.ReferenceGenome: """ Add the fasta sequence to a Hail reference genome. Only GRCh37 and GRCh38 references are supported. :param ref: Input reference genome. :return: """ if not ref.has_sequence(): if ref.name == "GRCh38": ref.add_sequence( "gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz", "gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai", ) elif ref.name == "GRCh37": ref.add_sequence( "gs://hail-common/references/human_g1k_v37.fasta.gz", "gs://hail-common/references/human_g1k_v37.fasta.fai", ) else: raise NotImplementedError( f"No known location for the fasta/fai files for genome {ref.name}. Only" " GRCh37 and GRCh38 are supported at this time." ) else: logger.info( "Reference genome sequence already present. Ignoring" " add_reference_sequence." ) return ref
[docs]def get_reference_genome( locus: Union[hl.expr.LocusExpression, hl.expr.IntervalExpression], add_sequence: bool = False, ) -> hl.ReferenceGenome: """ Return the reference genome associated with the input Locus expression. :param locus: Input locus :param add_sequence: If set, the fasta sequence is added to the reference genome :return: Reference genome """ if isinstance(locus, hl.expr.LocusExpression): ref = locus.dtype.reference_genome else: assert isinstance(locus, hl.expr.IntervalExpression) ref = locus.dtype.point_type.reference_genome if add_sequence: ref = add_reference_sequence(ref) return ref