Source code for gnomad.sample_qc.pipeline

# noqa: D100

import functools
import logging
import operator
from typing import List, Optional, Union

import hail as hl

from gnomad.sample_qc.sex import (
    gaussian_mixture_model_karyotype_assignment,
    get_chr_x_hom_alt_cutoffs,
    get_ploidy_cutoffs,
    get_sex_expr,
)
from gnomad.utils.annotations import (
    bi_allelic_expr,
    bi_allelic_site_inbreeding_expr,
    get_adj_expr,
)
from gnomad.utils.filtering import filter_low_conf_regions, filter_to_adj
from gnomad.utils.reference_genome import get_reference_genome
from gnomad.utils.sparse_mt import impute_sex_ploidy

logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


[docs]def filter_rows_for_qc( mt: hl.MatrixTable, min_af: Optional[float] = 0.001, min_callrate: Optional[float] = 0.99, min_inbreeding_coeff_threshold: Optional[float] = -0.8, min_hardy_weinberg_threshold: Optional[float] = 1e-8, apply_hard_filters: bool = True, bi_allelic_only: bool = True, snv_only: bool = True, ) -> hl.MatrixTable: """ Annotate rows with `sites_callrate`, `site_inbreeding_coeff` and `af`, then apply thresholds. AF and callrate thresholds are taken from gnomAD QC; inbreeding coeff, MQ, FS and QD filters are taken from GATK best practices. .. note:: This function expect the typical ``info`` annotation of type struct with fields ``MQ``, ``FS`` and ``QD`` if applying hard filters. :param mt: Input MT :param min_af: Minimum site AF to keep. Not applied if set to ``None``. :param min_callrate: Minimum site call rate to keep. Not applied if set to ``None``. :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to ``None``. :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to ``None``. :param apply_hard_filters: Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30. :param bi_allelic_only: Whether to only keep bi-allelic sites or include multi-allelic sites too. :param snv_only: Whether to only keep SNVs or include other variant types. :return: annotated and filtered table """ annotation_expr = {} if min_af is not None: annotation_expr["af"] = hl.agg.mean(mt.GT.n_alt_alleles()) / 2 if min_callrate is not None: annotation_expr["site_callrate"] = hl.agg.fraction(hl.is_defined(mt.GT)) if min_inbreeding_coeff_threshold is not None: annotation_expr["site_inbreeding_coeff"] = bi_allelic_site_inbreeding_expr( mt.GT ) if min_hardy_weinberg_threshold is not None: annotation_expr["hwe"] = hl.agg.hardy_weinberg_test(mt.GT) if annotation_expr: mt = mt.annotate_rows(**annotation_expr) filter_expr = [] if min_af is not None: filter_expr.append((mt.af > min_af)) if min_callrate is not None: filter_expr.append((mt.site_callrate > min_callrate)) if min_inbreeding_coeff_threshold is not None: filter_expr.append((mt.site_inbreeding_coeff > min_inbreeding_coeff_threshold)) if min_hardy_weinberg_threshold is not None: filter_expr.append((mt.hwe.p_value > min_hardy_weinberg_threshold)) if snv_only: filter_expr.append(hl.is_snp(mt.alleles[0], mt.alleles[1])) if bi_allelic_only: filter_expr.append(bi_allelic_expr(mt)) if apply_hard_filters: if "info" in mt.row_value: if "QD" in mt.info: filter_expr.append((mt.info.QD >= 2)) else: logger.warning( "Could not apply QD hard filter, as `info.QD` not found in schema." ) if "FS" in mt.info: filter_expr.append((mt.info.FS <= 60)) else: logger.warning( "Could not apply FS hard filter, as `info.FS` not found in schema." ) if "MQ" in mt.info: filter_expr.append((mt.info.MQ >= 30)) else: logger.warning( "Could not apply MQ hard filter, as `info.MQ` not found in schema." ) else: logger.warning( "Could not apply hard filters as `info` not found in schema." ) return mt.filter_rows(functools.reduce(operator.iand, filter_expr))
[docs]def get_qc_mt( mt: hl.MatrixTable, bi_allelic_only: bool = True, snv_only: bool = True, adj_only: bool = True, min_af: Optional[float] = 0.001, min_callrate: Optional[float] = 0.99, min_inbreeding_coeff_threshold: Optional[float] = -0.8, min_hardy_weinberg_threshold: Optional[float] = 1e-8, apply_hard_filters: bool = True, ld_r2: Optional[float] = 0.1, filter_lcr: bool = True, filter_decoy: bool = True, filter_segdup: bool = True, filter_exome_low_coverage_regions: bool = False, high_conf_regions: Optional[List[str]] = None, checkpoint_path: Optional[str] = None, n_partitions: Optional[int] = None, block_size: Optional[int] = None, ) -> hl.MatrixTable: """ Create a QC-ready MT. Has options to filter to the following: - Variants outside known problematic regions - Bi-allelic sites only - SNVs only - Variants passing hard thresholds - Variants passing the set call rate and MAF thresholds - Genotypes passing on gnomAD ADJ criteria (GQ>=20, DP>=10, AB>0.2 for hets) In addition, the MT will be LD-pruned if `ld_r2` is set. :param mt: Input MT. :param bi_allelic_only: Whether to only keep bi-allelic sites or include multi-allelic sites too. :param snv_only: Whether to only keep SNVs or include other variant types. :param adj_only: If set, only ADJ genotypes are kept. This filter is applied before the call rate and AF calculation. :param min_af: Minimum allele frequency to keep. Not applied if set to ``None``. :param min_callrate: Minimum call rate to keep. Not applied if set to ``None``. :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to ``None``. :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to ``None``. :param apply_hard_filters: Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30. :param ld_r2: Minimum r2 to keep when LD-pruning (set to `None` for no LD pruning). :param filter_lcr: Filter LCR regions. :param filter_decoy: Filter decoy regions. :param filter_segdup: Filter segmental duplication regions. :param filter_exome_low_coverage_regions: If set, only high coverage exome regions (computed from gnomAD are kept). :param high_conf_regions: If given, the data will be filtered to only include variants in those regions. :param checkpoint_path: If given, the QC MT will be checkpointed to the specified path before running LD pruning. If not specified, persist will be used instead. :param n_partitions: If given, the QC MT will be repartitioned to the specified number of partitions before running LD pruning. `checkpoint_path` must also be specified as the MT will first be written to the `checkpoint_path` before being reread with the new number of partitions. :param block_size: If given, set the block size to this value when LD pruning. :return: Filtered MT. """ logger.info("Creating QC MatrixTable") if ld_r2 is not None: logger.warning( "The LD-prune step of this function requires non-preemptible workers only!" ) if n_partitions and not checkpoint_path: raise ValueError("checkpoint_path must be supplied if repartitioning!") qc_mt = filter_low_conf_regions( mt, filter_lcr=filter_lcr, filter_decoy=filter_decoy, filter_segdup=filter_segdup, filter_exome_low_coverage_regions=filter_exome_low_coverage_regions, high_conf_regions=high_conf_regions, ) if adj_only: qc_mt = filter_to_adj( qc_mt ) # TODO: Make sure that this works fine before call rate filtering qc_mt = filter_rows_for_qc( qc_mt, min_af, min_callrate, min_inbreeding_coeff_threshold, min_hardy_weinberg_threshold, apply_hard_filters, bi_allelic_only, snv_only, ) if ld_r2 is not None: if checkpoint_path: if n_partitions: logger.info("Checkpointing and repartitioning the MT and LD pruning") qc_mt.write(checkpoint_path, overwrite=True) qc_mt = hl.read_matrix_table( checkpoint_path, _n_partitions=n_partitions ) else: logger.info("Checkpointing the MT and LD pruning") qc_mt = qc_mt.checkpoint(checkpoint_path, overwrite=True) else: logger.info("Persisting the MT and LD pruning") qc_mt = qc_mt.persist() unfiltered_qc_mt = qc_mt.unfilter_entries() pruned_ht = hl.ld_prune(unfiltered_qc_mt.GT, r2=ld_r2, block_size=block_size) qc_mt = qc_mt.filter_rows(hl.is_defined(pruned_ht[qc_mt.row_key])) qc_mt = qc_mt.annotate_globals( qc_mt_params=hl.struct( bi_allelic_only=bi_allelic_only, snv_only=snv_only, adj_only=adj_only, min_af=min_af if min_af is not None else hl.null(hl.tfloat32), min_callrate=( min_callrate if min_callrate is not None else hl.null(hl.tfloat32) ), inbreeding_coeff_threshold=( min_inbreeding_coeff_threshold if min_inbreeding_coeff_threshold is not None else hl.null(hl.tfloat32) ), min_hardy_weinberg_threshold=( min_hardy_weinberg_threshold if min_hardy_weinberg_threshold is not None else hl.null(hl.tfloat32) ), apply_hard_filters=apply_hard_filters, ld_r2=ld_r2 if ld_r2 is not None else hl.null(hl.tfloat32), filter_exome_low_coverage_regions=filter_exome_low_coverage_regions, high_conf_regions=( high_conf_regions if high_conf_regions is not None else hl.null(hl.tarray(hl.tstr)) ), ) ) return qc_mt.annotate_cols(sample_callrate=hl.agg.fraction(hl.is_defined(qc_mt.GT)))
[docs]def infer_sex_karyotype( ploidy_ht: hl.Table, f_stat_cutoff: float = 0.5, use_gaussian_mixture_model: bool = False, normal_ploidy_cutoff: int = 5, aneuploidy_cutoff: int = 6, chr_x_frac_hom_alt_expr: Optional[hl.expr.NumericExpression] = None, normal_chr_x_hom_alt_cutoff: int = 5, ) -> hl.Table: """ Create a Table with X_karyotype, Y_karyotype, and sex_karyotype. This function uses `get_ploidy_cutoffs` to determine X and Y ploidy cutoffs and then `get_sex_expr` to get karyotype annotations from those cutoffs. By default `f_stat_cutoff` will be used to roughly split samples into 'XX' and 'XY' for use in `get_ploidy_cutoffs`. If `use_gaussian_mixture_model` is True a gaussian mixture model will be used to split samples into 'XX' and 'XY' instead of f-stat. :param ploidy_ht: Input Table with chromosome X and chromosome Y ploidy values and optionally f-stat. :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY are above cutoff. Default is 0.5. :param use_gaussian_mixture_model: Use gaussian mixture model to split samples into 'XX' and 'XY' instead of f-stat. :param normal_ploidy_cutoff: Number of standard deviations to use when determining sex chromosome ploidy cutoffs for XX, XY karyotypes. :param aneuploidy_cutoff: Number of standard deviations to use when determining sex chromosome ploidy cutoffs for aneuploidies. :param chr_x_frac_hom_alt_expr: Fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X. :param normal_chr_x_hom_alt_cutoff: Number of standard deviations to use when determining cutoffs for the fraction of homozygous alternate genotypes (hom-alt/(hom-alt + het)) on chromosome X for for XX and XY karyotypes. Only used if `chr_x_frac_hom_alt_expr` is supplied. :return: Table of samples imputed sex karyotype. """ logger.info("Inferring sex karyotype") if chr_x_frac_hom_alt_expr is not None: ploidy_ht = ploidy_ht.annotate(_chr_x_frac_hom_alt=chr_x_frac_hom_alt_expr) if use_gaussian_mixture_model: logger.info("Using Gaussian Mixture Model for karyotype assignment") gmm_sex_ht = gaussian_mixture_model_karyotype_assignment(ploidy_ht) x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs( gmm_sex_ht, group_by_expr=gmm_sex_ht.gmm_karyotype, normal_ploidy_cutoff=normal_ploidy_cutoff, aneuploidy_cutoff=aneuploidy_cutoff, ) ploidy_ht = ploidy_ht.annotate( gmm_karyotype=gmm_sex_ht[ploidy_ht.key].gmm_karyotype ) group_by_expr = ploidy_ht.gmm_karyotype f_stat_cutoff = None else: logger.info("Using f-stat for karyotype assignment") x_ploidy_cutoffs, y_ploidy_cutoffs = get_ploidy_cutoffs( ploidy_ht, f_stat_cutoff=f_stat_cutoff, normal_ploidy_cutoff=normal_ploidy_cutoff, aneuploidy_cutoff=aneuploidy_cutoff, ) group_by_expr = None if chr_x_frac_hom_alt_expr is not None: logger.info( "Including cutoffs for the fraction of homozygous alternate genotypes" " (hom-alt/(hom-alt + het)) on chromosome X. Using %d standard deviations" " to determine cutoffs.", normal_chr_x_hom_alt_cutoff, ) chr_x_frac_hom_alt_expr = ploidy_ht._chr_x_frac_hom_alt chr_x_frac_hom_alt_cutoffs = get_chr_x_hom_alt_cutoffs( ploidy_ht, chr_x_frac_hom_alt_expr, f_stat_cutoff=f_stat_cutoff, group_by_expr=group_by_expr, cutoff_stdev=normal_chr_x_hom_alt_cutoff, ) else: chr_x_frac_hom_alt_cutoffs = None karyotype_ht = ploidy_ht.select( **get_sex_expr( ploidy_ht.chrX_ploidy, ploidy_ht.chrY_ploidy, x_ploidy_cutoffs, y_ploidy_cutoffs, chr_x_frac_hom_alt_expr=chr_x_frac_hom_alt_expr, chr_x_frac_hom_alt_cutoffs=chr_x_frac_hom_alt_cutoffs, ) ) karyotype_ht = karyotype_ht.annotate_globals( use_gaussian_mixture_model=use_gaussian_mixture_model, normal_ploidy_cutoff=normal_ploidy_cutoff, aneuploidy_cutoff=aneuploidy_cutoff, x_ploidy_cutoffs=hl.struct( upper_cutoff_X=x_ploidy_cutoffs[0], lower_cutoff_XX=x_ploidy_cutoffs[1][0], upper_cutoff_XX=x_ploidy_cutoffs[1][1], lower_cutoff_XXX=x_ploidy_cutoffs[2], ), y_ploidy_cutoffs=hl.struct( lower_cutoff_Y=y_ploidy_cutoffs[0][0], upper_cutoff_Y=y_ploidy_cutoffs[0][1], lower_cutoff_YY=y_ploidy_cutoffs[1], ), ) if chr_x_frac_hom_alt_expr is not None: karyotype_ht = karyotype_ht.annotate_globals( x_frac_hom_alt_cutoffs=hl.struct( lower_cutoff_more_than_one_X=chr_x_frac_hom_alt_cutoffs[0][0], upper_cutoff_more_than_one_X=chr_x_frac_hom_alt_cutoffs[0][1], lower_cutoff_single_X=chr_x_frac_hom_alt_cutoffs[1], ) ) if use_gaussian_mixture_model: karyotype_ht = karyotype_ht.annotate( gmm_sex_karyotype=ploidy_ht[karyotype_ht.key].gmm_karyotype ) else: karyotype_ht = karyotype_ht.annotate_globals(f_stat_cutoff=f_stat_cutoff) return karyotype_ht
[docs]def annotate_sex( mtds: Union[hl.MatrixTable, hl.vds.VariantDataset], is_sparse: bool = True, excluded_intervals: Optional[hl.Table] = None, included_intervals: Optional[hl.Table] = None, normalization_contig: str = "chr20", sites_ht: Optional[hl.Table] = None, aaf_expr: Optional[str] = None, gt_expr: str = "GT", f_stat_cutoff: float = 0.5, aaf_threshold: float = 0.001, variants_only_x_ploidy: bool = False, variants_only_y_ploidy: bool = False, variants_filter_lcr: bool = True, variants_filter_segdup: bool = True, variants_filter_decoy: bool = False, variants_snv_only: bool = False, coverage_mt: Optional[hl.MatrixTable] = None, compute_x_frac_variants_hom_alt: bool = False, compute_fstat: bool = True, infer_karyotype: bool = True, use_gaussian_mixture_model: bool = False, ) -> hl.Table: """ Impute sample sex based on X-chromosome heterozygosity and sex chromosome ploidy. Return Table with the following fields: - s (str): Sample - `normalization_contig`_mean_dp (float32): Sample's mean coverage over the specified `normalization_contig`. - chrX_mean_dp (float32): Sample's mean coverage over chromosome X. - chrY_mean_dp (float32): Sample's mean coverage over chromosome Y. - chrX_ploidy (float32): Sample's imputed ploidy over chromosome X. - chrY_ploidy (float32): Sample's imputed ploidy over chromosome Y. If `compute_fstat`: - f_stat (float64): Sample f-stat. Calculated using hl.impute_sex. - n_called (int64): Number of variants with a genotype call. Calculated using hl.impute_sex. - expected_homs (float64): Expected number of homozygotes. Calculated using hl.impute_sex. - observed_homs (int64): Observed number of homozygotes. Calculated using hl.impute_sex. If `infer_karyotype`: - X_karyotype (str): Sample's chromosome X karyotype. - Y_karyotype (str): Sample's chromosome Y karyotype. - sex_karyotype (str): Sample's sex karyotype. .. note:: In order to infer sex karyotype (`infer_karyotype`=True), one of `compute_fstat` or `use_gaussian_mixture_model` must be set to True. :param mtds: Input MatrixTable or VariantDataset. :param is_sparse: Whether input MatrixTable is in sparse data format. Default is True. :param excluded_intervals: Optional table of intervals to exclude from the computation. This option is currently not implemented for imputing sex chromosome ploidy on a VDS. :param included_intervals: Optional table of intervals to use in the computation. REQUIRED for exomes. :param normalization_contig: Which chromosome to use to normalize sex chromosome coverage. Used in determining sex chromosome ploidies. Default is "chr20". :param sites_ht: Optional Table of sites and alternate allele frequencies for filtering the input MatrixTable prior to imputing sex. :param aaf_expr: Optional. Name of field in input MatrixTable with alternate allele frequency. :param gt_expr: Name of entry field storing the genotype. Default is 'GT'. :param f_stat_cutoff: f-stat to roughly divide 'XX' from 'XY' samples. Assumes XX samples are below cutoff and XY samples are above cutoff. Default is 0.5. :param aaf_threshold: Minimum alternate allele frequency to be used in f-stat calculations. Default is 0.001. :param variants_only_x_ploidy: Whether to use depth of only variant data for the x ploidy estimation. :param variants_only_y_ploidy: Whether to use depth of only variant data for the y ploidy estimation. :param variants_filter_lcr: Whether to filter out variants in LCR regions for variants only ploidy estimation and fraction of homozygous alternate variants on chromosome X. Default is True. :param variants_filter_segdup: Whether to filter out variants in segdup regions for variants only ploidy estimation and fraction of homozygous alternate variants on chromosome X. Default is True. :param variants_filter_decoy: Whether to filter out variants in decoy regions for variants only ploidy estimation and fraction of homozygous alternate variants on chromosome X. Default is False. Note: this option doesn't exist for GRCh38. :param variants_snv_only: Whether to filter to only single nucleotide variants for variants only ploidy estimation and fraction of homozygous alternate variants on chromosome X. Default is False. :param coverage_mt: Optional precomputed coverage MatrixTable to use in reference based VDS ploidy estimation. :param compute_x_frac_variants_hom_alt: Whether to return an annotation for the fraction of homozygous alternate variants on chromosome X. Default is False. :param compute_fstat: Whether to compute f-stat. Default is True. :param infer_karyotype: Whether to infer sex karyotypes. Default is True. :param use_gaussian_mixture_model: Whether to use gaussian mixture model to split samples into 'XX' and 'XY' instead of f-stat. Default is False. :return: Table of samples and their imputed sex karyotypes. """ logger.info("Imputing sex chromosome ploidies...") if infer_karyotype and not (compute_fstat or use_gaussian_mixture_model): raise ValueError( "In order to infer sex karyotype (infer_karyotype=True), one of" " 'compute_fstat' or 'use_gaussian_mixture_model' must be set to True!" ) is_vds = isinstance(mtds, hl.vds.VariantDataset) if is_vds: if excluded_intervals is not None: raise NotImplementedError( "The use of the parameter 'excluded_intervals' is currently not" " implemented for imputing sex chromosome ploidy on a VDS!" ) if included_intervals is None: raise NotImplementedError( "The current implementation for imputing sex chromosome ploidy on a VDS" " requires a list of 'included_intervals'!" ) mt = mtds.variant_data else: if not is_sparse: raise NotImplementedError( "Imputing sex ploidy does not exist yet for dense data." ) mt = mtds # Determine the contigs that are needed for variant only and reference # block only sex ploidy imputation rg = get_reference_genome(mt.locus) if normalization_contig not in rg.contigs: raise ValueError( f"Normalization contig {normalization_contig} is not found in reference" f" genome {rg.name}!" ) x_contigs = set(rg.x_contigs) y_contigs = set(rg.y_contigs) if variants_only_x_ploidy: var_keep_contigs = x_contigs | {normalization_contig} ref_keep_contigs = set() else: ref_keep_contigs = x_contigs | {normalization_contig} var_keep_contigs = set() if variants_only_y_ploidy: var_keep_contigs = {normalization_contig} | var_keep_contigs | y_contigs else: ref_keep_contigs = {normalization_contig} | ref_keep_contigs | y_contigs ref_keep_locus_intervals = [ hl.parse_locus_interval(contig, reference_genome=rg.name) for contig in ref_keep_contigs ] var_keep_locus_intervals = [ hl.parse_locus_interval(contig, reference_genome=rg.name) for contig in var_keep_contigs ] x_locus_intervals = [ hl.parse_locus_interval(contig, reference_genome=rg.name) for contig in x_contigs ] if ref_keep_contigs: logger.info( "Imputing sex chromosome ploidy using only reference block depth" " information on the following contigs: %s", ref_keep_contigs, ) if is_vds: if coverage_mt is not None: ploidy_ht = hl.vds.impute_sex_chr_ploidy_from_interval_coverage( coverage_mt.filter_rows( hl.is_defined(included_intervals[coverage_mt.row_key]) & hl.literal(ref_keep_contigs).contains( coverage_mt.interval.start.contig ) ), normalization_contig=normalization_contig, ) else: ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( hl.vds.filter_intervals(mtds, ref_keep_locus_intervals), calling_intervals=included_intervals, normalization_contig=normalization_contig, use_variant_dataset=False, ) ploidy_ht = ploidy_ht.rename( { "x_ploidy": "chrX_ploidy", "y_ploidy": "chrY_ploidy", "x_mean_dp": "chrX_mean_dp", "y_mean_dp": "chrY_mean_dp", } ) else: ploidy_ht = impute_sex_ploidy( hl.filter_intervals(mt, ref_keep_locus_intervals), excluded_intervals, included_intervals, normalization_contig, use_only_variants=False, ) if variants_only_x_ploidy: ploidy_ht = ploidy_ht.drop("chrX_ploidy", "chrX_mean_dp") if variants_only_y_ploidy: ploidy_ht = ploidy_ht.drop("chrY_ploidy", "chrY_mean_dp") add_globals = hl.struct() if compute_x_frac_variants_hom_alt or var_keep_contigs: logger.info( "Filtering variants for variant only sex chromosome ploidy imputation" " and/or computation of the fraction of homozygous alternate variants on" " chromosome X", ) filtered_mt = hl.filter_intervals( mt, var_keep_locus_intervals + x_locus_intervals ) if variants_filter_lcr or variants_filter_segdup or variants_filter_decoy: logger.info( "Filtering out variants in: %s", ("segmental duplications, " if variants_filter_segdup else "") + ("low confidence regions, " if variants_filter_lcr else "") + (" decoy regions" if variants_filter_decoy else ""), ) filtered_mt = filter_low_conf_regions( filtered_mt, filter_lcr=variants_filter_lcr, filter_decoy=variants_filter_decoy, filter_segdup=variants_filter_segdup, ) if variants_snv_only: logger.info("Filtering to SNVs") filtered_mt = filtered_mt.filter_rows( hl.is_snp(filtered_mt.alleles[0], filtered_mt.alleles[1]) ) add_globals = add_globals.annotate( variants_filter_lcr=variants_filter_lcr, variants_segdup=variants_filter_segdup, variants_filter_decoy=variants_filter_decoy, variants_snv_only=variants_snv_only, ) if var_keep_contigs: logger.info( "Imputing sex chromosome ploidy using only variant depth information on the" " following contigs: %s", var_keep_contigs, ) var_filtered_mt = hl.filter_intervals(filtered_mt, var_keep_locus_intervals) if is_vds: var_ploidy_ht = hl.vds.impute_sex_chromosome_ploidy( hl.vds.VariantDataset(mtds.reference_data, var_filtered_mt), calling_intervals=included_intervals, normalization_contig=normalization_contig, use_variant_dataset=True, ) var_ploidy_ht = var_ploidy_ht.rename( { "autosomal_mean_dp": f"var_data_{normalization_contig}_mean_dp", "x_ploidy": "chrX_ploidy", "y_ploidy": "chrY_ploidy", "x_mean_dp": "chrX_mean_dp", "y_mean_dp": "chrY_mean_dp", } ) else: var_ploidy_ht = impute_sex_ploidy( var_filtered_mt, excluded_intervals, included_intervals, normalization_contig, use_only_variants=True, ) var_ploidy_ht = var_ploidy_ht.rename( { f"{normalization_contig}_mean_dp": ( f"var_data_{normalization_contig}_mean_dp" ) } ) if ref_keep_contigs: ploidy_ht = var_ploidy_ht.annotate(**ploidy_ht[var_ploidy_ht.key]) else: ploidy_ht = var_ploidy_ht ploidy_ht = ploidy_ht.annotate_globals( normalization_contig=normalization_contig, variants_only_x_ploidy=variants_only_x_ploidy, variants_only_y_ploidy=variants_only_y_ploidy, **add_globals, ) if compute_x_frac_variants_hom_alt: logger.info( "Computing fraction of variants that are homozygous alternate on" " chromosome X" ) filtered_mt = hl.filter_intervals(filtered_mt, x_locus_intervals) filtered_mt = filtered_mt.filter_rows( hl.is_defined(included_intervals[filtered_mt.locus]) ) filtered_mt = filtered_mt.annotate_entries( adj=get_adj_expr( filtered_mt.LGT, filtered_mt.GQ, filtered_mt.DP, filtered_mt.LAD ) ) frac_hom_alt_ht = filtered_mt.select_cols( chrx_frac_hom_alt=hl.agg.count_where(filtered_mt.LGT.is_hom_var()) / hl.agg.count_where(hl.is_defined(filtered_mt.LGT)), chrx_frac_hom_alt_adj=hl.agg.filter( filtered_mt.adj, hl.agg.count_where(filtered_mt.LGT.is_hom_var()) / hl.agg.count_where(hl.is_defined(filtered_mt.LGT)), ), ).cols() ploidy_ht = ploidy_ht.annotate(**frac_hom_alt_ht[ploidy_ht.key]) if compute_fstat: logger.info("Filtering mt to biallelic SNPs in X contigs: %s", x_contigs) if "was_split" in list(mt.row): mt = mt.filter_rows( (~mt.was_split) & hl.is_snp(mt.alleles[0], mt.alleles[1]) ) else: mt = mt.filter_rows( (hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) ) mt = hl.filter_intervals(mt, x_locus_intervals) if sites_ht is not None: if aaf_expr is None: logger.warning( "sites_ht was provided, but aaf_expr is missing. Assuming name of" " field with alternate allele frequency is 'AF'." ) aaf_expr = "AF" logger.info("Filtering to provided sites") mt = mt.annotate_rows(**sites_ht[mt.row_key]) mt = mt.filter_rows(hl.is_defined(mt[aaf_expr])) logger.info("Calculating inbreeding coefficient on chrX") sex_ht = hl.impute_sex( mt[gt_expr], aaf_threshold=aaf_threshold, male_threshold=f_stat_cutoff, female_threshold=f_stat_cutoff, aaf=aaf_expr, ) logger.info("Annotating sex chromosome ploidy HT with impute_sex HT") ploidy_ht = ploidy_ht.annotate(**sex_ht[ploidy_ht.key]) ploidy_ht = ploidy_ht.annotate_globals(f_stat_cutoff=f_stat_cutoff) if infer_karyotype: karyotype_ht = infer_sex_karyotype( ploidy_ht, f_stat_cutoff, use_gaussian_mixture_model ) ploidy_ht = ploidy_ht.annotate(**karyotype_ht[ploidy_ht.key]) ploidy_ht = ploidy_ht.annotate_globals(**karyotype_ht.index_globals()) return ploidy_ht