# noqa: D100
import hail as hl
from hail.linalg import BlockMatrix
from gnomad.resources.grch37.gnomad import public_release
from gnomad.resources.grch37.gnomad_ld import ld_index, ld_matrix
[docs]def get_r_human_readable(
pop: str, var1: str, var2: str, ref_genome: str = "GRCh37"
): # noqa: D103
bm = ld_matrix(pop).bm()
ht = ld_index(pop).ht()
chrom, pos, ref, alt = var1.split("-")
var1 = (hl.parse_locus(f"{chrom}:{pos}", ref_genome), [ref, alt])
chrom, pos, ref, alt = var2.split("-")
var2 = (hl.parse_locus(f"{chrom}:{pos}", ref_genome), [ref, alt])
return get_r_for_pair_of_variants(bm, ht, var1, var2)
# TODO: find LD proxies
[docs]def get_r_for_pair_of_variants(
bm: BlockMatrix,
ld_index: hl.Table,
var1: (hl.tlocus, hl.tarray(hl.tstr)),
var2: (hl.tlocus, hl.tarray(hl.tstr)),
):
"""
Get `r` value (LD) for pair of variants `var1` and `var2`.
.. code-block:: python
bm = get_ld_matrix('nfe')
ld_index = get_ld_index('nfe')
var1 = (hl.parse_locus('1:10146', 'GRCh37'), ['AC', 'A'])
var2 = (hl.parse_locus('1:10151', 'GRCh37'), ['TA', 'T'])
get_r_for_pair_of_variants(bm, ld_index, var1, var2)
# 0.01789767935482124
:param bm: Input BlockMatrix
:param ld_index: Corresponding index table
:param var1: Tuple of locus and alleles
:param var2: Tuple of locus and alleles
:return: Correlation (r) between two variants
"""
idx1 = ld_index.filter(
(ld_index.locus == var1[0]) & (ld_index.alleles == var1[1])
).idx.collect()[0]
idx2 = ld_index.filter(
(ld_index.locus == var2[0]) & (ld_index.alleles == var2[1])
).idx.collect()[0]
if idx1 > idx2:
temp = idx1
idx1 = idx2
idx2 = temp
return bm[idx1, idx2]
[docs]def get_r_within_gene_in_pop(pop: str, gene: str):
"""
Get LD information (`r`) for all pairs of variants within `gene` for a given `pop`.
Warning: this returns a table quadratic in number of variants. Exercise caution with large genes.
:param pop: Population for which to get LD information
:param gene: Gene symbol as string
:return: Table with pairs of variants
"""
return get_r_within_gene(
ld_matrix(pop).bm(), ld_index(pop).ht(), gene, None, "GRCh37"
)
[docs]def get_r_within_gene(
bm: BlockMatrix,
ld_index: hl.Table,
gene: str,
vep_ht: hl.Table = None,
reference_genome: str = None,
):
"""
Get LD information (`r`) for all pairs of variants within `gene`.
Warning: this returns a table quadratic in number of variants. Exercise caution with large genes.
:param bm: Input Block Matrix
:param ld_index: Corresponding index table
:param gene: Gene symbol as string
:param vep_ht: Table with VEP annotations (if None, gets from get_gnomad_public_data())
:param reference_genome: Reference genome to pass to get_gene_intervals for fast filtering to gene
:return: Table with pairs of variants
"""
if vep_ht is None:
vep_ht = public_release("exomes").ht()
if reference_genome is None:
reference_genome = hl.default_reference().name
intervals = hl.experimental.get_gene_intervals(
gene_symbols=[gene], reference_genome=reference_genome
)
ld_index = hl.filter_intervals(ld_index, intervals)
ld_index = ld_index.annotate(vep=vep_ht[ld_index.key].vep)
ld_index = ld_index.filter(
hl.any(lambda tc: tc.gene_symbol == gene, ld_index.vep.transcript_consequences)
)
indices_to_keep = ld_index.idx.collect()
filt_bm = bm.filter(indices_to_keep, indices_to_keep)
ht = filt_bm.entries()
ld_index = ld_index.add_index("new_idx").key_by("new_idx")
return ht.transmute(r=ht.entry, i_variant=ld_index[ht.i], j_variant=ld_index[ht.j])