Gamgee
You miserable little maggot. I'll stove your head in!
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
variant.h
Go to the documentation of this file.
1 #ifndef gamgee__variant__guard
2 #define gamgee__variant__guard
3 
4 #include "variant_header.h"
5 #include "individual_field.h"
7 #include "shared_field.h"
8 #include "variant_filters.h"
9 #include "genotype.h"
10 
11 #include "../utils/variant_utils.h"
12 
13 #include "htslib/sam.h"
14 #include "boost/dynamic_bitset.hpp"
15 
16 #include <string>
17 #include <memory>
18 #include <vector>
19 
20 namespace gamgee {
24 enum class DiploidPLGenotype { HOM_REF = 0, HET = 1, HOM_VAR = 2};
25 
29 class Variant {
30  public:
31  Variant() = default;
32  explicit Variant(const std::shared_ptr<bcf_hdr_t>& header, const std::shared_ptr<bcf1_t>& body) noexcept;
33  Variant(const Variant& other);
34  Variant& operator=(const Variant& other);
35  Variant(Variant&& other) = default;
36  Variant& operator=(Variant&& other) = default;
37 
44  // Avoid a deep copy here by constructing using VariantHeader's internal shared header pointer
45  return VariantHeader{m_header.m_header};
46  }
47 
48  bool missing() const { return m_body == nullptr; }
49 
50  uint32_t chromosome() const {return uint32_t(m_body->rid);}
51  std::string chromosome_name() const {return header().chromosomes()[chromosome()];}
52  uint32_t alignment_start() const {return uint32_t(m_body->pos+1);}
53  uint32_t alignment_stop() const {return uint32_t(m_body->pos + m_body->rlen);}
54  float qual() const {return m_body->qual;}
55  uint32_t n_samples() const {return uint32_t(m_body->n_sample);}
56  uint32_t n_alleles() const {return uint32_t(m_body->n_allele);}
57 
58  std::string id() const;
59  std::string ref() const;
60  std::vector<std::string> alt() const;
61 
62  // filter field getter
63  VariantFilters filters() const;
64  bool has_filter(const std::string& filter) const;
65 
66  // individual field getters (a.k.a "format fields")
80 
81  // shared field getters (a.k.a "info fields")
82  bool boolean_shared_field(const std::string& tag) const;
83  SharedField<int32_t> integer_shared_field(const std::string& tag) const;
84  SharedField<float> float_shared_field(const std::string& tag) const;
85  SharedField<std::string> string_shared_field(const std::string& tag) const;
86  SharedField<int32_t> shared_field_as_integer(const std::string& tag) const;
87  SharedField<float> shared_field_as_float(const std::string& tag) const;
88  SharedField<std::string> shared_field_as_string(const std::string& tag) const;
89  bool boolean_shared_field(const int32_t index) const;
90  SharedField<int32_t> integer_shared_field(const int32_t index) const;
91  SharedField<float> float_shared_field(const int32_t index) const;
92  SharedField<std::string> string_shared_field(const int32_t index) const;
93  SharedField<int32_t> shared_field_as_integer(const int32_t index) const;
94  SharedField<float> shared_field_as_float(const int32_t index) const;
95  SharedField<std::string> shared_field_as_string(const int32_t index) const;
96 
147  template <class VALUE, template<class> class ITER>
148  static boost::dynamic_bitset<> select_if(
149  const ITER<VALUE>& first,
150  const ITER<VALUE>& last,
151  const std::function<bool (const decltype(*first)& value)> pred) // decltype(*first) will always yield an l-value reference to VALUE (VALUE&). I just added the & here to be explicit about my intention of capturing the parameter by reference.
152  {
153  const auto n_samples = last - first;
154  auto selected_samples = boost::dynamic_bitset<>(n_samples);
155  auto it = first;
156  for (auto i = 0; i < n_samples; i++) {
157  selected_samples[i] = pred(*it++);
158  }
159  return selected_samples;
160  }
161 
172  AlleleMask allele_mask() const;
173 
174  private:
175  VariantHeader m_header;
176  std::shared_ptr<bcf1_t> m_body;
177 
178  bcf_fmt_t* find_individual_field(const std::string& tag) const { return bcf_get_fmt(m_header.m_header.get(), m_body.get(), tag.c_str()); }
179  bcf_info_t* find_shared_field(const std::string& tag) const { return bcf_get_info(m_header.m_header.get(), m_body.get(), tag.c_str()); }
180  bcf_fmt_t* find_individual_field(const uint32_t index) const { return bcf_get_fmt_id(m_body.get(), index); }
181  bcf_info_t* find_shared_field(const uint32_t index) const { return bcf_get_info_id(m_body.get(), index); }
182  bool check_field(const int32_t type_field, const int32_t type_value, const int32_t index) const;
183  inline AlleleType allele_type_from_difference(const int diff) const;
184 
185  template<class FIELD_TYPE, class INDEX_OR_TAG> SharedField<FIELD_TYPE> shared_field_as(const INDEX_OR_TAG& p) const;
186  template<class FIELD_TYPE, class INDEX_OR_TAG> IndividualField<IndividualFieldValue<FIELD_TYPE>> individual_field_as(const INDEX_OR_TAG& p) const;
187 
188  friend class VariantWriter;
189  friend class VariantBuilder;
190 
191  // TODO: remove this friendship and these mutators after Issue #320 is resolved
192 
194 
195  inline void set_alignment_start(const int32_t start) { m_body->pos = start - 1; }
196  inline void set_alignment_stop(const int32_t end) { m_body->rlen = end - m_body->pos; }
197 
198  inline void set_reference_allele(const char* ref, const int32_t ref_length)
199  {
200  bcf_unpack(m_body.get(), BCF_UN_STR);
201  //Try to avoid calling bcf_update_alleles as it causes significant overhead in
202  //terms of memory re-allocation etc.
203  //If enough space is available, over-write the current reference allele value
204  //Seems like an un-necessary hack, but trust me every single such overhead counts
205  if(m_body->rlen >= ref_length)
206  {
207  memcpy(m_body->d.allele[0], ref, ref_length); //memcpy is slightly faster than strcpy
208  m_body->d.allele[0][ref_length] = '\0'; //append null - why not include in memcpy? see set_reference_allele with char option
209  m_body->d.shared_dirty |= BCF1_DIRTY_ALS;
210  }
211  else
212  {
213  //expensive - causes significant reallocs within htslib
214  //hacks to interpret const ptrs to non-const. Again, saves some overhead
215  //No need to free d.allele[0] because it points to shared string within bcf1_1.shared
216  m_body->d.allele[0] = const_cast<char*>(ref);
217  //Re-use same d.allele char** as update_alleles writes to different region of memory anyway
218  bcf_update_alleles(const_cast<const bcf_hdr_t*>(m_header.m_header.get()), m_body.get(), const_cast<const char**>(m_body->d.allele), m_body->n_allele);
219  }
220  }
221  inline void set_reference_allele(const char* ref) { set_reference_allele(ref, static_cast<int32_t>(strlen(ref))); }
222  inline void set_reference_allele(const char ref_base) { set_reference_allele(&ref_base, 1); }
223 };
224 
225 } // end of namespace
226 
227 #endif /* defined(gamgee__variant__guard) */
bool boolean_shared_field(const std::string &tag) const
whether or not the tag is present
Definition: variant.cpp:190
Variant & operator=(const Variant &other)
deep copy assignment of a Variant and it's header. Shared pointers maintain state to all other associ...
Definition: variant.cpp:80
bcf_fmt_t * bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
Definition: vcf.c:2979
DiploidPLGenotype
simple enum to keep the indices of the genotypes in the PL field of diploid individuals ...
Definition: variant.h:24
AlleleMask allele_mask() const
computes the allele types for all allels (including the reference allele)
Definition: variant.cpp:117
VariantHeader header() const
returns the header for this variant
Definition: variant.h:43
IndividualField< IndividualFieldValue< std::string > > individual_field_as_string(const std::string &tag) const
same as string_individual_field but will attempt to convert underlying data to string if possible...
Definition: variant.cpp:152
std::string chromosome_name() const
returns the name of the chromosome by querying the header.
Definition: variant.h:51
std::vector< std::string > chromosomes() const
returns the number of samples in the header
Definition: variant_header.cpp:96
std::string id() const
returns the variant id field (typically dbsnp id)
Definition: variant.cpp:92
uint32_t n_samples() const
returns the number of samples in this Variant record
Definition: variant.h:55
VariantFilters filters() const
returns a vector-like object with all the filters for this record
Definition: variant.cpp:108
IndividualField< IndividualFieldValue< int32_t > > integer_individual_field(const std::string &tag) const
returns a random access object with all the values in a given individual field tag in integer format ...
Definition: variant.cpp:132
int bcf_unpack(bcf1_t *b, int which)
Definition: vcf.c:1945
SharedField< int32_t > shared_field_as_integer(const std::string &tag) const
same as integer_shared_field but will attempt to convert underlying data to integer if possible...
Definition: variant.cpp:210
uint32_t alignment_start() const
returns a 1-based alignment start position (as you would see in a VCF file).
Definition: variant.h:52
bool missing() const
returns true if this is a default-constructed Variant object with no data
Definition: variant.h:48
bool has_filter(const std::string &filter) const
checks for the existence of a filter in this record
Definition: variant.cpp:113
std::string ref() const
returns the ref allele in this Variant record
Definition: variant.cpp:97
static boost::dynamic_bitset select_if(const ITER< VALUE > &first, const ITER< VALUE > &last, const std::function< bool(const decltype(*first)&value)> pred)
functional-style set logic operations for variant field vectors
Definition: variant.h:148
utility class to write out a VCF/BCF file to any stream
Definition: variant_writer.h:21
SharedField< std::string > string_shared_field(const std::string &tag) const
returns a random access object with all the values in a given shared field tag in string format for a...
Definition: variant.cpp:206
#define BCF1_DIRTY_ALS
Definition: vcf.h:160
Definition: vcf.h:136
uint32_t alignment_stop() const
returns a 1-based alignment stop position, as you would see in a VCF INFO END tag, or the end position of the reference allele if there is no END tag.
Definition: variant.h:53
bcf_info_t * bcf_get_info_id(bcf1_t *line, const int id)
Definition: vcf.c:3004
IndividualField< IndividualFieldValue< std::string > > string_individual_field(const std::string &tag) const
returns a random access object with all the values in a given individual field tag in string format f...
Definition: variant.cpp:140
A class template to hold the values of a specific Variant's shared field.
Definition: shared_field.h:57
Definition: vcf.h:144
std::vector< std::string > alt() const
returns the vectors of alt alleles in this Variant record
Definition: variant.cpp:102
bcf_info_t * bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
Definition: vcf.c:2986
VariantBuilder: construct Variant records from scratch (and, coming soon, from existing Variant recor...
Definition: variant_builder.h:164
IndividualField< IndividualFieldValue< int32_t > > individual_field_as_integer(const std::string &tag) const
same as integer_individual_field but will attempt to convert underlying data to integer if possible...
Definition: variant.cpp:144
#define BCF_UN_STR
Definition: vcf.h:334
AlleleType
Definition: variant_utils.h:22
uint32_t n_alleles() const
returns the number of alleles in this Variant record including the reference allele ...
Definition: variant.h:56
IndividualField< Genotype > genotypes() const
special getter for the Genotype (GT) field. Returns a random access object with all the values in a g...
Definition: variant.cpp:252
A class template to hold the values of a specific Variant's format field for all samples.
Definition: individual_field.h:65
Definition: exceptions.h:9
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
Definition: vcf.c:2922
SharedField< std::string > shared_field_as_string(const std::string &tag) const
same as string_shared_field but will attempt to convert underlying data to string if possible...
Definition: variant.cpp:218
SharedField< float > float_shared_field(const std::string &tag) const
returns a random access object with all the values in a given shared field tag in float format for al...
Definition: variant.cpp:202
SharedField< int32_t > integer_shared_field(const std::string &tag) const
returns a random access object with all the values in a given shared field tag in integer format cont...
Definition: variant.cpp:198
class to manipulate filter field objects without making copies.
Definition: variant_filters.h:23
Utility class to manipulate a Variant record.
Definition: variant.h:29
SharedField< float > shared_field_as_float(const std::string &tag) const
same as float_shared_field but will attempt to convert underlying data to float if possible...
Definition: variant.cpp:214
IndividualField< IndividualFieldValue< float > > float_individual_field(const std::string &tag) const
returns a random access object with all the values in a given individual field tag in float format fo...
Definition: variant.cpp:136
Variant()=default
initializes a null Variant
Utility class to handle reference blocks while iterating over multiple variant files.
Definition: reference_block_splitting_variant_iterator.h:16
Utility class to hold a variant header.
Definition: variant_header.h:52
std::vector< AlleleType > AlleleMask
Definition: variant_utils.h:24
uint32_t chromosome() const
returns the integer representation of the chromosome. Notice that chromosomes are listed in index ord...
Definition: variant.h:50
IndividualField< IndividualFieldValue< float > > individual_field_as_float(const std::string &tag) const
same as float_individual_field but will attempt to convert underlying data to float if possible...
Definition: variant.cpp:148
bcf_fmt_t * bcf_get_fmt_id(bcf1_t *line, const int id)
Definition: vcf.c:2993
float qual() const
returns the Phred scaled site qual (probability that the site is not reference). See VCF spec...
Definition: variant.h:54