Gamgee
You miserable little maggot. I'll stove your head in!
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
variant_builder_individual_region.h
Go to the documentation of this file.
1 #ifndef gamgee__variant_builder_individual_region__guard
2 #define gamgee__variant_builder_individual_region__guard
3 
4 #include "variant.h"
6 
7 #include "htslib/kstring.h"
8 #include "htslib/vcf.h"
9 
10 #include <vector>
11 #include <string>
12 #include <stdexcept>
13 
14 namespace gamgee {
15 
24  public:
25  explicit VariantBuilderIndividualRegion(const VariantHeader& header, const bool enable_validation);
26 
32 
33  void set_enable_validation(const bool enable_validation) { m_enable_validation = enable_validation; }
34 
35  int32_t gt_index() const { return m_gt_field_index; }
36  uint32_t num_present_fields() const { return m_num_present_fields; }
37  bool modified() const { return m_num_present_fields > 0; }
38 
39  template<class FIELD_ID_TYPE, class BULK_FIELD_VALUES_TYPE>
40  void bulk_set_genotype_field(const FIELD_ID_TYPE& field_id, BULK_FIELD_VALUES_TYPE&& field_values) {
41  // Set boolean parameter in bulk_set_field() to true to indicate that we should allow GT to be set
42  bulk_set_field(field_id, std::forward<BULK_FIELD_VALUES_TYPE>(field_values), BCF_HT_INT, m_int_fields, true);
43  }
44 
45  template<class FIELD_ID_TYPE, class BULK_FIELD_VALUES_TYPE>
46  void bulk_set_integer_field(const FIELD_ID_TYPE& field_id, BULK_FIELD_VALUES_TYPE&& field_values) {
47  // Final boolean parameter to bulk_set_field() is kept as its default false here to disallow GT to be set as a regular int field
48  bulk_set_field(field_id, std::forward<BULK_FIELD_VALUES_TYPE>(field_values), BCF_HT_INT, m_int_fields);
49  }
50 
51  template<class FIELD_ID_TYPE, class BULK_FIELD_VALUES_TYPE>
52  void bulk_set_float_field(const FIELD_ID_TYPE& field_id, BULK_FIELD_VALUES_TYPE&& field_values) {
53  bulk_set_field(field_id, std::forward<BULK_FIELD_VALUES_TYPE>(field_values), BCF_HT_REAL, m_float_fields);
54  }
55 
56  template<class FIELD_ID_TYPE, class BULK_FIELD_VALUES_TYPE>
57  void bulk_set_string_field(const FIELD_ID_TYPE& field_id, BULK_FIELD_VALUES_TYPE&& field_values) {
58  bulk_set_field(field_id, std::forward<BULK_FIELD_VALUES_TYPE>(field_values), BCF_HT_STR, m_string_fields);
59  }
60 
61  template<class FIELD_ID_TYPE, class SAMPLE_ID_TYPE, class FIELD_VALUE_TYPE>
62  void set_genotype_field_by_sample(const FIELD_ID_TYPE& field_id, const SAMPLE_ID_TYPE& sample_id, const FIELD_VALUE_TYPE* field_values, const uint32_t num_field_values) {
63  // Set boolean parameter in set_field_by_sample() to true to indicate that we should allow GT to be set
64  set_field_by_sample(field_id, sample_id, field_values, num_field_values, BCF_HT_INT, m_int_fields, true);
65  }
66 
67  template<class FIELD_ID_TYPE, class SAMPLE_ID_TYPE, class FIELD_VALUE_TYPE>
68  void set_integer_field_by_sample(const FIELD_ID_TYPE& field_id, const SAMPLE_ID_TYPE& sample_id, const FIELD_VALUE_TYPE* field_values, const uint32_t num_field_values) {
69  // Final boolean parameter to set_field_by_sample() is kept as its default false here to disallow GT to be set as a regular int field
70  set_field_by_sample(field_id, sample_id, field_values, num_field_values, BCF_HT_INT, m_int_fields);
71  }
72 
73  template<class FIELD_ID_TYPE, class SAMPLE_ID_TYPE, class FIELD_VALUE_TYPE>
74  void set_float_field_by_sample(const FIELD_ID_TYPE& field_id, const SAMPLE_ID_TYPE& sample_id, const FIELD_VALUE_TYPE* field_values, const uint32_t num_field_values) {
75  set_field_by_sample(field_id, sample_id, field_values, num_field_values, BCF_HT_REAL, m_float_fields);
76  }
77 
78  template<class FIELD_ID_TYPE, class SAMPLE_ID_TYPE, class FIELD_VALUE_TYPE>
79  void set_string_field_by_sample(const FIELD_ID_TYPE& field_id, const SAMPLE_ID_TYPE& sample_id, const FIELD_VALUE_TYPE* field_values, const uint32_t num_field_values) {
80  set_field_by_sample(field_id, sample_id, field_values, num_field_values, BCF_HT_STR, m_string_fields);
81  }
82 
83  template<class FIELD_ID_TYPE>
84  void remove_individual_field(const FIELD_ID_TYPE& field_id) {
85  const auto field_idx = field_index(field_id);
86  if ( m_enable_validation ) {
87  validate_individual_field_existence(field_idx);
88  }
89  remove_field(field_idx);
90  }
91 
92  uint32_t estimate_total_size() const;
93  void encode_into(kstring_t* buffer) const;
94  void clear();
95 
96  private:
97  VariantHeader m_header;
98  std::vector<int32_t> m_field_lookup_table;
99  int32_t m_gt_field_index;
100  uint32_t m_num_present_fields;
101  std::vector<VariantBuilderIndividualField<int32_t, int32_t>> m_int_fields;
102  std::vector<VariantBuilderIndividualField<float, float>> m_float_fields;
103  std::vector<VariantBuilderIndividualField<char, std::string>> m_string_fields;
104  bool m_enable_validation;
105 
112  static const uint32_t int_field_short_value_threshold;
113  static const uint32_t float_field_short_value_threshold;
114  static const uint32_t string_field_short_value_threshold;
115 
116  void build_lookup_tables();
117 
118  template<class FIELD_ID_TYPE, class BULK_FIELD_VALUES_TYPE, class FIELD_TYPE>
119  void bulk_set_field(const FIELD_ID_TYPE& field_id, BULK_FIELD_VALUES_TYPE&& field_values, const int32_t provided_type, std::vector<FIELD_TYPE>& fields_of_type, const bool allow_gt = false) {
120  const auto field_idx = field_index(field_id);
121  if ( m_enable_validation ) {
122  validate_individual_field(field_idx, provided_type, allow_gt);
123  validate_multi_sample_vector_length(field_values);
124  }
125 
126  auto& field = fields_of_type[m_field_lookup_table[field_idx]];
127  const auto field_was_already_present = field.present();
128 
129  // Need to use std::forward() here so that we can handle both the move and the copy use cases
130  field.set_entire_field(std::forward<BULK_FIELD_VALUES_TYPE>(field_values));
131 
132  // Field will not necessarily be present after setting (eg., field_values might have been something like { {}, {} }).
133  // If the field is missing after giving it the user's value then treat it as an explicit user request to
134  // remove the entire field.
135  // Note that it's too expensive to check up front whether the value is something like { {}. {} }, which is why
136  // we check afterwards instead.
137  if ( ! field.present() ) {
138  field.remove();
139  }
140  update_present_field_count(field_was_already_present, field.present());
141  }
142 
143  template<class FIELD_ID_TYPE, class SAMPLE_ID_TYPE, class FIELD_VALUE_TYPE, class FIELD_TYPE>
144  void set_field_by_sample(const FIELD_ID_TYPE& field_id, const SAMPLE_ID_TYPE& sample_id, const FIELD_VALUE_TYPE* field_values, const uint32_t num_field_values, const int32_t provided_type, std::vector<FIELD_TYPE>& fields_of_type, const bool allow_gt = false) {
145  const auto field_idx = field_index(field_id);
146  const auto sample_idx = sample_index(sample_id);
147  if ( m_enable_validation ) {
148  validate_individual_field(field_idx, sample_idx, provided_type, allow_gt);
149  }
150 
151  auto& field = fields_of_type[m_field_lookup_table[field_idx]];
152  const auto field_was_already_present = field.present();
153 
154  field.set_sample_field_value(sample_idx, field_values, num_field_values);
155 
156  // Field will not necessarily be present after setting (eg., field_values might have been empty).
157  // Note that unlike in the bulk setting case, we don't treat empty values here as a user request to
158  // remove the field, since we're only dealing with a single sample's data.
159  update_present_field_count(field_was_already_present, field.present());
160  }
161 
162  void remove_field(const int32_t field_index);
163 
164  // These overloads only exist to allow us to unify the string id / integer id cases in
165  // the templated functions above
166  int32_t field_index(const std::string& field_id) const { return m_header.field_index(field_id); }
167  int32_t field_index(const uint32_t field_id) const { return int32_t(field_id); }
168  int32_t sample_index(const std::string& sample_id) const { return m_header.sample_index(sample_id); }
169  int32_t sample_index(const uint32_t sample_id) const { return int32_t(sample_id); }
170 
171  void validate_individual_field(const int32_t field_index, const uint32_t provided_type, const bool allow_gt) const;
172  void validate_individual_field(const int32_t field_index, const int32_t sample_index, const uint32_t provided_type, const bool allow_gt) const;
173  void validate_individual_field_existence(const int32_t field_index) const;
174 
175  template<class ELEMENT_TYPE>
176  void validate_multi_sample_vector_length(const std::vector<std::vector<ELEMENT_TYPE>>& vec) const {
177  // Empty vectors explicitly allowed; non-empty vectors must have a size EQUAL to the # of samples
178  if ( vec.size() != m_header.n_samples() && ! vec.empty() ) {
179  throw std::invalid_argument(std::string{"Number of elements in non-empty vector of vectors for individual field ("} + std::to_string(vec.size()) + ") not equal to the number of samples (" + std::to_string(m_header.n_samples()) + ")");
180  }
181  }
182 
183  template<class ELEMENT_TYPE>
184  void validate_multi_sample_vector_length(const std::vector<ELEMENT_TYPE>& vec) const {
185  const auto num_samples = m_header.n_samples();
186 
187  // Empty vectors explicitly allowed; non-empty vectors must have a size DIVISIBLE by # of samples
188  if ( vec.size() % num_samples != 0 ) {
189  throw std::invalid_argument(std::string{"Number of elements in flattened vector for individual field ("} + std::to_string(vec.size()) + ") not divisible by number of samples (" + std::to_string(num_samples) + ")");
190  }
191  }
192 
193  // need to specialize for the vector of string case
194  void validate_multi_sample_vector_length(const std::vector<std::string>& vec) const {
195  // Empty vectors explicitly allowed; non-empty vectors must have a size EQUAL to the # of samples
196  if ( vec.size() != m_header.n_samples() && ! vec.empty() ) {
197  throw std::invalid_argument(std::string{"Number of elements in non-empty vector for individual field ("} + std::to_string(vec.size()) + ") not equal to the number of samples (" + std::to_string(m_header.n_samples()) + ")");
198  }
199  }
200 
201  void update_present_field_count(const bool field_was_already_present, const bool field_currently_present) {
202  if ( ! field_was_already_present && field_currently_present ) {
203  ++m_num_present_fields;
204  }
205  else if ( field_was_already_present && ! field_currently_present ) {
206  --m_num_present_fields;
207  }
208  }
209 };
210 
211 }
212 
213 #endif /* gamgee__variant_builder_individual_region__guard */
void set_genotype_field_by_sample(const FIELD_ID_TYPE &field_id, const SAMPLE_ID_TYPE &sample_id, const FIELD_VALUE_TYPE *field_values, const uint32_t num_field_values)
Definition: variant_builder_individual_region.h:62
#define BCF_HT_REAL
Definition: vcf.h:55
int32_t gt_index() const
Definition: variant_builder_individual_region.h:35
void bulk_set_integer_field(const FIELD_ID_TYPE &field_id, BULK_FIELD_VALUES_TYPE &&field_values)
Definition: variant_builder_individual_region.h:46
void set_string_field_by_sample(const FIELD_ID_TYPE &field_id, const SAMPLE_ID_TYPE &sample_id, const FIELD_VALUE_TYPE *field_values, const uint32_t num_field_values)
Definition: variant_builder_individual_region.h:79
void remove_individual_field(const FIELD_ID_TYPE &field_id)
Definition: variant_builder_individual_region.h:84
void bulk_set_genotype_field(const FIELD_ID_TYPE &field_id, BULK_FIELD_VALUES_TYPE &&field_values)
Definition: variant_builder_individual_region.h:40
void bulk_set_float_field(const FIELD_ID_TYPE &field_id, BULK_FIELD_VALUES_TYPE &&field_values)
Definition: variant_builder_individual_region.h:52
void bulk_set_string_field(const FIELD_ID_TYPE &field_id, BULK_FIELD_VALUES_TYPE &&field_values)
Definition: variant_builder_individual_region.h:57
int32_t field_index(const std::string &tag) const
looks up the index of a particular filter, shared or individual field tag, enabling subsequent O(1) r...
Definition: variant_header.h:219
Definition: bgzf.h:69
void set_integer_field_by_sample(const FIELD_ID_TYPE &field_id, const SAMPLE_ID_TYPE &sample_id, const FIELD_VALUE_TYPE *field_values, const uint32_t num_field_values)
Definition: variant_builder_individual_region.h:68
void set_enable_validation(const bool enable_validation)
Definition: variant_builder_individual_region.h:33
uint32_t num_present_fields() const
Definition: variant_builder_individual_region.h:36
bool modified() const
Definition: variant_builder_individual_region.h:37
#define BCF_HT_STR
Definition: vcf.h:56
void set_float_field_by_sample(const FIELD_ID_TYPE &field_id, const SAMPLE_ID_TYPE &sample_id, const FIELD_VALUE_TYPE *field_values, const uint32_t num_field_values)
Definition: variant_builder_individual_region.h:74
Definition: exceptions.h:9
Helper class for VariantBuilder to manage the fields belonging to the individual region of Variant re...
Definition: variant_builder_individual_region.h:23
VariantBuilderIndividualRegion & operator=(VariantBuilderIndividualRegion &&other)=default
void encode_into(kstring_t *buffer) const
Encode all individual fields into the provided byte buffer in the proper order and format for final i...
Definition: variant_builder_individual_region.cpp:103
uint32_t n_samples() const
Definition: variant_header.h:72
Utility class to hold a variant header.
Definition: variant_header.h:52
VariantBuilderIndividualRegion(const VariantHeader &header, const bool enable_validation)
Definition: variant_builder_individual_region.cpp:16
#define BCF_HT_INT
Definition: vcf.h:54
int32_t sample_index(const std::string &sample) const
looks up the index of a particular sample, enabling subsequent O(1) random-access lookups for that sa...
Definition: variant_header.h:229
uint32_t estimate_total_size() const
Produce a slight overestimate of the total size of the encoded data for this individual region...
Definition: variant_builder_individual_region.cpp:83
void clear()
Reset the individual region to a pristine state with no field data.
Definition: variant_builder_individual_region.cpp:133