Gamgee
You miserable little maggot. I'll stove your head in!
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
variant_builder_individual_field.h
Go to the documentation of this file.
1 #ifndef gamgee__variant_builder_individual_field__guard
2 #define gamgee__variant_builder_individual_field__guard
3 
4 #include "htslib/kstring.h"
5 #include "htslib/vcf.h"
6 
7 #include "../missing.h"
8 #include "../utils/hts_memory.h"
9 #include "../utils/short_value_optimized_storage.h"
10 
11 #include <string>
12 #include <vector>
13 #include <algorithm>
14 #include <stdexcept>
15 
16 namespace gamgee {
17 
35 template<class ENCODED_TYPE, class BULK_CHANGE_TYPE>
37  public:
38  explicit VariantBuilderIndividualField(const uint32_t num_samples, const uint32_t field_index, const int32_t field_type,
39  const ENCODED_TYPE missing_value, const ENCODED_TYPE end_of_vector_value,
40  const uint32_t short_value_upper_bound) :
41  m_field_index {field_index},
42  m_field_type {field_type},
43  m_num_samples {num_samples},
44  m_max_sample_value_length { 0 },
45  m_missing_value {missing_value},
46  m_end_of_vector_value {end_of_vector_value},
47  m_flattened_bulk_changes{},
48  m_nested_bulk_changes{},
49  m_per_sample_changes {num_samples, short_value_upper_bound},
50  m_removed { false }
51  {}
52 
58 
64  void set_entire_field(std::vector<BULK_CHANGE_TYPE>&& bulk_changes) {
65  m_flattened_bulk_changes = std::move(bulk_changes);
66  if ( ! m_nested_bulk_changes.empty() ) m_nested_bulk_changes.clear();
67  m_max_sample_value_length = max_sample_value_length(m_flattened_bulk_changes);
68  m_removed = false;
69  }
70 
71  void set_entire_field(const std::vector<BULK_CHANGE_TYPE>& bulk_changes) {
72  m_flattened_bulk_changes = bulk_changes;
73  if ( ! m_nested_bulk_changes.empty() ) m_nested_bulk_changes.clear();
74  m_max_sample_value_length = max_sample_value_length(m_flattened_bulk_changes);
75  m_removed = false;
76  }
77 
78  void set_entire_field(std::vector<std::vector<BULK_CHANGE_TYPE>>&& bulk_changes) {
79  m_nested_bulk_changes = std::move(bulk_changes);
80  if ( ! m_flattened_bulk_changes.empty() ) m_flattened_bulk_changes.clear();
81  m_max_sample_value_length = max_sample_value_length(m_nested_bulk_changes);
82  m_removed = false;
83  }
84  void set_entire_field(const std::vector<std::vector<BULK_CHANGE_TYPE>>& bulk_changes) {
85  m_nested_bulk_changes = bulk_changes;
86  if ( ! m_flattened_bulk_changes.empty() ) m_flattened_bulk_changes.clear();
87  m_max_sample_value_length = max_sample_value_length(m_nested_bulk_changes);
88  m_removed = false;
89  }
90 
94  void set_sample_field_value(const uint32_t sample_index, const ENCODED_TYPE* values, const uint32_t num_values) {
95  m_per_sample_changes.set(sample_index, values, num_values);
96  m_max_sample_value_length = m_per_sample_changes.max_value_length();
97  m_removed = false;
98  }
99 
100  uint32_t field_index() const { return m_field_index; }
101  int32_t field_type() const { return m_field_type; }
102 
103  void remove() { clear(); m_removed = true; }
104  bool removed() const { return m_removed; }
105  bool missing() const { return m_max_sample_value_length == 0u; }
106  bool present() const { return ! missing() && ! removed(); }
107 
108  bool has_bulk_changes() const { return ! m_flattened_bulk_changes.empty() || ! m_nested_bulk_changes.empty(); }
109  bool has_per_sample_changes() const { return m_per_sample_changes.num_values() > 0; }
110 
114  void clear() {
115  if ( has_bulk_changes() ) {
116  m_flattened_bulk_changes.clear();
117  m_nested_bulk_changes.clear();
118  }
119 
120  if ( has_per_sample_changes() ) {
121  m_per_sample_changes.clear();
122  }
123  m_max_sample_value_length = 0;
124 
125  m_removed = false;
126  }
127 
132  uint32_t estimated_encoded_size() const {
133  // max possible size required by encoded field index + typing byte
134  constexpr uint32_t max_metadata_overhead = 11;
135 
136  // note that for ints we (typically) overestimate by assuming the worst-case of int32
137  return present() ? (m_max_sample_value_length * sizeof(ENCODED_TYPE) * m_num_samples) + max_metadata_overhead : 0;
138  }
139 
143  void encode_into(kstring_t* destination) const {
144  if ( ! present() ) {
145  return;
146  }
147 
148  // Branch based on how the un-encoded field values are being stored
150  throw std::logic_error("Cannot set an individual field both in bulk and by sample");
151  }
152  else if ( ! m_flattened_bulk_changes.empty() ) {
153  encode_into(destination, m_flattened_bulk_changes);
154  }
155  else if ( ! m_nested_bulk_changes.empty() ) {
156  encode_into(destination, m_nested_bulk_changes);
157  }
158  else if ( has_per_sample_changes() ) {
159  encode_into(destination, m_per_sample_changes);
160  }
161  else {
162  throw std::logic_error(std::string{"Encoding requested, but nothing to encode for individual field: "} + std::to_string(m_field_index));
163  }
164  }
165 
166  private:
167  uint32_t m_field_index;
168  int32_t m_field_type;
169  uint32_t m_num_samples;
170  uint32_t m_max_sample_value_length;
171  ENCODED_TYPE m_missing_value;
172  ENCODED_TYPE m_end_of_vector_value;
173  std::vector<BULK_CHANGE_TYPE> m_flattened_bulk_changes;
174  std::vector<std::vector<BULK_CHANGE_TYPE>> m_nested_bulk_changes;
176  bool m_removed;
177 
186  void encode_into(kstring_t* destination, const std::vector<int32_t>& values) const {
187  bcf_enc_int1(destination, m_field_index);
188 
189  // Since the user has provided us with a pre-flattened/padded set of values, we can just pass
190  // it to htslib directly for encoding.
191  // N.B. VariantBuilder has already ensured that values.size() is divisible by num_samples (unless validation is turned off)
192  bcf_enc_vint(destination, values.size(), const_cast<int32_t*>(&(values[0])), values.size() / m_num_samples);
193  }
194 
195  void encode_into(kstring_t* destination, const std::vector<std::vector<int32_t>>& values) const {
196  auto min_value = INT32_MAX;
197  auto max_value = INT32_MIN + 1;
198 
199  // We could just create a flattened vector and then pass it directly to htslib to spare us all this
200  // low-level work, but it would be less efficient
201 
202  // First find the min/max values
203  for ( const auto& sample_values : values ) {
204  if ( sample_values.size() > 0 ) find_min_max_int_values(&(sample_values[0]), sample_values.size(), min_value, max_value);
205  }
206 
207  // Then determine the resulting encoded type (int8, int16, or int32)
208  const auto encoded_type = utils::int_encoded_type(min_value, max_value);
209 
210  // Encode the field index and type descriptor
211  bcf_enc_int1(destination, m_field_index);
212  bcf_enc_size(destination, m_max_sample_value_length, encoded_type);
213 
214  // Encode values for each sample, padding as necessary to the max length
215  for ( const auto& sample_values : values ) {
216  // OK to pass in nullptr to the encoder if a sample has no values
217  encode_and_pad_int_values_as_type(destination, sample_values.empty() ? nullptr : &(sample_values[0]), sample_values.size(), m_max_sample_value_length, encoded_type);
218  }
219  }
220 
221  void encode_into(kstring_t* destination, const utils::ShortValueOptimizedStorage<int32_t>& values) const {
222  auto max_length = values.max_value_length();
223  auto min_value = INT32_MAX;
224  auto max_value = INT32_MIN + 1;
225 
226  // We could just create a flattened vector and then pass it directly to htslib to spare us all this
227  // low-level work, but it would be less efficient
228 
229  // First find the min/max values
230  for ( auto i = 0u; i < values.capacity(); ++i ) {
231  auto sample_value = values.get(i);
232  if ( sample_value.second > 0 ) find_min_max_int_values(sample_value.first, sample_value.second, min_value, max_value);
233  }
234 
235  // Then determine the resulting encoded type (int8, int16, or int32)
236  const auto encoded_type = utils::int_encoded_type(min_value, max_value);
237 
238  // Encode field index and type descriptor
239  bcf_enc_int1(destination, m_field_index);
240  bcf_enc_size(destination, max_length, encoded_type);
241 
242  // Encode values for each sample, padding as necessary to the max length
243  for ( auto i = 0u; i < values.capacity(); ++i ) {
244  auto sample_value = values.get(i);
245  // OK to pass in nullptr to the encoder if a sample has no values
246  encode_and_pad_int_values_as_type(destination, sample_value.second == 0 ? nullptr : sample_value.first, sample_value.second, max_length, encoded_type);
247  }
248  }
249 
250  void encode_into(kstring_t* destination, const std::vector<float>& values) const {
251  bcf_enc_int1(destination, m_field_index);
252 
253  // Since the user has provided us with a pre-flattened/padded set of values, we can just pass
254  // it to htslib directly for encoding.
255  // N.B. VariantBuilder has already ensured that values.size() is divisible by num_samples (unless validation is turned off)
256  bcf_enc_size(destination, values.size() / m_num_samples, BCF_BT_FLOAT);
257  kputsn(reinterpret_cast<const char*>(&(values[0])), values.size() * sizeof(float), destination);
258  }
259 
260  void encode_into(kstring_t* destination, const std::vector<std::vector<float>>& values) const {
261  // Encode field index and type descriptor
262  bcf_enc_int1(destination, m_field_index);
263  bcf_enc_size(destination, m_max_sample_value_length, BCF_BT_FLOAT);
264 
265  // Then encode values for each sample, padding as necessary to the max length
266  for ( auto& sample_values : values ) {
267  encode_and_pad_sample_values(destination, &(sample_values[0]), sample_values.size(), m_max_sample_value_length);
268  }
269  }
270 
271  void encode_into(kstring_t* destination, const utils::ShortValueOptimizedStorage<float>& values) const {
272  const auto max_length = values.max_value_length();
273 
274  // Encode field index and type descriptor
275  bcf_enc_int1(destination, m_field_index);
276  bcf_enc_size(destination, max_length, BCF_BT_FLOAT);
277 
278  // Encode per-sample values, padding as necessary to the max length
279  for ( auto i = 0u; i < values.capacity(); ++i ) {
280  const auto sample_values = values.get(i);
281  encode_and_pad_sample_values(destination, sample_values.first, sample_values.second, max_length);
282  }
283  }
284 
285  void encode_into(kstring_t* destination, const std::vector<std::string>& values) const {
286  // Encode field index and type descriptor
287  bcf_enc_int1(destination, m_field_index);
288  bcf_enc_size(destination, m_max_sample_value_length, BCF_BT_CHAR);
289 
290  // Encode per-sample values, padding as necessary to the max length
291  for ( const auto& str : values ) {
292  encode_and_pad_sample_values(destination, str.c_str(), str.length(), m_max_sample_value_length);
293  }
294  }
295 
296  void encode_into(kstring_t* destination, const std::vector<std::vector<std::string>>& values) const {
297  // this one is not possible, but is needed in order to allow the string instantiation of the template to compile
298  throw std::logic_error("nested vectors of strings not supported");
299  }
300 
301  void encode_into(kstring_t* destination, const utils::ShortValueOptimizedStorage<char>& values) const {
302  const auto max_length = values.max_value_length();
303 
304  // Encode field index and type descriptor
305  bcf_enc_int1(destination, m_field_index);
306  bcf_enc_size(destination, max_length, BCF_BT_CHAR);
307 
308  // Encode per-sample values, padding as necessary to the max length
309  for ( auto i = 0u; i < values.capacity(); ++i ) {
310  const auto sample_values = values.get(i);
311  encode_and_pad_sample_values(destination, sample_values.first, sample_values.second, max_length);
312  }
313  }
314 
319  void encode_and_pad_sample_values(kstring_t* destination, const ENCODED_TYPE* sample_values, const uint32_t num_values, const uint32_t field_width) const {
320  // This is how much padding we usually need (exception is the missing sample case handled below)
321  auto pad_size = field_width - num_values;
322 
323  // If this sample has values, copy them first
324  if ( num_values > 0 ) {
325  kputsn(reinterpret_cast<const char*>(sample_values), num_values * sizeof(ENCODED_TYPE), destination);
326  }
327  else {
328  // If there are no values for this sample, we need to write a single missing value,
329  // then pad with field_width - 1 end of vector values
330  kputsn(reinterpret_cast<const char*>(&m_missing_value), sizeof(ENCODED_TYPE), destination);
331  pad_size = field_width - 1;
332  }
333 
334  // Pad with appropriate number of end of vector values
335  pad_field(destination, reinterpret_cast<const char*>(&m_end_of_vector_value), sizeof(ENCODED_TYPE), pad_size);
336  }
337 
341  void encode_and_pad_int_values_as_type(kstring_t* destination, const int32_t* values, const uint32_t num_values, const uint32_t field_width, const uint8_t target_type) const {
342  switch ( target_type ) {
343  case BCF_BT_INT8:
344  transcode_to_int8(destination, values, num_values, field_width);
345  break;
346  case BCF_BT_INT16:
347  transcode_to_int16(destination, values, num_values, field_width);
348  break;
349  case BCF_BT_INT32:
350  encode_and_pad_sample_values(destination, values, num_values, field_width);
351  break;
352  default:
353  throw std::logic_error("Invalid target type in encode_and_pad_int_values_as_type()");
354  }
355  }
356 
361  void transcode_to_int8(kstring_t* destination, const int32_t* values, const uint32_t num_values, const uint32_t field_width) const {
362  // missing value will vary depending on whether we're in the GT field or not
363  const int8_t int8_missing = m_missing_value == bcf_int32_missing ? bcf_int8_missing : int8_t(m_missing_value);
364  const int8_t int8_vector_end = bcf_int8_vector_end; // need this to be an lvalue rather than a macro so we can take its address
365  auto pad_size = field_width - num_values;
366 
367  // If there are values for this sample, convert them one at a time to int8
368  if ( num_values > 0 ) {
369  for ( auto i = 0u; i < num_values; ++i ) {
370  if ( values[i] == m_end_of_vector_value ) kputc(int8_vector_end, destination);
371  else if ( values[i] == m_missing_value ) kputc(int8_missing, destination);
372  else kputc( values[i], destination);
373  }
374  }
375  else {
376  // If there are no values for this sample, we'll write an int8_missing followed by field_width - 1 int8_vector_ends
377  kputc(int8_missing, destination);
378  pad_size = field_width - 1;
379  }
380 
381  // Pad field with appropriate number of vector end values
382  pad_field(destination, reinterpret_cast<const char*>(&int8_vector_end), 1, pad_size);
383  }
384 
389  void transcode_to_int16(kstring_t* destination, const int32_t* values, const uint32_t num_values, const uint32_t field_width) const {
390  // missing value will vary depending on whether we're in the GT field or not
391  const int16_t int16_missing = m_missing_value == bcf_int32_missing ? bcf_int16_missing : int16_t(m_missing_value);
392  const int16_t int16_vector_end = bcf_int16_vector_end; // need this to be an lvalue rather than a macro so we can take its address
393  auto pad_size = field_width - num_values;
394  int16_t int16_val = 0;
395 
396  // If there are values for this sample, convert them one at a time to int16
397  if ( num_values > 0 ) {
398  for ( auto i = 0u; i < num_values; ++i ) {
399  if ( values[i] == m_end_of_vector_value ) int16_val = int16_vector_end;
400  else if ( values[i] == m_missing_value ) int16_val = int16_missing;
401  else int16_val = int16_t(values[i]);
402  kputsn(reinterpret_cast<const char*>(&int16_val), 2, destination);
403  }
404  }
405  else {
406  // If there are no values for this sample, we'll write an int16_missing followed by field_width - 1 int16_vector_ends
407  kputsn(reinterpret_cast<const char*>(&int16_missing), 2, destination);
408  pad_size = field_width - 1;
409  }
410 
411  // Pad field with appropriate number of vector end values
412  pad_field(destination, reinterpret_cast<const char*>(&int16_vector_end), 2, pad_size);
413  }
414 
415  void pad_field(kstring_t* destination, const char* padding, const uint32_t padding_bytes, const uint32_t pad_size) const {
416  for ( auto i = 0u; i < pad_size; ++i ) {
417  kputsn(padding, padding_bytes, destination);
418  }
419  }
420 
425  void find_min_max_int_values(const int32_t* values, const uint32_t num_values, int32_t& min, int32_t& max) const {
426  // N.B. can't use std::minmax_element() here since we need to explicitly exclude the missing and vector end values
427  std::for_each(values, values + num_values, [&min, &max] (int32_t val) {
428  if ( val != bcf_int32_missing && val != bcf_int32_vector_end ) {
429  if ( val > max ) max = val;
430  if ( val < min ) min = val;
431  }
432  });
433  }
434 
439  template<class T>
440  uint32_t max_sample_value_length(const std::vector<T>& sample_data) const {
441  return sample_data.size() / m_num_samples;
442  }
443 
447  uint32_t max_sample_value_length(const std::vector<std::string>& sample_data) const {
448  auto max_length = 0u;
449  std::for_each(sample_data.begin(), sample_data.end(), [&max_length] (const std::string& str) {
450  if ( str.length() > max_length ) max_length = str.length();
451  });
452  return max_length;
453  }
454 
458  template<class T>
459  uint32_t max_sample_value_length(const std::vector<std::vector<T>>& sample_data) const {
460  auto max_length = 0u;
461  std::for_each(sample_data.begin(), sample_data.end(), [&max_length] (const std::vector<T>& sample_values) {
462  if ( sample_values.size() > max_length ) max_length = sample_values.size();
463  });
464  return max_length;
465  }
466 };
467 
468 }
469 
470 #endif /* gamgee__variant_builder_individual_field__guard */
#define BCF_BT_FLOAT
Definition: vcf.h:123
void clear(const uint32_t index)
Clear the value at a specific index.
Definition: short_value_optimized_storage.h:128
uint32_t field_index() const
Definition: variant_builder_individual_field.h:100
#define BCF_BT_INT32
Definition: vcf.h:122
bool has_bulk_changes() const
Definition: variant_builder_individual_field.h:108
#define bcf_int8_vector_end
Definition: vcf.h:750
#define bcf_int32_missing
Definition: vcf.h:756
Helper class for VariantBuilder to manage the storage and encoding of a single multi-sample individua...
Definition: variant_builder_individual_field.h:36
bool present() const
Definition: variant_builder_individual_field.h:106
bool has_per_sample_changes() const
Definition: variant_builder_individual_field.h:109
void set_entire_field(const std::vector< BULK_CHANGE_TYPE > &bulk_changes)
Definition: variant_builder_individual_field.h:71
void encode_into(kstring_t *destination) const
Encode this field's data into the provided buffer. If field has no data or was removed, do nothing.
Definition: variant_builder_individual_field.h:143
void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
Definition: vcf.c:1371
VariantBuilderIndividualField & operator=(VariantBuilderIndividualField &&other)=default
#define BCF_BT_INT8
Definition: vcf.h:120
void set_entire_field(const std::vector< std::vector< BULK_CHANGE_TYPE >> &bulk_changes)
Definition: variant_builder_individual_field.h:84
void set_entire_field(std::vector< BULK_CHANGE_TYPE > &&bulk_changes)
Definition: variant_builder_individual_field.h:64
int32_t field_type() const
Definition: variant_builder_individual_field.h:101
#define bcf_int16_vector_end
Definition: vcf.h:751
Definition: bgzf.h:69
bool removed() const
Definition: variant_builder_individual_field.h:104
uint32_t max_value_length() const
Returns the length of the longest value in the container.
Definition: short_value_optimized_storage.h:67
#define bcf_int32_vector_end
Definition: vcf.h:752
void set_entire_field(std::vector< std::vector< BULK_CHANGE_TYPE >> &&bulk_changes)
Definition: variant_builder_individual_field.h:78
uint32_t estimated_encoded_size() const
Provide an estimate (typically an overestimate) of the number of bytes this field will require when e...
Definition: variant_builder_individual_field.h:132
VariantBuilderIndividualField(const uint32_t num_samples, const uint32_t field_index, const int32_t field_type, const ENCODED_TYPE missing_value, const ENCODED_TYPE end_of_vector_value, const uint32_t short_value_upper_bound)
Definition: variant_builder_individual_field.h:38
#define str(x)
Definition: sam.c:66
Definition: exceptions.h:9
void set(const uint32_t index, const std::vector< ELEMENT_TYPE > &values)
Set the value at the specified index by vector.
Definition: short_value_optimized_storage.h:95
uint8_t int_encoded_type(const int32_t min_val, const int32_t max_val)
Given a min and max value, determines whether int8, int16, or int32 BCF encoding is required...
Definition: hts_memory.cpp:161
#define BCF_BT_INT16
Definition: vcf.h:121
#define bcf_int16_missing
Definition: vcf.h:755
uint32_t num_values() const
Returns the number of values in the container.
Definition: short_value_optimized_storage.h:57
void clear()
Reset this field to a pristine state with no data.
Definition: variant_builder_individual_field.h:114
void set_sample_field_value(const uint32_t sample_index, const ENCODED_TYPE *values, const uint32_t num_values)
Stores a value for just a single sample efficiently using the ShortValueOptimizedStorage layer...
Definition: variant_builder_individual_field.h:94
bool missing() const
Definition: variant_builder_individual_field.h:105
#define BCF_BT_CHAR
Definition: vcf.h:124
#define bcf_int8_missing
Definition: vcf.h:754