1 #ifndef gamgee__variant_builder_individual_field__guard
2 #define gamgee__variant_builder_individual_field__guard
7 #include "../missing.h"
8 #include "../utils/hts_memory.h"
9 #include "../utils/short_value_optimized_storage.h"
35 template<
class ENCODED_TYPE,
class BULK_CHANGE_TYPE>
39 const ENCODED_TYPE missing_value,
const ENCODED_TYPE end_of_vector_value,
40 const uint32_t short_value_upper_bound) :
41 m_field_index {field_index},
43 m_num_samples {num_samples},
44 m_max_sample_value_length { 0 },
45 m_missing_value {missing_value},
46 m_end_of_vector_value {end_of_vector_value},
47 m_flattened_bulk_changes{},
48 m_nested_bulk_changes{},
49 m_per_sample_changes {num_samples, short_value_upper_bound},
65 m_flattened_bulk_changes = std::move(bulk_changes);
66 if ( ! m_nested_bulk_changes.empty() ) m_nested_bulk_changes.clear();
67 m_max_sample_value_length = max_sample_value_length(m_flattened_bulk_changes);
72 m_flattened_bulk_changes = bulk_changes;
73 if ( ! m_nested_bulk_changes.empty() ) m_nested_bulk_changes.clear();
74 m_max_sample_value_length = max_sample_value_length(m_flattened_bulk_changes);
79 m_nested_bulk_changes = std::move(bulk_changes);
80 if ( ! m_flattened_bulk_changes.empty() ) m_flattened_bulk_changes.clear();
81 m_max_sample_value_length = max_sample_value_length(m_nested_bulk_changes);
84 void set_entire_field(
const std::vector<std::vector<BULK_CHANGE_TYPE>>& bulk_changes) {
85 m_nested_bulk_changes = bulk_changes;
86 if ( ! m_flattened_bulk_changes.empty() ) m_flattened_bulk_changes.clear();
87 m_max_sample_value_length = max_sample_value_length(m_nested_bulk_changes);
95 m_per_sample_changes.
set(sample_index, values, num_values);
103 void remove() {
clear(); m_removed =
true; }
105 bool missing()
const {
return m_max_sample_value_length == 0u; }
108 bool has_bulk_changes()
const {
return ! m_flattened_bulk_changes.empty() || ! m_nested_bulk_changes.empty(); }
116 m_flattened_bulk_changes.clear();
117 m_nested_bulk_changes.clear();
121 m_per_sample_changes.
clear();
123 m_max_sample_value_length = 0;
134 constexpr uint32_t max_metadata_overhead = 11;
137 return present() ? (m_max_sample_value_length *
sizeof(ENCODED_TYPE) * m_num_samples) + max_metadata_overhead : 0;
150 throw std::logic_error(
"Cannot set an individual field both in bulk and by sample");
152 else if ( ! m_flattened_bulk_changes.empty() ) {
153 encode_into(destination, m_flattened_bulk_changes);
155 else if ( ! m_nested_bulk_changes.empty() ) {
162 throw std::logic_error(std::string{
"Encoding requested, but nothing to encode for individual field: "} + std::to_string(m_field_index));
167 uint32_t m_field_index;
168 int32_t m_field_type;
169 uint32_t m_num_samples;
170 uint32_t m_max_sample_value_length;
171 ENCODED_TYPE m_missing_value;
172 ENCODED_TYPE m_end_of_vector_value;
173 std::vector<BULK_CHANGE_TYPE> m_flattened_bulk_changes;
174 std::vector<std::vector<BULK_CHANGE_TYPE>> m_nested_bulk_changes;
187 bcf_enc_int1(destination, m_field_index);
192 bcf_enc_vint(destination, values.size(),
const_cast<int32_t*
>(&(values[0])), values.size() / m_num_samples);
195 void encode_into(
kstring_t* destination,
const std::vector<std::vector<int32_t>>& values)
const {
196 auto min_value = INT32_MAX;
197 auto max_value = INT32_MIN + 1;
203 for (
const auto& sample_values : values ) {
204 if ( sample_values.size() > 0 ) find_min_max_int_values(&(sample_values[0]), sample_values.size(), min_value, max_value);
211 bcf_enc_int1(destination, m_field_index);
212 bcf_enc_size(destination, m_max_sample_value_length, encoded_type);
215 for (
const auto& sample_values : values ) {
217 encode_and_pad_int_values_as_type(destination, sample_values.empty() ?
nullptr : &(sample_values[0]), sample_values.size(), m_max_sample_value_length, encoded_type);
221 void encode_into(
kstring_t* destination,
const utils::ShortValueOptimizedStorage<int32_t>& values)
const {
222 auto max_length = values.max_value_length();
223 auto min_value = INT32_MAX;
224 auto max_value = INT32_MIN + 1;
230 for (
auto i = 0u; i < values.capacity(); ++i ) {
231 auto sample_value = values.get(i);
232 if ( sample_value.second > 0 ) find_min_max_int_values(sample_value.first, sample_value.second, min_value, max_value);
239 bcf_enc_int1(destination, m_field_index);
240 bcf_enc_size(destination, max_length, encoded_type);
243 for (
auto i = 0u; i < values.capacity(); ++i ) {
244 auto sample_value = values.get(i);
246 encode_and_pad_int_values_as_type(destination, sample_value.second == 0 ?
nullptr : sample_value.first, sample_value.second, max_length, encoded_type);
251 bcf_enc_int1(destination, m_field_index);
256 bcf_enc_size(destination, values.size() / m_num_samples,
BCF_BT_FLOAT);
257 kputsn(reinterpret_cast<const char*>(&(values[0])), values.size() *
sizeof(float), destination);
262 bcf_enc_int1(destination, m_field_index);
263 bcf_enc_size(destination, m_max_sample_value_length,
BCF_BT_FLOAT);
266 for (
auto& sample_values : values ) {
267 encode_and_pad_sample_values(destination, &(sample_values[0]), sample_values.size(), m_max_sample_value_length);
271 void encode_into(
kstring_t* destination,
const utils::ShortValueOptimizedStorage<float>& values)
const {
272 const auto max_length = values.max_value_length();
275 bcf_enc_int1(destination, m_field_index);
279 for (
auto i = 0u; i < values.capacity(); ++i ) {
280 const auto sample_values = values.get(i);
281 encode_and_pad_sample_values(destination, sample_values.first, sample_values.second, max_length);
287 bcf_enc_int1(destination, m_field_index);
288 bcf_enc_size(destination, m_max_sample_value_length,
BCF_BT_CHAR);
291 for (
const auto&
str : values ) {
292 encode_and_pad_sample_values(destination,
str.c_str(),
str.length(), m_max_sample_value_length);
296 void encode_into(
kstring_t* destination,
const std::vector<std::vector<std::string>>& values)
const {
298 throw std::logic_error(
"nested vectors of strings not supported");
301 void encode_into(
kstring_t* destination,
const utils::ShortValueOptimizedStorage<char>& values)
const {
302 const auto max_length = values.max_value_length();
305 bcf_enc_int1(destination, m_field_index);
306 bcf_enc_size(destination, max_length,
BCF_BT_CHAR);
309 for (
auto i = 0u; i < values.capacity(); ++i ) {
310 const auto sample_values = values.get(i);
311 encode_and_pad_sample_values(destination, sample_values.first, sample_values.second, max_length);
319 void encode_and_pad_sample_values(
kstring_t* destination,
const ENCODED_TYPE* sample_values,
const uint32_t num_values,
const uint32_t field_width)
const {
321 auto pad_size = field_width - num_values;
324 if ( num_values > 0 ) {
325 kputsn(reinterpret_cast<const char*>(sample_values), num_values *
sizeof(ENCODED_TYPE), destination);
330 kputsn(reinterpret_cast<const char*>(&m_missing_value),
sizeof(ENCODED_TYPE), destination);
331 pad_size = field_width - 1;
335 pad_field(destination, reinterpret_cast<const char*>(&m_end_of_vector_value),
sizeof(ENCODED_TYPE), pad_size);
341 void encode_and_pad_int_values_as_type(
kstring_t* destination,
const int32_t* values,
const uint32_t num_values,
const uint32_t field_width,
const uint8_t target_type)
const {
342 switch ( target_type ) {
344 transcode_to_int8(destination, values, num_values, field_width);
347 transcode_to_int16(destination, values, num_values, field_width);
350 encode_and_pad_sample_values(destination, values, num_values, field_width);
353 throw std::logic_error(
"Invalid target type in encode_and_pad_int_values_as_type()");
361 void transcode_to_int8(
kstring_t* destination,
const int32_t* values,
const uint32_t num_values,
const uint32_t field_width)
const {
365 auto pad_size = field_width - num_values;
368 if ( num_values > 0 ) {
369 for (
auto i = 0u; i < num_values; ++i ) {
370 if ( values[i] == m_end_of_vector_value ) kputc(int8_vector_end, destination);
371 else if ( values[i] == m_missing_value ) kputc(int8_missing, destination);
372 else kputc( values[i], destination);
377 kputc(int8_missing, destination);
378 pad_size = field_width - 1;
382 pad_field(destination, reinterpret_cast<const char*>(&int8_vector_end), 1, pad_size);
389 void transcode_to_int16(
kstring_t* destination,
const int32_t* values,
const uint32_t num_values,
const uint32_t field_width)
const {
393 auto pad_size = field_width - num_values;
394 int16_t int16_val = 0;
397 if ( num_values > 0 ) {
398 for (
auto i = 0u; i < num_values; ++i ) {
399 if ( values[i] == m_end_of_vector_value ) int16_val = int16_vector_end;
400 else if ( values[i] == m_missing_value ) int16_val = int16_missing;
401 else int16_val = int16_t(values[i]);
402 kputsn(reinterpret_cast<const char*>(&int16_val), 2, destination);
407 kputsn(reinterpret_cast<const char*>(&int16_missing), 2, destination);
408 pad_size = field_width - 1;
412 pad_field(destination, reinterpret_cast<const char*>(&int16_vector_end), 2, pad_size);
415 void pad_field(
kstring_t* destination,
const char* padding,
const uint32_t padding_bytes,
const uint32_t pad_size)
const {
416 for (
auto i = 0u; i < pad_size; ++i ) {
417 kputsn(padding, padding_bytes, destination);
425 void find_min_max_int_values(
const int32_t* values,
const uint32_t num_values, int32_t& min, int32_t& max)
const {
427 std::for_each(values, values + num_values, [&min, &max] (int32_t val) {
429 if ( val > max ) max = val;
430 if ( val < min ) min = val;
440 uint32_t max_sample_value_length(
const std::vector<T>& sample_data)
const {
441 return sample_data.size() / m_num_samples;
447 uint32_t max_sample_value_length(
const std::vector<std::string>& sample_data)
const {
448 auto max_length = 0u;
449 std::for_each(sample_data.begin(), sample_data.end(), [&max_length] (
const std::string&
str) {
450 if (
str.length() > max_length ) max_length =
str.length();
459 uint32_t max_sample_value_length(
const std::vector<std::vector<T>>& sample_data)
const {
460 auto max_length = 0u;
461 std::for_each(sample_data.begin(), sample_data.end(), [&max_length] (
const std::vector<T>& sample_values) {
462 if ( sample_values.size() > max_length ) max_length = sample_values.size();
#define BCF_BT_FLOAT
Definition: vcf.h:123
void clear(const uint32_t index)
Clear the value at a specific index.
Definition: short_value_optimized_storage.h:128
uint32_t field_index() const
Definition: variant_builder_individual_field.h:100
#define BCF_BT_INT32
Definition: vcf.h:122
bool has_bulk_changes() const
Definition: variant_builder_individual_field.h:108
#define bcf_int8_vector_end
Definition: vcf.h:750
#define bcf_int32_missing
Definition: vcf.h:756
Helper class for VariantBuilder to manage the storage and encoding of a single multi-sample individua...
Definition: variant_builder_individual_field.h:36
bool present() const
Definition: variant_builder_individual_field.h:106
bool has_per_sample_changes() const
Definition: variant_builder_individual_field.h:109
void set_entire_field(const std::vector< BULK_CHANGE_TYPE > &bulk_changes)
Definition: variant_builder_individual_field.h:71
void encode_into(kstring_t *destination) const
Encode this field's data into the provided buffer. If field has no data or was removed, do nothing.
Definition: variant_builder_individual_field.h:143
void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
Definition: vcf.c:1371
VariantBuilderIndividualField & operator=(VariantBuilderIndividualField &&other)=default
#define BCF_BT_INT8
Definition: vcf.h:120
void set_entire_field(const std::vector< std::vector< BULK_CHANGE_TYPE >> &bulk_changes)
Definition: variant_builder_individual_field.h:84
void set_entire_field(std::vector< BULK_CHANGE_TYPE > &&bulk_changes)
Definition: variant_builder_individual_field.h:64
int32_t field_type() const
Definition: variant_builder_individual_field.h:101
#define bcf_int16_vector_end
Definition: vcf.h:751
bool removed() const
Definition: variant_builder_individual_field.h:104
uint32_t max_value_length() const
Returns the length of the longest value in the container.
Definition: short_value_optimized_storage.h:67
#define bcf_int32_vector_end
Definition: vcf.h:752
void set_entire_field(std::vector< std::vector< BULK_CHANGE_TYPE >> &&bulk_changes)
Definition: variant_builder_individual_field.h:78
uint32_t estimated_encoded_size() const
Provide an estimate (typically an overestimate) of the number of bytes this field will require when e...
Definition: variant_builder_individual_field.h:132
~VariantBuilderIndividualField()=default
VariantBuilderIndividualField(const uint32_t num_samples, const uint32_t field_index, const int32_t field_type, const ENCODED_TYPE missing_value, const ENCODED_TYPE end_of_vector_value, const uint32_t short_value_upper_bound)
Definition: variant_builder_individual_field.h:38
#define str(x)
Definition: sam.c:66
Definition: exceptions.h:9
void set(const uint32_t index, const std::vector< ELEMENT_TYPE > &values)
Set the value at the specified index by vector.
Definition: short_value_optimized_storage.h:95
uint8_t int_encoded_type(const int32_t min_val, const int32_t max_val)
Given a min and max value, determines whether int8, int16, or int32 BCF encoding is required...
Definition: hts_memory.cpp:161
#define BCF_BT_INT16
Definition: vcf.h:121
#define bcf_int16_missing
Definition: vcf.h:755
uint32_t num_values() const
Returns the number of values in the container.
Definition: short_value_optimized_storage.h:57
void clear()
Reset this field to a pristine state with no data.
Definition: variant_builder_individual_field.h:114
void set_sample_field_value(const uint32_t sample_index, const ENCODED_TYPE *values, const uint32_t num_values)
Stores a value for just a single sample efficiently using the ShortValueOptimizedStorage layer...
Definition: variant_builder_individual_field.h:94
bool missing() const
Definition: variant_builder_individual_field.h:105
#define BCF_BT_CHAR
Definition: vcf.h:124
#define bcf_int8_missing
Definition: vcf.h:754