Gamgee
You miserable little maggot. I'll stove your head in!
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
variant_header_merger.h
Go to the documentation of this file.
1 #ifndef __gamgee_variant_header_merger__
2 #define __gamgee_variant_header_merger__
3 
4 #include <boost/shared_ptr.hpp>
5 #include <string>
6 #include <unordered_map>
7 
8 #include "../utils/merged_vcf_lut.h"
9 
10 namespace gamgee
11 {
12  /*
13  * VariantHeaderMerger class
14  * @brief Class for merging VCF headers and maintain field and sample mappings
15  * Class which:
16  * (a) Merges multiple input VCF headers to obtain a merged VCF header
17  * (b) Creates and updates mappings between input VCFs and merged VCF header fields and samples.
18  *
19  * Provides functions to add new VCF headers for merging
20  * Example usage:
21  * shared_ptr<bcf_hdr_t> vcf_hdr;
22  * vector<shared_ptr<bcf_hdr_t>> hdr_vec1;
23  * vector<shared_ptr<bcf_hdr_t>> hdr_vec2;
24  *
25  * Start with empty object, add headers later
26  * VariantHeaderMerger X;
27  * X.add_header(vcf_hdr);
28  * X.add_headers(hdr_vec1);
29  * const shared_ptr<bcf_hdr_t>& merged_hdr = X.get_merged_header();
30  *
31  * Start with 1, add more later
32  * VariantHeaderMerger Y { vcf_hdr };
33  * Y.add_headers(hdr_vec1);
34  *
35  * Start with many, add more later
36  * VariantHeaderMerger Z { hdr_vec1 };
37  * Z.add_headers(hdr_vec2);
38  *
39  * The class contains two LUTs - m_header_fields_LUT and m_samples_LUT of type MergedVCFLUTBase<> for storing mapping for
40  * header fields (FMT, FLT, INFO) and samples respectively. The class is templated to select the 'best' memory layout.
41  *
42  * Each of the two MergedVCFLUTBase<> objects (m_header_fields_LUT, m_samples_LUT) contains two matrices (vector<vector<int>>):
43  * m_inputs_2_merged_lut and m_merged_2_inputs_lut. The first stores the mapping from input VCF fields to the merged VCF fields while
44  * the second stores the mapping in the opposite direction.
45  * You can layout each matrix in one of the 2 following ways:
46  * (a) matrix[i][j] corresponds to input VCF i and field j
47  * (b) matrix[i][j] corresponds to field i and input VCF j
48  * Option (a) is optimal where you are looking at all the fields of a VCF in quick succession,
49  * while (b) is optimal when you are looking at all VCFs for a particular field.
50  * The 2 boolean template parameters for MergedVCFLUTBase<inputs_2_merged_LUT_is_input_ordered, merged_2_inputs_LUT_is_input_ordered>
51  * control the layout of the two matrices. If the parameter inputs_2_merged_LUT_is_input_ordered is true, then layout (a) is selected for
52  * m_inputs_2_merged_lut, else layout (b)
53  *
54  * VariantHeaderMerger has 4 boolean parameters - two for the m_header_fields_LUT (fields_*_LUT_ordering) and two for the
55  * m_samples_LUT (samples_*_LUT_ordering)
56  * fields_forward_LUT_ordering: controls the layout of the input VCF fields to merged VCF fields matrix in m_header_fields_LUT
57  * fields_reverse_LUT_ordering: controls the layout of the merged VCF fields to input VCF fields matrix in m_header_fields_LUT
58  * samples_forward_LUT_ordering: controls the layout of the input VCF fields to merged VCF samples matrix in m_samples_LUT
59  * samples_reverse_LUT_ordering: controls the layout of the merged VCF samples to input VCF samples matrix in m_samples_LUT
60  *
61  * We create type aliases InputOrderedVariantHeaderMerger and FieldOrderedVariantHeaderMerger for specialized template
62  * instantiations of VariantHeaderMerger.
63  * (a) InputOrderedVariantHeaderMerger - fast traversal of all field lookups for a single input VCF
64  * (b) FieldOrderedVariantHeaderMerger - fast traversal of all input VCFs for a single field lookup
65  *
66  * Example usage:
67  * vector<VariantHeader> hdr_vec;
68  * VariantHeaderMerger Z { hdr_vec };
69  * auto input_vcf_idx = 0u;
70  * auto input_PL_idx = hdr_vec[input_vcf_idx].field_index("PL", BCF_HL_FMT);
71  * auto merged_PL_idx = Z.get_merged_header_idx_for_input(input_vcf_idx, input_PL_idx);
72  * auto refind_input_PL_idx = Z.get_input_header_idx_for_merged(input_vcf_idx, merged_PL_idx);
73  * assert(refind_input_PL_idx == input_PL_idx);
74  */
75 
76  template<bool fields_forward_LUT_ordering, bool fields_reverse_LUT_ordering, bool samples_forward_LUT_ordering, bool samples_reverse_LUT_ordering>
77  class VariantHeaderMerger
78  {
79  private:
80  static const auto m_DEFAULT_INIT_NUM_INPUT_VCFS = 10u;
81  static const auto m_DEFAULT_INIT_NUM_FIELDS = 30u;
82  static const auto m_DEFAULT_INIT_NUM_SAMPLES = 10u;
83  public:
88  : m_header_fields_LUT{ m_DEFAULT_INIT_NUM_INPUT_VCFS, m_DEFAULT_INIT_NUM_FIELDS },
89  m_samples_LUT { m_DEFAULT_INIT_NUM_INPUT_VCFS, m_DEFAULT_INIT_NUM_SAMPLES },
90  m_merged_field_idx_enum_lut { 1u, m_DEFAULT_INIT_NUM_FIELDS }
91  {
92  reset();
93  m_num_input_vcfs_allocated = m_DEFAULT_INIT_NUM_INPUT_VCFS;
94  m_num_merged_fields_allocated = m_DEFAULT_INIT_NUM_FIELDS;
95  m_num_merged_samples_allocated = m_DEFAULT_INIT_NUM_SAMPLES;
96  m_num_enums_allocated = m_DEFAULT_INIT_NUM_FIELDS;
97  }
102  VariantHeaderMerger(const std::shared_ptr<bcf_hdr_t>& input_vcf_header)
104  {
105  add_header(input_vcf_header);
106  }
111  VariantHeaderMerger(const VariantHeader& input_vcf_header)
112  : VariantHeaderMerger(input_vcf_header.m_header)
113  { }
118  VariantHeaderMerger(const std::vector<std::shared_ptr<bcf_hdr_t>>& input_vcf_headers)
119  : m_header_fields_LUT{ static_cast<unsigned>(input_vcf_headers.size()), m_DEFAULT_INIT_NUM_FIELDS },
120  m_samples_LUT { static_cast<unsigned>(input_vcf_headers.size()), m_DEFAULT_INIT_NUM_SAMPLES },
121  m_merged_field_idx_enum_lut { 1u, m_DEFAULT_INIT_NUM_FIELDS }
122  {
123  reset();
124  m_num_input_vcfs_allocated = input_vcf_headers.size();
125  m_num_merged_fields_allocated = m_DEFAULT_INIT_NUM_FIELDS;
126  m_num_merged_samples_allocated = m_DEFAULT_INIT_NUM_SAMPLES;
127  m_num_enums_allocated = m_DEFAULT_INIT_NUM_FIELDS;
128  add_headers(input_vcf_headers);
129  }
134  VariantHeaderMerger(const std::vector<VariantHeader>& input_vcf_headers)
135  : m_header_fields_LUT{ static_cast<unsigned>(input_vcf_headers.size()), m_DEFAULT_INIT_NUM_FIELDS },
136  m_samples_LUT { static_cast<unsigned>(input_vcf_headers.size()), m_DEFAULT_INIT_NUM_SAMPLES },
137  m_merged_field_idx_enum_lut { 1u, m_DEFAULT_INIT_NUM_FIELDS }
138  {
139  reset();
140  m_num_input_vcfs_allocated = input_vcf_headers.size();
141  m_num_merged_fields_allocated = m_DEFAULT_INIT_NUM_FIELDS;
142  m_num_merged_samples_allocated = m_DEFAULT_INIT_NUM_SAMPLES;
143  m_num_enums_allocated = m_DEFAULT_INIT_NUM_FIELDS;
144  add_headers(input_vcf_headers);
145  }
146  /*
147  * @brief No anticipated use of a deep copy for VariantHeaderMerger
148  */
149  VariantHeaderMerger(const VariantHeaderMerger&) = delete;
151  /*
152  * @brief default move constructor for VariantHeaderMerger
153  */
156 
157  ~VariantHeaderMerger() = default;
161  void reset()
162  {
163  m_input_vcf_headers.clear();
164  m_sample2idx_merged.clear();
165  m_merged_vcf_header_ptr = nullptr;
166  m_num_merged_fields_allocated = 0u;
167  m_num_merged_samples_allocated = 0u;
168  m_num_input_vcfs_allocated = 0u;
169  m_num_enums_allocated = 0u;
170  m_header_fields_LUT.reset_luts();
171  m_samples_LUT.reset_luts();
172  m_merged_field_idx_enum_lut.reset_luts();
173  }
177  void clear()
178  {
179  reset();
180  m_header_fields_LUT.clear();
181  m_samples_LUT.clear();
182  m_merged_field_idx_enum_lut.clear();
183  }
188  void add_header(const std::shared_ptr<bcf_hdr_t>& hdr);
193  void add_header(const VariantHeader& hdr);
198  void add_headers(const std::vector<std::shared_ptr<bcf_hdr_t>>& headers);
203  void add_headers(const std::vector<VariantHeader>& headers);
208  const std::shared_ptr<bcf_hdr_t>& get_raw_merged_header() const { return m_merged_vcf_header_ptr; }
213  const VariantHeader get_merged_header() const { return VariantHeader{ m_merged_vcf_header_ptr }; }
214  /*LUT functions*/
221  inline int get_merged_sample_idx_for_input(unsigned inputGVCFIdx, int inputSampleIdx) const
222  { return m_samples_LUT.get_merged_idx_for_input(inputGVCFIdx, inputSampleIdx); }
229  inline int get_merged_header_idx_for_input(unsigned inputGVCFIdx, int inputIdx) const
230  { return m_header_fields_LUT.get_merged_idx_for_input(inputGVCFIdx, inputIdx); }
237  inline int get_input_sample_idx_for_merged(unsigned inputGVCFIdx, int mergedSampleIdx) const
238  { return m_samples_LUT.get_input_idx_for_merged(inputGVCFIdx, mergedSampleIdx); }
245  inline int get_input_header_idx_for_merged(unsigned inputGVCFIdx, int mergedIdx) const
246  { return m_header_fields_LUT.get_input_idx_for_merged(inputGVCFIdx, mergedIdx); }
247 
275  void store_merged_field_idx_for_enum(const std::string& field, unsigned field_enum_idx);
276  inline int get_merged_field_idx_for_enum(unsigned field_enum_idx) const
277  {
278  return m_merged_field_idx_enum_lut.get_merged_idx_for_input(0u, field_enum_idx);
279  }
280  inline int get_enum_for_merged_field_idx(int merged_field_idx) const
281  {
282  return m_merged_field_idx_enum_lut.get_input_idx_for_merged(0u, merged_field_idx);
283  }
284  private:
288  void resize_luts_if_needed();
289  //LUT for VCF header fields (FMT/FLT/INFO)
291  //LUT for samples
293  //Header fields mapping
294  void add_header_fields_mapping(bcf_hdr_t* curr_header, unsigned input_vcf_idx);
295  //Samples mapping
296  void add_samples_mapping(bcf_hdr_t* curr_header, unsigned input_vcf_idx);
297  //Global sample names to idx mapping
298  std::unordered_map<std::string,int> m_sample2idx_merged;
299  //Input VCF headers
300  std::vector<std::shared_ptr<bcf_hdr_t>> m_input_vcf_headers;
301  //Merged header
302  std::shared_ptr<bcf_hdr_t> m_merged_vcf_header_ptr;
303  //sizes of the LUTs - to determine when to reallocate
304  unsigned m_num_merged_fields_allocated;
305  unsigned m_num_merged_samples_allocated;
306  unsigned m_num_input_vcfs_allocated;
307  unsigned m_num_enums_allocated;
308  //LUT to store merged field idxs to user-defined enum mappings
309  //The two matrices in this LUT are single row matrices - the matrix m_inputs_2_merged_lut will
310  //store the mapping from the user defined enum to the merged VCF header field idx, while the
311  //matrix m_merged_2_inputs will store the enum corresponding to the merged field
312  utils::MergedVCFLUTBase<true, true> m_merged_field_idx_enum_lut;
313  };
314  /*NOTE: Needs explicit instantiation in .cpp file to use this type alias*/
317 }
318 
319 #endif
void clear()
: resets all mappings, de-allocates LUT memory
Definition: variant_header_merger.h:177
const VariantHeader get_merged_header() const
Get merged VCF header.
Definition: variant_header_merger.h:213
Definition: merged_vcf_lut.h:20
int get_merged_sample_idx_for_input(unsigned inputGVCFIdx, int inputSampleIdx) const
Get sample idx for the merged VCF corresponding to sample idx inputSampleIdx in the input VCF of inde...
Definition: variant_header_merger.h:221
int get_input_idx_for_merged(unsigned inputGVCFIdx, int mergedIdx) const
Get field idx for input VCF inputGVCFIdx corresponding to field idx mergedIdx in the mergedVCF file...
Definition: merged_vcf_lut.h:100
const std::shared_ptr< bcf_hdr_t > & get_raw_merged_header() const
Get merged VCF header shared_ptr.
Definition: variant_header_merger.h:208
void store_merged_field_idx_for_enum(const std::string &field, unsigned field_enum_idx)
utility function for storing index of frequently used fields in the merged VCF Sometimes the user/dev...
Definition: variant_header_merger.cpp:123
VariantHeaderMerger(const std::shared_ptr< bcf_hdr_t > &input_vcf_header)
Constructor with a single input VCF header as input.
Definition: variant_header_merger.h:102
void clear()
deallocates memory
Definition: merged_vcf_lut.cpp:27
int get_merged_field_idx_for_enum(unsigned field_enum_idx) const
Definition: variant_header_merger.h:276
void reset()
: resets all mappings, but does not de-allocate LUT memory
Definition: variant_header_merger.h:161
void reset_luts()
: clear all mappings
Definition: merged_vcf_lut.h:69
int get_input_header_idx_for_merged(unsigned inputGVCFIdx, int mergedIdx) const
Get header field (FLT/FMT/INFO) idx for the input VCF inputGVCFIdx corresponding to field mergedIdx i...
Definition: variant_header_merger.h:245
VariantHeaderMerger(const std::vector< VariantHeader > &input_vcf_headers)
Constructor with a vector of input VCF headers to be merged.
Definition: variant_header_merger.h:134
VariantHeaderMerger(const std::vector< std::shared_ptr< bcf_hdr_t >> &input_vcf_headers)
Constructor with a vector of input VCF headers to be merged.
Definition: variant_header_merger.h:118
int get_merged_idx_for_input(unsigned inputGVCFIdx, int inputIdx) const
Get field idx for the merged VCF corresponding to field idx inputIdx in the input VCF of index inputG...
Definition: merged_vcf_lut.h:116
Definition: exceptions.h:9
int get_input_sample_idx_for_merged(unsigned inputGVCFIdx, int mergedSampleIdx) const
Get sample idx for the input VCF inputGVCFIdx corresponding to sample mergedSampleIdx in the merged V...
Definition: variant_header_merger.h:237
void add_headers(const std::vector< std::shared_ptr< bcf_hdr_t >> &headers)
add a vector of new VCF headers into the merged header and update LUTs
void add_header(const std::shared_ptr< bcf_hdr_t > &hdr)
add a new header into the merged header and update LUTs
int get_enum_for_merged_field_idx(int merged_field_idx) const
Definition: variant_header_merger.h:280
Utility class to hold a variant header.
Definition: variant_header.h:52
Definition: vcf.h:100
VariantHeaderMerger(const VariantHeader &input_vcf_header)
Constructor with a single input VCF header as input.
Definition: variant_header_merger.h:111
int get_merged_header_idx_for_input(unsigned inputGVCFIdx, int inputIdx) const
Get header field (FLT/FMT/INFO) idx for the merged VCF corresponding to field idx inputIdx in the inp...
Definition: variant_header_merger.h:229
VariantHeaderMerger()
empty constructor, initialize 'large' LUTs
Definition: variant_header_merger.h:87
VariantHeaderMerger & operator=(const VariantHeaderMerger &)=delete