Gamgee
You miserable little maggot. I'll stove your head in!
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vcf.h
Go to the documentation of this file.
1 /* vcf.h -- VCF/BCF API functions.
2 
3  Copyright (C) 2012, 2013 Broad Institute.
4  Copyright (C) 2012-2014 Genome Research Ltd.
5 
6  Author: Heng Li <lh3@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE. */
25 
26 /*
27  todo:
28  - make the function names consistent
29  - provide calls to abstract away structs as much as possible
30  */
31 
32 #ifndef HTSLIB_VCF_H
33 #define HTSLIB_VCF_H
34 
35 #include <stdint.h>
36 #include <limits.h>
37 #include <assert.h>
38 #include "hts.h"
39 #include "kstring.h"
40 
41 
42 /*****************
43  * Header struct *
44  *****************/
45 
46 #define BCF_HL_FLT 0 // header line
47 #define BCF_HL_INFO 1
48 #define BCF_HL_FMT 2
49 #define BCF_HL_CTG 3
50 #define BCF_HL_STR 4 // structured header line TAG=<A=..,B=..>
51 #define BCF_HL_GEN 5 // generic header line
52 
53 #define BCF_HT_FLAG 0 // header type
54 #define BCF_HT_INT 1
55 #define BCF_HT_REAL 2
56 #define BCF_HT_STR 3
57 
58 #define BCF_VL_FIXED 0 // variable length
59 #define BCF_VL_VAR 1
60 #define BCF_VL_A 2
61 #define BCF_VL_G 3
62 #define BCF_VL_R 4
63 
64 /* === Dictionary ===
65 
66  The header keeps three dictonaries. The first keeps IDs in the
67  "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths
68  in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[]
69  is the actual hash table, which is opaque to the end users. In the hash
70  table, the key is the ID or sample name as a C string and the value is a
71  bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash
72  table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the
73  size of the hash table or, equivalently, the length of the id[] arrays.
74 */
75 
76 #define BCF_DT_ID 0 // dictionary type
77 #define BCF_DT_CTG 1
78 #define BCF_DT_SAMPLE 2
79 
80 // Complete textual representation of a header line
81 typedef struct {
82  int type; // One of the BCF_HL_* type
83  char *key; // The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc.
84  char *value; // Set only for generic lines, NULL for FILTER/INFO, etc.
85  int nkeys; // Number of structured fields
86  char **keys, **vals; // The key=value pairs
87 } bcf_hrec_t;
88 
89 typedef struct {
90  uint32_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 for BCF_HL_FLT,INFO,FMT
91  bcf_hrec_t *hrec[3];
92  int id;
93 } bcf_idinfo_t;
94 
95 typedef struct {
96  const char *key;
97  const bcf_idinfo_t *val;
98 } bcf_idpair_t;
99 
100 typedef struct {
101  int32_t n[3];
102  bcf_idpair_t *id[3];
103  void *dict[3]; // ID dictionary, contig dict and sample dict
104  char **samples;
106  int nhrec, dirty;
107  int ntransl, *transl[2]; // for bcf_translate()
108  int nsamples_ori; // for bcf_hdr_set_samples()
109  uint8_t *keep_samples;
111 } bcf_hdr_t;
112 
113 extern uint8_t bcf_type_shift[];
114 
115 /**************
116  * VCF record *
117  **************/
118 
119 #define BCF_BT_NULL 0
120 #define BCF_BT_INT8 1
121 #define BCF_BT_INT16 2
122 #define BCF_BT_INT32 3
123 #define BCF_BT_FLOAT 5
124 #define BCF_BT_CHAR 7
125 
126 #define VCF_REF 0
127 #define VCF_SNP 1
128 #define VCF_MNP 2
129 #define VCF_INDEL 4
130 #define VCF_OTHER 8
131 
132 typedef struct {
133  int type, n; // variant type and the number of bases affected, negative for deletions
134 } variant_t;
135 
136 typedef struct {
137  int id; // id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key
138  int n, size, type; // n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types
139  uint8_t *p; // same as vptr and vptr_* in bcf_info_t below
140  uint32_t p_len;
141  uint32_t p_off:31, p_free:1;
142 } bcf_fmt_t;
143 
144 typedef struct {
145  int key; // key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key
146  int type, len; // type: one of BCF_BT_* types; len: vector length, 1 for scalars
147  union {
148  int32_t i; // integer value
149  float f; // float value
150  } v1; // only set if $len==1; for easier access
151  uint8_t *vptr; // pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes
152  uint32_t vptr_len; // length of the vptr block or, when set, of the vptr_mod block, excluding offset
153  uint32_t vptr_off:31, // vptr offset, i.e., the size of the INFO key plus size+type bytes
154  vptr_free:1; // indicates that vptr-vptr_off must be freed; set only when modified and the new
155  // data block is bigger than the original
156 } bcf_info_t;
157 
158 
159 #define BCF1_DIRTY_ID 1
160 #define BCF1_DIRTY_ALS 2
161 #define BCF1_DIRTY_FLT 4
162 #define BCF1_DIRTY_INF 8
163 
164 typedef struct {
165  int m_fmt, m_info, m_id, m_als, m_allele, m_flt; // allocated size (high-water mark); do not change
166  int n_flt; // Number of FILTER fields
167  int *flt; // FILTER keys in the dictionary
168  char *id, *als; // ID and REF+ALT block (\0-seperated)
169  char **allele; // allele[0] is the REF (allele[] pointers to the als block); all null terminated
170  bcf_info_t *info; // INFO
171  bcf_fmt_t *fmt; // FORMAT and individual sample
172  variant_t *var; // $var and $var_type set only when set_variant_types called
173  int n_var, var_type;
174  int shared_dirty; // if set, shared.s must be recreated on BCF output
175  int indiv_dirty; // if set, indiv.s must be recreated on BCF output
176 } bcf_dec_t;
177 
178 
179 #define BCF_ERR_CTG_UNDEF 1
180 #define BCF_ERR_TAG_UNDEF 2
181 #define BCF_ERR_NCOLS 4
182 
183 /*
184  The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file
185  is slower because the string is first to be parsed, packed into BCF line
186  (done in vcf_parse), then unpacked into internal bcf1_t structure. If it
187  is known in advance that some of the fields will not be required (notably
188  the sample columns), parsing of these can be skipped by setting max_unpack
189  appropriately.
190  Similarly, it is fast to output a BCF line because the columns (kept in
191  shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF
192  line must be formatted in vcf_format.
193  */
194 typedef struct {
195  int32_t rid; // CHROM
196  int32_t pos; // POS
197  int32_t rlen; // length of REF
198  float qual; // QUAL
199  uint32_t n_info:16, n_allele:16;
200  uint32_t n_fmt:8, n_sample:24;
202  bcf_dec_t d; // lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack()
203  int max_unpack; // Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed
204  int unpacked; // remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work
205  int unpack_size[3]; // the original block size of ID, REF+ALT and FILTER
206  int errcode; // one of BCF_ERR_* codes
207 } bcf1_t;
208 
209 /*******
210  * API *
211  *******/
212 
213 #ifdef __cplusplus
214 extern "C" {
215 #endif
216 
217  /***********************************************************************
218  * BCF and VCF I/O
219  *
220  * A note about naming conventions: htslib internally represents VCF
221  * records as bcf1_t data structures, therefore most functions are
222  * prefixed with bcf_. There are a few exceptions where the functions must
223  * be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In
224  * these cases, functions prefixed with bcf_ are more general and work
225  * with both BCF and VCF.
226  *
227  ***********************************************************************/
228 
230  #define bcf_init1() bcf_init()
231  #define bcf_read1(fp,h,v) bcf_read((fp),(h),(v))
232  #define vcf_read1(fp,h,v) vcf_read((fp),(h),(v))
233  #define bcf_write1(fp,h,v) bcf_write((fp),(h),(v))
234  #define vcf_write1(fp,h,v) vcf_write((fp),(h),(v))
235  #define bcf_destroy1(v) bcf_destroy(v)
236  #define vcf_parse1(s,h,v) vcf_parse((s),(h),(v))
237  #define bcf_clear1(v) bcf_clear(v)
238  #define vcf_format1(h,v,s) vcf_format((h),(v),(s))
239 
247  bcf_hdr_t *bcf_hdr_init(const char *mode);
248 
250  void bcf_hdr_destroy(bcf_hdr_t *h);
251 
253  bcf1_t *bcf_init(void);
254 
256  void bcf_destroy(bcf1_t *v);
257 
262  void bcf_empty(bcf1_t *v);
263 
269  void bcf_clear(bcf1_t *v);
270 
271 
273  typedef htsFile vcfFile;
274  #define bcf_open(fn, mode) hts_open((fn), (mode))
275  #define vcf_open(fn, mode) hts_open((fn), (mode))
276  #define bcf_close(fp) hts_close(fp)
277  #define vcf_close(fp) hts_close(fp)
278 
281 
304  int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file);
305  int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec);
306 
307 
309  int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h);
310 
312  int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v);
313 
315  int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s);
316 
325  int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v);
326 
334  #define BCF_UN_STR 1 // up to ALT inclusive
335  #define BCF_UN_FLT 2 // up to FILTER
336  #define BCF_UN_INFO 4 // up to INFO
337  #define BCF_UN_SHR (BCF_UN_STR|BCF_UN_FLT|BCF_UN_INFO) // all shared information
338  #define BCF_UN_FMT 8 // unpack format and each sample
339  #define BCF_UN_IND BCF_UN_FMT // a synonymo of BCF_UN_FMT
340  #define BCF_UN_ALL (BCF_UN_SHR|BCF_UN_FMT) // everything
341  int bcf_unpack(bcf1_t *b, int which);
342 
343  /*
344  * bcf_dup() - create a copy of BCF record.
345  *
346  * Note that bcf_unpack() must be called on the returned copy as if it was
347  * obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src)
348  * internally to reflect any changes made by bcf_update_* functions.
349  */
350  bcf1_t *bcf_dup(bcf1_t *src);
351  bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src);
352 
356  int bcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v);
357 
364  int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h);
365  int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v);
366  int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v);
367 
369  int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, int *beg, int *end);
370 
371 
372 
373  /**************************************************************************
374  * Header querying and manipulation routines
375  **************************************************************************/
376 
378  bcf_hdr_t *bcf_hdr_dup(const bcf_hdr_t *hdr);
385  int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src);
386 
391  int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample);
392 
394  int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname);
395 
400  char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len);
401 
403  int bcf_hdr_append(bcf_hdr_t *h, const char *line);
404  int bcf_hdr_printf(bcf_hdr_t *h, const char *format, ...);
405 
406  const char *bcf_hdr_get_version(const bcf_hdr_t *hdr);
407  void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version);
408 
414  void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key);
415 
427  bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap);
428 
430  const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs);
431 
433  #define bcf_hdr_nsamples(hdr) (hdr)->n[BCF_DT_SAMPLE]
434 
435 
437  int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt);
438  int bcf_hdr_sync(bcf_hdr_t *h);
439  bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len);
440  void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str);
441  int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec);
450  bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class);
452  void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len);
453  void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted);
454  int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key);
455  void hrec_add_idx(bcf_hrec_t *hrec, int idx);
456  void bcf_hrec_destroy(bcf_hrec_t *hrec);
457 
458 
459 
460  /**************************************************************************
461  * Individual record querying and manipulation routines
462  **************************************************************************/
463 
465  int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap);
466 
474  int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line);
475 
479  int bcf_get_variant_types(bcf1_t *rec);
480  int bcf_get_variant_type(bcf1_t *rec, int ith_allele);
481  int bcf_is_snp(bcf1_t *v);
482 
488  int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n);
495  int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id);
501  int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass);
505  int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter);
517  int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals);
518  int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string);
519  int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id);
520 
521  /*
522  * bcf_update_info_*() - functions for updating INFO fields
523  * @hdr: the BCF header
524  * @line: VCF line to be edited
525  * @key: the INFO tag to be updated
526  * @values: pointer to the array of values. Pass NULL to remove the tag.
527  * @n: number of values in the array. When set to 0, the INFO tag is removed
528  *
529  * The @string in bcf_update_info_flag() is optional, @n indicates whether
530  * the flag is set or removed.
531  *
532  * Returns 0 on success or negative value on error.
533  */
534  #define bcf_update_info_int32(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_INT)
535  #define bcf_update_info_float(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_REAL)
536  #define bcf_update_info_flag(hdr,line,key,string,n) bcf_update_info((hdr),(line),(key),(string),(n),BCF_HT_FLAG)
537  #define bcf_update_info_string(hdr,line,key,string) bcf_update_info((hdr),(line),(key),(string),1,BCF_HT_STR)
538  int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type);
539 
540  /*
541  * bcf_update_format_*() - functions for updating FORMAT fields
542  * @values: pointer to the array of values, the same number of elements
543  * is expected for each sample. Missing values must be padded
544  * with bcf_*_missing or bcf_*_vector_end values.
545  * @n: number of values in the array. If n==0, existing tag is removed.
546  *
547  * The function bcf_update_format_string() is a higher-level (slower) variant of
548  * bcf_update_format_char(). The former accepts array of \0-terminated strings
549  * whereas the latter requires that the strings are collapsed into a single array
550  * of fixed-length strings. In case of strings with variable length, shorter strings
551  * can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char()
552  * are not \0-terminated.
553  *
554  * Returns 0 on success or negative value on error.
555  */
556  #define bcf_update_format_int32(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_INT)
557  #define bcf_update_format_float(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_REAL)
558  #define bcf_update_format_char(hdr,line,key,values,n) bcf_update_format((hdr),(line),(key),(values),(n),BCF_HT_STR)
559  #define bcf_update_genotypes(hdr,line,gts,n) bcf_update_format((hdr),(line),"GT",(gts),(n),BCF_HT_INT) // See bcf_gt_ macros below
560  int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n);
561  int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type);
562 
563  // Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds
564  // to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained
565  // from bcf_get_genotypes() below.
566  #define bcf_gt_phased(idx) ((idx+1)<<1|1)
567  #define bcf_gt_unphased(idx) ((idx+1)<<1)
568  #define bcf_gt_missing 0
569  #define bcf_gt_is_missing(val) ((val)>>1 ? 0 : 1)
570  #define bcf_gt_is_phased(idx) ((idx)&1)
571  #define bcf_gt_allele(val) (((val)>>1)-1)
572 
574  #define bcf_alleles2gt(a,b) ((a)>(b)?((a)*((a)+1)/2+(b)):((b)*((b)+1)/2+(a)))
575  static inline void bcf_gt2alleles(int igt, int *a, int *b)
576  {
577  int k = 0, dk = 1;
578  while ( k<igt ) { dk++; k += dk; }
579  *b = dk - 1; *a = igt - k + *b;
580  }
581 
591  bcf_fmt_t *bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key);
592  bcf_info_t *bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key);
593 
602  bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id);
603  bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id);
604 
623  #define bcf_get_info_int32(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_INT)
624  #define bcf_get_info_float(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL)
625  #define bcf_get_info_string(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_STR)
626  #define bcf_get_info_flag(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_FLAG)
627  int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type);
628 
650  #define bcf_get_format_int32(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_INT)
651  #define bcf_get_format_float(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL)
652  #define bcf_get_format_char(hdr,line,tag,dst,ndst) bcf_get_format_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_STR)
653  #define bcf_get_genotypes(hdr,line,dst,ndst) bcf_get_format_values(hdr,line,"GT",(void**)(dst),ndst,BCF_HT_INT)
654  int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst);
655  int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type);
656 
657 
658 
659  /**************************************************************************
660  * Helper functions
661  **************************************************************************/
662 
672  int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id);
673  #define bcf_hdr_int2id(hdr,type,int_id) ((hdr)->id[type][int_id].key)
674 
679  static inline int bcf_hdr_name2id(const bcf_hdr_t *hdr, const char *id) { return bcf_hdr_id2int(hdr, BCF_DT_CTG, id); }
680  static inline const char *bcf_hdr_id2name(const bcf_hdr_t *hdr, int rid) { return hdr->id[BCF_DT_CTG][rid].key; }
681  static inline const char *bcf_seqname(const bcf_hdr_t *hdr, bcf1_t *rec) { return hdr->id[BCF_DT_CTG][rec->rid].key; }
682 
697  #define bcf_hdr_id2length(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>8 & 0xf)
698  #define bcf_hdr_id2number(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>12)
699  #define bcf_hdr_id2type(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf)
700  #define bcf_hdr_id2coltype(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf)
701  #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id<0 || bcf_hdr_id2coltype(hdr,type,int_id)==0xf) ? 0 : 1)
702  #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)])
703 
704  void bcf_fmt_array(kstring_t *s, int n, int type, void *data);
705  uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr);
706 
707  void bcf_enc_vchar(kstring_t *s, int l, const char *a);
708  void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize);
709  void bcf_enc_vfloat(kstring_t *s, int n, float *a);
710 
711 
712  /**************************************************************************
713  * BCF index
714  *
715  * Note that these functions work with BCFs only. See synced_bcf_reader.h
716  * which provides (amongst other things) an API to work transparently with
717  * both indexed BCFs and VCFs.
718  **************************************************************************/
719 
720  #define bcf_itr_destroy(iter) hts_itr_destroy(iter)
721  #define bcf_itr_queryi(idx, tid, beg, end) hts_itr_query((idx), (tid), (beg), (end), bcf_readrec)
722  #define bcf_itr_querys(idx, hdr, s) hts_itr_querys((idx), (s), (hts_name2id_f)(bcf_hdr_name2id), (hdr), hts_itr_query, bcf_readrec)
723  #define bcf_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0)
724  #define bcf_index_load(fn) hts_idx_load(fn, HTS_FMT_CSI)
725  #define bcf_index_seqnames(idx, hdr, nptr) hts_idx_seqnames((idx),(nptr),(hts_id2name_f)(bcf_hdr_id2name),(hdr))
726 
727  int bcf_index_build(const char *fn, int min_shift);
728 
729 #ifdef __cplusplus
730 }
731 #endif
732 
733 /*******************
734  * Typed value I/O *
735  *******************/
736 
737 /*
738  Note that in contrast with BCFv2.1 specification, HTSlib implementation
739  allows missing values in vectors. For integer types, the values 0x80,
740  0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001,
741  0x80000001 as end-of-vector indicators. Similarly for floats, the value of
742  0x7F800001 is interpreted as a missing value and 0x7F800002 as an
743  end-of-vector indicator.
744  Note that the end-of-vector byte is not part of the vector.
745 
746  This trial BCF version (v2.2) is compatible with the VCF specification and
747  enables to handle correctly vectors with different ploidy in presence of
748  missing values.
749  */
750 #define bcf_int8_vector_end (INT8_MIN+1)
751 #define bcf_int16_vector_end (INT16_MIN+1)
752 #define bcf_int32_vector_end (INT32_MIN+1)
753 #define bcf_str_vector_end 0
754 #define bcf_int8_missing INT8_MIN
755 #define bcf_int16_missing INT16_MIN
756 #define bcf_int32_missing INT32_MIN
757 #define bcf_str_missing 0x07
758 extern uint32_t bcf_float_vector_end;
759 extern uint32_t bcf_float_missing;
760 static inline void bcf_float_set(float *ptr, uint32_t value)
761 {
762  union { uint32_t i; float f; } u;
763  u.i = value;
764  *ptr = u.f;
765 }
766 #define bcf_float_set_vector_end(x) bcf_float_set(&(x),bcf_float_vector_end)
767 #define bcf_float_set_missing(x) bcf_float_set(&(x),bcf_float_missing)
768 static inline int bcf_float_is_missing(float f)
769 {
770  union { uint32_t i; float f; } u;
771  u.f = f;
772  return u.i==bcf_float_missing ? 1 : 0;
773 }
774 static inline int bcf_float_is_vector_end(float f)
775 {
776  union { uint32_t i; float f; } u;
777  u.f = f;
778  return u.i==bcf_float_vector_end ? 1 : 0;
779 }
780 
781 static inline void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
782 {
783  #define BRANCH(type_t, missing, vector_end) { \
784  type_t *ptr = (type_t*) (fmt->p + isample*fmt->size); \
785  int i; \
786  for (i=0; i<fmt->n && ptr[i]!=vector_end; i++) \
787  { \
788  if ( i ) kputc("/|"[ptr[i]&1], str); \
789  if ( !(ptr[i]>>1) ) kputc('.', str); \
790  else kputw((ptr[i]>>1) - 1, str); \
791  } \
792  if (i == 0) kputc('.', str); \
793  }
794  switch (fmt->type) {
798  default: fprintf(stderr,"FIXME: type %d in bcf_format_gt?\n", fmt->type); abort(); break;
799  }
800  #undef BRANCH
801 }
802 
803 static inline void bcf_enc_size(kstring_t *s, int size, int type)
804 {
805  if (size >= 15) {
806  kputc(15<<4|type, s);
807  if (size >= 128) {
808  if (size >= 32768) {
809  int32_t x = size;
810  kputc(1<<4|BCF_BT_INT32, s);
811  kputsn((char*)&x, 4, s);
812  } else {
813  int16_t x = size;
814  kputc(1<<4|BCF_BT_INT16, s);
815  kputsn((char*)&x, 2, s);
816  }
817  } else {
818  kputc(1<<4|BCF_BT_INT8, s);
819  kputc(size, s);
820  }
821  } else kputc(size<<4|type, s);
822 }
823 
824 static inline int bcf_enc_inttype(long x)
825 {
826  if (x <= INT8_MAX && x > bcf_int8_missing) return BCF_BT_INT8;
827  if (x <= INT16_MAX && x > bcf_int16_missing) return BCF_BT_INT16;
828  return BCF_BT_INT32;
829 }
830 
831 static inline void bcf_enc_int1(kstring_t *s, int32_t x)
832 {
833  if (x == bcf_int32_vector_end) {
834  bcf_enc_size(s, 1, BCF_BT_INT8);
835  kputc(bcf_int8_vector_end, s);
836  } else if (x == bcf_int32_missing) {
837  bcf_enc_size(s, 1, BCF_BT_INT8);
838  kputc(bcf_int8_missing, s);
839  } else if (x <= INT8_MAX && x > bcf_int8_missing) {
840  bcf_enc_size(s, 1, BCF_BT_INT8);
841  kputc(x, s);
842  } else if (x <= INT16_MAX && x > bcf_int16_missing) {
843  int16_t z = x;
844  bcf_enc_size(s, 1, BCF_BT_INT16);
845  kputsn((char*)&z, 2, s);
846  } else {
847  int32_t z = x;
848  bcf_enc_size(s, 1, BCF_BT_INT32);
849  kputsn((char*)&z, 4, s);
850  }
851 }
852 
853 static inline int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q)
854 {
855  if (type == BCF_BT_INT8) {
856  *q = (uint8_t*)p + 1;
857  return *(int8_t*)p;
858  } else if (type == BCF_BT_INT16) {
859  *q = (uint8_t*)p + 2;
860  return *(int16_t*)p;
861  } else {
862  *q = (uint8_t*)p + 4;
863  return *(int32_t*)p;
864  }
865 }
866 
867 static inline int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q)
868 {
869  return bcf_dec_int1(p + 1, *p&0xf, q);
870 }
871 
872 static inline int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type)
873 {
874  *type = *p & 0xf;
875  if (*p>>4 != 15) {
876  *q = (uint8_t*)p + 1;
877  return *p>>4;
878  } else return bcf_dec_typed_int1(p + 1, q);
879 }
880 
881 #endif
bcf_hdr_t * bcf_hdr_dup(const bcf_hdr_t *hdr)
Definition: vcf.c:2387
bcf_hdr_t * vcf_hdr_read(htsFile *fp)
Definition: vcf.c:1195
void bcf_enc_vfloat(kstring_t *s, int n, float *a)
Definition: vcf.c:1410
int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h)
Definition: vcf.c:1353
int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec)
Definition: vcf.c:464
bcf_fmt_t * bcf_get_fmt(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
Definition: vcf.c:2979
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname)
Definition: vcf.c:1270
void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const char *str, int len, int is_quoted)
Definition: vcf.c:222
void bcf_hdr_destroy(bcf_hdr_t *h)
Definition: vcf.c:714
Definition: hts.h:109
bcf_hdr_t * bcf_hdr_read(htsFile *fp)
Definition: vcf.c:736
int bcf_hdr_sync(bcf_hdr_t *h)
Definition: vcf.c:110
int32_t i
Definition: vcf.h:148
int is_file(char *fn)
Definition: files.c:57
int32_t rid
Definition: vcf.h:195
variant_t * var
Definition: vcf.h:172
uint8_t * p
Definition: vcf.h:139
bcf_hrec_t ** hrec
Definition: vcf.h:105
float qual
Definition: vcf.h:198
int max_unpack
Definition: vcf.h:203
int * flt
Definition: vcf.h:167
#define BCF_BT_INT32
Definition: vcf.h:122
uint8_t * bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
Definition: vcf.c:1461
int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt)
Definition: vcf.c:562
Definition: vcf.h:132
bcf_fmt_t * fmt
Definition: vcf.h:171
int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, int *beg, int *end)
Definition: vcf.c:927
#define bcf_int8_vector_end
Definition: vcf.h:750
#define bcf_int32_missing
Definition: vcf.h:756
int bcf_get_variant_type(bcf1_t *rec, int ith_allele)
Definition: vcf.c:2626
bcf_hrec_t * bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class)
Definition: vcf.c:508
int bcf_hdr_id2int(const bcf_hdr_t *hdr, int type, const char *id)
Definition: vcf.c:2145
int bcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
Definition: vcf.c:1149
Definition: vcf.h:81
int bcf_update_filter(const bcf_hdr_t *hdr, bcf1_t *line, int *flt_ids, int n)
Definition: vcf.c:2844
int id
Definition: vcf.h:137
int bcf_update_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const char **values, int n)
Definition: vcf.c:2715
int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
Definition: vcf.c:3016
int32_t pos
Definition: vcf.h:196
int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
Definition: vcf.c:1719
#define BRANCH(type_t, missing, vector_end)
int bcf_remove_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id, int pass)
Definition: vcf.c:2875
int bcf_unpack(bcf1_t *b, int which)
Definition: vcf.c:1945
enum @17 mode
void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
Definition: vcf.c:1371
#define BCF_BT_INT8
Definition: vcf.h:120
int vcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
Definition: vcf.c:1903
int bcf_get_format_string(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, char ***dst, int *ndst)
Definition: vcf.c:3093
uint8_t * keep_samples
Definition: vcf.h:109
Definition: vcf.h:164
char * bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len)
Definition: vcf.c:1313
void bcf_hrec_destroy(bcf_hrec_t *hrec)
Definition: vcf.c:146
void bcf_enc_vchar(kstring_t *s, int l, const char *a)
Definition: vcf.c:1416
uint32_t bcf_float_vector_end
float f
Definition: vcf.h:149
#define bcf_int16_vector_end
Definition: vcf.h:751
void bcf_empty(bcf1_t *v)
const bcf_idinfo_t * val
Definition: vcf.h:97
int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
Definition: vcf.c:2741
int id
Definition: vcf.h:92
bcf_hdr_t * bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const *samples, int *imap)
Definition: vcf.c:2396
const char * bcf_hdr_get_version(const bcf_hdr_t *hdr)
Definition: vcf.c:668
Definition: vcf.h:136
void bcf_clear(bcf1_t *v)
Definition: vcf.c:799
int key
Definition: vcf.h:145
uint32_t vptr_len
Definition: vcf.h:152
void bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str)
Definition: vcf.c:1309
int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec)
Definition: vcf.c:878
int nsamples_ori
Definition: vcf.h:108
int nhrec
Definition: vcf.h:106
kstring_t mem
Definition: vcf.h:110
bcf_info_t * bcf_get_info_id(bcf1_t *line, const int id)
Definition: vcf.c:3004
int bcf_get_variant_types(bcf1_t *rec)
Definition: vcf.c:2621
int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src)
Definition: vcf.c:2213
Definition: vcf.h:144
Definition: bgzf.h:69
int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type)
Definition: vcf.c:2632
bcf_info_t * bcf_get_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key)
Definition: vcf.c:2986
#define bcf_int32_vector_end
Definition: vcf.h:752
int bcf_hdr_printf(bcf_hdr_t *h, const char *format,...)
Definition: vcf.c:645
bcf_hrec_t * bcf_hrec_dup(bcf_hrec_t *hrec)
Definition: vcf.c:162
int m_info
Definition: vcf.h:165
Definition: vcf.h:194
uint32_t p_len
Definition: vcf.h:140
void hrec_add_idx(bcf_hrec_t *hrec, int idx)
Definition: vcf.c:242
int bcf_is_snp(bcf1_t *v)
Definition: vcf.c:2527
int nkeys
Definition: vcf.h:85
int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
Definition: vcf.c:2129
uint8_t * vptr
Definition: vcf.h:151
int bcf_hdr_append(bcf_hdr_t *h, const char *line)
Definition: vcf.c:589
bcf_hdr_t * bcf_hdr_init(const char *mode)
Definition: vcf.c:698
bcf_info_t * info
Definition: vcf.h:170
bcf1_t * bcf_dup(bcf1_t *src)
Definition: vcf.c:1143
int shared_dirty
Definition: vcf.h:174
#define str(x)
Definition: sam.c:66
char * value
Definition: vcf.h:84
int errcode
Definition: vcf.h:206
int bcf_has_filter(const bcf_hdr_t *hdr, bcf1_t *line, char *filter)
Definition: vcf.c:2889
int bcf_update_alleles_str(const bcf_hdr_t *hdr, bcf1_t *line, const char *alleles_string)
Definition: vcf.c:2949
int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line)
Definition: vcf.c:2271
const char * key
Definition: vcf.h:96
bcf_hrec_t * bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len)
Definition: vcf.c:268
int bcf_index_build(const char *fn, int min_shift)
Definition: vcf.c:2195
Definition: vcf.h:89
bcf1_t * bcf_init(void)
int type
Definition: vcf.h:146
int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals)
Definition: vcf.c:2922
char * key
Definition: vcf.h:83
int type
Definition: vcf.h:133
char ** allele
Definition: vcf.h:169
int n_flt
Definition: vcf.h:166
int type
Definition: vcf.h:138
int bcf_hdr_add_sample(bcf_hdr_t *hdr, const char *sample)
Definition: vcf.c:58
#define BCF_DT_CTG
Definition: vcf.h:77
char * id
Definition: vcf.h:168
int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h)
Definition: vcf.c:769
#define BCF_BT_INT16
Definition: vcf.h:121
int32_t rlen
Definition: vcf.h:197
#define bcf_int16_missing
Definition: vcf.h:755
void bcf_fmt_array(kstring_t *s, int n, int type, void *data)
Definition: vcf.c:1422
int var_type
Definition: vcf.h:173
uint8_t bcf_type_shift[]
int bcf_add_filter(const bcf_hdr_t *hdr, bcf1_t *line, int flt_id)
Definition: vcf.c:2857
int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key)
Definition: vcf.c:253
bcf1_t * bcf_copy(bcf1_t *dst, bcf1_t *src)
Definition: vcf.c:1121
const char ** bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs)
Definition: vcf.c:1333
kstring_t shared
Definition: vcf.h:201
int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type)
Definition: vcf.c:3131
int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id)
Definition: vcf.c:2966
int indiv_dirty
Definition: vcf.h:175
bcf_dec_t d
Definition: vcf.h:202
int unpacked
Definition: vcf.h:204
char ** vals
Definition: vcf.h:86
Definition: vcf.h:100
#define bcf_int8_missing
Definition: vcf.h:754
int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s)
Definition: vcf.c:2014
bcf_idpair_t * id[3]
Definition: vcf.h:102
htsFile vcfFile
Definition: vcf.h:273
int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v)
Definition: vcf.c:919
void bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version)
Definition: vcf.c:679
int bcf_subset(const bcf_hdr_t *h, bcf1_t *v, int n, int *imap)
Definition: vcf.c:2499
uint32_t bcf_float_missing
bcf_fmt_t * bcf_get_fmt_id(bcf1_t *line, const int id)
Definition: vcf.c:2993
void bcf_hdr_remove(bcf_hdr_t *h, int type, const char *key)
Definition: vcf.c:598
Definition: bgzf.h:49
void bcf_hrec_add_key(bcf_hrec_t *hrec, const char *str, int len)
Definition: vcf.c:210
void bcf_destroy(bcf1_t *v)
int type
Definition: vcf.h:82
int bcf_hdr_set_samples(bcf_hdr_t *hdr, const char *samples, int is_file)
Definition: vcf.c:2433
char ** samples
Definition: vcf.h:104
Definition: vcf.h:95