Gamgee
You miserable little maggot. I'll stove your head in!
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
synced_bcf_reader.h
Go to the documentation of this file.
1 /* synced_bcf_reader.h -- stream through multiple VCF files.
2 
3  Copyright (C) 2012-2014 Genome Research Ltd.
4 
5  Author: Petr Danecek <pd3@sanger.ac.uk>
6 
7 Permission is hereby granted, free of charge, to any person obtaining a copy
8 of this software and associated documentation files (the "Software"), to deal
9 in the Software without restriction, including without limitation the rights
10 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 copies of the Software, and to permit persons to whom the Software is
12 furnished to do so, subject to the following conditions:
13 
14 The above copyright notice and this permission notice shall be included in
15 all copies or substantial portions of the Software.
16 
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 DEALINGS IN THE SOFTWARE. */
24 
25 /*
26  The synced_bcf_reader allows to keep multiple VCFs open and stream them
27  using the next_line iterator in a seamless matter without worrying about
28  chromosomes and synchronizing the sites. This is used by vcfcheck to
29  compare multiple VCFs simultaneously and is used also for merging,
30  creating intersections, etc.
31 
32  The synced_bcf_reader also provides API for reading indexed BCF/VCF,
33  hiding differences in BCF/VCF opening, indexing and reading.
34 
35 
36  Example of usage:
37 
38  bcf_srs_t *sr = bcf_sr_init();
39  for (i=0; i<nfiles; i++)
40  bcf_sr_add_reader(sr,files[i]);
41  while ( bcf_sr_next_line(sr) )
42  {
43  for (i=0; i<nfiles; i++)
44  {
45  bcf1_t *line = bcf_sr_get_line(sr,i);
46  ...
47  }
48  }
49  bcf_sr_destroy(sr);
50 */
51 
52 #ifndef HTSLIB_SYNCED_BCF_READER_H
53 #define HTSLIB_SYNCED_BCF_READER_H
54 
55 #include "hts.h"
56 #include "vcf.h"
57 #include "tbx.h"
58 
59 // How should be treated sites with the same position but different alleles
60 #define COLLAPSE_NONE 0 // require the exact same set of alleles in all files
61 #define COLLAPSE_SNPS 1 // allow different alleles, as long as they all are SNPs
62 #define COLLAPSE_INDELS 2 // the same as above, but with indels
63 #define COLLAPSE_ANY 4 // any combination of alleles can be returned by bcf_sr_next_line()
64 #define COLLAPSE_SOME 8 // at least some of the ALTs must match
65 #define COLLAPSE_BOTH (COLLAPSE_SNPS|COLLAPSE_INDELS)
66 
67 typedef struct _bcf_sr_regions_t
68 {
69  // for reading from tabix-indexed file (big data)
70  tbx_t *tbx; // tabix index
71  hts_itr_t *itr; // tabix iterator
72  kstring_t line; // holder of the current line, set only when reading from tabix-indexed files
74  char *fname;
75  int is_bin; // is open in binary mode (tabix access)
76  char **als; // parsed alleles if targets_als set and _regions_match_alleles called
77  kstring_t als_str; // block of parsed alleles
78  int nals, mals; // number of set alleles and the size of allocated array
79  int als_type; // alleles type, currently VCF_SNP or VCF_INDEL
80 
81  // user handler to deal with skipped regions without a counterpart in VCFs
82  void (*missed_reg_handler)(struct _bcf_sr_regions_t *, void *);
84 
85  // for in-memory regions (small data)
86  struct _region_t *regs; // the regions
87 
88  // shared by both tabix-index and in-memory regions
89  void *seq_hash; // keys: sequence names, values: index to seqs
90  char **seq_names; // sequence names
91  int nseqs; // number of sequences (chromosomes) in the file
92  int iseq; // current position: chr name, index to snames
93  int start, end; // current position: start, end of the region (0-based)
95 }
97 
98 typedef struct
99 {
105  const char *fname;
106  bcf1_t **buffer; // cached VCF records. First is the current record synced across the reader
107  int nbuffer, mbuffer; // number of cached records (including the current record); number of allocated records
108  int nfilter_ids, *filter_ids; // -1 for ".", otherwise filter id as returned by bcf_id2int
109  int *samples, n_smpl; // list of columns in the order consistent with bcf_srs_t.samples
110 }
111 bcf_sr_t;
112 
113 typedef enum
114 {
117 }
119 
120 typedef struct
121 {
122  // Parameters controlling the logic
123  int collapse; // How should the duplicate sites be treated. One of the COLLAPSE_* types above.
124  char *apply_filters; // If set, sites where none of the FILTER strings is listed
125  // will be skipped. Active only at the time of
126  // initialization, that is during the add_reader()
127  // calls. Therefore, each reader can be initialized with different
128  // filters.
129  int require_index; // Some tools do not need random access
130  int max_unpack; // When reading VCFs and knowing some fields will not be needed, boost performance of vcf_parse1
131  int *has_line; // Corresponds to return value of bcf_sr_next_line but is not limited by sizeof(int). Use bcf_sr_has_line macro to query.
133 
134  // Auxiliary data
136  int nreaders;
137  int streaming; // reading mode: index-jumping or streaming
138  int explicit_regs; // was the list of regions se by bcf_sr_set_regions or guessed from tabix index?
139  char **samples; // List of samples
140  bcf_sr_regions_t *regions, *targets; // see bcf_sr_set_[targets|regions] for description
141  int targets_als; // subset to targets not only by position but also by alleles?
144  int n_smpl;
145 }
146 bcf_srs_t;
147 
148 #ifdef __cplusplus
149 extern "C" {
150 #endif
151 
153 bcf_srs_t *bcf_sr_init(void);
154 
156 void bcf_sr_destroy(bcf_srs_t *readers);
157 
158 char *bcf_sr_strerror(int errnum);
159 
160 
171 int bcf_sr_add_reader(bcf_srs_t *readers, const char *fname);
172 void bcf_sr_remove_reader(bcf_srs_t *files, int i);
173 
182 int bcf_sr_next_line(bcf_srs_t *readers);
183 #define bcf_sr_has_line(readers, i) (readers)->has_line[i]
184 #define bcf_sr_get_line(_readers, i) ((_readers)->has_line[i] ? ((_readers)->readers[i].buffer[0]) : NULL)
185 #define bcf_sr_region_done(_readers,i) (!(_readers)->has_line[i] && !(_readers)->readers[i].nbuffer ? 1 : 0)
186 #define bcf_sr_get_header(_readers, i) (_readers)->readers[i].header
187 #define bcf_sr_get_reader(_readers, i) &((_readers)->readers[i])
188 
194 int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos);
195 
207 int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file);
208 
235 int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles);
236 int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file);
237 
238 
239 
240 /*
241  * bcf_sr_regions_init()
242  * @regions: regions can be either a comma-separated list of regions
243  * (chr|chr:pos|chr:from-to|chr:from-) or VCF, BED, or
244  * tab-delimited file (the default). Uncompressed files
245  * are stored in memory while bgzip-compressed and tabix-indexed
246  * region files are streamed.
247  * @is_file: 0: regions is a comma-separated list of regions
248  * (chr|chr:pos|chr:from-to|chr:from-)
249  * 1: VCF, BED or tab-delimited file
250  * @chr, from, to:
251  * Column indexes of chromosome, start position and end position
252  * in the tab-delimited file. The positions are 1-based and
253  * inclusive.
254  * These parameters are ignored when reading from VCF, BED or
255  * tabix-indexed files. When end position column is not present,
256  * supply 'from' in place of 'to'. When 'to' is negative, first
257  * abs(to) will be attempted and if that fails, 'from' will be used
258  * instead.
259  */
260 bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int chr, int from, int to);
262 
263 /*
264  * bcf_sr_regions_seek() - seek to the chromosome block
265  *
266  * Returns 0 on success or -1 on failure. Sets reg->seq appropriately and
267  * reg->start,reg->end to -1.
268  */
269 int bcf_sr_regions_seek(bcf_sr_regions_t *regions, const char *chr);
270 
271 /*
272  * bcf_sr_regions_next() - retrieves next region. Returns 0 on success and -1
273  * when all regions have been read. The fields reg->seq, reg->start and
274  * reg->end are filled with the genomic coordinates on succes or with
275  * NULL,-1,-1 when no region is available. The coordinates are 0-based,
276  * inclusive.
277  */
279 
280 /*
281  * bcf_sr_regions_overlap() - checks if the interval <start,end> overlaps any of
282  * the regions, the coordinates are 0-based, inclusive. The coordinate queries
283  * must come in ascending order.
284  *
285  * Returns 0 if the position is in regions; -1 if the position is not in the
286  * regions and more regions exist; -2 if not in the regions and there are no more
287  * regions left.
288  */
289 int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end);
290 
291 /*
292  * bcf_sr_regions_flush() - calls repeatedly regs->missed_reg_handler() until
293  * all remaining records are processed.
294  */
296 
297 #ifdef __cplusplus
298 }
299 #endif
300 
301 #endif
char * fname
Definition: synced_bcf_reader.h:74
htsFile * file
Definition: synced_bcf_reader.h:73
Definition: synced_bcf_reader.c:44
Definition: hts.h:109
int targets_exclude
Definition: synced_bcf_reader.h:142
int is_file(char *fn)
Definition: files.c:57
int start
Definition: synced_bcf_reader.h:93
bcf1_t ** buffer
Definition: synced_bcf_reader.h:106
int bcf_sr_regions_next(bcf_sr_regions_t *reg)
Definition: synced_bcf_reader.c:1089
void * seq_hash
Definition: synced_bcf_reader.h:89
int require_index
Definition: synced_bcf_reader.h:129
int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file)
Definition: synced_bcf_reader.c:732
int streaming
Definition: synced_bcf_reader.h:137
char ** seq_names
Definition: synced_bcf_reader.h:90
int nals
Definition: synced_bcf_reader.h:78
int end
Definition: synced_bcf_reader.h:93
int nreaders
Definition: synced_bcf_reader.h:136
struct _region_t * regs
Definition: synced_bcf_reader.h:86
htsFile * file
Definition: synced_bcf_reader.h:100
Definition: synced_bcf_reader.h:115
bcf_sr_regions_t * bcf_sr_regions_init(const char *regions, int is_file, int chr, int from, int to)
Definition: synced_bcf_reader.c:977
Definition: synced_bcf_reader.h:67
bcf_sr_t * readers
Definition: synced_bcf_reader.h:135
hts_itr_t * itr
Definition: synced_bcf_reader.h:71
int collapse
Definition: synced_bcf_reader.h:123
Definition: synced_bcf_reader.h:115
Definition: synced_bcf_reader.h:115
int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end)
Definition: synced_bcf_reader.c:1223
int nfilter_ids
Definition: synced_bcf_reader.h:108
region1_t * regs
Definition: synced_bcf_reader.c:46
int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos)
Definition: synced_bcf_reader.c:714
void(* missed_reg_handler)(struct _bcf_sr_regions_t *, void *)
Definition: synced_bcf_reader.h:82
int * has_line
Definition: synced_bcf_reader.h:131
void bcf_sr_regions_destroy(bcf_sr_regions_t *regions)
Definition: synced_bcf_reader.c:1044
int n_smpl
Definition: synced_bcf_reader.h:144
int nbuffer
Definition: synced_bcf_reader.h:107
int * samples
Definition: synced_bcf_reader.h:109
hts_itr_t * itr
Definition: synced_bcf_reader.h:104
Definition: synced_bcf_reader.h:98
int is_bin
Definition: synced_bcf_reader.h:75
int als_type
Definition: synced_bcf_reader.h:79
tbx_t * tbx
Definition: synced_bcf_reader.h:70
char ** samples
Definition: synced_bcf_reader.h:139
Definition: bgzf.h:69
int bcf_sr_next_line(bcf_srs_t *readers)
Definition: synced_bcf_reader.c:677
int explicit_regs
Definition: synced_bcf_reader.h:138
bcf_sr_regions_t * targets
Definition: synced_bcf_reader.h:140
Definition: vcf.h:194
Definition: hts.c:640
Definition: hts.h:321
int prev_seq
Definition: synced_bcf_reader.h:94
struct _bcf_sr_regions_t bcf_sr_regions_t
Definition: synced_bcf_reader.h:116
void bcf_sr_destroy(bcf_srs_t *readers)
Definition: synced_bcf_reader.c:265
const char * fname
Definition: synced_bcf_reader.h:105
int bcf_sr_add_reader(bcf_srs_t *readers, const char *fname)
Definition: synced_bcf_reader.c:132
int iseq
Definition: synced_bcf_reader.h:92
bcf_srs_t * bcf_sr_init(void)
Definition: synced_bcf_reader.c:245
int targets_als
Definition: synced_bcf_reader.h:141
bcf_sr_error
Definition: synced_bcf_reader.h:113
kstring_t tmps
Definition: synced_bcf_reader.h:143
int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file)
Definition: synced_bcf_reader.c:104
bcf_hdr_t * header
Definition: synced_bcf_reader.h:103
Definition: tbx.h:44
char ** als
Definition: synced_bcf_reader.h:76
void bcf_sr_remove_reader(bcf_srs_t *files, int i)
Definition: synced_bcf_reader.c:280
void * missed_reg_data
Definition: synced_bcf_reader.h:83
Definition: synced_bcf_reader.h:120
bcf_sr_error errnum
Definition: synced_bcf_reader.h:132
int mals
Definition: synced_bcf_reader.h:78
int max_unpack
Definition: synced_bcf_reader.h:130
kstring_t line
Definition: synced_bcf_reader.h:72
Definition: vcf.h:100
tbx_t * tbx_idx
Definition: synced_bcf_reader.h:101
int prev_start
Definition: synced_bcf_reader.h:94
char * bcf_sr_strerror(int errnum)
Definition: synced_bcf_reader.c:55
Definition: synced_bcf_reader.h:115
int nseqs
Definition: synced_bcf_reader.h:91
char * apply_filters
Definition: synced_bcf_reader.h:124
Definition: synced_bcf_reader.h:115
int bcf_sr_regions_seek(bcf_sr_regions_t *regions, const char *chr)
Definition: synced_bcf_reader.c:1069
kstring_t als_str
Definition: synced_bcf_reader.h:77
int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles)
Definition: synced_bcf_reader.c:118
hts_idx_t * bcf_idx
Definition: synced_bcf_reader.h:102
void bcf_sr_regions_flush(bcf_sr_regions_t *regs)
Definition: synced_bcf_reader.c:1251