skydive/parse.rs
1use std::{collections::HashSet, io::BufRead, path::PathBuf};
2
3use anyhow::Result;
4use linked_hash_set::LinkedHashSet;
5use regex::Regex;
6use url::Url;
7
8use path_absolutize::Absolutize;
9
10/// Parse a list of loci strings into a `HashSet` of formatted loci.
11/// If a locus string is a file on disk, read its contents and parse each line as a locus.
12///
13/// # Arguments
14///
15/// * `loci_list` - A list of strings representing loci.
16/// * `padding` - A u64 representing the amount to extend the start and stop positions by.
17///
18/// # Returns
19///
20/// A `HashSet` of formatted loci.
21///
22/// # Errors
23///
24/// This function returns an error if it cannot parse a given locus.
25///
26/// # Panics
27///
28/// This function will panic if it cannot parse a given locus.
29/// It will also panic if it cannot read a given file path.
30///
31#[must_use]
32pub fn parse_loci(
33 loci_list: &Vec<String>,
34 padding: u64,
35) -> LinkedHashSet<(String, u64, u64, String)> {
36 // Initialize a HashSet to store unique loci after parsing
37 let mut loci = LinkedHashSet::new();
38
39 // Iterate over each locus in the provided list
40 for locus in loci_list {
41 // Check if the locus represents a file on disk
42 let path = PathBuf::from(locus);
43 if path.is_file() {
44 // If it's a file, read its contents and parse each line as a locus
45 let file = std::fs::File::open(&path).expect("Failed to open file");
46 let reader = std::io::BufReader::new(file);
47 for line in reader.lines() {
48 let line = line.expect("Failed to read line");
49
50 // Skip lines starting with '#'
51 if line.trim().starts_with('#') {
52 continue;
53 }
54
55 match parse_locus(&line.clone(), padding) {
56 Ok(l_fmt) => {
57 loci.insert(l_fmt);
58 }
59 Err(_) => {
60 panic!("Could not parse locus '{line}' from file '{locus}'.");
61 }
62 }
63 }
64 // Skip the rest of the loop iteration
65 continue;
66 } else {
67 // Attempt to parse the locus
68 match parse_locus(&locus.to_owned(), padding) {
69 Ok(l_fmt) => {
70 // If parsing is successful, insert the formatted locus into the HashSet
71 loci.insert(l_fmt);
72 }
73 Err(_) => {
74 // If parsing fails, panic and terminate the program, providing an error message
75 panic!("Could not parse locus '{locus}'.");
76 }
77 }
78 }
79 }
80
81 loci
82}
83
84/// Parse a locus string into a tuple of contig name, start position, stop position, and optional name.
85/// The locus string can be in the following formats:
86/// - chr:start-stop
87/// - chr:start-stop|name
88/// - chr start stop
89/// - chr start stop name
90///
91/// The start and stop positions are 1-based and inclusive.
92/// The optional padding parameter can be used to extend the start and stop positions by a specified amount.
93///
94/// # Arguments
95///
96/// * `locus` - A string representing a locus.
97/// * `padding` - A u64 representing the amount to extend the start and stop positions by.
98///
99/// # Returns
100///
101/// A tuple containing the contig name, start position, stop position, and optional name.
102///
103/// # Errors
104///
105/// This function returns an error if the locus format is incorrect.
106///
107/// # Panics
108///
109/// This function will panic if the locus format is incorrect.
110pub fn parse_locus(locus: &str, padding: u64) -> Result<(String, u64, u64, String)> {
111 // Regex to capture the contig name, start position, stop position, and optional name.
112 // Accepts:
113 // - chr:start-stop
114 // - chr:start-stop|name
115 // - chr start stop
116 // - chr start stop name
117 let re = Regex::new(r"(.*)[:\s]+(\d+)[-\s]+(\d+)(?:[|\s+](.*))?")?;
118
119 // Remove commas from the locus string
120 let locus = locus.replace(",", "");
121
122 if let Some(captures) = re.captures(&locus) {
123 let chr = captures.get(1).unwrap().as_str().to_string();
124 let start = captures.get(2).unwrap().as_str().parse::<u64>()? - padding;
125 let stop = captures.get(3).unwrap().as_str().parse::<u64>()? + padding;
126 let name = captures.get(4).map_or_else(
127 || format!("{chr}:{start}-{stop}"),
128 |m| m.as_str().to_string(),
129 );
130
131 if start > stop {
132 anyhow::bail!("Locus format for '{}' is incorrect. Start position ({}) is greater than stop position ({}).", locus, start, stop);
133 }
134
135 Ok((chr, start, stop, name))
136 } else {
137 anyhow::bail!(
138 "Locus format for '{}' is incorrect. It should be 'chr:start-stop', 'chr:start-stop|name', 'chr start stop', or 'chr start stop name'.",
139 locus
140 );
141 }
142}
143
144/// Parse a list of BAM file paths into a `HashSet` of URLs.
145/// If any of the files are a local file ending in .txt, assume it's a file of filenames.
146///
147/// # Arguments
148///
149/// * `bam_paths` - A list of BAM file paths.
150///
151/// # Returns
152///
153/// A `HashSet` of URLs.
154///
155/// # Errors
156///
157/// This function returns an error if it cannot parse a given file path.
158///
159/// # Panics
160///
161/// This function will panic if it cannot parse a given file path.
162pub fn parse_file_names(bam_paths: &[PathBuf]) -> HashSet<Url> {
163 // Convert the list of BAM file paths into a HashSet of URLs
164 let mut reads_urls: HashSet<Url> = bam_paths
165 .iter()
166 // Use filter_map to attempt to parse each path as a URL, and collect the successful ones
167 .filter_map(|path| {
168 let path_str = path.to_string_lossy();
169 if path_str.starts_with("gs://") {
170 Url::parse(&path_str).ok()
171 } else {
172 Url::from_file_path(path.absolutize().unwrap()).ok()
173 }
174 })
175 .collect();
176
177 // If any of the files are a local file ending in .txt, assume it's a file of filenames.
178 let mut local_file_contents = HashSet::new();
179 let mut to_remove = HashSet::new();
180 for url in &reads_urls {
181 if url.scheme() == "file" {
182 let path = url.to_file_path().unwrap();
183 if path.extension().and_then(std::ffi::OsStr::to_str) == Some("txt") {
184 if let Ok(file) = std::fs::File::open(&path) {
185 let reader = std::io::BufReader::new(file);
186 for line in reader.lines().map_while(Result::ok) {
187 let abs_path = PathBuf::from(line);
188 local_file_contents.insert(abs_path);
189 }
190 }
191
192 to_remove.insert(url.clone());
193 }
194 }
195 }
196
197 // Remove FOFN files from the set of BAM/CRAM files.
198 for url in to_remove.iter() {
199 let _ = reads_urls.remove(url);
200 }
201
202 // Add the files from the file of filenames to the full list of files.
203 reads_urls.extend(local_file_contents.into_iter().filter_map(|path| {
204 let path_str = path.to_string_lossy();
205 if path_str.starts_with("gs://") {
206 Url::parse(&path_str).ok()
207 } else {
208 Url::from_file_path(path.absolutize().unwrap()).ok()
209 }
210 }));
211
212 reads_urls
213}
214
215#[cfg(test)]
216mod tests {
217 use super::*;
218
219 #[test]
220 fn test_parse_locus() {
221 // Valid locus without padding
222 let result = parse_locus("chr1:1000-2000", 0);
223 assert_eq!(
224 result.ok(),
225 Some((
226 "chr1".to_string(),
227 1000 as u64,
228 2000 as u64,
229 "chr1:1000-2000".to_string()
230 ))
231 );
232
233 // Valid locus with padding
234 let result = parse_locus("chr2:5000-6000", 100);
235 assert!(result.is_ok());
236 assert_eq!(
237 result.ok(),
238 Some((
239 "chr2".to_string(),
240 4900 as u64,
241 6100 as u64,
242 "chr2:4900-6100".to_string()
243 ))
244 );
245
246 // Valid locus with name
247 let result = parse_locus("chr3:10000-20000|gene1", 0);
248 assert_eq!(
249 result.ok(),
250 Some((
251 "chr3".to_string(),
252 10000 as u64,
253 20000 as u64,
254 "gene1".to_string()
255 ))
256 );
257
258 // Valid locus with commas
259 let result = parse_locus("chr3:10,000-20,000|gene1", 0);
260 assert_eq!(
261 result.ok(),
262 Some((
263 "chr3".to_string(),
264 10000 as u64,
265 20000 as u64,
266 "gene1".to_string()
267 ))
268 );
269
270 // Combination of space and colon separators
271 let result = parse_locus("chr4 30000-40000", 0);
272 assert_eq!(
273 result.ok(),
274 Some((
275 "chr4".to_string(),
276 30000 as u64,
277 40000 as u64,
278 "chr4:30000-40000".to_string()
279 ))
280 );
281
282 // Invalid format (non-numeric start position)
283 let result = parse_locus("chr5:start-50000", 0);
284 assert!(result.is_err());
285
286 // Invalid format (start position greater than end position)
287 let result = parse_locus("chr6:60000-50000", 0);
288 assert!(result.is_err());
289
290 // Valid locus with tab-separated fields
291 let result = parse_locus("chr7\t70000\t80000", 0);
292 assert_eq!(
293 result.ok(),
294 Some((
295 "chr7".to_string(),
296 70000 as u64,
297 80000 as u64,
298 "chr7:70000-80000".to_string()
299 ))
300 );
301
302 // Valid locus with tab-separated fields and name
303 let result = parse_locus("chr8\t90000\t100000\tgene2", 0);
304 assert_eq!(
305 result.ok(),
306 Some((
307 "chr8".to_string(),
308 90000 as u64,
309 100000 as u64,
310 "gene2".to_string()
311 ))
312 );
313
314 // Valid locus with mixed tab and colon separators
315 let result = parse_locus("chr9:110000\t120000", 0);
316 assert_eq!(
317 result.ok(),
318 Some((
319 "chr9".to_string(),
320 110000 as u64,
321 120000 as u64,
322 "chr9:110000-120000".to_string()
323 ))
324 );
325
326 // Contig name with dash in it
327 let result = parse_locus("chr10-A:130000-140000|chr10-A", 0);
328 assert_eq!(
329 result.ok(),
330 Some((
331 "chr10-A".to_string(),
332 130000 as u64,
333 140000 as u64,
334 "chr10-A".to_string()
335 ))
336 );
337
338 // Locus with multiple colons and dashes
339 let result = parse_locus("chr22:42121531-42135680:1-14150", 0);
340 assert_eq!(
341 result.ok(),
342 Some((
343 "chr22:42121531-42135680".to_string(),
344 1 as u64,
345 14150 as u64,
346 "chr22:42121531-42135680:1-14150".to_string()
347 ))
348 );
349 }
350}