skydive/
parse.rs

1use std::{collections::HashSet, io::BufRead, path::PathBuf};
2
3use anyhow::Result;
4use linked_hash_set::LinkedHashSet;
5use regex::Regex;
6use url::Url;
7
8use path_absolutize::Absolutize;
9
10/// Parse a list of loci strings into a `HashSet` of formatted loci.
11/// If a locus string is a file on disk, read its contents and parse each line as a locus.
12///
13/// # Arguments
14///
15/// * `loci_list` - A list of strings representing loci.
16/// * `padding` - A u64 representing the amount to extend the start and stop positions by.
17///
18/// # Returns
19///
20/// A `HashSet` of formatted loci.
21///
22/// # Errors
23///
24/// This function returns an error if it cannot parse a given locus.
25///
26/// # Panics
27///
28/// This function will panic if it cannot parse a given locus.
29/// It will also panic if it cannot read a given file path.
30///
31#[must_use]
32pub fn parse_loci(
33    loci_list: &Vec<String>,
34    padding: u64,
35) -> LinkedHashSet<(String, u64, u64, String)> {
36    // Initialize a HashSet to store unique loci after parsing
37    let mut loci = LinkedHashSet::new();
38
39    // Iterate over each locus in the provided list
40    for locus in loci_list {
41        // Check if the locus represents a file on disk
42        let path = PathBuf::from(locus);
43        if path.is_file() {
44            // If it's a file, read its contents and parse each line as a locus
45            let file = std::fs::File::open(&path).expect("Failed to open file");
46            let reader = std::io::BufReader::new(file);
47            for line in reader.lines() {
48                let line = line.expect("Failed to read line");
49
50                // Skip lines starting with '#'
51                if line.trim().starts_with('#') {
52                    continue;
53                }
54
55                match parse_locus(&line.clone(), padding) {
56                    Ok(l_fmt) => {
57                        loci.insert(l_fmt);
58                    }
59                    Err(_) => {
60                        panic!("Could not parse locus '{line}' from file '{locus}'.");
61                    }
62                }
63            }
64            // Skip the rest of the loop iteration
65            continue;
66        } else {
67            // Attempt to parse the locus
68            match parse_locus(&locus.to_owned(), padding) {
69                Ok(l_fmt) => {
70                    // If parsing is successful, insert the formatted locus into the HashSet
71                    loci.insert(l_fmt);
72                }
73                Err(_) => {
74                    // If parsing fails, panic and terminate the program, providing an error message
75                    panic!("Could not parse locus '{locus}'.");
76                }
77            }
78        }
79    }
80
81    loci
82}
83
84/// Parse a locus string into a tuple of contig name, start position, stop position, and optional name.
85/// The locus string can be in the following formats:
86/// - chr:start-stop
87/// - chr:start-stop|name
88/// - chr start stop
89/// - chr start stop name
90///
91/// The start and stop positions are 1-based and inclusive.
92/// The optional padding parameter can be used to extend the start and stop positions by a specified amount.
93///
94/// # Arguments
95///
96/// * `locus` - A string representing a locus.
97/// * `padding` - A u64 representing the amount to extend the start and stop positions by.
98///
99/// # Returns
100///
101/// A tuple containing the contig name, start position, stop position, and optional name.
102///
103/// # Errors
104///
105/// This function returns an error if the locus format is incorrect.
106///
107/// # Panics
108///
109/// This function will panic if the locus format is incorrect.
110pub fn parse_locus(locus: &str, padding: u64) -> Result<(String, u64, u64, String)> {
111    // Regex to capture the contig name, start position, stop position, and optional name.
112    // Accepts:
113    // - chr:start-stop
114    // - chr:start-stop|name
115    // - chr start stop
116    // - chr start stop name
117    let re = Regex::new(r"(.*)[:\s]+(\d+)[-\s]+(\d+)(?:[|\s+](.*))?")?;
118
119    // Remove commas from the locus string
120    let locus = locus.replace(",", "");
121
122    if let Some(captures) = re.captures(&locus) {
123        let chr = captures.get(1).unwrap().as_str().to_string();
124        let start = captures.get(2).unwrap().as_str().parse::<u64>()? - padding;
125        let stop = captures.get(3).unwrap().as_str().parse::<u64>()? + padding;
126        let name = captures.get(4).map_or_else(
127            || format!("{chr}:{start}-{stop}"),
128            |m| m.as_str().to_string(),
129        );
130
131        if start > stop {
132            anyhow::bail!("Locus format for '{}' is incorrect. Start position ({}) is greater than stop position ({}).", locus, start, stop);
133        }
134
135        Ok((chr, start, stop, name))
136    } else {
137        anyhow::bail!(
138            "Locus format for '{}' is incorrect. It should be 'chr:start-stop', 'chr:start-stop|name', 'chr start stop', or 'chr start stop name'.",
139            locus
140        );
141    }
142}
143
144/// Parse a list of BAM file paths into a `HashSet` of URLs.
145/// If any of the files are a local file ending in .txt, assume it's a file of filenames.
146///
147/// # Arguments
148///
149/// * `bam_paths` - A list of BAM file paths.
150///
151/// # Returns
152///
153/// A `HashSet` of URLs.
154///
155/// # Errors
156///
157/// This function returns an error if it cannot parse a given file path.
158///
159/// # Panics
160///
161/// This function will panic if it cannot parse a given file path.
162pub fn parse_file_names(bam_paths: &[PathBuf]) -> HashSet<Url> {
163    // Convert the list of BAM file paths into a HashSet of URLs
164    let mut reads_urls: HashSet<Url> = bam_paths
165        .iter()
166        // Use filter_map to attempt to parse each path as a URL, and collect the successful ones
167        .filter_map(|path| {
168            let path_str = path.to_string_lossy();
169            if path_str.starts_with("gs://") {
170                Url::parse(&path_str).ok()
171            } else {
172                Url::from_file_path(path.absolutize().unwrap()).ok()
173            }
174        })
175        .collect();
176
177    // If any of the files are a local file ending in .txt, assume it's a file of filenames.
178    let mut local_file_contents = HashSet::new();
179    let mut to_remove = HashSet::new();
180    for url in &reads_urls {
181        if url.scheme() == "file" {
182            let path = url.to_file_path().unwrap();
183            if path.extension().and_then(std::ffi::OsStr::to_str) == Some("txt") {
184                if let Ok(file) = std::fs::File::open(&path) {
185                    let reader = std::io::BufReader::new(file);
186                    for line in reader.lines().map_while(Result::ok) {
187                        let abs_path = PathBuf::from(line);
188                        local_file_contents.insert(abs_path);
189                    }
190                }
191
192                to_remove.insert(url.clone());
193            }
194        }
195    }
196
197    // Remove FOFN files from the set of BAM/CRAM files.
198    for url in to_remove.iter() {
199        let _ = reads_urls.remove(url);
200    }
201
202    // Add the files from the file of filenames to the full list of files.
203    reads_urls.extend(local_file_contents.into_iter().filter_map(|path| {
204        let path_str = path.to_string_lossy();
205        if path_str.starts_with("gs://") {
206            Url::parse(&path_str).ok()
207        } else {
208            Url::from_file_path(path.absolutize().unwrap()).ok()
209        }
210    }));
211
212    reads_urls
213}
214
215#[cfg(test)]
216mod tests {
217    use super::*;
218
219    #[test]
220    fn test_parse_locus() {
221        // Valid locus without padding
222        let result = parse_locus("chr1:1000-2000", 0);
223        assert_eq!(
224            result.ok(),
225            Some((
226                "chr1".to_string(),
227                1000 as u64,
228                2000 as u64,
229                "chr1:1000-2000".to_string()
230            ))
231        );
232
233        // Valid locus with padding
234        let result = parse_locus("chr2:5000-6000", 100);
235        assert!(result.is_ok());
236        assert_eq!(
237            result.ok(),
238            Some((
239                "chr2".to_string(),
240                4900 as u64,
241                6100 as u64,
242                "chr2:4900-6100".to_string()
243            ))
244        );
245
246        // Valid locus with name
247        let result = parse_locus("chr3:10000-20000|gene1", 0);
248        assert_eq!(
249            result.ok(),
250            Some((
251                "chr3".to_string(),
252                10000 as u64,
253                20000 as u64,
254                "gene1".to_string()
255            ))
256        );
257
258        // Valid locus with commas
259        let result = parse_locus("chr3:10,000-20,000|gene1", 0);
260        assert_eq!(
261            result.ok(),
262            Some((
263                "chr3".to_string(),
264                10000 as u64,
265                20000 as u64,
266                "gene1".to_string()
267            ))
268        );
269
270        // Combination of space and colon separators
271        let result = parse_locus("chr4 30000-40000", 0);
272        assert_eq!(
273            result.ok(),
274            Some((
275                "chr4".to_string(),
276                30000 as u64,
277                40000 as u64,
278                "chr4:30000-40000".to_string()
279            ))
280        );
281
282        // Invalid format (non-numeric start position)
283        let result = parse_locus("chr5:start-50000", 0);
284        assert!(result.is_err());
285
286        // Invalid format (start position greater than end position)
287        let result = parse_locus("chr6:60000-50000", 0);
288        assert!(result.is_err());
289
290        // Valid locus with tab-separated fields
291        let result = parse_locus("chr7\t70000\t80000", 0);
292        assert_eq!(
293            result.ok(),
294            Some((
295                "chr7".to_string(),
296                70000 as u64,
297                80000 as u64,
298                "chr7:70000-80000".to_string()
299            ))
300        );
301
302        // Valid locus with tab-separated fields and name
303        let result = parse_locus("chr8\t90000\t100000\tgene2", 0);
304        assert_eq!(
305            result.ok(),
306            Some((
307                "chr8".to_string(),
308                90000 as u64,
309                100000 as u64,
310                "gene2".to_string()
311            ))
312        );
313
314        // Valid locus with mixed tab and colon separators
315        let result = parse_locus("chr9:110000\t120000", 0);
316        assert_eq!(
317            result.ok(),
318            Some((
319                "chr9".to_string(),
320                110000 as u64,
321                120000 as u64,
322                "chr9:110000-120000".to_string()
323            ))
324        );
325
326        // Contig name with dash in it
327        let result = parse_locus("chr10-A:130000-140000|chr10-A", 0);
328        assert_eq!(
329            result.ok(),
330            Some((
331                "chr10-A".to_string(),
332                130000 as u64,
333                140000 as u64,
334                "chr10-A".to_string()
335            ))
336        );
337
338        // Locus with multiple colons and dashes
339        let result = parse_locus("chr22:42121531-42135680:1-14150", 0);
340        assert_eq!(
341            result.ok(),
342            Some((
343                "chr22:42121531-42135680".to_string(),
344                1 as u64,
345                14150 as u64,
346                "chr22:42121531-42135680:1-14150".to_string()
347            ))
348        );
349    }
350}