skydive/
agg.rs

1use core::panic;
2use flate2::read::GzDecoder;
3use ndarray::Array2;
4use serde_json::Value;
5use std::collections::HashMap;
6use std::collections::HashSet;
7use std::fs::File;
8use std::io::{self, BufRead, BufReader};
9
10#[derive(Debug)]
11pub struct GraphicalGenome {
12    pub anchor: HashMap<String, Value>,
13    pub edges: HashMap<String, Value>,
14    pub outgoing: HashMap<String, Vec<String>>,
15    pub incoming: HashMap<String, Vec<String>>,
16}
17
18pub fn add_unique(vec: &mut Vec<String>, item: String) {
19    if !vec.contains(&item) {
20        vec.push(item);
21    }
22}
23
24/// Reverse complement of a k-mer
25///
26/// # Arguments
27///
28/// * `kmer` - A string representing the k-mer.
29///
30/// # Returns
31///
32/// A string containing the reverse complement of the k-mer.
33///
34/// # Panics
35///
36/// If the k-mer contains an unexpected character.
37#[must_use]
38pub fn reverse_complement(kmer: &str) -> String {
39    kmer.chars()
40        .rev()
41        .map(|c| match c {
42            'A' => 'T',
43            'T' => 'A',
44            'C' => 'G',
45            'G' => 'C',
46            'N' => 'N',
47            _ => panic!("Unexpected character: {}", c),
48        })
49        .collect()
50}
51
52impl GraphicalGenome {
53    /// Load a graphical genome from a file
54    ///
55    /// # Arguments
56    ///
57    /// * `filename` - A string slice that holds the name of the file to be loaded
58    ///
59    /// # Returns
60    ///
61    /// A Result containing the graphical genome if the file was successfully loaded
62    ///
63    /// # Errors
64    ///
65    /// * The file cannot be opened (e.g., permission issues, file not found).
66    /// * The file format is invalid (e.g., not a valid GZIP file or has unexpected content).
67    ///
68    /// # Panics
69    ///
70    /// - The file cannot be opened.
71    /// - The file is not a valid GZIP-compressed file.
72    /// - The annotation string is not valid JSON.
73    /// - There is a memory allocation error or other unexpected issue with the `HashMap` operations.
74    pub fn load_graph(filename: &str) -> io::Result<GraphicalGenome> {
75        let file = File::open(filename)?;
76        let reader: Box<dyn BufRead> = if filename.ends_with(".gz") {
77            Box::new(BufReader::new(GzDecoder::new(file)))
78        } else {
79            Box::new(BufReader::new(file))
80        };
81        let mut anchor_dict = HashMap::new();
82        let mut edge_dict = HashMap::new();
83        let mut outgoing: HashMap<String, Vec<String>> = HashMap::new();
84        let mut incoming: HashMap<String, Vec<String>> = HashMap::new();
85        for line in reader.lines() {
86            let line = line?.trim_end().to_string();
87            if line.starts_with('S') {
88                let itemlist: Vec<&str> = line.split('\t').collect();
89                let name = itemlist[1];
90                let seq = itemlist[2];
91                let annotation = &itemlist[3][5..];
92
93                let json_value: Value = serde_json::from_str(annotation).unwrap();
94
95                let mut value = json_value;
96
97                value["seq"] = Value::String(seq.to_string());
98
99                if name.starts_with('A') {
100                    anchor_dict.insert(name.to_string(), value);
101                } else if name.starts_with('E') {
102                    edge_dict.insert(name.to_string(), value);
103                }
104            } else if line.starts_with('L') {
105                let itemlist: Vec<&str> = line.split('\t').collect();
106                let src = itemlist[1];
107                let dst = itemlist[3];
108                outgoing
109                    .entry(src.to_string())
110                    .or_default()
111                    .push(dst.to_string());
112                incoming
113                    .entry(dst.to_string())
114                    .or_default()
115                    .push(src.to_string());
116            }
117        }
118        Ok(GraphicalGenome {
119            anchor: anchor_dict,
120            edges: edge_dict,
121            outgoing: outgoing,
122            incoming: incoming,
123        })
124    }
125
126    /// Method to extract a single sample graph
127    ///
128    /// # Arguments
129    ///
130    /// * `df_single_sample` - A 2D array containing the imputed data matrix
131    /// * `anchorlist` - A vector of strings containing the anchor names
132    /// * `readset` - A vector of strings containing the read names
133    /// * `sample` - A string slice containing the sample name
134    ///
135    /// # Returns
136    ///
137    /// A Result containing the graphical genome of the single sample
138    ///
139    /// # Errors
140    ///
141    /// * The `df_single_sample` array has an invalid shape.
142    /// * The anchor or read indices are invalid.
143    /// * There is a memory allocation error.
144    /// * The JSON data is invalid.
145    /// * There is an unexpected data structure or other internal error.
146    ///
147    /// # Panics
148    ///
149    /// If the edge do not have outgoing anchor
150    pub fn extract_single_sample_graph(
151        &self,
152        df_single_sample: &Array2<f64>,
153        anchorlist: &Vec<String>,
154        readset: &Vec<String>,
155        sample: &str,
156    ) -> io::Result<GraphicalGenome> {
157        let mut new_edges: HashMap<String, serde_json::Value> = HashMap::new();
158        let mut new_incoming = HashMap::new();
159        let mut new_outgoing = HashMap::new();
160
161        // be careful about the anchor and read index,
162        // make sure it matches with the imputed data matrix
163        for (anchorindex, anchor) in anchorlist.iter().enumerate() {
164            let mut d: HashMap<usize, String> = HashMap::new();
165            if let Some(outgoinglist) = self.outgoing.get(anchor) {
166                // Update the existing `d` variable instead of declaring a new one
167                d = outgoinglist
168                    .iter()
169                    .enumerate()
170                    .map(|(index, value)| (index, value.to_string()))
171                    .collect();
172            }
173
174            for (read_index, read) in readset.iter().enumerate() {
175                let edgeindex = df_single_sample[[read_index, anchorindex]].round() - 1.0;
176                #[allow(clippy::cast_possible_truncation)]
177                #[allow(clippy::cast_sign_loss)]
178                let usize_index = edgeindex as usize;
179                let edgename = d.get(&usize_index);
180                // println!("usize_index: {}, d: {:?}, edgename: {:?}", usize_index, &d, edgename); // Assuming new_edges is a HashMap<String, SomeType>
181                new_edges
182                    .entry(edgename.unwrap().to_string())
183                    .or_insert_with(|| serde_json::json!({}));
184                if let Some(edge_value) = self.edges.get(&(*edgename.as_ref().unwrap()).to_string())
185                {
186                    if let Some(seq_value) = edge_value.get("seq") {
187                        let new_edge_value = new_edges
188                            .entry((*edgename.as_ref().unwrap()).to_string())
189                            .or_insert_with(|| serde_json::json!({}));
190                        new_edge_value["seq"] = seq_value.clone();
191                    }
192                }
193
194                let edgename_str = (*edgename.as_ref().unwrap()).to_string();
195                let new_edge_value = new_edges
196                    .entry(edgename_str.clone())
197                    .or_insert_with(|| serde_json::json!({}));
198                if !new_edge_value.get("reads").is_some() {
199                    new_edge_value["reads"] = serde_json::Value::Array(vec![]);
200                }
201                let reads_vec = new_edge_value
202                    .get_mut("reads")
203                    .unwrap()
204                    .as_array_mut()
205                    .unwrap();
206                reads_vec.push(serde_json::Value::String(read.to_string()));
207
208                // let mut new_edge_value = new_edges.entry(edgename_str.clone()).or_insert_with(|| serde_json::json!({}));
209                if !new_edge_value.get("strain").is_some() {
210                    new_edge_value["strain"] = serde_json::Value::Array(vec![]);
211                }
212                let strain_vec = new_edge_value
213                    .get_mut("strain")
214                    .unwrap()
215                    .as_array_mut()
216                    .unwrap();
217                if !strain_vec
218                    .iter()
219                    .any(|x| x == &serde_json::Value::String(sample.to_string()))
220                {
221                    strain_vec.push(serde_json::Value::String(sample.to_string()));
222                }
223
224                if let Some(outgoing_list) = self
225                    .outgoing
226                    .get(&(*edgename.as_ref().unwrap()).to_string())
227                {
228                    if let Some(dst) = outgoing_list.get(0) {
229                        let incoming_list = new_incoming
230                            .entry((*edgename.as_ref().unwrap()).to_string())
231                            .or_default();
232                        add_unique(incoming_list, anchor.to_string());
233
234                        let incoming_dst_list = new_incoming.entry(dst.to_string()).or_default();
235                        add_unique(incoming_dst_list, (*edgename.as_ref().unwrap()).to_string());
236                        let outgoing_list = new_outgoing.entry(anchor.to_string()).or_default();
237                        add_unique(outgoing_list, (*edgename.as_ref().unwrap()).to_string());
238                        let outgoing_edgename_list = new_outgoing
239                            .entry((*edgename.as_ref().unwrap()).to_string())
240                            .or_default();
241                        add_unique(outgoing_edgename_list, dst.to_string());
242                    } else {
243                        panic!("edge do not have outgoing anchor")
244                    }
245                }
246            }
247        }
248        let new_graph = GraphicalGenome {
249            anchor: self.anchor.clone(),
250            edges: new_edges,
251            outgoing: new_outgoing,
252            incoming: new_incoming,
253        };
254        Ok(new_graph)
255    }
256}
257
258pub struct FindAllPathBetweenAnchors {
259    pub subpath: Vec<(Vec<String>, HashSet<String>)>,
260}
261
262impl FindAllPathBetweenAnchors {
263    #[must_use]
264    pub fn new(
265        graph: &GraphicalGenome,
266        start: &str,
267        end: &str,
268        read_sets: HashSet<String>,
269    ) -> Self {
270        let mut finder = FindAllPathBetweenAnchors {
271            subpath: Vec::new(),
272        };
273        finder.find_path(graph, start, end, &Vec::new(), 0, read_sets);
274        finder
275    }
276
277    pub fn find_path(
278        &mut self,
279        g: &GraphicalGenome,
280        start: &str,
281        end: &str,
282        sofar: &Vec<String>,
283        depth: usize,
284        readset: HashSet<String>,
285    ) {
286        if start == end {
287            let mut sofar1 = sofar.clone();
288            sofar1.push(end.to_string());
289            if !readset.is_empty() {
290                self.subpath.push((sofar1, readset));
291            }
292            return;
293        }
294
295        if readset.is_empty() || start == "SINK" {
296            return;
297        }
298
299        if !g.outgoing.contains_key(start) {
300            return;
301        }
302
303        let depth1 = depth + 1;
304
305        if let Some(outgoing) = g.outgoing.get(start) {
306            for dst in outgoing {
307                let mut readset1 = readset.clone();
308                if dst.starts_with("E") {
309                    // let edge_reads: HashSet<String> = g.edges.get(dst)
310                    //                                         .and_then(|edge| edge.get("reads").and_then(|r| r.as_array()))
311                    //                                         .map(|reads| reads.iter().filter_map(|read| read.as_str().map(String::from)).collect())
312                    //                                         .unwrap_or_default();
313
314                    // let readset1: HashSet<String> = readset.intersection(&edge_reads).cloned().collect();
315                    if let Some(edge_reads) = g
316                        .edges
317                        .get(dst)
318                        .and_then(|e| e.get("reads").and_then(|r| r.as_array()))
319                    {
320                        readset1.retain(|read| edge_reads.iter().any(|r| r.as_str() == Some(read)));
321                    }
322                }
323                let mut sofar1 = sofar.clone();
324                sofar1.push(start.to_string());
325                self.find_path(g, dst, end, &sofar1, depth1, readset1);
326            }
327        }
328    }
329}
330
331// Series parallele graph
332#[must_use]
333pub fn reconstruct_path_seq(graph: &GraphicalGenome, path: &[String]) -> String {
334    let mut seq = String::new();
335    for item in path {
336        if item.starts_with('A') {
337            if let Some(anchor) = graph.anchor.get(item) {
338                seq += &anchor["seq"].as_str().unwrap_or_default(); // Assuming `anchor` is a HashMap and "seq" is a key
339                                                                    // println!("{:?}", anchor["seq"].as_str().unwrap_or_default());
340            }
341        } else if item.starts_with("E") {
342            if let Some(edge) = graph.edges.get(item) {
343                seq += &edge["seq"].as_str().unwrap_or_default(); // Assuming `edges` is a HashMap and "seq" is a key
344            }
345        }
346    }
347    seq
348}