1use core::panic;
2use flate2::read::GzDecoder;
3use ndarray::Array2;
4use serde_json::Value;
5use std::collections::HashMap;
6use std::collections::HashSet;
7use std::fs::File;
8use std::io::{self, BufRead, BufReader};
9
10#[derive(Debug)]
11pub struct GraphicalGenome {
12 pub anchor: HashMap<String, Value>,
13 pub edges: HashMap<String, Value>,
14 pub outgoing: HashMap<String, Vec<String>>,
15 pub incoming: HashMap<String, Vec<String>>,
16}
17
18pub fn add_unique(vec: &mut Vec<String>, item: String) {
19 if !vec.contains(&item) {
20 vec.push(item);
21 }
22}
23
24#[must_use]
38pub fn reverse_complement(kmer: &str) -> String {
39 kmer.chars()
40 .rev()
41 .map(|c| match c {
42 'A' => 'T',
43 'T' => 'A',
44 'C' => 'G',
45 'G' => 'C',
46 'N' => 'N',
47 _ => panic!("Unexpected character: {}", c),
48 })
49 .collect()
50}
51
52impl GraphicalGenome {
53 pub fn load_graph(filename: &str) -> io::Result<GraphicalGenome> {
75 let file = File::open(filename)?;
76 let reader: Box<dyn BufRead> = if filename.ends_with(".gz") {
77 Box::new(BufReader::new(GzDecoder::new(file)))
78 } else {
79 Box::new(BufReader::new(file))
80 };
81 let mut anchor_dict = HashMap::new();
82 let mut edge_dict = HashMap::new();
83 let mut outgoing: HashMap<String, Vec<String>> = HashMap::new();
84 let mut incoming: HashMap<String, Vec<String>> = HashMap::new();
85 for line in reader.lines() {
86 let line = line?.trim_end().to_string();
87 if line.starts_with('S') {
88 let itemlist: Vec<&str> = line.split('\t').collect();
89 let name = itemlist[1];
90 let seq = itemlist[2];
91 let annotation = &itemlist[3][5..];
92
93 let json_value: Value = serde_json::from_str(annotation).unwrap();
94
95 let mut value = json_value;
96
97 value["seq"] = Value::String(seq.to_string());
98
99 if name.starts_with('A') {
100 anchor_dict.insert(name.to_string(), value);
101 } else if name.starts_with('E') {
102 edge_dict.insert(name.to_string(), value);
103 }
104 } else if line.starts_with('L') {
105 let itemlist: Vec<&str> = line.split('\t').collect();
106 let src = itemlist[1];
107 let dst = itemlist[3];
108 outgoing
109 .entry(src.to_string())
110 .or_default()
111 .push(dst.to_string());
112 incoming
113 .entry(dst.to_string())
114 .or_default()
115 .push(src.to_string());
116 }
117 }
118 Ok(GraphicalGenome {
119 anchor: anchor_dict,
120 edges: edge_dict,
121 outgoing: outgoing,
122 incoming: incoming,
123 })
124 }
125
126 pub fn extract_single_sample_graph(
151 &self,
152 df_single_sample: &Array2<f64>,
153 anchorlist: &Vec<String>,
154 readset: &Vec<String>,
155 sample: &str,
156 ) -> io::Result<GraphicalGenome> {
157 let mut new_edges: HashMap<String, serde_json::Value> = HashMap::new();
158 let mut new_incoming = HashMap::new();
159 let mut new_outgoing = HashMap::new();
160
161 for (anchorindex, anchor) in anchorlist.iter().enumerate() {
164 let mut d: HashMap<usize, String> = HashMap::new();
165 if let Some(outgoinglist) = self.outgoing.get(anchor) {
166 d = outgoinglist
168 .iter()
169 .enumerate()
170 .map(|(index, value)| (index, value.to_string()))
171 .collect();
172 }
173
174 for (read_index, read) in readset.iter().enumerate() {
175 let edgeindex = df_single_sample[[read_index, anchorindex]].round() - 1.0;
176 #[allow(clippy::cast_possible_truncation)]
177 #[allow(clippy::cast_sign_loss)]
178 let usize_index = edgeindex as usize;
179 let edgename = d.get(&usize_index);
180 new_edges
182 .entry(edgename.unwrap().to_string())
183 .or_insert_with(|| serde_json::json!({}));
184 if let Some(edge_value) = self.edges.get(&(*edgename.as_ref().unwrap()).to_string())
185 {
186 if let Some(seq_value) = edge_value.get("seq") {
187 let new_edge_value = new_edges
188 .entry((*edgename.as_ref().unwrap()).to_string())
189 .or_insert_with(|| serde_json::json!({}));
190 new_edge_value["seq"] = seq_value.clone();
191 }
192 }
193
194 let edgename_str = (*edgename.as_ref().unwrap()).to_string();
195 let new_edge_value = new_edges
196 .entry(edgename_str.clone())
197 .or_insert_with(|| serde_json::json!({}));
198 if !new_edge_value.get("reads").is_some() {
199 new_edge_value["reads"] = serde_json::Value::Array(vec![]);
200 }
201 let reads_vec = new_edge_value
202 .get_mut("reads")
203 .unwrap()
204 .as_array_mut()
205 .unwrap();
206 reads_vec.push(serde_json::Value::String(read.to_string()));
207
208 if !new_edge_value.get("strain").is_some() {
210 new_edge_value["strain"] = serde_json::Value::Array(vec![]);
211 }
212 let strain_vec = new_edge_value
213 .get_mut("strain")
214 .unwrap()
215 .as_array_mut()
216 .unwrap();
217 if !strain_vec
218 .iter()
219 .any(|x| x == &serde_json::Value::String(sample.to_string()))
220 {
221 strain_vec.push(serde_json::Value::String(sample.to_string()));
222 }
223
224 if let Some(outgoing_list) = self
225 .outgoing
226 .get(&(*edgename.as_ref().unwrap()).to_string())
227 {
228 if let Some(dst) = outgoing_list.get(0) {
229 let incoming_list = new_incoming
230 .entry((*edgename.as_ref().unwrap()).to_string())
231 .or_default();
232 add_unique(incoming_list, anchor.to_string());
233
234 let incoming_dst_list = new_incoming.entry(dst.to_string()).or_default();
235 add_unique(incoming_dst_list, (*edgename.as_ref().unwrap()).to_string());
236 let outgoing_list = new_outgoing.entry(anchor.to_string()).or_default();
237 add_unique(outgoing_list, (*edgename.as_ref().unwrap()).to_string());
238 let outgoing_edgename_list = new_outgoing
239 .entry((*edgename.as_ref().unwrap()).to_string())
240 .or_default();
241 add_unique(outgoing_edgename_list, dst.to_string());
242 } else {
243 panic!("edge do not have outgoing anchor")
244 }
245 }
246 }
247 }
248 let new_graph = GraphicalGenome {
249 anchor: self.anchor.clone(),
250 edges: new_edges,
251 outgoing: new_outgoing,
252 incoming: new_incoming,
253 };
254 Ok(new_graph)
255 }
256}
257
258pub struct FindAllPathBetweenAnchors {
259 pub subpath: Vec<(Vec<String>, HashSet<String>)>,
260}
261
262impl FindAllPathBetweenAnchors {
263 #[must_use]
264 pub fn new(
265 graph: &GraphicalGenome,
266 start: &str,
267 end: &str,
268 read_sets: HashSet<String>,
269 ) -> Self {
270 let mut finder = FindAllPathBetweenAnchors {
271 subpath: Vec::new(),
272 };
273 finder.find_path(graph, start, end, &Vec::new(), 0, read_sets);
274 finder
275 }
276
277 pub fn find_path(
278 &mut self,
279 g: &GraphicalGenome,
280 start: &str,
281 end: &str,
282 sofar: &Vec<String>,
283 depth: usize,
284 readset: HashSet<String>,
285 ) {
286 if start == end {
287 let mut sofar1 = sofar.clone();
288 sofar1.push(end.to_string());
289 if !readset.is_empty() {
290 self.subpath.push((sofar1, readset));
291 }
292 return;
293 }
294
295 if readset.is_empty() || start == "SINK" {
296 return;
297 }
298
299 if !g.outgoing.contains_key(start) {
300 return;
301 }
302
303 let depth1 = depth + 1;
304
305 if let Some(outgoing) = g.outgoing.get(start) {
306 for dst in outgoing {
307 let mut readset1 = readset.clone();
308 if dst.starts_with("E") {
309 if let Some(edge_reads) = g
316 .edges
317 .get(dst)
318 .and_then(|e| e.get("reads").and_then(|r| r.as_array()))
319 {
320 readset1.retain(|read| edge_reads.iter().any(|r| r.as_str() == Some(read)));
321 }
322 }
323 let mut sofar1 = sofar.clone();
324 sofar1.push(start.to_string());
325 self.find_path(g, dst, end, &sofar1, depth1, readset1);
326 }
327 }
328 }
329}
330
331#[must_use]
333pub fn reconstruct_path_seq(graph: &GraphicalGenome, path: &[String]) -> String {
334 let mut seq = String::new();
335 for item in path {
336 if item.starts_with('A') {
337 if let Some(anchor) = graph.anchor.get(item) {
338 seq += &anchor["seq"].as_str().unwrap_or_default(); }
341 } else if item.starts_with("E") {
342 if let Some(edge) = graph.edges.get(item) {
343 seq += &edge["seq"].as_str().unwrap_or_default(); }
345 }
346 }
347 seq
348}