skydive/
ldbg.rs

1use anyhow::Result;
2use itertools::Itertools;
3use linked_hash_map::LinkedHashMap;
4
5use std::collections::{BTreeMap, HashMap, HashSet};
6use std::fs::File;
7use std::io::Write;
8
9use parquet::data_type::AsBytes;
10
11use needletail::sequence::complement;
12use needletail::Sequence;
13use std::path::PathBuf;
14
15use petgraph::graph::{DiGraph, NodeIndex};
16use petgraph::visit::EdgeRef;
17
18use indicatif::{ParallelProgressIterator, ProgressIterator};
19use rayon::iter::IntoParallelRefIterator;
20use rayon::iter::ParallelIterator;
21
22use gbdt::decision_tree::Data;
23use gbdt::gradient_boost::GBDT;
24
25use bio::alignment::distance::levenshtein;
26
27use crate::edges::Edges;
28use crate::link::Link;
29use crate::record::Record;
30use crate::wmec::WMECData;
31
32type KmerGraph = HashMap<Vec<u8>, Record>;
33type KmerScores = HashMap<Vec<u8>, f32>;
34type Links = HashMap<Vec<u8>, HashMap<Link, u16>>;
35type Sources = HashMap<Vec<u8>, Vec<usize>>;
36type KmerNoise = HashSet<Vec<u8>>;
37
38/// Represents a linked de Bruijn graph with a k-mer size specified at construction time.
39#[derive(Debug, Clone)]
40pub struct LdBG {
41    pub name: String,
42    pub kmer_size: usize,
43    pub kmers: KmerGraph,
44    pub scores: KmerScores,
45    pub links: Links,
46    pub sources: Sources,
47    pub noise: KmerNoise,
48    pub verbose: bool,
49}
50
51impl LdBG {
52    #[must_use]
53    pub fn new(name: String, kmer_size: usize) -> Self {
54        LdBG {
55            name,
56            kmer_size,
57            kmers: KmerGraph::new(),
58            scores: KmerScores::new(),
59            links: Links::new(),
60            sources: Sources::new(),
61            noise: KmerNoise::new(),
62            verbose: false,
63        }
64    }
65
66    /// Create a de Bruijn graph (and optional links) from a file path.
67    ///
68    /// # Arguments
69    ///
70    /// * `name` - A string representing the name of the graph.
71    /// * `kmer_size` - The k-mer size.
72    /// * `seq_path` - A path to the sequence file.
73    ///
74    /// # Returns
75    ///
76    /// A new instance of `LdBG`.
77    ///
78    /// # Panics
79    ///
80    /// This function will panic if it cannot open a file.
81    #[must_use]
82    pub fn from_file(name: String, kmer_size: usize, seq_path: &PathBuf) -> Self {
83        let reader = bio::io::fasta::Reader::from_file(seq_path).unwrap();
84        let all_reads: Vec<bio::io::fasta::Record> = reader.records().map(|r| r.unwrap()).collect();
85
86        let fwd_seqs: Vec<Vec<u8>> = all_reads
87            .iter()
88            .map(|r| r.seq().as_bytes().to_vec())
89            .collect();
90
91        let kmers = Self::build_graph(kmer_size, &fwd_seqs);
92        let scores: KmerScores = kmers.keys().map(|k| (k.clone(), 1.0)).collect();
93        let links: Links = Links::new();
94        let sources: Sources = Sources::new();
95        let noise: KmerNoise = Self::find_noise(&fwd_seqs, kmer_size);
96
97        LdBG {
98            name,
99            kmer_size,
100            kmers,
101            scores,
102            links,
103            sources,
104            noise,
105            verbose: false,
106        }
107    }
108
109    /// Create a de Bruijn graph (and optional links) from many file paths.
110    ///
111    /// # Arguments
112    ///
113    /// * `name` - A string representing the name of the graph.
114    /// * `kmer_size` - The k-mer size.
115    /// * `seq_paths` - Paths to sequence files.
116    ///
117    /// # Returns
118    ///
119    /// A new instance of `LdBG`.
120    ///
121    /// # Panics
122    ///
123    /// This function will panic if it cannot open a file.
124    #[must_use]
125    pub fn from_files(name: String, kmer_size: usize, seq_paths: &Vec<PathBuf>) -> Self {
126        let fwd_seqs = seq_paths
127            .iter()
128            .map(|p| {
129                let reader = bio::io::fasta::Reader::from_file(p).expect("Failed to open file");
130                reader
131                    .records()
132                    .filter_map(|r| r.ok())
133                    .map(|r| r.seq().to_vec())
134                    .collect::<Vec<Vec<u8>>>()
135            })
136            .flatten()
137            .collect();
138
139        let kmers = Self::build_graph(kmer_size, &fwd_seqs);
140        let scores: KmerScores = kmers.keys().map(|k| (k.clone(), 1.0)).collect();
141        let links: Links = Links::new();
142        let sources: Sources = Sources::new();
143        let noise: KmerNoise = Self::find_noise(&fwd_seqs, kmer_size);
144
145        LdBG {
146            name,
147            kmer_size,
148            kmers,
149            scores,
150            links,
151            sources,
152            noise,
153            verbose: false,
154        }
155    }
156
157    /// Create a de Bruijn graph (and optional links) from a sequence.
158    ///
159    /// # Arguments
160    ///
161    /// * `name` - A string representing the name of the graph.
162    /// * `kmer_size` - The k-mer size.
163    /// * `fwd_seq` - A forward sequence.
164    ///
165    /// # Returns
166    ///
167    /// A new instance of `LdBG`.
168    #[must_use]
169    pub fn from_sequence(name: String, kmer_size: usize, fwd_seq: &Vec<u8>) -> Self {
170        let fwd_seqs = vec![fwd_seq.clone()];
171
172        let kmers = Self::build_graph(kmer_size, &fwd_seqs);
173        let scores: KmerScores = kmers.keys().map(|k| (k.clone(), 1.0)).collect();
174        let links: Links = Links::new();
175        let sources: Sources = Sources::new();
176        let noise: KmerNoise = Self::find_noise(&fwd_seqs, kmer_size);
177
178        LdBG {
179            name,
180            kmer_size,
181            kmers,
182            scores,
183            links,
184            sources,
185            noise,
186            verbose: false,
187        }
188    }
189
190    /// Create a de Bruijn graph (and optional links) from a list of sequences.
191    ///
192    /// # Arguments
193    ///
194    /// * `name` - A string representing the name of the graph.
195    /// * `kmer_size` - The k-mer size.
196    /// * `fwd_seqs` - A vector of forward sequences.
197    ///
198    /// # Returns
199    ///
200    /// A new instance of `LdBG`.
201    #[must_use]
202    pub fn from_sequences(name: String, kmer_size: usize, fwd_seqs: &Vec<Vec<u8>>) -> Self {
203        let kmers = Self::build_graph(kmer_size, fwd_seqs);
204        let scores: KmerScores = kmers.keys().map(|k| (k.clone(), 1.0)).collect();
205        let links: Links = Links::new();
206        let sources: Sources = Sources::new();
207        let noise: KmerNoise = Self::find_noise(&fwd_seqs, kmer_size);
208
209        LdBG {
210            name,
211            kmer_size,
212            kmers,
213            scores,
214            links,
215            sources,
216            noise,
217            verbose: false,
218        }
219    }
220
221    /// Get the name of the graph.
222    ///
223    /// # Returns
224    ///
225    /// A reference to the name of the graph.
226    #[must_use]
227    pub fn name(&self) -> &String {
228        &self.name
229    }
230
231    #[must_use]
232    pub fn verbose(mut self, verbose: bool) -> Self {
233        self.verbose = verbose;
234        self
235    }
236
237    fn find_noise(fwd_seqs: &Vec<Vec<u8>>, kmer_size: usize) -> HashSet<Vec<u8>> {
238        let mut noise: KmerNoise = KmerNoise::new();
239
240        for seq in fwd_seqs {
241            let mut kmer_positions = HashMap::new();
242            for (i, fw_kmer) in seq.windows(kmer_size).enumerate() {
243                let cn_kmer = crate::utils::canonicalize_kmer(fw_kmer);
244                kmer_positions
245                    .entry(cn_kmer)
246                    .or_insert_with(Vec::new)
247                    .push(i);
248            }
249
250            for (cn_kmer, positions) in kmer_positions {
251                if positions.len() > 1 {
252                    noise.insert(cn_kmer);
253                }
254            }
255
256            for range in sdust::symmetric_dust(seq) {
257                if range.len() >= 1 {
258                    for fw_kmer in seq[range.start..range.end].windows(kmer_size) {
259                        let cn_kmer = crate::utils::canonicalize_kmer(fw_kmer);
260
261                        noise.insert(cn_kmer);
262                    }
263                }
264            }
265        }
266
267        noise
268    }
269
270    /// Build a de Bruijn graph from a vector of sequences.
271    ///
272    /// # Arguments
273    ///
274    /// * `k` - The k-mer size.
275    /// * `fwd_seqs` - A vector of forward sequences.
276    ///
277    /// # Returns
278    ///
279    /// A k-mer graph.
280    fn build_graph(k: usize, fwd_seqs: &Vec<Vec<u8>>) -> KmerGraph {
281        let mut graph: KmerGraph = KmerGraph::new();
282
283        // Iterate over sequences
284        for fwd_seq in fwd_seqs {
285            if fwd_seq.len() < k + 1
286                || fwd_seq
287                    .iter()
288                    .any(|&b| !matches!(b, b'A' | b'C' | b'G' | b'T'))
289            {
290                continue;
291            }
292
293            // Iterate over k-mers
294            for i in 0..fwd_seq.len() - k + 1 {
295                let fw_kmer = &fwd_seq[i..i + k];
296
297                let prev = fwd_seq.get(i.wrapping_sub(1));
298                let fw_prev_base = *prev.unwrap_or(&b'.');
299
300                let next = fwd_seq.get(i + k);
301                let fw_next_base = *next.unwrap_or(&b'.');
302
303                Self::add_record_to_graph(&mut graph, fw_kmer, fw_prev_base, fw_next_base);
304            }
305        }
306
307        graph
308    }
309
310    /// Build the links for a de Bruijn graph from a vector of sequences.
311    ///
312    /// # Arguments
313    ///
314    /// * `k` - The k-mer size.
315    /// * `fwd_seqs` - A vector of forward sequences.
316    ///
317    /// # Returns
318    ///
319    /// A map of links.
320    #[must_use]
321    pub fn build_links(mut self, fwd_seqs: &Vec<Vec<u8>>, correct: bool) -> Self {
322        let progress_bar = if self.verbose {
323            crate::utils::default_bounded_progress_bar("Building links", fwd_seqs.len() as u64)
324        } else {
325            crate::utils::default_hidden_progress_bar()
326        };
327
328        let g = self.traverse_all_kmers();
329
330        let links: Links = fwd_seqs
331            .par_iter()
332            .progress_with(progress_bar)
333            .map(|fwd_seq| {
334                if correct {
335                    self.correct_seq(&g, fwd_seq)
336                } else {
337                    fwd_seq.clone()
338                }
339            })
340            .map(|fwd_seq| {
341                let mut local_links = Links::new();
342
343                let fw_seq = fwd_seq.clone();
344                let rc_seq = fw_seq.reverse_complement();
345
346                LdBG::add_record_to_links(&mut local_links, &fw_seq, self.kmer_size, &self.kmers);
347                LdBG::add_record_to_links(&mut local_links, &rc_seq, self.kmer_size, &self.kmers);
348
349                local_links
350            })
351            .reduce(Links::new, |mut acc, local_links| {
352                for (k, v) in local_links {
353                    acc.entry(k).or_default().extend(v);
354                }
355                acc
356            });
357
358        self.links = links;
359
360        self
361    }
362
363    /// Add a k-mer, the preceding base, and following base to the graph.
364    ///
365    /// # Arguments
366    ///
367    /// * `graph` - A mutable reference to the k-mer graph.
368    /// * `fw_kmer` - A slice representing the forward k-mer.
369    /// * `fw_prev_base` - The preceding base of the forward k-mer.
370    /// * `fw_next_base` - The following base of the forward k-mer.
371    fn add_record_to_graph(
372        graph: &mut KmerGraph,
373        fw_kmer: &[u8],
374        fw_prev_base: u8,
375        fw_next_base: u8,
376    ) {
377        let rc_kmer_vec = fw_kmer.reverse_complement();
378        let rc_kmer = rc_kmer_vec.as_bytes();
379
380        let (cn_kmer, can_prev_base, can_next_base, is_forward) = if fw_kmer < rc_kmer {
381            (fw_kmer, fw_prev_base, fw_next_base, true)
382        } else {
383            (
384                rc_kmer,
385                complement(fw_next_base),
386                complement(fw_prev_base),
387                false,
388            )
389        };
390
391        // If it's not already there, insert k-mer and empty record into k-mer map.
392        if !graph.contains_key(cn_kmer) {
393            graph.insert(cn_kmer.to_owned(), Record::new(0, None));
394        }
395
396        // Increment the canonical k-mer's coverage.
397        graph.get_mut(cn_kmer).unwrap().increment_coverage();
398
399        // Increment per-strand coverage.
400        if is_forward {
401            graph.get_mut(cn_kmer).unwrap().increment_fw_coverage();
402        } else {
403            graph.get_mut(cn_kmer).unwrap().increment_rc_coverage();
404        }
405
406        // Set incoming edge for canonical k-mer.
407        graph
408            .get_mut(cn_kmer)
409            .unwrap()
410            .set_incoming_edge(can_prev_base);
411
412        // Set outgoing edge for canonical k-mer.
413        graph
414            .get_mut(cn_kmer)
415            .unwrap()
416            .set_outgoing_edge(can_next_base);
417    }
418
419    /// Add all junction choices from a given sequence.
420    ///
421    /// # Arguments
422    ///
423    /// * `links` - A mutable reference to the links map.
424    /// * `seq` - A reference to the sequence.
425    /// * `k` - The k-mer size.
426    /// * `graph` - A reference to the k-mer graph.
427    /// * `reverse` - A boolean indicating whether to search in reverse.
428    /// * `fw` - A boolean indicating whether the sequence is forward.
429    fn add_record_to_links(links: &mut Links, seq: &[u8], k: usize, graph: &KmerGraph) {
430        if seq.len() < k + 1 {
431            return;
432        }
433
434        let range = (0..seq.len() - k + 1).collect::<Vec<_>>();
435
436        // Iterate over k-mers to find junctions.
437        for i in range {
438            let fw_kmer = &seq[i..i + k];
439
440            if LdBG::has_junction(graph, fw_kmer, true) {
441                if let Some((anchor_kmer_vec, index)) =
442                    Self::find_anchor_kmer(i, seq, k, graph, true)
443                {
444                    let anchor_kmer = anchor_kmer_vec.as_bytes();
445
446                    let cn_anchor_kmer_vec = crate::utils::canonicalize_kmer(anchor_kmer);
447                    let cn_anchor_kmer = cn_anchor_kmer_vec.as_bytes();
448
449                    // Populate link.
450                    let mut link = Link::new(anchor_kmer == cn_anchor_kmer);
451
452                    let sub_range = (index..seq.len() - k).collect::<Vec<_>>();
453
454                    for j in sub_range {
455                        let next_kmer = &seq[j..j + k];
456
457                        let has_junction = LdBG::has_junction(graph, next_kmer, false);
458                        if has_junction {
459                            let choice = seq[j + k];
460                            link.push_back(choice);
461                        }
462                    }
463
464                    if !link.junctions.is_empty() {
465                        // Add link to links map.
466                        if !links.contains_key(cn_anchor_kmer) {
467                            links.insert(cn_anchor_kmer.to_owned(), HashMap::new());
468                        }
469
470                        if !links.get(cn_anchor_kmer).unwrap().contains_key(&link) {
471                            links.get_mut(cn_anchor_kmer).unwrap().insert(link, 1);
472                        } else {
473                            let linkcov =
474                                *links.get_mut(cn_anchor_kmer).unwrap().get(&link).unwrap();
475
476                            links
477                                .get_mut(cn_anchor_kmer)
478                                .unwrap()
479                                .insert(link, linkcov.saturating_add(1));
480                        }
481                    }
482                }
483            }
484        }
485    }
486
487    /// Find an anchor k-mer in a sequence.
488    ///
489    /// # Arguments
490    ///
491    /// * `index` - The starting index in the sequence.
492    /// * `seq` - A reference to the sequence.
493    /// * `k` - The k-mer size.
494    /// * `graph` - A reference to the k-mer graph.
495    /// * `reverse` - A boolean indicating whether to search in reverse.
496    ///
497    /// # Returns
498    ///
499    /// An optional tuple containing the anchor k-mer and its index.
500    fn find_anchor_kmer(
501        index: usize,
502        seq: &[u8],
503        k: usize,
504        graph: &KmerGraph,
505        reverse: bool,
506    ) -> Option<(Vec<u8>, usize)> {
507        if index > 0 {
508            let mut index = index - 1;
509            loop {
510                if index == 0 {
511                    break;
512                }
513
514                let anchor_kmer = &seq[index..index + k];
515
516                if !LdBG::has_junction(graph, anchor_kmer, !reverse)
517                    || LdBG::has_junction(graph, anchor_kmer, reverse)
518                {
519                    return Some((anchor_kmer.to_vec(), index));
520                }
521
522                index -= 1;
523            }
524        }
525
526        None
527    }
528
529    /// Check if the given k-mer represents a junction (in the orientation of the given k-mer).
530    ///
531    /// # Arguments
532    ///
533    /// * `graph` - A reference to the k-mer graph.
534    /// * `kmer` - A slice representing the k-mer.
535    /// * `reverse` - A boolean indicating whether to check in reverse.
536    ///
537    /// # Returns
538    ///
539    /// A boolean indicating whether the k-mer is a junction.
540    fn has_junction(graph: &KmerGraph, kmer: &[u8], reverse: bool) -> bool {
541        let cn_kmer = crate::utils::canonicalize_kmer(kmer);
542
543        if let Some(r) = graph.get(&cn_kmer) {
544            let is_canonical = String::from_utf8_lossy(kmer) == String::from_utf8_lossy(&cn_kmer);
545            let (in_degree, out_degree) = (r.in_degree(), r.out_degree());
546
547            return if is_canonical {
548                if !reverse {
549                    out_degree > 1
550                } else {
551                    in_degree > 1
552                }
553            } else if !reverse {
554                in_degree > 1
555            } else {
556                out_degree > 1
557            };
558        }
559
560        false
561    }
562
563    fn next_kmers(&self, kmer: &[u8]) -> Vec<Vec<u8>> {
564        let cn_kmer_vec = crate::utils::canonicalize_kmer(kmer).clone();
565        let cn_kmer = cn_kmer_vec.as_bytes();
566
567        let ru = self.kmers.get(cn_kmer);
568        if ru.is_none() {
569            return vec![];
570        }
571
572        let r = ru.unwrap();
573
574        let next_kmers = if cn_kmer == kmer {
575            r.outgoing_edges()
576                .iter()
577                .map(|&e| {
578                    let mut next_kmer = kmer[1..].to_owned();
579                    next_kmer.push(e);
580                    next_kmer
581                })
582                .collect::<Vec<Vec<u8>>>()
583        } else {
584            r.incoming_edges()
585                .iter()
586                .map(|&e| {
587                    let mut next_kmer = kmer[1..].to_owned();
588                    next_kmer.push(complement(e));
589                    next_kmer
590                })
591                .collect::<Vec<Vec<u8>>>()
592        };
593
594        next_kmers
595    }
596
597    fn prev_kmers(&self, kmer: &[u8]) -> Vec<Vec<u8>> {
598        let cn_kmer_vec = crate::utils::canonicalize_kmer(kmer).clone();
599        let cn_kmer = cn_kmer_vec.as_bytes();
600
601        let ru = self.kmers.get(cn_kmer);
602        if ru.is_none() {
603            return vec![];
604        }
605
606        let r = ru.unwrap();
607
608        let prev_kmers = if cn_kmer == kmer {
609            r.incoming_edges()
610                .iter()
611                .map(|&e| {
612                    let mut prev_kmer = kmer[0..kmer.len() - 1].to_owned();
613                    prev_kmer.insert(0, e);
614                    prev_kmer
615                })
616                .collect::<Vec<Vec<u8>>>()
617        } else {
618            r.outgoing_edges()
619                .iter()
620                .map(|&e| {
621                    let mut prev_kmer = kmer[0..kmer.len() - 1].to_owned();
622                    prev_kmer.insert(0, complement(e));
623                    prev_kmer
624                })
625                .collect::<Vec<Vec<u8>>>()
626        };
627
628        prev_kmers
629    }
630
631    /// Starting at a given k-mer, get the next k-mer (or return None if there isn't a single outgoing edge).
632    ///
633    /// # Arguments
634    ///
635    /// * `kmer` - A slice representing the current k-mer.
636    /// * `links_in_scope` - A mutable reference to the links in scope.
637    ///
638    /// # Returns
639    ///
640    /// An optional vector containing the next k-mer.
641    fn next_kmer(&self, kmer: &[u8], links_in_scope: &mut Vec<Link>) -> Option<Vec<u8>> {
642        let cn_kmer_vec = crate::utils::canonicalize_kmer(kmer).clone();
643        let cn_kmer = cn_kmer_vec.as_bytes();
644
645        let ru = self.kmers.get(cn_kmer);
646        let r = ru?;
647
648        let next_base = if cn_kmer == kmer {
649            match r.out_degree() {
650                0 => return None,
651                1 => r.outgoing_edges()[0],
652                _ => {
653                    links_in_scope.retain(|link| {
654                        if let Some(&next_char) = link.front() {
655                            r.outgoing_edges().contains(&next_char)
656                        } else {
657                            false
658                        }
659                    });
660
661                    let consensus_junction_choice = *links_in_scope.first()?.front().unwrap();
662
663                    if r.outgoing_edges().contains(&consensus_junction_choice) {
664                        for link in links_in_scope.iter_mut() {
665                            link.pop_front();
666                        }
667                        links_in_scope.retain(|link| !link.is_empty());
668                        consensus_junction_choice
669                    } else {
670                        return None;
671                    }
672                }
673            }
674        } else {
675            match r.in_degree() {
676                0 => return None,
677                1 => complement(r.incoming_edges()[0]),
678                _ => {
679                    links_in_scope.retain(|link| {
680                        if let Some(&next_char) = link.front() {
681                            r.incoming_edges().contains(&complement(next_char))
682                        } else {
683                            false
684                        }
685                    });
686
687                    let consensus_junction_choice = *links_in_scope.first()?.front().unwrap();
688
689                    if r.incoming_edges()
690                        .contains(&complement(consensus_junction_choice))
691                    {
692                        for link in links_in_scope.iter_mut() {
693                            link.pop_front();
694                        }
695                        links_in_scope.retain(|link| !link.is_empty());
696                        consensus_junction_choice
697                    } else {
698                        return None;
699                    }
700                }
701            }
702        };
703
704        let next_kmer = [&kmer[1..kmer.len()], &[next_base]].concat();
705
706        Some(next_kmer)
707    }
708
709    /// Starting at a given k-mer, get the previous k-mer (or return None if there isn't a single incoming edge).
710    ///
711    /// # Arguments
712    ///
713    /// * `kmer` - A slice representing the current k-mer.
714    /// * `links_in_scope` - A mutable reference to the links in scope.
715    ///
716    /// # Returns
717    ///
718    /// An optional vector containing the previous k-mer.
719    fn prev_kmer(&self, kmer: &[u8], links_in_scope: &mut Vec<Link>) -> Option<Vec<u8>> {
720        let cn_kmer_vec = crate::utils::canonicalize_kmer(kmer).clone();
721        let cn_kmer = cn_kmer_vec.as_bytes();
722
723        let ru = self.kmers.get(cn_kmer);
724        let r = ru?;
725
726        let prev_base = if cn_kmer == kmer {
727            match r.in_degree() {
728                0 => return None,
729                1 => r.incoming_edges()[0],
730                _ => {
731                    links_in_scope.retain(|link| {
732                        if let Some(&next_char) = link.front() {
733                            r.incoming_edges().contains(&next_char)
734                        } else {
735                            false
736                        }
737                    });
738
739                    let consensus_junction_choice = match links_in_scope.first()?.front() {
740                        Some(choice) => *choice,
741                        None => return None,
742                    };
743
744                    if r.incoming_edges().contains(&consensus_junction_choice) {
745                        for link in links_in_scope.iter_mut() {
746                            link.pop_front();
747                        }
748                        links_in_scope.retain(|link| !link.is_empty());
749                        consensus_junction_choice
750                    } else {
751                        return None;
752                    }
753                }
754            }
755        } else {
756            match r.out_degree() {
757                0 => return None,
758                1 => complement(r.outgoing_edges()[0]),
759                _ => {
760                    links_in_scope.retain(|link| {
761                        if let Some(&next_char) = link.front() {
762                            r.outgoing_edges().contains(&complement(next_char))
763                        } else {
764                            false
765                        }
766                    });
767
768                    let consensus_junction_choice = match links_in_scope.first()?.front() {
769                        Some(choice) => *choice,
770                        None => return None,
771                    };
772
773                    if r.outgoing_edges()
774                        .contains(&complement(consensus_junction_choice))
775                    {
776                        for link in links_in_scope.iter_mut() {
777                            link.pop_front();
778                        }
779                        links_in_scope.retain(|link| !link.is_empty());
780                        consensus_junction_choice
781                    } else {
782                        return None;
783                    }
784                }
785            }
786        };
787
788        let prev_kmer = [&[prev_base], &kmer[0..kmer.len() - 1]].concat();
789
790        Some(prev_kmer)
791    }
792
793    pub fn remove(&mut self, kmer: &[u8]) -> Option<Record> {
794        let cn_kmer = crate::utils::canonicalize_kmer(kmer);
795        self.kmers.remove(&cn_kmer)
796    }
797
798    /// Score k-mers using a Gradient Boosting Decision Tree model.
799    ///
800    /// # Arguments
801    ///
802    /// * `model_path` - A path to the model file.
803    ///
804    /// # Returns
805    ///
806    /// A new instance of `LdBG` with updated k-mer scores.
807    ///
808    /// # Panics
809    ///
810    /// This function will panic if it cannot load the model from the specified path.
811    #[must_use]
812    pub fn score_kmers(mut self, model_path: &PathBuf) -> Self {
813        let gbdt = GBDT::load_model(model_path.to_str().unwrap()).unwrap();
814
815        self.scores = self
816            .kmers
817            .keys()
818            .map(|cn_kmer| {
819                let lcov = self.kmers.get(cn_kmer).unwrap().coverage();
820                let scov: u16 = 0;
821                let compressed_len = crate::utils::homopolymer_compressed(cn_kmer).len();
822
823                let data = Data::new_test_data(
824                    vec![
825                        lcov as f32,
826                        scov as f32,
827                        (cn_kmer.len() - compressed_len) as f32,
828                    ],
829                    Some(0.0),
830                );
831                let prediction = *gbdt.predict(&vec![data]).first().unwrap_or(&0.0);
832
833                (cn_kmer.clone(), prediction)
834            })
835            .collect();
836
837        self
838    }
839
840    pub fn infer_edges(&mut self) {
841        let mut kmers = KmerGraph::new();
842
843        self.kmers.iter().for_each(|(cn_kmer, record)| {
844            let mut new_record = Record::new(record.coverage(), Some(Edges::empty()));
845
846            for edge_base in vec![b'A', b'C', b'G', b'T'] {
847                let next_kmer = [&cn_kmer[1..], &[edge_base]].concat();
848                let next_cn_kmer = crate::utils::canonicalize_kmer(&next_kmer);
849
850                if self.kmers.contains_key(&next_cn_kmer) {
851                    new_record.set_outgoing_edge(edge_base);
852                }
853
854                let prev_kmer = [&[edge_base], &cn_kmer[0..cn_kmer.len() - 1]].concat();
855                let prev_cn_kmer = crate::utils::canonicalize_kmer(&prev_kmer);
856
857                if self.kmers.contains_key(&prev_cn_kmer) {
858                    new_record.set_incoming_edge(edge_base);
859                }
860            }
861
862            kmers.insert(cn_kmer.clone(), new_record);
863        });
864
865        self.kmers = kmers;
866    }
867
868    #[must_use]
869    pub fn correct_seqs(&self, seqs: &Vec<Vec<u8>>) -> Vec<Vec<u8>> {
870        let progress_bar = if self.verbose {
871            crate::utils::default_bounded_progress_bar("Correcting reads", seqs.len() as u64)
872        } else {
873            crate::utils::default_hidden_progress_bar()
874        };
875
876        let g = self.traverse_all_kmers();
877
878        seqs.par_iter()
879            .progress_with(progress_bar)
880            .map(|seq| self.correct_seq(&g, seq))
881            .collect::<Vec<Vec<u8>>>()
882    }
883
884    #[must_use]
885    pub fn correct_seq(&self, g: &DiGraph<String, f32>, seq: &[u8]) -> Vec<u8> {
886        let mut g_seq = g.clone();
887
888        let a_kmer_set = seq
889            .windows(self.kmer_size)
890            .map(|kmer| kmer.to_vec())
891            .collect::<HashSet<_>>();
892
893        let count = seq
894            .windows(self.kmer_size)
895            .filter_map(|fw_kmer| {
896                g.node_indices()
897                    .find(|&n| *g[n] == String::from_utf8_lossy(fw_kmer).to_string())
898            })
899            .count();
900
901        if count == 0 {
902            return seq.to_vec();
903        }
904
905        // Split sequence into k-mers and add nodes/edges to graph
906        for i in 0..seq.len() - self.kmer_size {
907            let current_kmer = &seq[i..i + self.kmer_size];
908            let next_kmer = &seq[i + 1..i + 1 + self.kmer_size];
909
910            let current_node = g_seq
911                .node_indices()
912                .find(|&n| *g_seq[n] == String::from_utf8_lossy(current_kmer).to_string())
913                .unwrap_or_else(|| {
914                    g_seq.add_node(String::from_utf8_lossy(current_kmer).to_string())
915                });
916            let next_node = g_seq
917                .node_indices()
918                .find(|&n| *g_seq[n] == String::from_utf8_lossy(next_kmer).to_string())
919                .unwrap_or_else(|| g_seq.add_node(String::from_utf8_lossy(next_kmer).to_string()));
920
921            if !g_seq.contains_edge(current_node, next_node) {
922                g_seq.add_edge(current_node, next_node, 1.0);
923            }
924        }
925
926        let sb = find_all_superbubbles(&g_seq);
927
928        let mut replacements = Vec::new();
929
930        for ((in_node, out_node), interior) in sb {
931            let kmer_in = g_seq[in_node].as_bytes();
932            let kmer_out = g_seq[out_node].as_bytes();
933
934            // let start_position = seq.windows(self.kmer_size).position(|w| w == kmer_in);
935            // let end_position = seq.windows(self.kmer_size).position(|w| w == kmer_out);
936
937            // crate::elog!("Superbubble: {} -> {} ({} interior nodes)", String::from_utf8_lossy(kmer_in), String::from_utf8_lossy(kmer_out), interior.len());
938            // crate::elog!(" - start: {:?}", start_position);
939            // crate::elog!(" -   end: {:?}", end_position);
940
941            if a_kmer_set.contains(&kmer_in.to_vec()) && a_kmer_set.contains(&kmer_out.to_vec()) {
942                // crate::elog!(" - contains k-mers");
943
944                let paths_fwd = petgraph::algo::all_simple_paths::<Vec<_>, _>(
945                    &g_seq,
946                    in_node,
947                    out_node,
948                    0,
949                    Some(interior.len()),
950                );
951                let paths_rev = petgraph::algo::all_simple_paths::<Vec<_>, _>(
952                    &g_seq,
953                    out_node,
954                    in_node,
955                    0,
956                    Some(interior.len()),
957                );
958
959                let mut paths = paths_fwd
960                    .chain(paths_rev)
961                    .map(|path| {
962                        let score = path
963                            .iter()
964                            .map(|node| {
965                                let kmer = g_seq.node_weight(*node).unwrap().as_bytes();
966                                let cn_kmer = crate::utils::canonicalize_kmer(kmer);
967                                self.scores.get(&cn_kmer).unwrap_or(&0.0)
968                            })
969                            .fold(1.0, |acc, x| acc * x)
970                            .powf(1.0 / path.len() as f32);
971
972                        let seen_kmer_count = path
973                            .iter()
974                            .filter(|node| {
975                                let kmer = g_seq.node_weight(**node).unwrap().as_bytes();
976                                let cn_kmer = crate::utils::canonicalize_kmer(kmer);
977                                a_kmer_set.contains(&cn_kmer.to_vec())
978                            })
979                            .count();
980
981                        (path, score, seen_kmer_count)
982                    })
983                    .collect::<Vec<(Vec<NodeIndex>, f32, usize)>>();
984
985                paths.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap());
986
987                for (path, score, seen_kmer_count) in &paths {
988                    let mut contig = String::new();
989
990                    for node in path {
991                        if contig.is_empty() {
992                            contig.push_str(&String::from_utf8_lossy(g_seq[*node].as_bytes()));
993                        } else {
994                            contig.push_str(&String::from_utf8_lossy(
995                                &g_seq[*node].as_bytes()[self.kmer_size - 1..],
996                            ));
997                        }
998                    }
999
1000                    // crate::elog!("Contig: {} (score: {}, seen k-mer count: {})", contig, score, seen_kmer_count);
1001                }
1002
1003                let read_path = paths.first().cloned().unwrap();
1004                let read_kmers = read_path
1005                    .0
1006                    .iter()
1007                    .map(|node| g_seq[*node].as_bytes().to_vec())
1008                    .collect::<Vec<_>>();
1009
1010                /*
1011                for (path, _, _) in paths.iter().skip(1).take(1) {
1012                    let mut contig = String::new();
1013
1014                    for node in path {
1015                        if contig.is_empty() {
1016                            contig.push_str(&String::from_utf8_lossy(g_seq[*node].as_bytes()));
1017                        } else {
1018                            contig.push_str(&String::from_utf8_lossy(&g_seq[*node].as_bytes()[self.kmer_size - 1..]));
1019                        }
1020                    }
1021                }
1022                */
1023
1024                if paths.len() > 1 && read_path.1 < 0.1 {
1025                    let replacement_path = paths.get(1).cloned().unwrap();
1026                    let replacement_kmers = replacement_path
1027                        .0
1028                        .iter()
1029                        .map(|node| g_seq[*node].as_bytes().to_vec())
1030                        .collect::<Vec<_>>();
1031
1032                    replacements.push((read_kmers, replacement_kmers));
1033
1034                    // crate::elog!(" - replaced");
1035                }
1036            }
1037        }
1038
1039        let mut b = seq
1040            .windows(self.kmer_size)
1041            .map(|kmer| kmer.to_vec())
1042            .collect::<Vec<_>>();
1043
1044        for (read_path, replacement_path) in replacements {
1045            let start_pos = b.iter().position(|kmer| kmer == &read_path[0]);
1046            let end_pos = b
1047                .iter()
1048                .position(|kmer| kmer == &read_path[read_path.len() - 1]);
1049
1050            if let (Some(start_pos), Some(end_pos)) = (start_pos, end_pos) {
1051                let mut q1 = String::new();
1052                for kmer in read_path.iter() {
1053                    if q1.is_empty() {
1054                        q1.push_str(&String::from_utf8_lossy(kmer));
1055                    } else {
1056                        q1.push_str(&String::from_utf8_lossy(&kmer[self.kmer_size - 1..]));
1057                    }
1058                }
1059
1060                let mut r1 = String::new();
1061                for kmer in replacement_path.iter() {
1062                    if r1.is_empty() {
1063                        r1.push_str(&String::from_utf8_lossy(kmer));
1064                    } else {
1065                        r1.push_str(&String::from_utf8_lossy(&kmer[self.kmer_size - 1..]));
1066                    }
1067                }
1068
1069                // crate::elog!("Replacing {}-{} in read of length {}\n{}\n{}", start_pos, end_pos, b.len(), q1, r1);
1070
1071                if start_pos <= end_pos {
1072                    b.splice(start_pos..=end_pos, replacement_path);
1073                }
1074            }
1075        }
1076
1077        let mut c = String::new();
1078
1079        for kmer in b {
1080            if c.is_empty() {
1081                c.push_str(&String::from_utf8_lossy(&kmer));
1082            } else {
1083                c.push_str(&String::from_utf8_lossy(&kmer[self.kmer_size - 1..]));
1084            }
1085        }
1086
1087        /*
1088        let gfa_output = PathBuf::from(format!("read.{}.gfa", 0));
1089        let _ = crate::utils::write_gfa(&mut File::create(gfa_output.clone()).unwrap(), &g_seq);
1090
1091        let csv_output = gfa_output.with_extension("csv");
1092        let mut csv_file = File::create(&csv_output).unwrap();
1093
1094        writeln!(csv_file, "node,label,kmer").unwrap();
1095
1096        for (node_index, node_label) in g_seq.node_indices().zip(g_seq.node_weights()) {
1097            let kmer = node_label.as_bytes();
1098            let cn_kmer = crate::utils::canonicalize_kmer(kmer);
1099            let score = (100.0 * *self.scores.get(&cn_kmer).unwrap_or(&0.0)) as u32;
1100            let sources = self.sources.get(&cn_kmer).unwrap_or(&vec![]).clone();
1101
1102            let source = if sources.len() == 1 { sources[0] } else { 2 };
1103
1104            writeln!(
1105                csv_file,
1106                "{},{},{}",
1107                node_index.index(),
1108                format!("{} ({})", source, score),
1109                node_label,
1110            )
1111            .unwrap();
1112        }
1113        */
1114
1115        c.as_bytes().to_vec()
1116    }
1117
1118    #[must_use]
1119    pub fn correct_seq_old(&self, seq: &[u8]) -> Vec<Vec<u8>> {
1120        let (mut graph, node_indices) = self.initialize_read_graph(seq);
1121
1122        for window in node_indices.windows(2) {
1123            let (from, to) = (window[0], window[1]);
1124
1125            let from_kmer = graph.node_weight(from).unwrap().as_bytes();
1126            let to_kmer = graph.node_weight(to).unwrap().as_bytes();
1127
1128            let from_cn_kmer = crate::utils::canonicalize_kmer(from_kmer);
1129            let to_cn_kmer = crate::utils::canonicalize_kmer(to_kmer);
1130
1131            let from_next_kmers = self.next_kmers(&from_kmer);
1132            let to_prev_kmers = self.prev_kmers(&to_kmer);
1133
1134            if self.kmers.contains_key(&from_cn_kmer)
1135                && self.kmers.contains_key(&to_cn_kmer)
1136                && from_next_kmers.contains(&to_kmer.to_vec())
1137                && to_prev_kmers.contains(&from_kmer.to_vec())
1138            {
1139                graph.add_edge(from, to, 1.0);
1140            } else {
1141                if let Ok((contig, forward)) = self.try_simple_paths(from_kmer, to_kmer) {
1142                    self.add_to_read_graph(contig, &mut graph, forward);
1143                } else if let Ok((contig, forward)) = self.try_alternate_paths(&graph, from, to, 5)
1144                {
1145                    self.add_to_read_graph(contig, &mut graph, forward);
1146                } else if let Ok((contig, forward)) =
1147                    self.try_overlapping_paths(&graph, from, to, 5)
1148                {
1149                    self.add_to_read_graph(contig, &mut graph, forward);
1150                } else if let Ok((contig, forward)) =
1151                    self.try_exploratory_paths(&graph, from, to, seq, 5, 100)
1152                {
1153                    self.add_to_read_graph(contig, &mut graph, forward);
1154                }
1155            }
1156        }
1157
1158        let initial_segments = traverse_read_graph(&graph, 0);
1159
1160        if let Ok(connecting_segments) = self.try_connecting_all_segments(&initial_segments) {
1161            for (contig, forward) in connecting_segments.into_iter() {
1162                self.add_to_read_graph(contig, &mut graph, forward);
1163            }
1164        }
1165
1166        // if graph.node_count() > 0 {
1167        //     self.remove_short_branches(&mut graph, 2 * self.kmer_size);
1168
1169        //     let dot = Dot::with_config(&graph, &[Config::EdgeNoLabel]);
1170        //     let mut file = File::create("read_graph.dot").expect("Unable to create file");
1171        //     write!(file, "{:?}", dot).expect("Unable to write data");
1172        // }
1173
1174        let corrected_segments = traverse_read_graph(&graph, self.kmer_size);
1175
1176        if corrected_segments.len() > 1 {
1177            vec![self.combine_read_segments(seq, &corrected_segments)]
1178        } else {
1179            corrected_segments
1180        }
1181    }
1182
1183    fn combine_read_segments(&self, seq: &[u8], corrected_segments: &Vec<Vec<u8>>) -> Vec<u8> {
1184        use spoa::AlignmentType;
1185        let mut la = spoa::AlignmentEngine::new(AlignmentType::kSW, 5, -10, -16, -12, -20, -8);
1186
1187        let mut sg = spoa::Graph::new();
1188        for lr_seq in vec![seq.to_vec()].iter().chain(corrected_segments.iter()) {
1189            let seq_cstr = std::ffi::CString::new(lr_seq.clone()).unwrap();
1190            let seq_qual = std::ffi::CString::new(vec![b'I'; lr_seq.len()]).unwrap();
1191            let a = la.align(seq_cstr.as_ref(), &sg);
1192
1193            sg.add_alignment(&a, seq_cstr.as_ref(), seq_qual.as_ref());
1194        }
1195
1196        let msa_cstrs = sg.multiple_sequence_alignment(false);
1197        let msa_strings = msa_cstrs
1198            .iter()
1199            .map(|cstr| cstr.to_str().unwrap().to_string())
1200            .map(|msa_string| {
1201                let leading_dashes = msa_string.chars().take_while(|&c| c == '-').count();
1202                let trailing_dashes = msa_string.chars().rev().take_while(|&c| c == '-').count();
1203                let middle = &msa_string[leading_dashes..msa_string.len() - trailing_dashes];
1204                format!(
1205                    "{}{}{}",
1206                    " ".repeat(leading_dashes),
1207                    middle,
1208                    " ".repeat(trailing_dashes)
1209                )
1210            })
1211            .collect::<Vec<String>>();
1212
1213        let mut combined_seq = vec![];
1214        for column in 0..msa_strings[0].len() {
1215            let mut pileup = vec![];
1216            for msa_string in msa_strings.iter().skip(1).chain(msa_strings.iter().take(1)) {
1217                let char_at_column = msa_string.chars().nth(column).unwrap();
1218
1219                if char_at_column != ' ' {
1220                    pileup.push(char_at_column);
1221                }
1222            }
1223
1224            combined_seq.push(pileup[0] as u8);
1225        }
1226
1227        combined_seq.retain(|&x| x != b'-');
1228
1229        combined_seq
1230    }
1231
1232    fn remove_short_branches(&self, graph: &mut petgraph::Graph<String, f64>, min_size: usize) {
1233        // Find nodes with more than one outgoing edge and prune short branches
1234        let mut nodes_to_remove = HashSet::new();
1235        for node in graph.node_indices() {
1236            let out_degree = graph.edges_directed(node, petgraph::Outgoing).count();
1237            if out_degree > 1 {
1238                for edge in graph.edges_directed(node, petgraph::Outgoing) {
1239                    let mut branch = vec![edge.target()];
1240                    let mut current = edge.target();
1241                    let mut branch_length = 0;
1242
1243                    while branch_length < 2 * self.kmer_size {
1244                        let out_edges: Vec<_> =
1245                            graph.edges_directed(current, petgraph::Outgoing).collect();
1246                        if out_edges.len() != 1 {
1247                            break;
1248                        }
1249                        current = out_edges[0].target();
1250                        branch.push(current);
1251                        branch_length += 1;
1252                    }
1253
1254                    if branch_length < min_size {
1255                        nodes_to_remove.extend(branch);
1256                    }
1257                }
1258            }
1259        }
1260
1261        // Remove the identified nodes from the graph
1262        for node in nodes_to_remove {
1263            graph.remove_node(node);
1264        }
1265    }
1266
1267    fn try_simple_paths(&self, from_kmer: &[u8], to_kmer: &[u8]) -> Result<(Vec<u8>, bool)> {
1268        let mut forward_contig = from_kmer.to_vec();
1269        let mut backward_contig = to_kmer.to_vec();
1270
1271        if let Ok(_) = self.assemble_forward_until(
1272            &mut forward_contig,
1273            &from_kmer,
1274            &HashSet::from([to_kmer.to_vec()]),
1275            100,
1276        ) {
1277            return Ok((forward_contig, true));
1278        } else if let Ok(_) = self.assemble_backward_until(
1279            &mut backward_contig,
1280            &to_kmer,
1281            &HashSet::from([from_kmer.to_vec()]),
1282            100,
1283        ) {
1284            return Ok((backward_contig, false));
1285        }
1286
1287        Err(anyhow::anyhow!("No simple path found"))
1288    }
1289
1290    fn try_alternate_paths(
1291        &self,
1292        graph: &petgraph::Graph<String, f64>,
1293        from: NodeIndex,
1294        to: NodeIndex,
1295        window: u8,
1296    ) -> Result<(Vec<u8>, bool)> {
1297        for scan_left in 0..=window {
1298            for scan_right in 0..=window {
1299                if let (Some(new_from), Some(new_to)) = (
1300                    navigate_backward(from, scan_left, graph),
1301                    navigate_forward(to, scan_right, graph),
1302                ) {
1303                    let new_from_kmer = graph.node_weight(new_from).unwrap().as_bytes();
1304                    let new_to_kmer = graph.node_weight(new_to).unwrap().as_bytes();
1305
1306                    let mut forward_contig = new_from_kmer.to_vec();
1307                    let mut backward_contig = new_to_kmer.to_vec();
1308
1309                    if let Ok(_) = self.assemble_forward_until(
1310                        &mut forward_contig,
1311                        &new_from_kmer,
1312                        &HashSet::from([new_to_kmer.to_vec()]),
1313                        100,
1314                    ) {
1315                        return Ok((forward_contig, true));
1316                    } else if let Ok(_) = self.assemble_backward_until(
1317                        &mut backward_contig,
1318                        &new_to_kmer,
1319                        &HashSet::from([new_from_kmer.to_vec()]),
1320                        100,
1321                    ) {
1322                        return Ok((backward_contig, false));
1323                    }
1324                }
1325            }
1326        }
1327
1328        return Err(anyhow::anyhow!("No alternate path found"));
1329    }
1330
1331    fn try_overlapping_paths(
1332        &self,
1333        graph: &petgraph::Graph<String, f64>,
1334        from: NodeIndex,
1335        to: NodeIndex,
1336        window: u8,
1337    ) -> Result<(Vec<u8>, bool)> {
1338        for scan_left in 0..=window {
1339            for scan_right in 0..=window {
1340                if let (Some(new_from), Some(new_to)) = (
1341                    navigate_backward(from, scan_left, graph),
1342                    navigate_forward(to, scan_right, graph),
1343                ) {
1344                    let new_from_kmer = graph.node_weight(new_from).unwrap().as_bytes();
1345                    let new_to_kmer = graph.node_weight(new_to).unwrap().as_bytes();
1346
1347                    let mut forward_contig = new_from_kmer.to_vec();
1348                    let mut backward_contig = new_to_kmer.to_vec();
1349
1350                    let seqs = vec![forward_contig.clone(), backward_contig.clone()];
1351                    let l = LdBG::from_sequences("contig".to_string(), self.kmer_size, &seqs)
1352                        .build_links(&seqs, false);
1353
1354                    if let Ok(_) = l.assemble_forward_until(
1355                        &mut forward_contig,
1356                        &new_from_kmer,
1357                        &HashSet::from([new_to_kmer.to_vec()]),
1358                        100,
1359                    ) {
1360                        return Ok((forward_contig, true));
1361                    } else if let Ok(_) = l.assemble_backward_until(
1362                        &mut backward_contig,
1363                        &new_to_kmer,
1364                        &HashSet::from([new_from_kmer.to_vec()]),
1365                        100,
1366                    ) {
1367                        return Ok((backward_contig, false));
1368                    }
1369                }
1370            }
1371        }
1372
1373        return Err(anyhow::anyhow!("No overlapping path found"));
1374    }
1375
1376    fn try_exploratory_paths(
1377        &self,
1378        graph: &petgraph::Graph<String, f64>,
1379        from: NodeIndex,
1380        to: NodeIndex,
1381        seq: &[u8],
1382        window: u8,
1383        max_intermediate_nodes: usize,
1384    ) -> Result<(Vec<u8>, bool)> {
1385        for scan_left in 0..=window {
1386            for scan_right in 0..=window {
1387                if let (Some(new_from), Some(new_to)) = (
1388                    navigate_backward(from, scan_left, graph),
1389                    navigate_forward(to, scan_right, graph),
1390                ) {
1391                    let new_from_kmer = graph.node_weight(new_from).unwrap().as_bytes();
1392                    let new_to_kmer = graph.node_weight(new_to).unwrap().as_bytes();
1393
1394                    if let Some(substring) =
1395                        self.find_read_substring(seq, new_from_kmer, new_to_kmer)
1396                    {
1397                        let mut subgraph = DiGraph::new();
1398                        let mut visited = HashMap::<String, NodeIndex>::new();
1399
1400                        let start_label = String::from_utf8_lossy(&new_from_kmer).to_string();
1401                        let start_node = subgraph.add_node(start_label.clone());
1402                        visited.insert(start_label.clone(), start_node);
1403
1404                        if let Ok(_) = self.traverse_forward_until(
1405                            &mut subgraph,
1406                            &mut visited,
1407                            start_node,
1408                            new_to_kmer,
1409                        ) {
1410                            let start_label = String::from_utf8_lossy(&new_from_kmer).to_string();
1411                            let end_label = String::from_utf8_lossy(&new_to_kmer).to_string();
1412
1413                            // Find the node indices for the start and end kmers
1414                            let start_node = subgraph
1415                                .node_indices()
1416                                .find(|&n| subgraph[n] == start_label);
1417                            let end_node =
1418                                subgraph.node_indices().find(|&n| subgraph[n] == end_label);
1419
1420                            if let (Some(start), Some(end)) = (start_node, end_node) {
1421                                let mut best_contig = String::new();
1422                                let mut min_ldist = u32::MAX;
1423
1424                                // List all paths from start_node to end_node
1425                                for path in petgraph::algo::all_simple_paths::<Vec<_>, _>(
1426                                    &subgraph,
1427                                    start,
1428                                    end,
1429                                    0,
1430                                    Some(max_intermediate_nodes),
1431                                ) {
1432                                    // Convert the path to labels
1433                                    let path_labels: Vec<&String> =
1434                                        path.iter().map(|&n| &subgraph[n]).collect();
1435
1436                                    // Create the full contig from the path_labels
1437                                    let mut new_contig = String::new();
1438                                    for kmer in path_labels.iter() {
1439                                        if new_contig.is_empty() {
1440                                            new_contig.push_str(kmer);
1441                                        } else {
1442                                            new_contig.push_str(
1443                                                &kmer.chars().last().unwrap().to_string(),
1444                                            );
1445                                        }
1446                                    }
1447
1448                                    let ldist =
1449                                        levenshtein(&new_contig.as_bytes().to_vec(), &substring);
1450                                    if ldist < min_ldist {
1451                                        min_ldist = ldist;
1452                                        best_contig = new_contig;
1453                                    }
1454                                }
1455
1456                                if min_ldist < (substring.len() / 10) as u32 {
1457                                    return Ok((best_contig.as_bytes().to_vec(), true));
1458                                }
1459                            }
1460                        }
1461                    }
1462                }
1463            }
1464        }
1465
1466        return Err(anyhow::anyhow!("No alternate path found"));
1467    }
1468
1469    fn try_connecting_all_segments(&self, segments: &[Vec<u8>]) -> Result<Vec<(Vec<u8>, bool)>> {
1470        let mut connecting_segments = Vec::new();
1471
1472        for segment1 in segments.iter() {
1473            for segment2 in segments.iter() {
1474                if segment1 == segment2 {
1475                    continue;
1476                }
1477
1478                if let Ok(result) = self.try_connecting_segments(segment1, segment2) {
1479                    connecting_segments.push(result);
1480                }
1481            }
1482        }
1483
1484        if connecting_segments.len() == 0 {
1485            return Err(anyhow::anyhow!("No connecting path found"));
1486        }
1487
1488        Ok(connecting_segments)
1489    }
1490
1491    fn try_connecting_segments(&self, segment1: &[u8], segment2: &[u8]) -> Result<(Vec<u8>, bool)> {
1492        let start_kmers = segment1
1493            .windows(self.kmer_size)
1494            .skip(segment1.len().saturating_sub(2 * self.kmer_size))
1495            .map(|kmer| kmer.to_vec())
1496            .collect::<HashSet<Vec<u8>>>();
1497        let end_kmers = segment2
1498            .windows(self.kmer_size)
1499            .take(10)
1500            .map(|kmer| kmer.to_vec())
1501            .collect::<HashSet<Vec<u8>>>();
1502
1503        for start_kmer in start_kmers.iter() {
1504            let mut forward_contig = start_kmer.clone();
1505            if let Ok(_) =
1506                self.assemble_forward_until(&mut forward_contig, &start_kmer, &end_kmers, 100)
1507            {
1508                return Ok((forward_contig, true));
1509            }
1510        }
1511
1512        for end_kmer in end_kmers.iter() {
1513            let mut backward_contig = end_kmer.clone();
1514            if let Ok(_) =
1515                self.assemble_backward_until(&mut backward_contig, &end_kmer, &start_kmers, 100)
1516            {
1517                return Ok((backward_contig, false));
1518            }
1519        }
1520
1521        return Err(anyhow::anyhow!("No connecting path found"));
1522    }
1523
1524    fn find_read_substring(&self, seq: &[u8], from_kmer: &[u8], to_kmer: &[u8]) -> Option<Vec<u8>> {
1525        seq.windows(from_kmer.len())
1526            .position(|window| window == from_kmer)
1527            .and_then(|start_pos| {
1528                seq[start_pos + from_kmer.len()..]
1529                    .windows(to_kmer.len())
1530                    .position(|window| window == to_kmer)
1531                    .map(|end_pos| {
1532                        seq[start_pos..start_pos + from_kmer.len() + end_pos + to_kmer.len()]
1533                            .to_vec()
1534                    })
1535            })
1536    }
1537
1538    fn add_to_read_graph(
1539        &self,
1540        contig: Vec<u8>,
1541        graph: &mut petgraph::Graph<String, f64>,
1542        forward: bool,
1543    ) {
1544        if forward {
1545            self.add_to_read_graph_forward(&contig, graph);
1546        } else {
1547            self.add_to_read_graph_backward(&contig, graph);
1548        }
1549    }
1550
1551    fn add_to_read_graph_backward(
1552        &self,
1553        backward_contig: &[u8],
1554        graph: &mut petgraph::Graph<String, f64>,
1555    ) {
1556        // If backward assembly succeeded, add path to graph
1557        for i in (0..backward_contig.len() - self.kmer_size).rev() {
1558            let current_kmer = &backward_contig[i..i + self.kmer_size];
1559            let prev_kmer = &backward_contig[i + 1..i + 1 + self.kmer_size];
1560
1561            let prev_node = graph
1562                .node_indices()
1563                .find(|&n| *graph[n] == String::from_utf8_lossy(current_kmer).to_string())
1564                .unwrap_or_else(|| {
1565                    graph.add_node(String::from_utf8_lossy(current_kmer).to_string())
1566                });
1567            let current_node = graph
1568                .node_indices()
1569                .find(|&n| *graph[n] == String::from_utf8_lossy(prev_kmer).to_string())
1570                .unwrap_or_else(|| graph.add_node(String::from_utf8_lossy(prev_kmer).to_string()));
1571
1572            if !graph.contains_edge(prev_node, current_node) {
1573                graph.add_edge(prev_node, current_node, 1.0);
1574            }
1575        }
1576    }
1577
1578    fn add_to_read_graph_forward(
1579        &self,
1580        forward_contig: &[u8],
1581        graph: &mut petgraph::Graph<String, f64>,
1582    ) {
1583        // If forward assembly succeeded, add path to graph
1584        for i in 0..forward_contig.len() - self.kmer_size {
1585            let current_kmer = &forward_contig[i..i + self.kmer_size];
1586            let next_kmer = &forward_contig[i + 1..i + 1 + self.kmer_size];
1587
1588            let current_node = graph
1589                .node_indices()
1590                .find(|&n| *graph[n] == String::from_utf8_lossy(current_kmer).to_string())
1591                .unwrap_or_else(|| {
1592                    graph.add_node(String::from_utf8_lossy(current_kmer).to_string())
1593                });
1594            let next_node = graph
1595                .node_indices()
1596                .find(|&n| *graph[n] == String::from_utf8_lossy(next_kmer).to_string())
1597                .unwrap_or_else(|| graph.add_node(String::from_utf8_lossy(next_kmer).to_string()));
1598
1599            if !graph.contains_edge(current_node, next_node) {
1600                graph.add_edge(current_node, next_node, 1.0);
1601            }
1602        }
1603    }
1604
1605    fn initialize_read_graph(&self, seq: &[u8]) -> (petgraph::Graph<String, f64>, Vec<NodeIndex>) {
1606        let mut graph = DiGraph::<String, f64>::new();
1607
1608        let mut node_indices = Vec::new();
1609        for kmer in seq.windows(self.kmer_size) {
1610            if self
1611                .kmers
1612                .contains_key(&crate::utils::canonicalize_kmer(kmer))
1613            {
1614                let node_index = graph.add_node(String::from_utf8_lossy(kmer).to_string());
1615                node_indices.push(node_index);
1616            }
1617        }
1618        (graph, node_indices)
1619    }
1620
1621    /// Assemble a contig in the forward direction.
1622    ///
1623    /// # Arguments
1624    ///
1625    /// * `contig` - A mutable reference to the contig being assembled.
1626    /// * `start_kmer` - A vector representing the starting k-mer.
1627    fn assemble_forward(&self, contig: &mut Vec<u8>, start_kmer: &[u8]) {
1628        let mut links_in_scope: Vec<Link> = Vec::new();
1629        let mut used_links = HashSet::new();
1630        let mut last_kmer = start_kmer.to_owned();
1631        let mut visited = HashSet::new();
1632
1633        loop {
1634            self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, true);
1635
1636            match self.next_kmer(&last_kmer, &mut links_in_scope) {
1637                Some(this_kmer) => {
1638                    if links_in_scope.len() == 0 {
1639                        if visited.contains(&this_kmer) {
1640                            break;
1641                        }
1642
1643                        visited.insert(this_kmer.clone());
1644                    }
1645
1646                    contig.push(this_kmer[this_kmer.len() - 1]);
1647                    last_kmer = this_kmer;
1648                }
1649                None => {
1650                    break;
1651                }
1652            }
1653        }
1654    }
1655
1656    /// Assemble a contig in the forward direction until a stopping k-mer is reached or the limit is hit.
1657    ///
1658    /// # Arguments
1659    ///
1660    /// * `contig` - A mutable reference to the contig being assembled.
1661    /// * `start_kmer` - A vector representing the starting k-mer.
1662    /// * `stop_kmers` - A `HashSet` of k-mers to stop at.
1663    /// * `limit` - The maximum number of nodes to traverse before giving up.
1664    fn assemble_forward_until(
1665        &self,
1666        contig: &mut Vec<u8>,
1667        start_kmer: &[u8],
1668        stop_kmers: &HashSet<Vec<u8>>,
1669        limit: usize,
1670    ) -> Result<Vec<u8>> {
1671        let mut links_in_scope: Vec<Link> = Vec::new();
1672        let mut used_links = HashSet::new();
1673        let mut last_kmer = start_kmer.to_owned();
1674        let mut nodes_traversed = 0;
1675
1676        loop {
1677            if stop_kmers.contains(&last_kmer) {
1678                return Ok(last_kmer);
1679            }
1680
1681            if nodes_traversed >= limit {
1682                return Err(anyhow::anyhow!(
1683                    "Reached limit without finding a stopping k-mer"
1684                ));
1685            }
1686
1687            self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, true);
1688
1689            match self.next_kmer(&last_kmer, &mut links_in_scope) {
1690                Some(this_kmer) => {
1691                    contig.push(this_kmer[this_kmer.len() - 1]);
1692                    last_kmer = this_kmer;
1693                    nodes_traversed += 1;
1694                }
1695                None => {
1696                    break;
1697                }
1698            }
1699        }
1700
1701        Err(anyhow::anyhow!("No stopping k-mer found"))
1702    }
1703
1704    fn assemble_forward_until_condition<F>(
1705        &self,
1706        contig: &mut Vec<u8>,
1707        start_kmer: &[u8],
1708        limit: usize,
1709        stopping_condition: F,
1710    ) -> Result<Vec<u8>>
1711    where
1712        F: Fn(&[u8], usize, &Self) -> bool,
1713    {
1714        let initial_contig_length = contig.len();
1715
1716        let mut links_in_scope: Vec<Link> = Vec::new();
1717        let mut used_links = HashSet::new();
1718        let mut last_kmer = start_kmer.to_owned();
1719
1720        loop {
1721            if stopping_condition(&last_kmer, contig.len() - initial_contig_length, self) {
1722                return Ok(last_kmer);
1723            }
1724
1725            if contig.len() - initial_contig_length >= limit {
1726                return Err(anyhow::anyhow!(
1727                    "Reached limit without finding a stopping k-mer"
1728                ));
1729            }
1730
1731            self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, true);
1732
1733            match self.next_kmer(&last_kmer, &mut links_in_scope) {
1734                Some(this_kmer) => {
1735                    contig.push(this_kmer[this_kmer.len() - 1]);
1736                    last_kmer = this_kmer;
1737                }
1738                None => {
1739                    break;
1740                }
1741            }
1742        }
1743
1744        Err(anyhow::anyhow!("Stopping condition not satisfied"))
1745    }
1746
1747    fn assemble_forward_limit(
1748        &self,
1749        contig: &mut Vec<u8>,
1750        start_kmer: &[u8],
1751        limit: usize,
1752    ) -> Result<Vec<u8>> {
1753        let mut links_in_scope: Vec<Link> = Vec::new();
1754        let mut used_links = HashSet::new();
1755        let mut last_kmer = start_kmer.to_owned();
1756        loop {
1757            if contig.len() >= limit {
1758                return Ok(last_kmer);
1759            }
1760
1761            self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, true);
1762
1763            match self.next_kmer(&last_kmer, &mut links_in_scope) {
1764                Some(this_kmer) => {
1765                    contig.push(this_kmer[this_kmer.len() - 1]);
1766                    last_kmer = this_kmer;
1767                }
1768                None => {
1769                    break;
1770                }
1771            }
1772        }
1773
1774        Err(anyhow::anyhow!("No stopping k-mer found"))
1775    }
1776
1777    /// Assemble a contig in the backward direction.
1778    ///
1779    /// # Arguments
1780    ///
1781    /// * `contig` - A mutable reference to the contig being assembled.
1782    /// * `start_kmer` - A vector representing the starting k-mer.
1783    fn assemble_backward(&self, contig: &mut Vec<u8>, start_kmer: &[u8]) {
1784        let mut links_in_scope: Vec<Link> = Vec::new();
1785        let mut used_links = HashSet::new();
1786        let mut last_kmer = start_kmer.to_owned();
1787        let mut visited = HashSet::new();
1788
1789        loop {
1790            self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, false);
1791
1792            match self.prev_kmer(&last_kmer, &mut links_in_scope) {
1793                Some(this_kmer) => {
1794                    if links_in_scope.len() == 0 {
1795                        if visited.contains(&this_kmer) {
1796                            break;
1797                        }
1798
1799                        visited.insert(this_kmer.clone());
1800                    }
1801
1802                    contig.insert(0, this_kmer[0]);
1803                    last_kmer = this_kmer;
1804                }
1805                None => {
1806                    break;
1807                }
1808            }
1809        }
1810    }
1811
1812    /// Assemble a contig in the backward direction until a stopping k-mer is reached or the limit is hit.
1813    ///
1814    /// # Arguments
1815    ///
1816    /// * `contig` - A mutable reference to the contig being assembled.
1817    /// * `start_kmer` - A vector representing the starting k-mer.
1818    /// * `stop_kmers` - A `HashSet` of vectors representing the stopping k-mers.
1819    /// * `limit` - The maximum number of nodes to traverse before giving up.
1820    fn assemble_backward_until(
1821        &self,
1822        contig: &mut Vec<u8>,
1823        start_kmer: &[u8],
1824        stop_kmers: &HashSet<Vec<u8>>,
1825        limit: usize,
1826    ) -> Result<Vec<u8>> {
1827        let mut links_in_scope: Vec<Link> = Vec::new();
1828        let mut used_links = HashSet::new();
1829        let mut last_kmer = start_kmer.to_owned();
1830        let mut nodes_traversed = 0;
1831
1832        loop {
1833            if stop_kmers.contains(&last_kmer) {
1834                return Ok(last_kmer);
1835            }
1836
1837            if nodes_traversed >= limit {
1838                return Err(anyhow::anyhow!(
1839                    "Limit reached before finding a stopping k-mer"
1840                ));
1841            }
1842
1843            self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, false);
1844
1845            match self.prev_kmer(&last_kmer, &mut links_in_scope) {
1846                Some(this_kmer) => {
1847                    contig.insert(0, this_kmer[0]);
1848                    last_kmer = this_kmer;
1849                    nodes_traversed += 1;
1850                }
1851                None => {
1852                    break;
1853                }
1854            }
1855        }
1856
1857        Err(anyhow::anyhow!("No stopping k-mer found"))
1858    }
1859
1860    fn assemble_backward_until_condition<F>(
1861        &self,
1862        contig: &mut Vec<u8>,
1863        start_kmer: &[u8],
1864        limit: usize,
1865        stopping_condition: F,
1866    ) -> Result<Vec<u8>>
1867    where
1868        F: Fn(&[u8], usize, &Self) -> bool,
1869    {
1870        let initial_contig_length = contig.len();
1871
1872        let mut links_in_scope: Vec<Link> = Vec::new();
1873        let mut used_links = HashSet::new();
1874        let mut last_kmer = start_kmer.to_owned();
1875
1876        loop {
1877            if stopping_condition(&last_kmer, contig.len() - initial_contig_length, self) {
1878                return Ok(last_kmer);
1879            }
1880
1881            if contig.len() - initial_contig_length >= limit {
1882                return Err(anyhow::anyhow!(
1883                    "Reached limit without finding a stopping k-mer"
1884                ));
1885            }
1886
1887            self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, false);
1888
1889            match self.prev_kmer(&last_kmer, &mut links_in_scope) {
1890                Some(this_kmer) => {
1891                    contig.insert(0, this_kmer[0]);
1892                    last_kmer = this_kmer;
1893                }
1894                None => {
1895                    break;
1896                }
1897            }
1898        }
1899
1900        Err(anyhow::anyhow!("Stopping condition not satisfied"))
1901    }
1902
1903    fn assemble_backward_limit(
1904        &self,
1905        contig: &mut Vec<u8>,
1906        start_kmer: &[u8],
1907        limit: usize,
1908    ) -> Result<Vec<u8>> {
1909        let mut links_in_scope: Vec<Link> = Vec::new();
1910        let mut used_links = HashSet::new();
1911        let mut last_kmer = start_kmer.to_owned();
1912        loop {
1913            if contig.len() >= limit {
1914                return Ok(last_kmer);
1915            }
1916
1917            self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, false);
1918
1919            match self.prev_kmer(&last_kmer, &mut links_in_scope) {
1920                Some(this_kmer) => {
1921                    contig.insert(0, this_kmer[0]);
1922                    last_kmer = this_kmer;
1923                }
1924                None => {
1925                    break;
1926                }
1927            }
1928        }
1929
1930        Err(anyhow::anyhow!("No stopping k-mer found"))
1931    }
1932
1933    /// Starting at a given k-mer, assemble a contig.
1934    ///
1935    /// # Arguments
1936    ///
1937    /// * `kmer` - A slice representing the starting k-mer.
1938    ///
1939    /// # Returns
1940    ///
1941    /// A vector containing the assembled contig.
1942    ///
1943    /// # Panics
1944    ///
1945    /// - This function will panic if the k-mer length does not match the expected length.
1946    #[must_use]
1947    pub fn assemble(&self, kmer: &[u8]) -> Vec<u8> {
1948        let mut contig: Vec<u8> = kmer.to_vec();
1949
1950        assert!(
1951            kmer.len() == self.kmer_size,
1952            "kmer length {} does not match expected length {}",
1953            kmer.len(),
1954            self.kmer_size
1955        );
1956
1957        self.assemble_forward(&mut contig, kmer);
1958        self.assemble_backward(&mut contig, kmer);
1959
1960        contig
1961    }
1962
1963    /// Assemble all contigs from the linked de Bruijn graph.
1964    ///
1965    /// # Returns
1966    ///
1967    /// A vector of contigs.
1968    ///
1969    /// # Panics
1970    ///
1971    /// - This function will panic if the node weight for a unique node cannot be retrieved.
1972    /// - If `self.kmers.get(cn_kmer).unwrap()` returns None.
1973    #[must_use]
1974    pub fn assemble_all(&self) -> Vec<Vec<u8>> {
1975        let mut contigs = Vec::new();
1976
1977        let mut used_kmers = HashSet::new();
1978        let k = self.kmer_size;
1979
1980        let progress_bar = if self.verbose {
1981            crate::utils::default_bounded_progress_bar(
1982                "Assembling contigs",
1983                self.kmers.len() as u64,
1984            )
1985        } else {
1986            crate::utils::default_hidden_progress_bar()
1987        };
1988
1989        for cn_kmer in self.kmers.keys().progress_with(progress_bar) {
1990            if !used_kmers.contains(cn_kmer) {
1991                let r = self.kmers.get(cn_kmer).unwrap();
1992
1993                if r.in_degree() == 1 && r.out_degree() == 1 {
1994                    let contig = self.assemble(cn_kmer);
1995                    for kmer_in_contig in contig.windows(k) {
1996                        used_kmers.insert(crate::utils::canonicalize_kmer(kmer_in_contig));
1997                    }
1998                    contigs.push(contig);
1999                } else if r.in_degree() == 0 && r.out_degree() == 0 {
2000                    contigs.push(cn_kmer.clone());
2001                }
2002
2003                used_kmers.insert(cn_kmer.clone());
2004            }
2005        }
2006
2007        contigs
2008    }
2009
2010    /// Assemble contigs at superbubbles in the graph.
2011    ///
2012    /// This function traverses all k-mers in the graph, identifies superbubbles,
2013    /// and assembles contigs from unique nodes within these superbubbles.
2014    ///
2015    /// # Returns
2016    ///
2017    /// A vector of contigs, where each contig is represented as a vector of bytes.
2018    ///
2019    /// # Panics
2020    ///
2021    /// - This function will panic if the node weight for a unique node cannot be retrieved.
2022    /// - if `g.node_weight(*unique_node).unwrap().as_bytes()` returns None.
2023    #[must_use]
2024    pub fn assemble_at_bubbles(&self) -> Vec<Vec<u8>> {
2025        let g = self.traverse_all_kmers();
2026        let bubbles = find_all_superbubbles(&g);
2027
2028        let mut all_contigs = Vec::new();
2029        let mut visited = HashSet::new();
2030
2031        for ((in_node, out_node), interior) in bubbles {
2032            let paths_fwd = petgraph::algo::all_simple_paths::<Vec<_>, _>(
2033                &g,
2034                in_node,
2035                out_node,
2036                0,
2037                Some(interior.len()),
2038            );
2039            let paths_rev = petgraph::algo::all_simple_paths::<Vec<_>, _>(
2040                &g,
2041                out_node,
2042                in_node,
2043                0,
2044                Some(interior.len()),
2045            );
2046
2047            let paths: Vec<Vec<NodeIndex>> = paths_fwd.chain(paths_rev).collect();
2048
2049            let mut unique_nodes = Vec::new();
2050
2051            for (i, path) in paths.iter().enumerate() {
2052                let other_paths: HashSet<_> = paths
2053                    .iter()
2054                    .enumerate()
2055                    .filter(|&(j, _)| j != i)
2056                    .flat_map(|(_, p)| p)
2057                    .collect();
2058
2059                if let Some(unique_node) = path.iter().find(|&node| !other_paths.contains(node)) {
2060                    let kmer = g.node_weight(*unique_node).unwrap().as_bytes().to_vec();
2061                    let cn_kmer = crate::utils::canonicalize_kmer(&kmer);
2062
2063                    if !visited.contains(&cn_kmer) {
2064                        unique_nodes.push(*unique_node);
2065                    }
2066                }
2067            }
2068
2069            let contigs = unique_nodes
2070                .iter()
2071                .map(|unique_node| {
2072                    let kmer = g.node_weight(*unique_node).unwrap().as_bytes().to_vec();
2073                    self.assemble(&kmer)
2074                })
2075                .collect::<Vec<_>>();
2076
2077            all_contigs.extend(contigs.clone());
2078
2079            for contig in contigs {
2080                for kmer in contig.windows(self.kmer_size) {
2081                    visited.insert(crate::utils::canonicalize_kmer(kmer));
2082                }
2083            }
2084        }
2085
2086        all_contigs
2087    }
2088
2089    /*
2090    pub fn cluster_reads(&self, lr_seqs: &[Vec<u8>], sr_seqs: &[Vec<u8>]) {
2091        let g = self.traverse_all_kmers();
2092        let bubbles = find_all_superbubbles(&g);
2093
2094        let mut reads = vec![vec![None; bubbles.len()]; lr_seqs.len()];
2095        let mut confidences = vec![vec![None; bubbles.len()]; lr_seqs.len()];
2096
2097        let cn_kmers: HashMap<Vec<u8>, Vec<usize>> = lr_seqs
2098            .iter()
2099            .enumerate()
2100            .flat_map(|(read_index, seq)| {
2101                seq.windows(self.kmer_size)
2102                    .map(move |kmer| (crate::utils::canonicalize_kmer(kmer), read_index))
2103            })
2104            .fold(HashMap::new(), |mut acc, (kmer, read_idx)| {
2105                acc.entry(kmer).or_default().push(read_idx);
2106                acc
2107            });
2108
2109        for (bubble_index, ((in_node, out_node), interior)) in bubbles.iter().enumerate() {
2110            let paths_fwd = petgraph::algo::all_simple_paths::<Vec<_>, _>(&g, *in_node, *out_node, 0, Some(interior.len()));
2111            let paths_rev = petgraph::algo::all_simple_paths::<Vec<_>, _>(&g, *out_node, *in_node, 0, Some(interior.len()));
2112
2113            let mut path_info = paths_fwd.chain(paths_rev).map(|path| {
2114                let path_kmers = path
2115                    .iter()
2116                    .map(|node| g.node_weight(*node).unwrap().as_bytes().to_vec())
2117                    .map(|kmer| crate::utils::canonicalize_kmer(&kmer))
2118                    .collect::<Vec<Vec<u8>>>();
2119
2120                let mut read_index_counts = HashMap::new();
2121
2122                let mut path_score = 1.0;
2123                for kmer in &path_kmers {
2124                    if let Some(read_indices) = cn_kmers.get(kmer) {
2125                        for &read_index in read_indices {
2126                            *read_index_counts.entry(read_index).or_insert(0) += 1;
2127                        }
2128                    }
2129
2130                    let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2131                    path_score *= self.scores.get(&cn_kmer).unwrap_or(&1.0);
2132                }
2133
2134                path_score = path_score.powf(1.0 / path_kmers.len() as f32);
2135                if path_score.is_nan() {
2136                    path_score = 0.0;
2137                }
2138
2139                let path_length = path_kmers.len();
2140                let read_index_counts_filtered: BTreeMap<usize, i32> = read_index_counts.iter()
2141                    .filter(|(_, &count)| count as f32 > 0.8 * path_length as f32)
2142                    .map(|(&index, &count)| (index, count))
2143                    .collect();
2144
2145                let read_indices = read_index_counts_filtered.keys().cloned().collect::<Vec<_>>();
2146
2147                (path_kmers, read_indices, path_score)
2148            })
2149            .collect::<Vec<_>>();
2150
2151            path_info.sort_by(|(_, _, score_a), (_, _, score_b)| score_b.partial_cmp(score_a).unwrap());
2152
2153            for (path_index, (path_kmers, read_indices, path_score)) in path_info.iter().take(2).enumerate() {
2154                let mut allele = String::new();
2155                for path_kmer in path_kmers {
2156                    let kmer = std::str::from_utf8(path_kmer).unwrap();
2157                    if allele.is_empty() {
2158                        allele.push_str(kmer);
2159                    } else {
2160                        allele.push_str(&kmer.chars().last().unwrap().to_string());
2161                    }
2162                }
2163
2164                let path_qual = (-10.0 * (1.0 - path_score).log10()) as u32;
2165
2166                // crate::elog!("[{} {}]: {:.3} {} {} {:?}", bubble_index, path_index, path_score, path_qual, allele, read_indices);
2167
2168                for read_index in read_indices {
2169                    reads[*read_index][bubble_index] = Some(path_index as u8);
2170                    confidences[*read_index][bubble_index] = Some(path_qual);
2171                }
2172            }
2173        }
2174
2175        let mat = WMECData::new(reads, confidences);
2176
2177        let (h1, h2) = crate::wmec::phase(&mat);
2178
2179        // mat.write_reads_matrix("reads.tsv").unwrap();
2180
2181        crate::elog!("Hap1: {:?}", h1);
2182        crate::elog!("Hap2: {:?}", h2);
2183    }
2184    */
2185
2186    /// Update links available to inform navigation during graph traversal.
2187    ///
2188    /// # Arguments
2189    ///
2190    /// * `links_in_scope` - A mutable reference to the links in scope.
2191    /// * `last_kmer` - A reference to the last k-mer.
2192    /// * `used_links` - A mutable reference to the set of used links.
2193    /// * `forward` - A boolean indicating whether to update forward links.
2194    fn update_links(
2195        &self,
2196        links_in_scope: &mut Vec<Link>,
2197        last_kmer: &Vec<u8>,
2198        used_links: &mut HashSet<Link>,
2199        forward: bool,
2200    ) {
2201        let cn_kmer_vec = crate::utils::canonicalize_kmer(last_kmer.as_bytes()).clone();
2202        let cn_kmer = cn_kmer_vec.as_bytes();
2203
2204        if self.links.contains_key(cn_kmer.as_bytes()) {
2205            let available_links = self.links.get(cn_kmer.as_bytes()).unwrap().clone();
2206
2207            let record_orientation_matches_kmer = last_kmer == cn_kmer;
2208
2209            for jv in available_links {
2210                let link_goes_forward = record_orientation_matches_kmer == jv.0.is_forward();
2211                let new_link = if link_goes_forward {
2212                    jv.0.clone()
2213                } else {
2214                    jv.0.complement()
2215                };
2216
2217                if forward == link_goes_forward && !used_links.contains(&new_link) {
2218                    used_links.insert(new_link.clone());
2219                    links_in_scope.push(new_link);
2220                }
2221            }
2222        }
2223    }
2224
2225    // fn print_kmer(kmer: &[u8], prev_base: u8, next_base: u8, record: Option<&Record>) {
2226    //     println!("{} {} {} {}",
2227    //         std::char::from_u32(prev_base as u32).unwrap_or('.'),
2228    //         std::str::from_utf8(kmer).unwrap(),
2229    //         std::char::from_u32(next_base as u32).unwrap_or('.'),
2230    //         record.map_or(String::from(""), |r| format!("{}", r))
2231    //     );
2232    // }
2233
2234    fn first_kmer(&self, contig: &[u8]) -> Option<Vec<u8>> {
2235        if contig.len() < self.kmer_size as usize {
2236            return None;
2237        }
2238
2239        Some(contig[0..self.kmer_size].to_vec())
2240    }
2241
2242    fn last_kmer(&self, contig: &[u8]) -> Option<Vec<u8>> {
2243        if contig.len() < self.kmer_size as usize {
2244            return None;
2245        }
2246
2247        Some(contig[contig.len() - self.kmer_size as usize..].to_vec())
2248    }
2249
2250    #[must_use]
2251    pub fn clean(self, threshold: f32) -> Self {
2252        let kmer_size = self.kmer_size;
2253
2254        self.clean_hairballs()
2255            .clean_tangles(0, 500, threshold)
2256            .clean_tangles(1, 500, threshold)
2257            .clean_tips(3 * kmer_size, threshold)
2258            .clean_branches(threshold)
2259            .clean_superbubbles(1, threshold)
2260            .clean_branches(threshold)
2261            .clean_contigs(100)
2262    }
2263
2264    /// Clean color-specific paths from the graph based on a minimum score threshold.
2265    ///
2266    /// This function removes paths that are specific to a given color and have a score below the specified threshold.
2267    ///
2268    /// # Arguments
2269    ///
2270    /// * `color` - The color index to filter paths by.
2271    /// * `min_score` - The minimum score threshold for paths to be retained.
2272    ///
2273    /// # Returns
2274    ///
2275    /// A new instance of the graph with the specified paths removed.
2276    ///
2277    /// # Panics
2278    ///
2279    /// This function will panic if the `assemble_forward` or `assemble_backward` methods fail to assemble a contig.
2280    #[must_use]
2281    pub fn clean_color_specific_paths(mut self, color: usize, min_score: f32) -> Self {
2282        let bad_cn_kmers = self
2283            .kmers
2284            .keys()
2285            .cloned()
2286            .filter(|cn_kmer| self.sources.get(cn_kmer).unwrap_or(&vec![]) == &vec![color])
2287            .filter(|cn_kmer| self.scores.get(cn_kmer).unwrap_or(&1.0) < &min_score)
2288            .filter(|cn_kmer| {
2289                self.kmers.get(cn_kmer).unwrap().in_degree() == 1
2290                    && self.kmers.get(cn_kmer).unwrap().out_degree() == 1
2291            })
2292            .collect::<Vec<Vec<u8>>>();
2293
2294        let mut to_remove = HashSet::new();
2295        let mut bad_paths: usize = 0;
2296
2297        for bad_cn_kmer in bad_cn_kmers {
2298            if !to_remove.contains(&bad_cn_kmer) {
2299                let mut seen_kmers = HashSet::new();
2300                let mut score_sum = 0.0;
2301
2302                let mut fw_contig = bad_cn_kmer.clone();
2303                self.assemble_forward(&mut fw_contig, &bad_cn_kmer);
2304
2305                for kmer in fw_contig.windows(self.kmer_size) {
2306                    let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2307
2308                    if let Some(r) = self.kmers.get(&cn_kmer) {
2309                        if r.in_degree() == 1 && r.out_degree() == 1 {
2310                            seen_kmers.insert(cn_kmer.clone());
2311                            score_sum += self.scores.get(&cn_kmer).unwrap_or(&1.0);
2312                        } else {
2313                            break;
2314                        }
2315                    }
2316                }
2317
2318                let mut rc_contig = bad_cn_kmer.clone();
2319                self.assemble_backward(&mut rc_contig, &bad_cn_kmer);
2320
2321                for kmer in rc_contig.windows(self.kmer_size).rev().skip(1) {
2322                    let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2323
2324                    if let Some(r) = self.kmers.get(&cn_kmer) {
2325                        if r.in_degree() == 1 && r.out_degree() == 1 {
2326                            seen_kmers.insert(cn_kmer.clone());
2327                            score_sum += self.scores.get(&cn_kmer).unwrap_or(&1.0);
2328                        } else {
2329                            break;
2330                        }
2331                    }
2332                }
2333
2334                let weight = score_sum / seen_kmers.len() as f32;
2335
2336                if weight < min_score {
2337                    to_remove.extend(seen_kmers);
2338                    bad_paths += 1;
2339                }
2340            }
2341        }
2342
2343        for cn_kmer in &to_remove {
2344            self.kmers.remove(cn_kmer);
2345            self.scores.remove(cn_kmer);
2346        }
2347
2348        // crate::elog!(" -- Removed {} bad paths specific to color {} ({} kmers)", bad_paths, color, to_remove.len());
2349
2350        self.infer_edges();
2351
2352        self
2353    }
2354
2355    /// This function removes tips from the graph. A tip is defined as a k-mer with an in-degree or out-degree of 0.
2356    ///
2357    /// # Arguments
2358    ///
2359    /// * `max_tip_length` - The maximum length of a tip to remove.
2360    /// * `min_score` - The minimum score threshold for k-mers to be considered part of a tip.
2361    ///
2362    /// # Returns
2363    ///
2364    /// The modified de Bruijn graph with tips removed.
2365    ///
2366    /// # Panics
2367    /// - This line could panic if `self.kmers.get(cn_kmer).unwrap().in_degree()` returns `None`, meaning the in-degree of the k-mer is not available.
2368    #[must_use]
2369    pub fn clean_branches(mut self, min_score: f32) -> Self {
2370        let bad_cn_kmers = self
2371            .kmers
2372            .keys()
2373            .cloned()
2374            .filter(|cn_kmer| self.scores.get(cn_kmer).unwrap_or(&1.0) < &min_score)
2375            .filter(|cn_kmer| {
2376                self.kmers.get(cn_kmer).unwrap().in_degree() == 1
2377                    && self.kmers.get(cn_kmer).unwrap().out_degree() == 1
2378            })
2379            .collect::<Vec<Vec<u8>>>();
2380
2381        let mut to_remove = HashSet::new();
2382        let mut bad_paths: usize = 0;
2383
2384        for bad_cn_kmer in bad_cn_kmers {
2385            if !to_remove.contains(&bad_cn_kmer) {
2386                let mut seen_kmers = HashSet::new();
2387                // let mut score_sum = 0.0;
2388                let mut score_prod = 1.0;
2389
2390                let mut fw_contig = bad_cn_kmer.clone();
2391                self.assemble_forward(&mut fw_contig, &bad_cn_kmer);
2392
2393                for kmer in fw_contig.windows(self.kmer_size) {
2394                    let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2395
2396                    if let Some(r) = self.kmers.get(&cn_kmer) {
2397                        if r.in_degree() == 1 && r.out_degree() == 1 {
2398                            seen_kmers.insert(cn_kmer.clone());
2399                            // score_sum += self.scores.get(&cn_kmer).unwrap_or(&1.0);
2400                            score_prod *= self.scores.get(&cn_kmer).unwrap_or(&1.0);
2401                        } else {
2402                            break;
2403                        }
2404                    }
2405                }
2406
2407                let mut rc_contig = bad_cn_kmer.clone();
2408                self.assemble_backward(&mut rc_contig, &bad_cn_kmer);
2409
2410                for kmer in rc_contig.windows(self.kmer_size).rev().skip(1) {
2411                    let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2412
2413                    if let Some(r) = self.kmers.get(&cn_kmer) {
2414                        if r.in_degree() == 1 && r.out_degree() == 1 {
2415                            seen_kmers.insert(cn_kmer.clone());
2416                            // score_sum += self.scores.get(&cn_kmer).unwrap_or(&1.0);
2417                            score_prod *= self.scores.get(&cn_kmer).unwrap_or(&1.0);
2418                        } else {
2419                            break;
2420                        }
2421                    }
2422                }
2423
2424                // let weight = score_sum / seen_kmers.len() as f32;
2425                let weight = score_prod.powf(1.0 / seen_kmers.len() as f32);
2426
2427                if weight < min_score {
2428                    to_remove.extend(seen_kmers);
2429                    bad_paths += 1;
2430                }
2431            }
2432        }
2433
2434        for cn_kmer in &to_remove {
2435            self.kmers.remove(cn_kmer);
2436            self.scores.remove(cn_kmer);
2437        }
2438
2439        // crate::elog!(" -- Removed {} bad paths ({} kmers)", bad_paths, to_remove.len());
2440
2441        self.infer_edges();
2442
2443        self
2444    }
2445
2446    /// Clean tangles from the de Bruijn graph.
2447    ///
2448    /// This function identifies and removes tangles from the de Bruijn graph based on a specified color,
2449    /// traversal limit, and minimum score threshold. A tangle is defined as a region in the graph where
2450    /// the in-degree and out-degree of a k-mer sum to 4 or more, indicating a complex branching structure.
2451    ///
2452    /// # Arguments
2453    ///
2454    /// * `color` - The color to filter k-mers by.
2455    /// * `limit` - The maximum number of nodes to traverse before giving up.
2456    /// * `min_score` - The minimum score threshold for k-mers to be considered part of a tangle.
2457    ///
2458    /// # Returns
2459    ///
2460    /// The modified de Bruijn graph with tangles removed.
2461    ///
2462    /// # Panics
2463    ///   - This line could panic if `g.node_weight(node)` returns `None`, meaning the node does not have an associated weight.
2464    ///   - This line could panic if `crate::utils::canonicalize_kmer(current_kmer)` encounters an unexpected input that it cannot process.
2465    ///   - This line could panic if `crate::utils::canonicalize_kmer(kmer)` encounters an unexpected input that it cannot process.
2466    #[must_use]
2467    pub fn clean_tangles(mut self, color: usize, limit: usize, min_score: f32) -> Self {
2468        let mut to_remove = HashSet::new();
2469        let mut bad_tangles: usize = 0;
2470
2471        let mut visited = HashSet::new();
2472
2473        let stopping_condition = |kmer: &[u8], _: usize, g: &LdBG| {
2474            g.scores
2475                .get(&crate::utils::canonicalize_kmer(kmer))
2476                .unwrap_or(&1.0)
2477                < &min_score
2478        };
2479
2480        for cn_kmer in self.kmers.keys() {
2481            if !visited.contains(cn_kmer)
2482                && self.sources.get(cn_kmer).unwrap_or(&vec![]) == &vec![color]
2483                && self.scores.get(cn_kmer).unwrap_or(&1.0) < &min_score
2484                && self.kmers.get(cn_kmer).unwrap().in_degree()
2485                    + self.kmers.get(cn_kmer).unwrap().out_degree()
2486                    >= 4
2487            {
2488                let g =
2489                    self.traverse_kmers_until_condition(&cn_kmer, color, limit, stopping_condition);
2490
2491                visited.insert(cn_kmer.clone());
2492                for node in g.node_indices() {
2493                    let current_kmer = g.node_weight(node).unwrap().as_bytes();
2494                    let current_cn_kmer = crate::utils::canonicalize_kmer(current_kmer);
2495
2496                    to_remove.insert(current_cn_kmer.clone());
2497
2498                    // Mark k-mers as visited, but avoid marking k-mers that are new potential starting points for tangles.
2499                    if !(self.scores.get(&current_cn_kmer).unwrap_or(&1.0) < &min_score
2500                        && self.kmers.get(&current_cn_kmer).unwrap().in_degree()
2501                            + self.kmers.get(&current_cn_kmer).unwrap().out_degree()
2502                            >= 4)
2503                    {
2504                        visited.insert(current_cn_kmer.clone());
2505                    }
2506                }
2507
2508                bad_tangles += 1;
2509            }
2510        }
2511
2512        for cn_kmer in &to_remove {
2513            self.kmers.remove(cn_kmer);
2514            self.scores.remove(cn_kmer);
2515        }
2516
2517        // crate::elog!(" -- Removed {} tangles in color {} ({} kmers)", bad_tangles, color, to_remove.len());
2518
2519        self.infer_edges();
2520
2521        self
2522    }
2523
2524    #[must_use]
2525    pub fn clean_hairballs(mut self) -> Self {
2526        for cn_kmer in self.noise.iter() {
2527            self.kmers.remove(cn_kmer);
2528            self.scores.remove(cn_kmer);
2529        }
2530
2531        // crate::elog!(" -- Removed repetitive noise ({} kmers)", self.noise.len());
2532
2533        self.infer_edges();
2534
2535        self
2536    }
2537
2538    /// This method will remove tips that have a score below the specified minimum score.
2539    /// A tip is defined as a region of the graph where there is only one path from the source to the sink.
2540    ///
2541    /// # Arguments
2542    ///
2543    /// * `limit` - The maximum number of nodes to traverse before giving up.
2544    /// * `min_score` - The minimum score for a tip to be kept.
2545    ///
2546    /// # Returns
2547    ///
2548    /// A new `LdBG` with the specified tips removed.
2549    ///
2550    /// # Panics
2551    /// If `self.kmers.get(cn_kmer)` returns `None`, the call to `unwrap()` will cause a panic.
2552    #[must_use]
2553    pub fn clean_tips(mut self, limit: usize, min_score: f32) -> Self {
2554        let mut to_remove = HashSet::new();
2555        let mut bad_paths: usize = 0;
2556
2557        let stopping_condition = |kmer: &[u8], _: usize, g: &LdBG| {
2558            g.prev_kmers(kmer).len() > 1 || g.next_kmers(kmer).len() > 1
2559        };
2560
2561        for cn_kmer in self.kmers.keys() {
2562            if self.kmers.get(cn_kmer).unwrap().in_degree() == 0
2563                && self.kmers.get(cn_kmer).unwrap().out_degree() > 1
2564            {
2565                let mut fw_contig = cn_kmer.clone();
2566                if let Ok(_) = self.assemble_forward_until_condition(
2567                    &mut fw_contig,
2568                    &cn_kmer,
2569                    limit,
2570                    stopping_condition,
2571                ) {
2572                    let score_sum = fw_contig
2573                        .kmers(self.kmer_size as u8)
2574                        .map(|kmer| {
2575                            self.scores
2576                                .get(&crate::utils::canonicalize_kmer(&kmer))
2577                                .unwrap_or(&1.0)
2578                        })
2579                        .sum::<f32>();
2580                    let weight = score_sum / (fw_contig.len() - self.kmer_size + 1) as f32;
2581
2582                    if weight < min_score {
2583                        for kmer in fw_contig.kmers(self.kmer_size as u8) {
2584                            to_remove.insert(crate::utils::canonicalize_kmer(&kmer));
2585                        }
2586
2587                        bad_paths += 1;
2588                    }
2589                }
2590            }
2591
2592            if self.kmers.get(cn_kmer).unwrap().out_degree() == 0
2593                && self.kmers.get(cn_kmer).unwrap().in_degree() > 1
2594            {
2595                let mut rv_contig = cn_kmer.clone();
2596                if let Ok(_) = self.assemble_backward_until_condition(
2597                    &mut rv_contig,
2598                    &cn_kmer,
2599                    limit,
2600                    stopping_condition,
2601                ) {
2602                    let score_sum = rv_contig
2603                        .kmers(self.kmer_size as u8)
2604                        .map(|kmer| {
2605                            self.scores
2606                                .get(&crate::utils::canonicalize_kmer(&kmer))
2607                                .unwrap_or(&1.0)
2608                        })
2609                        .sum::<f32>();
2610                    let weight = score_sum / (rv_contig.len() - self.kmer_size + 1) as f32;
2611
2612                    if weight < min_score {
2613                        for kmer in rv_contig.kmers(self.kmer_size as u8) {
2614                            to_remove.insert(crate::utils::canonicalize_kmer(&kmer));
2615                        }
2616
2617                        bad_paths += 1;
2618                    }
2619                }
2620            }
2621        }
2622
2623        for cn_kmer in &to_remove {
2624            self.kmers.remove(cn_kmer);
2625            self.scores.remove(cn_kmer);
2626        }
2627
2628        // crate::elog!(" -- Removed {} bad tips ({} kmers)", bad_paths, to_remove.len());
2629
2630        self.infer_edges();
2631
2632        self
2633    }
2634
2635    /// This method will remove bubbles that have a score below the specified minimum score.
2636    /// A bubble is defined as a region of the graph where there are two paths from the same
2637    /// source to the same sink.
2638    ///
2639    /// # Arguments
2640    ///
2641    /// * `min_score` - The minimum score for a bubble to be kept.
2642    ///
2643    /// # Returns
2644    ///
2645    /// A new `LdBG` with the specified bubbles removed.
2646    ///
2647    /// # Panics
2648    ///
2649    ///  - When calling `unwrap` on the result of `g.node_weight(*node)`. If the node does not exist in the graph, this will cause a panic.
2650    ///  - When calling `unwrap_or` on the result of `self.scores.get(&cn_kmer)`. If the canonical k-mer is not found in the scores, it will return the default value `1.0` instead of panicking.
2651    #[must_use]
2652    pub fn clean_superbubbles(mut self, color: usize, min_score: f32) -> Self {
2653        let mut to_remove = HashSet::new();
2654        let mut num_bad_paths: usize = 0;
2655
2656        let g = self.traverse_all_kmers();
2657        let bubbles = find_all_superbubbles(&g);
2658
2659        for ((in_node, out_node), interior) in bubbles.iter() {
2660            let paths_fwd = petgraph::algo::all_simple_paths::<Vec<_>, _>(
2661                &g,
2662                *in_node,
2663                *out_node,
2664                0,
2665                Some(interior.len()),
2666            );
2667            let paths_rev = petgraph::algo::all_simple_paths::<Vec<_>, _>(
2668                &g,
2669                *out_node,
2670                *in_node,
2671                0,
2672                Some(interior.len()),
2673            );
2674
2675            let mut paths = paths_fwd
2676                .chain(paths_rev)
2677                .map(|path| {
2678                    let score = path
2679                        .iter()
2680                        .map(|node| {
2681                            let kmer = g.node_weight(*node).unwrap().as_bytes();
2682                            let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2683                            *self.scores.get(&cn_kmer).unwrap_or(&1.0)
2684                        })
2685                        .fold(1.0, |acc, x| acc * x)
2686                        .powf(1.0 / path.len() as f32);
2687
2688                    let mut bad_color_count = 0;
2689
2690                    for node in &path {
2691                        let kmer = g.node_weight(*node).unwrap().as_bytes();
2692                        let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2693
2694                        let seen_colors = self
2695                            .sources
2696                            .get(&cn_kmer)
2697                            .unwrap_or(&vec![])
2698                            .iter()
2699                            .cloned()
2700                            .collect::<HashSet<_>>();
2701
2702                        if seen_colors.len() == 1 && seen_colors.contains(&color) {
2703                            bad_color_count += 1;
2704                        }
2705                    }
2706
2707                    let is_bad_color = bad_color_count as f32 > 0.5 * (path.len() as f32);
2708
2709                    (
2710                        path,
2711                        if score.is_nan() || is_bad_color {
2712                            0.0
2713                        } else {
2714                            score
2715                        },
2716                    )
2717                })
2718                .collect::<Vec<(Vec<NodeIndex>, f32)>>();
2719
2720            paths.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
2721
2722            let good_paths = paths
2723                .iter()
2724                .take(1)
2725                .chain(
2726                    paths
2727                        .iter()
2728                        .skip(1)
2729                        .filter(|path| path.1 > min_score)
2730                        .take(1),
2731                )
2732                .collect::<Vec<_>>();
2733            let bad_paths = paths
2734                .iter()
2735                .filter(|path| !good_paths.contains(path))
2736                .collect::<Vec<_>>();
2737
2738            let to_keep = good_paths
2739                .iter()
2740                .map(|(path, _)| path)
2741                .flatten()
2742                .map(|node| {
2743                    crate::utils::canonicalize_kmer(g.node_weight(*node).unwrap().as_bytes())
2744                })
2745                .collect::<HashSet<Vec<u8>>>();
2746
2747            bad_paths.iter().for_each(|(path, _)| {
2748                for node in path {
2749                    let kmer = g.node_weight(*node).unwrap().as_bytes();
2750                    let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2751
2752                    if !to_keep.contains(&cn_kmer) {
2753                        to_remove.insert(cn_kmer.clone());
2754                    }
2755                }
2756            });
2757
2758            num_bad_paths += bad_paths.len();
2759        }
2760
2761        for cn_kmer in &to_remove {
2762            self.kmers.remove(cn_kmer);
2763            self.scores.remove(cn_kmer);
2764        }
2765
2766        // crate::elog!(" -- Removed {} bad bubble paths ({} kmers)", num_bad_paths, to_remove.len());
2767
2768        self.infer_edges();
2769
2770        self
2771    }
2772
2773    /// This method will remove contigs that are shorter than the specified minimum length
2774    /// and that are not connected to the rest of the graph.
2775    ///
2776    /// # Arguments
2777    ///
2778    /// * `min_contig_length` - The minimum length of a contig to keep.
2779    ///
2780    /// # Returns
2781    ///
2782    /// A new `LdBG` with the specified contigs removed.
2783    ///
2784    /// # Panics
2785    ///
2786    /// This method will panic if the k-mer size is not set.
2787    #[must_use]
2788    pub fn clean_contigs(mut self, min_contig_length: usize) -> Self {
2789        let mut to_remove = HashSet::new();
2790        let mut bad_contigs: usize = 0;
2791
2792        self.assemble_all().iter().for_each(|contig| {
2793            if contig.len() < min_contig_length {
2794                let mut process = true;
2795
2796                // Check if the first k-mer has no incoming edges
2797                let first_kmer = contig.windows(self.kmer_size).next().unwrap();
2798                let prev_kmers = self.prev_kmers(first_kmer);
2799                if prev_kmers.len() > 0 {
2800                    process = false;
2801                }
2802
2803                // Check if the last k-mer has no outgoing edges
2804                let last_kmer = contig.windows(self.kmer_size).last().unwrap();
2805                let next_kmers = self.next_kmers(last_kmer);
2806                if next_kmers.len() > 0 {
2807                    process = false;
2808                }
2809
2810                if process {
2811                    for kmer in contig.windows(self.kmer_size) {
2812                        to_remove.insert(crate::utils::canonicalize_kmer(kmer));
2813                    }
2814
2815                    bad_contigs += 1;
2816                }
2817            }
2818        });
2819
2820        for cn_kmer in &to_remove {
2821            self.kmers.remove(cn_kmer);
2822            self.scores.remove(cn_kmer);
2823        }
2824
2825        // crate::elog!(" -- Removed {} bad contigs ({} kmers)", bad_contigs, to_remove.len());
2826
2827        self.infer_edges();
2828
2829        self
2830    }
2831
2832    fn traverse_forward(
2833        &self,
2834        graph: &mut petgraph::Graph<String, f32>,
2835        visited: &mut HashMap<String, NodeIndex>,
2836        start_node: NodeIndex,
2837    ) {
2838        let mut fwd_stack = vec![start_node];
2839        while let Some(node) = fwd_stack.pop() {
2840            let this_kmer = graph.node_weight(node).unwrap().as_bytes();
2841
2842            for next_kmer in self.next_kmers(this_kmer) {
2843                let next_label = String::from_utf8_lossy(&next_kmer).to_string();
2844                let next_node = if let Some(&existing_node) = visited.get(&next_label) {
2845                    existing_node
2846                } else {
2847                    let new_node = graph.add_node(next_label.clone());
2848                    visited.insert(next_label.clone(), new_node);
2849                    fwd_stack.push(new_node);
2850                    new_node
2851                };
2852
2853                if !graph.contains_edge(node, next_node) {
2854                    graph.add_edge(
2855                        node,
2856                        next_node,
2857                        *self
2858                            .scores
2859                            .get(&crate::utils::canonicalize_kmer(&next_kmer))
2860                            .unwrap_or(&1.0),
2861                    );
2862                }
2863            }
2864        }
2865    }
2866
2867    fn traverse_forward_until(
2868        &self,
2869        graph: &mut petgraph::Graph<String, f32>,
2870        visited: &mut HashMap<String, NodeIndex>,
2871        start_node: NodeIndex,
2872        stop_node: &[u8],
2873    ) -> Result<()> {
2874        let mut found = false;
2875
2876        let mut fwd_stack = vec![start_node];
2877        while let Some(node) = fwd_stack.pop() {
2878            let this_kmer = graph.node_weight(node).unwrap().as_bytes();
2879
2880            for next_kmer in self.next_kmers(this_kmer) {
2881                let next_label = String::from_utf8_lossy(&next_kmer).to_string();
2882                let next_node = if let Some(&existing_node) = visited.get(&next_label) {
2883                    existing_node
2884                } else {
2885                    let new_node = graph.add_node(next_label.clone());
2886                    visited.insert(next_label.clone(), new_node);
2887
2888                    if graph.node_weight(new_node).unwrap().as_bytes() != stop_node {
2889                        fwd_stack.push(new_node);
2890                    } else {
2891                        found = true;
2892                    }
2893
2894                    new_node
2895                };
2896
2897                if !graph.contains_edge(node, next_node) {
2898                    graph.add_edge(
2899                        node,
2900                        next_node,
2901                        *self
2902                            .scores
2903                            .get(&crate::utils::canonicalize_kmer(&next_kmer))
2904                            .unwrap_or(&1.0),
2905                    );
2906                }
2907            }
2908        }
2909
2910        if found {
2911            Ok(())
2912        } else {
2913            Err(anyhow::anyhow!("Stop node not found"))
2914        }
2915    }
2916
2917    fn traverse_forward_until_condition<F>(
2918        &self,
2919        graph: &mut petgraph::Graph<String, f32>,
2920        visited: &mut HashMap<String, NodeIndex>,
2921        color: usize,
2922        start_node: NodeIndex,
2923        limit: usize,
2924        stopping_condition: F,
2925    ) -> Result<()>
2926    where
2927        F: Fn(&[u8], usize, &Self) -> bool,
2928    {
2929        let mut found = false;
2930
2931        // Use a vector of tuples (node, depth) instead of just nodes
2932        let mut fwd_stack = vec![(start_node, 0)];
2933        while let Some((node, depth)) = fwd_stack.pop() {
2934            if depth >= limit {
2935                continue; // Skip this node if we've reached the depth limit
2936            }
2937
2938            let this_kmer = graph.node_weight(node).unwrap().as_bytes();
2939
2940            for next_kmer in self.next_kmers(this_kmer) {
2941                let same_color = self
2942                    .sources
2943                    .get(&crate::utils::canonicalize_kmer(&next_kmer))
2944                    .unwrap_or(&vec![])
2945                    == &vec![color];
2946                if !same_color {
2947                    continue;
2948                }
2949
2950                let next_label = String::from_utf8_lossy(&next_kmer).to_string();
2951                let next_node = if let Some(&existing_node) = visited.get(&next_label) {
2952                    existing_node
2953                } else {
2954                    let new_node = graph.add_node(next_label.clone());
2955                    visited.insert(next_label.clone(), new_node);
2956
2957                    if !stopping_condition(&next_kmer, depth + 1, self) {
2958                        // Push the new node with an incremented depth
2959                        fwd_stack.push((new_node, depth + 1));
2960                    } else {
2961                        found = true;
2962                    }
2963
2964                    new_node
2965                };
2966
2967                if !graph.contains_edge(node, next_node) {
2968                    graph.add_edge(
2969                        node,
2970                        next_node,
2971                        *self
2972                            .scores
2973                            .get(&crate::utils::canonicalize_kmer(&next_kmer))
2974                            .unwrap_or(&1.0),
2975                    );
2976                }
2977            }
2978        }
2979
2980        if found {
2981            Ok(())
2982        } else {
2983            Err(anyhow::anyhow!("Stop condition not met within depth limit"))
2984        }
2985    }
2986
2987    fn traverse_backward(
2988        &self,
2989        graph: &mut petgraph::Graph<String, f32>,
2990        visited: &mut HashMap<String, NodeIndex>,
2991        start_node: NodeIndex,
2992    ) {
2993        let mut rev_stack = vec![start_node];
2994        while let Some(node) = rev_stack.pop() {
2995            let this_kmer = graph.node_weight(node).unwrap().as_bytes();
2996
2997            for prev_kmer in self.prev_kmers(this_kmer) {
2998                let prev_label = String::from_utf8_lossy(&prev_kmer).to_string();
2999                let prev_node = if let Some(&existing_node) = visited.get(&prev_label) {
3000                    existing_node
3001                } else {
3002                    let new_node = graph.add_node(prev_label.clone());
3003                    visited.insert(prev_label.clone(), new_node);
3004                    rev_stack.push(new_node);
3005                    new_node
3006                };
3007
3008                if !graph.contains_edge(prev_node, node) {
3009                    graph.add_edge(
3010                        prev_node,
3011                        node,
3012                        *self
3013                            .scores
3014                            .get(&crate::utils::canonicalize_kmer(&prev_kmer))
3015                            .unwrap_or(&1.0),
3016                    );
3017                }
3018            }
3019        }
3020    }
3021
3022    fn traverse_backward_until(
3023        &self,
3024        graph: &mut petgraph::Graph<String, f32>,
3025        visited: &mut HashMap<String, NodeIndex>,
3026        start_node: NodeIndex,
3027        stop_node: &[u8],
3028    ) -> Result<()> {
3029        let mut found = false;
3030
3031        let mut rev_stack = vec![start_node];
3032        while let Some(node) = rev_stack.pop() {
3033            let this_kmer = graph.node_weight(node).unwrap().as_bytes();
3034
3035            for prev_kmer in self.prev_kmers(this_kmer) {
3036                let prev_label = String::from_utf8_lossy(&prev_kmer).to_string();
3037                let prev_node = if let Some(&existing_node) = visited.get(&prev_label) {
3038                    existing_node
3039                } else {
3040                    let new_node = graph.add_node(prev_label.clone());
3041                    visited.insert(prev_label.clone(), new_node);
3042
3043                    if graph.node_weight(new_node).unwrap().as_bytes() != stop_node {
3044                        rev_stack.push(new_node);
3045                    } else {
3046                        found = true;
3047                    }
3048
3049                    new_node
3050                };
3051
3052                if !graph.contains_edge(prev_node, node) {
3053                    graph.add_edge(
3054                        prev_node,
3055                        node,
3056                        *self
3057                            .scores
3058                            .get(&crate::utils::canonicalize_kmer(&prev_kmer))
3059                            .unwrap_or(&1.0),
3060                    );
3061                }
3062            }
3063        }
3064
3065        if found {
3066            Ok(())
3067        } else {
3068            Err(anyhow::anyhow!("Stop node not found"))
3069        }
3070    }
3071
3072    fn traverse_backward_until_condition<F>(
3073        &self,
3074        graph: &mut petgraph::Graph<String, f32>,
3075        visited: &mut HashMap<String, NodeIndex>,
3076        color: usize,
3077        start_node: NodeIndex,
3078        limit: usize,
3079        stopping_condition: F,
3080    ) -> Result<()>
3081    where
3082        F: Fn(&[u8], usize, &Self) -> bool,
3083    {
3084        let mut found = false;
3085
3086        // Use a vector of tuples (node, depth) instead of just nodes
3087        let mut rev_stack = vec![(start_node, 0)];
3088        while let Some((node, depth)) = rev_stack.pop() {
3089            if depth >= limit {
3090                continue; // Skip this node if we've reached the depth limit
3091            }
3092
3093            let this_kmer = graph.node_weight(node).unwrap().as_bytes();
3094
3095            for prev_kmer in self.prev_kmers(this_kmer) {
3096                let same_color = self
3097                    .sources
3098                    .get(&crate::utils::canonicalize_kmer(&prev_kmer))
3099                    .unwrap_or(&vec![])
3100                    == &vec![color];
3101                if !same_color {
3102                    continue;
3103                }
3104
3105                let prev_label = String::from_utf8_lossy(&prev_kmer).to_string();
3106                let prev_node = if let Some(&existing_node) = visited.get(&prev_label) {
3107                    existing_node
3108                } else {
3109                    let new_node = graph.add_node(prev_label.clone());
3110                    visited.insert(prev_label.clone(), new_node);
3111
3112                    if !stopping_condition(&prev_kmer, depth + 1, self) {
3113                        // Push the new node with an incremented depth
3114                        rev_stack.push((new_node, depth + 1));
3115                    } else {
3116                        found = true;
3117                    }
3118
3119                    new_node
3120                };
3121
3122                if !graph.contains_edge(prev_node, node) {
3123                    graph.add_edge(
3124                        prev_node,
3125                        node,
3126                        *self
3127                            .scores
3128                            .get(&crate::utils::canonicalize_kmer(&prev_kmer))
3129                            .unwrap_or(&1.0),
3130                    );
3131                }
3132            }
3133        }
3134
3135        if found {
3136            Ok(())
3137        } else {
3138            Err(anyhow::anyhow!("Stop condition not met within depth limit"))
3139        }
3140    }
3141
3142    pub fn traverse_kmers_until_condition<F>(
3143        &self,
3144        start_kmer: &[u8],
3145        color: usize,
3146        limit: usize,
3147        stopping_condition: F,
3148    ) -> DiGraph<String, f32>
3149    where
3150        F: Fn(&[u8], usize, &Self) -> bool,
3151    {
3152        let mut graph = DiGraph::new();
3153        let mut visited = HashMap::<String, NodeIndex>::new(); // Map node labels to indices
3154
3155        let start_label = String::from_utf8_lossy(&start_kmer).to_string();
3156        let start_node = graph.add_node(start_label.clone());
3157        visited.insert(start_label.clone(), start_node);
3158
3159        let _ = self.traverse_forward_until_condition(
3160            &mut graph,
3161            &mut visited,
3162            color,
3163            start_node,
3164            limit,
3165            &stopping_condition,
3166        );
3167        let _ = self.traverse_backward_until_condition(
3168            &mut graph,
3169            &mut visited,
3170            color,
3171            start_node,
3172            limit,
3173            &stopping_condition,
3174        );
3175
3176        graph
3177    }
3178
3179    /// Traverse kmers starting from a given kmer and build a graph.
3180    #[must_use]
3181    pub fn traverse_kmers(&self, start_kmer: &[u8]) -> DiGraph<String, f32> {
3182        let mut graph = DiGraph::new();
3183        let mut visited = HashMap::<String, NodeIndex>::new(); // Map node labels to indices
3184
3185        let start_label = String::from_utf8_lossy(&start_kmer).to_string();
3186        let start_node = graph.add_node(start_label.clone());
3187        visited.insert(start_label.clone(), start_node);
3188
3189        self.traverse_forward(&mut graph, &mut visited, start_node);
3190        self.traverse_backward(&mut graph, &mut visited, start_node);
3191
3192        graph
3193    }
3194
3195    /// The `traverse_contigs` function traverses kmers starting from a given kmer and builds a
3196    /// directed graph of contigs. It marks all kmers in the start contig as visited,
3197    /// then traverses forward and backward to build the graph, ensuring that each kmer is only
3198    /// visited once. The function returns the constructed graph.
3199    ///
3200    /// # Arguments
3201    ///
3202    /// * `start_kmer` - A vector of bytes representing the start kmer.
3203    ///
3204    /// # Returns
3205    ///
3206    /// A directed graph of contigs.
3207    ///
3208    /// # Panics
3209    ///
3210    /// 1. **Unwrapping `Option` values**:
3211    ///    - If `graph.node_weight(node)` returns `None`, the call to `unwrap()` will panic.
3212    ///    - If `self.last_kmer(this_contig)` or `self.first_kmer(this_contig)` returns `None`, the call to `unwrap()` will panic.
3213    ///    - If `visited.get(&canonical_kmer)` returns `None`, the call to `unwrap()` will panic.
3214    ///
3215    /// 2. **Indexing operations**:
3216    ///    - If `contig.windows(self.kmer_size)` is called with a `kmer_size` larger than the length of `contig`, it will panic.
3217    ///
3218    #[must_use]
3219    pub fn traverse_contigs(&self, start_kmer: &[u8]) -> DiGraph<String, f32> {
3220        let mut graph = DiGraph::new();
3221        let mut visited = HashMap::new();
3222
3223        let start_contig = self.assemble(&start_kmer);
3224        let start_node = graph.add_node(String::from_utf8_lossy(&start_contig).to_string());
3225
3226        // Mark all k-mers in the start contig as visited
3227        for kmer in start_contig.windows(self.kmer_size) {
3228            visited.insert(crate::utils::canonicalize_kmer(kmer), start_node);
3229        }
3230
3231        // Traverse forward
3232        let mut fwd_stack = vec![start_node];
3233        while let Some(node) = fwd_stack.pop() {
3234            let this_contig = graph.node_weight(node).unwrap().as_bytes();
3235
3236            if let Some(last_kmer) = self.last_kmer(this_contig) {
3237                for next_kmer in self.next_kmers(&last_kmer) {
3238                    if !visited.contains_key(&crate::utils::canonicalize_kmer(&next_kmer)) {
3239                        let mut next_contig = next_kmer.clone();
3240                        self.assemble_forward(&mut next_contig, &next_kmer);
3241
3242                        let weight = next_contig
3243                            .windows(self.kmer_size)
3244                            .map(|x| {
3245                                self.scores
3246                                    .get(&crate::utils::canonicalize_kmer(x))
3247                                    .unwrap_or(&1.0)
3248                            })
3249                            .fold(f32::INFINITY, |acc, x| acc.min(*x));
3250
3251                        let next_node =
3252                            graph.add_node(String::from_utf8_lossy(&next_contig).to_string());
3253                        graph.add_edge(node, next_node, weight);
3254
3255                        // Mark all k-mers in the new contig as visited
3256                        for kmer in next_contig.windows(self.kmer_size) {
3257                            visited.insert(crate::utils::canonicalize_kmer(kmer), next_node);
3258                        }
3259
3260                        fwd_stack.push(next_node);
3261                    } else {
3262                        let next_node = *visited
3263                            .get(&crate::utils::canonicalize_kmer(&next_kmer))
3264                            .unwrap();
3265                        if !graph.contains_edge(node, next_node) {
3266                            let weight = *self
3267                                .scores
3268                                .get(&crate::utils::canonicalize_kmer(&next_kmer))
3269                                .unwrap_or(&1.0);
3270                            graph.add_edge(node, next_node, weight);
3271                        }
3272                    }
3273                }
3274            }
3275        }
3276
3277        // Traverse backward
3278        let mut rev_stack = vec![start_node];
3279        while let Some(node) = rev_stack.pop() {
3280            let this_contig = graph.node_weight(node).unwrap().as_bytes();
3281
3282            if let Some(first_kmer) = self.first_kmer(this_contig) {
3283                for prev_kmer in self.prev_kmers(&first_kmer) {
3284                    if !visited.contains_key(&crate::utils::canonicalize_kmer(&prev_kmer)) {
3285                        let mut prev_contig = prev_kmer.clone();
3286                        self.assemble_backward(&mut prev_contig, &prev_kmer);
3287
3288                        let weight = prev_contig
3289                            .windows(self.kmer_size)
3290                            .map(|x| {
3291                                self.scores
3292                                    .get(&crate::utils::canonicalize_kmer(x))
3293                                    .unwrap_or(&1.0)
3294                            })
3295                            .fold(f32::INFINITY, |acc, x| acc.min(*x));
3296
3297                        let prev_node =
3298                            graph.add_node(String::from_utf8_lossy(&prev_contig).to_string());
3299                        graph.add_edge(prev_node, node, weight); // Note the reversed edge direction
3300
3301                        // Mark all k-mers in the new contig as visited
3302                        for kmer in prev_contig.windows(self.kmer_size) {
3303                            visited.insert(crate::utils::canonicalize_kmer(kmer), prev_node);
3304                        }
3305
3306                        rev_stack.push(prev_node);
3307                    } else {
3308                        let prev_node = *visited
3309                            .get(&crate::utils::canonicalize_kmer(&prev_kmer))
3310                            .unwrap();
3311                        if !graph.contains_edge(prev_node, node) {
3312                            let weight = *self
3313                                .scores
3314                                .get(&crate::utils::canonicalize_kmer(&prev_kmer))
3315                                .unwrap_or(&1.0);
3316                            graph.add_edge(prev_node, node, weight); // Note the reversed edge direction
3317                        }
3318                    }
3319                }
3320            }
3321        }
3322
3323        graph
3324    }
3325
3326    /// Traverse all kmers in the graph and return a new graph with all kmers merged.
3327    /// This function is useful for collapsing kmers that are separated by bubbles.
3328    /// The new graph will contain all kmers as nodes and edges between kmers as weights.
3329    ///
3330    /// # Returns
3331    ///
3332    /// A new graph with all kmers merged.
3333    ///
3334    /// # Panics
3335    ///
3336    /// Panics if the graph is not a directed graph.
3337    #[must_use]
3338    pub fn traverse_all_kmers(&self) -> DiGraph<String, f32> {
3339        let cn_kmers = self
3340            .kmers
3341            .iter()
3342            .filter(|(_, record)| record.in_degree() <= 1 && record.out_degree() <= 1)
3343            .map(|(cn_kmer, _)| cn_kmer.clone())
3344            .collect::<Vec<Vec<u8>>>();
3345
3346        let mut visited = HashSet::new();
3347        let mut graph = DiGraph::new();
3348        let mut node_indices = HashMap::<String, NodeIndex>::new(); // Map node labels to indices
3349
3350        for cn_kmer in cn_kmers {
3351            let fw_kmer = cn_kmer.clone();
3352            let rc_kmer = fw_kmer.reverse_complement();
3353            for kmer in [fw_kmer, rc_kmer] {
3354                if !visited.contains(&kmer) {
3355                    let g = self.traverse_kmers(&kmer);
3356
3357                    // Mark kmers as visited
3358                    for node_label in g.node_weights() {
3359                        // visited.insert(crate::utils::canonicalize_kmer(node_label.as_bytes()));
3360                        visited.insert(node_label.as_bytes().to_vec());
3361                    }
3362
3363                    // Add nodes to the main graph, avoiding duplicates
3364                    for node_index in g.node_indices() {
3365                        let node_label = g.node_weight(node_index).unwrap().clone();
3366                        if !node_indices.contains_key(&node_label) {
3367                            let new_node_idx = graph.add_node(node_label.clone());
3368                            node_indices.insert(node_label.clone(), new_node_idx);
3369                        }
3370                    }
3371
3372                    // Add edges, mapping nodes correctly
3373                    for edge in g.edge_references() {
3374                        let (source, target) = (edge.source(), edge.target());
3375                        let source_label = g.node_weight(source).unwrap();
3376                        let target_label = g.node_weight(target).unwrap();
3377
3378                        let new_source = *node_indices
3379                            .get(source_label)
3380                            .expect("Source node should exist");
3381                        let new_target = *node_indices
3382                            .get(target_label)
3383                            .expect("Target node should exist");
3384
3385                        if !graph.contains_edge(new_source, new_target) {
3386                            graph.add_edge(new_source, new_target, *edge.weight());
3387                        }
3388                    }
3389                }
3390            }
3391        }
3392
3393        graph
3394    }
3395
3396    /// Traverse all contigs in the graph and return a new graph with all contigs merged.
3397    /// This function is useful for collapsing contigs that are separated by bubbles.
3398    /// The new graph will contain all contigs as nodes and edges between contigs as weights.
3399    ///
3400    /// # Returns
3401    ///
3402    /// A new graph with all contigs merged.
3403    ///
3404    /// # Panics
3405    ///
3406    /// Panics if the graph is not a directed graph.
3407    #[must_use]
3408    pub fn traverse_all_contigs(&self) -> DiGraph<String, f32> {
3409        let cn_kmers = self
3410            .kmers
3411            .iter()
3412            .filter(|(_, record)| record.in_degree() <= 1 && record.out_degree() <= 1)
3413            .map(|(cn_kmer, _)| cn_kmer.clone())
3414            .collect::<Vec<Vec<u8>>>();
3415
3416        let mut visited = HashSet::new();
3417        let mut graph = DiGraph::new();
3418
3419        for cn_kmer in cn_kmers {
3420            if !visited.contains(&cn_kmer) {
3421                let g = self.traverse_contigs(&cn_kmer);
3422
3423                g.node_weights().for_each(|node| {
3424                    for kmer in node.as_bytes().kmers(self.kmer_size as u8) {
3425                        visited.insert(crate::utils::canonicalize_kmer(&kmer));
3426                    }
3427                });
3428
3429                // Add all nodes from g to graph
3430                for node_index in g.node_indices() {
3431                    let node_weight = g.node_weight(node_index).unwrap();
3432                    graph.add_node(node_weight.clone());
3433                }
3434
3435                // Add all edges from g to graph
3436                for edge in g.edge_references() {
3437                    let (source, target) = (edge.source(), edge.target());
3438                    let source_weight = g.node_weight(source).unwrap();
3439                    let target_weight = g.node_weight(target).unwrap();
3440
3441                    let new_source = graph
3442                        .node_indices()
3443                        .find(|&i| graph[i] == *source_weight)
3444                        .unwrap();
3445                    let new_target = graph
3446                        .node_indices()
3447                        .find(|&i| graph[i] == *target_weight)
3448                        .unwrap();
3449
3450                    graph.add_edge(new_source, new_target, *edge.weight());
3451                }
3452            }
3453        }
3454
3455        graph
3456    }
3457}
3458
3459fn traverse_read_graph(graph: &petgraph::Graph<String, f64>, min_length: usize) -> Vec<Vec<u8>> {
3460    // Populate corrected_seqs with all contiguous stretches of the graph more than 1 node long
3461    let mut corrected_seqs = Vec::new();
3462    let mut visited = HashSet::new();
3463    for node in graph.node_indices() {
3464        if !visited.contains(&node) {
3465            let mut current_seq = Vec::new();
3466            let mut current_node = node;
3467
3468            loop {
3469                visited.insert(current_node);
3470                if current_seq.is_empty() {
3471                    current_seq.push(graph[current_node].clone());
3472                } else {
3473                    current_seq.push(graph[current_node].chars().last().unwrap().to_string());
3474                }
3475
3476                let outgoing_edges: Vec<_> = graph
3477                    .edges_directed(current_node, petgraph::Direction::Outgoing)
3478                    .collect();
3479                if outgoing_edges.len() == 1 {
3480                    let next_node = outgoing_edges[0].target();
3481                    if !visited.contains(&next_node) {
3482                        current_node = next_node;
3483                    } else {
3484                        break;
3485                    }
3486                } else {
3487                    break;
3488                }
3489            }
3490
3491            if current_seq.len() > min_length {
3492                let contig = current_seq.join("").as_bytes().to_vec();
3493                corrected_seqs.push(contig);
3494            }
3495        }
3496    }
3497    corrected_seqs
3498}
3499
3500fn navigate_backward(
3501    node: NodeIndex,
3502    steps: u8,
3503    graph: &petgraph::Graph<String, f64>,
3504) -> Option<NodeIndex> {
3505    let mut current_node = node;
3506    let mut steps_backward = 0;
3507
3508    while steps_backward < steps {
3509        let incoming_edges: Vec<_> = graph
3510            .edges_directed(current_node, petgraph::Direction::Incoming)
3511            .collect();
3512        if incoming_edges.is_empty() {
3513            return None;
3514        }
3515        current_node = incoming_edges[0].source();
3516        steps_backward += 1;
3517    }
3518
3519    if steps_backward == steps {
3520        Some(current_node)
3521    } else {
3522        None
3523    }
3524}
3525
3526fn navigate_forward(
3527    node: NodeIndex,
3528    steps: u8,
3529    graph: &petgraph::Graph<String, f64>,
3530) -> Option<NodeIndex> {
3531    let mut current_node = node;
3532    let mut steps_forward = 0;
3533
3534    while steps_forward < steps {
3535        let outgoing_edges: Vec<_> = graph
3536            .edges_directed(current_node, petgraph::Direction::Outgoing)
3537            .collect();
3538        if outgoing_edges.is_empty() {
3539            return None;
3540        }
3541        current_node = outgoing_edges[0].source();
3542        steps_forward += 1;
3543    }
3544
3545    if steps_forward == steps {
3546        Some(current_node)
3547    } else {
3548        None
3549    }
3550}
3551
3552/// Find a superbubble in a directed graph.
3553/// See reference implementation at
3554/// [bubble_gun](https://github.com/fawaz-dabbaghieh/bubble_gun/blob/53b8d68d0c9d0c35252da4dc1b4bbb6af57b4631/BubbleGun/find_bubbles.py#L29)
3555/// for more details.
3556///
3557/// # Arguments
3558///
3559/// * `graph` - A directed graph.
3560/// * `s` - The starting node.
3561/// * `direction` - The direction to traverse the graph.
3562///
3563/// # Returns
3564///
3565/// A tuple containing the starting node, ending node, and the nodes inside the superbubble.
3566///
3567/// # Panics
3568///
3569/// Panics if the graph is not a directed graph.
3570#[must_use]
3571pub fn find_superbubble(
3572    graph: &DiGraph<String, f32>,
3573    s: NodeIndex,
3574    direction: petgraph::Direction,
3575) -> Option<(NodeIndex, NodeIndex, Vec<NodeIndex>)> {
3576    let mut seen = HashSet::new();
3577    let mut visited = HashSet::new();
3578    let mut nodes_inside = Vec::new();
3579
3580    // seen.insert((s.index(), direction));
3581    seen.insert(s.index());
3582
3583    let mut stack = vec![(s, direction)];
3584    while !stack.is_empty() {
3585        let (v, v_direction) = stack.pop().unwrap();
3586        visited.insert(v.index());
3587
3588        nodes_inside.push(v);
3589
3590        // seen.remove(&(v.index(), v_direction));
3591        seen.remove(&v.index());
3592
3593        let children = match v_direction {
3594            petgraph::Direction::Incoming => graph
3595                .neighbors_directed(v, petgraph::Direction::Incoming)
3596                .collect::<Vec<_>>(),
3597            petgraph::Direction::Outgoing => graph
3598                .neighbors_directed(v, petgraph::Direction::Outgoing)
3599                .collect::<Vec<_>>(),
3600        };
3601
3602        if children.is_empty() {
3603            break;
3604        }
3605
3606        for u in children {
3607            let (u_child_direction, u_parents) = if v_direction == petgraph::Direction::Outgoing {
3608                (
3609                    petgraph::Direction::Outgoing,
3610                    graph
3611                        .neighbors_directed(u, petgraph::Direction::Incoming)
3612                        .collect::<Vec<_>>(),
3613                )
3614            } else {
3615                (
3616                    petgraph::Direction::Incoming,
3617                    graph
3618                        .neighbors_directed(u, petgraph::Direction::Outgoing)
3619                        .collect::<Vec<_>>(),
3620                )
3621            };
3622
3623            if u.index() == s.index() {
3624                stack.clear();
3625                break;
3626            }
3627
3628            seen.insert(u.index());
3629
3630            if u_parents.iter().all(|&n| visited.contains(&n.index())) {
3631                stack.push((u, u_child_direction));
3632            }
3633        }
3634
3635        if stack.len() == 1 && seen.len() == 1 {
3636            let (t, _) = stack.pop().unwrap();
3637            nodes_inside.push(t);
3638
3639            if nodes_inside.len() == 2 {
3640                break;
3641            }
3642
3643            nodes_inside.retain(|&x| x != s && x != t);
3644            return Some((s, t, nodes_inside));
3645        }
3646    }
3647
3648    None
3649}
3650
3651#[must_use]
3652pub fn find_all_superbubbles_old(
3653    graph: &petgraph::Graph<String, f32>,
3654) -> LinkedHashMap<(NodeIndex, NodeIndex), Vec<NodeIndex>> {
3655    let mut bubbles = LinkedHashMap::new();
3656
3657    let mut visited: HashSet<NodeIndex> = HashSet::new();
3658    for n in graph.node_indices() {
3659        // if !visited.contains(&n) {
3660        for d in [petgraph::Direction::Outgoing, petgraph::Direction::Incoming] {
3661            let bubble = find_superbubble(&graph, n, d);
3662
3663            if let Some(bubble) = bubble {
3664                visited.extend(bubble.2.iter().cloned());
3665
3666                let key_fwd = (bubble.0, bubble.1);
3667                let key_rev = (bubble.1, bubble.0);
3668
3669                if !bubbles.contains_key(&key_fwd) && !bubbles.contains_key(&key_rev) {
3670                    bubbles.insert((bubble.0, bubble.1), bubble.2);
3671                }
3672            }
3673        }
3674        // }
3675    }
3676
3677    bubbles
3678}
3679
3680pub fn find_all_superbubbles(
3681    graph: &petgraph::Graph<String, f32>,
3682) -> LinkedHashMap<(NodeIndex, NodeIndex), Vec<NodeIndex>> {
3683    let mut bubbles = LinkedHashMap::new();
3684    let mut processed_pairs = HashSet::new();
3685
3686    for n in graph.node_indices() {
3687        for d in [petgraph::Direction::Outgoing, petgraph::Direction::Incoming] {
3688            if let Some((start, end, interior)) = find_superbubble(graph, n, d) {
3689                // Create both possible orderings of the pair
3690                let pair_forward = (start.index(), end.index());
3691                let pair_reverse = (end.index(), start.index());
3692
3693                // Only process this bubble if we haven't seen it before
3694                if !processed_pairs.contains(&pair_forward)
3695                    && !processed_pairs.contains(&pair_reverse)
3696                {
3697                    bubbles.insert((start, end), interior);
3698                    processed_pairs.insert(pair_forward);
3699                    processed_pairs.insert(pair_reverse);
3700                }
3701            }
3702        }
3703    }
3704
3705    bubbles
3706}
3707
3708#[cfg(test)]
3709mod tests {
3710    use super::*;
3711    use crate::edges::Edges;
3712
3713    use petgraph::data::DataMap;
3714    use rand::{Rng, SeedableRng};
3715
3716    use std::collections::BTreeMap;
3717    use std::env;
3718    use std::fs::File;
3719    use std::io::{BufRead, BufReader, Write};
3720    use std::process::{Command, Stdio};
3721
3722    use flate2::read::GzDecoder;
3723
3724    use proptest::prelude::*;
3725
3726    /// Canonical example genome from https://academic.oup.com/bioinformatics/article/34/15/2556/4938484
3727    fn get_test_genome() -> Vec<u8> {
3728        "ACTGATTTCGATGCGATGCGATGCCACGGTGG".as_bytes().to_vec()
3729    }
3730
3731    /// Generate a genome sequence with tandem repeats.
3732    ///
3733    /// # Arguments
3734    ///
3735    /// * `flank_length` - The length of the flanking regions.
3736    /// * `repeat_length` - The length of the repeat region.
3737    /// * `num_repeats` - The number of repeats.
3738    /// * `seed` - The seed for the random number generator.
3739    ///
3740    /// # Returns
3741    ///
3742    /// A vector containing the genome sequence with tandem repeats.
3743    fn generate_genome_with_tandem_repeats(
3744        flank_length: usize,
3745        repeat_length: usize,
3746        num_repeats: usize,
3747        seed: u64,
3748    ) -> Vec<u8> {
3749        let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed);
3750        let alphabet = b"ACGT";
3751
3752        let mut left_flank = Vec::with_capacity(flank_length);
3753        for _ in 0..flank_length {
3754            let idx = rng.gen_range(0..4);
3755            left_flank.push(alphabet[idx]);
3756        }
3757
3758        let mut repeat = Vec::with_capacity(repeat_length);
3759        for _ in 0..repeat_length {
3760            let idx = rng.gen_range(0..4);
3761            repeat.push(alphabet[idx]);
3762        }
3763
3764        let mut right_flank = Vec::with_capacity(flank_length);
3765        for _ in 0..flank_length {
3766            let idx = rng.gen_range(0..4);
3767            right_flank.push(alphabet[idx]);
3768        }
3769
3770        let mut genome = Vec::new();
3771        genome.extend_from_slice(&left_flank);
3772        for _ in 0..num_repeats {
3773            genome.extend_from_slice(&repeat);
3774        }
3775        genome.extend_from_slice(&right_flank);
3776
3777        genome
3778    }
3779
3780    /// Assemble using the reference implementation, McCortex.
3781    fn assemble_with_mccortex(
3782        k: usize,
3783        input_genome: &Vec<u8>,
3784        build_links: bool,
3785        use_cache: bool,
3786    ) -> (BTreeMap<String, String>, Links) {
3787        let current_dir = env::current_dir().unwrap();
3788        let test_dir = current_dir.join("tests/test_data");
3789        let test_dir_str = test_dir.to_str().unwrap();
3790
3791        let copied_genome = input_genome.clone();
3792        let md5_hash = format!("{:x}", md5::compute(&copied_genome));
3793
3794        let genome_path = test_dir.join(format!("test.{}.k_{}.l_{}.fa", md5_hash, k, build_links));
3795        let links_path = test_dir.join(format!(
3796            "links.{}.k_{}.l_{}.ctp.gz",
3797            md5_hash, k, build_links
3798        ));
3799        let contigs_path =
3800            test_dir.join(format!("contigs.{}.k_{}.l_{}.fa", md5_hash, k, build_links));
3801
3802        if !use_cache || !contigs_path.exists() || !links_path.exists() {
3803            let mut genome_file = File::create(genome_path).expect("Unable to create file");
3804
3805            writeln!(genome_file, ">genome").expect("Unable to write to file");
3806            writeln!(genome_file, "{}", String::from_utf8(copied_genome).unwrap())
3807                .expect("Unable to write to file");
3808
3809            Command::new("docker")
3810                .args(&[
3811                    "run",
3812                    "-v",
3813                    &format!("{}:/data", test_dir_str),
3814                    "-it",
3815                    "us.gcr.io/broad-dsp-lrma/lr-mccortex:1.0.0",
3816                    "mccortex31",
3817                    "build",
3818                    "-f",
3819                    "-m",
3820                    "1g",
3821                    "-k",
3822                    &format!("{}", k),
3823                    "-s",
3824                    "test",
3825                    "-1",
3826                    &format!("/data/test.{}.k_{}.l_{}.fa", md5_hash, k, build_links),
3827                    &format!("/data/test.{}.k_{}.l_{}.ctx", md5_hash, k, build_links),
3828                ])
3829                .stdout(Stdio::null())
3830                .stderr(Stdio::null())
3831                .status()
3832                .expect("failed to execute process");
3833
3834            if build_links {
3835                Command::new("docker")
3836                    .args(&[
3837                        "run",
3838                        "-v",
3839                        &format!("{}:/data", test_dir_str),
3840                        "-it",
3841                        "us.gcr.io/broad-dsp-lrma/lr-mccortex:1.0.0",
3842                        "mccortex31",
3843                        "thread",
3844                        "-f",
3845                        "-m",
3846                        "1g",
3847                        "-w",
3848                        "-o",
3849                        &format!("/data/links.{}.k_{}.l_{}.ctp.gz", md5_hash, k, build_links),
3850                        "-1",
3851                        &format!("/data/test.{}.k_{}.l_{}.fa", md5_hash, k, build_links),
3852                        &format!("/data/test.{}.k_{}.l_{}.ctx", md5_hash, k, build_links),
3853                    ])
3854                    .stdout(Stdio::null())
3855                    .stderr(Stdio::null())
3856                    .status()
3857                    .expect("failed to execute process");
3858
3859                Command::new("docker")
3860                    .args(&[
3861                        "run",
3862                        "-v",
3863                        &format!("{}:/data", test_dir_str),
3864                        "-it",
3865                        "us.gcr.io/broad-dsp-lrma/lr-mccortex:1.0.0",
3866                        "mccortex31",
3867                        "contigs",
3868                        "-f",
3869                        "-m",
3870                        "1g",
3871                        "-p",
3872                        &format!("/data/links.{}.k_{}.l_{}.ctp.gz", md5_hash, k, build_links),
3873                        "-o",
3874                        &format!("/data/contigs.{}.k_{}.l_{}.fa", md5_hash, k, build_links),
3875                        &format!("/data/test.{}.k_{}.l_{}.ctx", md5_hash, k, build_links),
3876                    ])
3877                    .stdout(Stdio::null())
3878                    .stderr(Stdio::null())
3879                    .status()
3880                    .expect("failed to execute process");
3881            } else {
3882                Command::new("docker")
3883                    .args(&[
3884                        "run",
3885                        "-v",
3886                        &format!("{}:/data", test_dir_str),
3887                        "-it",
3888                        "us.gcr.io/broad-dsp-lrma/lr-mccortex:1.0.0",
3889                        "mccortex31",
3890                        "contigs",
3891                        "-f",
3892                        "-m",
3893                        "1g",
3894                        "-o",
3895                        &format!("/data/contigs.{}.k_{}.l_{}.fa", md5_hash, k, build_links),
3896                        &format!("/data/test.{}.k_{}.l_{}.ctx", md5_hash, k, build_links),
3897                    ])
3898                    .stdout(Stdio::null())
3899                    .stderr(Stdio::null())
3900                    .status()
3901                    .expect("failed to execute process");
3902            }
3903        }
3904
3905        let contig_map = parse_mccortex_fasta(contigs_path);
3906        let links = parse_mccortex_ctp_file(links_path);
3907
3908        (contig_map, links)
3909    }
3910
3911    /// Parse the McCortex contigs file, grabbing both the assembly seed and assembled sequence.
3912    fn parse_mccortex_fasta(contigs_path: PathBuf) -> BTreeMap<String, String> {
3913        let contigs = bio::io::fasta::Reader::from_file(contigs_path).unwrap();
3914        let contig_map: BTreeMap<_, _> = contigs
3915            .records()
3916            .into_iter()
3917            .filter_map(Result::ok)
3918            .map(|r| {
3919                let pieces = r
3920                    .desc()
3921                    .unwrap()
3922                    .split_whitespace()
3923                    .flat_map(|s| s.split("=").collect::<Vec<&str>>())
3924                    .collect::<Vec<&str>>();
3925
3926                let seed = pieces.chunks(2).find(|pair| pair[0] == "seed").unwrap()[1].to_string();
3927
3928                let seq = String::from_utf8_lossy(r.seq()).to_string();
3929
3930                (seed, seq)
3931            })
3932            .collect();
3933
3934        contig_map
3935    }
3936
3937    /// Parse the McCortex links file.
3938    fn parse_mccortex_ctp_file(links_path: PathBuf) -> Links {
3939        let mut links = Links::new();
3940
3941        let file = File::open(links_path).expect("Unable to open file");
3942        let decoder = GzDecoder::new(file);
3943        let reader = BufReader::new(decoder);
3944
3945        let lines: Vec<Vec<String>> = reader
3946            .lines()
3947            .map(|line| {
3948                line.unwrap()
3949                    .split_whitespace()
3950                    .map(|s| s.to_string())
3951                    .collect()
3952            })
3953            .collect();
3954
3955        for i in 0..lines.len() - 1 {
3956            let pieces1 = &lines[i];
3957
3958            if pieces1.len() < 1 {
3959                continue;
3960            }
3961
3962            if let Some(first_char_1) = pieces1.get(0).unwrap().chars().next() {
3963                if "ACGT".contains(first_char_1) {
3964                    let anchor_kmer = pieces1.get(0).unwrap().as_bytes();
3965                    let cn_anchor_vec = crate::utils::canonicalize_kmer(anchor_kmer);
3966                    let cn_anchor_kmer = cn_anchor_vec.as_bytes();
3967
3968                    for j in i + 1..lines.len() {
3969                        let pieces2 = &lines[j];
3970
3971                        if pieces2.len() < 1 {
3972                            continue;
3973                        }
3974
3975                        if let Some(first_char_2) = pieces2.get(0).unwrap().chars().next() {
3976                            if "ACGT".contains(first_char_2) {
3977                                break;
3978                            } else if "FR".contains(first_char_2) {
3979                                let mut link = Link::new(pieces2.get(0).unwrap() == "F");
3980
3981                                let junctions = pieces2.get(3).unwrap();
3982                                for junction in junctions.chars() {
3983                                    link.push_back(junction as u8);
3984                                }
3985
3986                                // Add link to links map.
3987                                if !links.contains_key(cn_anchor_kmer) {
3988                                    links.insert(cn_anchor_kmer.to_owned(), HashMap::new());
3989                                }
3990
3991                                if !links.get(cn_anchor_kmer).unwrap().contains_key(&link) {
3992                                    links.get_mut(cn_anchor_kmer).unwrap().insert(link, 1);
3993                                } else {
3994                                    let linkcov =
3995                                        *links.get_mut(cn_anchor_kmer).unwrap().get(&link).unwrap();
3996
3997                                    links
3998                                        .get_mut(cn_anchor_kmer)
3999                                        .unwrap()
4000                                        .insert(link, linkcov.saturating_add(1));
4001                                }
4002                            }
4003                        }
4004                    }
4005                }
4006            }
4007        }
4008
4009        links
4010    }
4011
4012    #[test]
4013    fn test_traverse_kmers() {
4014        let genome = get_test_genome();
4015        let fwd_seqs = vec![genome];
4016
4017        let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4018        let graph = g.traverse_kmers(b"ATTTC");
4019
4020        // println!("{}", Dot::with_config(&graph, &[petgraph::dot::Config::EdgeNoLabel]));
4021
4022        // Check if the graph is consistent with the expected structure
4023        assert_eq!(graph.node_count(), 21);
4024        assert_eq!(graph.edge_count(), 21);
4025
4026        // Check specific nodes and their labels
4027        let node_labels: Vec<_> = graph.node_weights().cloned().collect();
4028        assert!(node_labels.contains(&"ATTTC".to_string()));
4029        assert!(node_labels.contains(&"TTTCG".to_string()));
4030        assert!(node_labels.contains(&"TTCGA".to_string()));
4031        assert!(node_labels.contains(&"TCGAT".to_string()));
4032        assert!(node_labels.contains(&"CGATG".to_string()));
4033        assert!(node_labels.contains(&"GATGC".to_string()));
4034        assert!(node_labels.contains(&"ATGCC".to_string()));
4035        assert!(node_labels.contains(&"ATGCG".to_string()));
4036        assert!(node_labels.contains(&"TGCGA".to_string()));
4037        assert!(node_labels.contains(&"GCGAT".to_string()));
4038        assert!(node_labels.contains(&"TGCCA".to_string()));
4039        assert!(node_labels.contains(&"GCCAC".to_string()));
4040        assert!(node_labels.contains(&"CCACG".to_string()));
4041        assert!(node_labels.contains(&"CACGG".to_string()));
4042        assert!(node_labels.contains(&"ACGGT".to_string()));
4043        assert!(node_labels.contains(&"CGGTG".to_string()));
4044        assert!(node_labels.contains(&"GGTGG".to_string()));
4045        assert!(node_labels.contains(&"GATTT".to_string()));
4046        assert!(node_labels.contains(&"TGATT".to_string()));
4047        assert!(node_labels.contains(&"CTGAT".to_string()));
4048        assert!(node_labels.contains(&"ACTGA".to_string()));
4049
4050        // Check specific edges
4051        let edges = graph.edge_indices().collect::<Vec<_>>();
4052        assert_eq!(edges.len(), 21);
4053
4054        // Helper function to find node index by label
4055        let find_node = |label: &str| graph.node_indices().find(|&i| graph[i] == label).unwrap();
4056
4057        // Check some specific edges
4058        assert!(graph.contains_edge(find_node("ATTTC"), find_node("TTTCG")));
4059        assert!(graph.contains_edge(find_node("TTTCG"), find_node("TTCGA")));
4060        assert!(graph.contains_edge(find_node("GATGC"), find_node("ATGCC")));
4061        assert!(graph.contains_edge(find_node("GATGC"), find_node("ATGCG")));
4062        assert!(graph.contains_edge(find_node("CGATG"), find_node("GATGC")));
4063        assert!(graph.contains_edge(find_node("GCCAC"), find_node("CCACG")));
4064        assert!(graph.contains_edge(find_node("ACTGA"), find_node("CTGAT")));
4065    }
4066
4067    // TODO: This test is not working yet
4068    fn test_traverse_all_kmers() {
4069        let genome = get_test_genome();
4070        let fwd_seqs = vec![genome];
4071
4072        let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4073        let graph = g.traverse_all_kmers();
4074
4075        // Write graph as GFA to a string
4076        let mut gfa_output = Vec::new();
4077        crate::utils::write_gfa(&mut gfa_output, &graph).unwrap();
4078
4079        // Print GFA string (commented out for test)
4080        let gfa_string = String::from_utf8(gfa_output).unwrap();
4081        println!("{}", gfa_string);
4082    }
4083
4084    // TODO: This test is not working yet
4085    fn test_traverse_all_contigs() {
4086        let genome = get_test_genome();
4087        let fwd_seqs = vec![genome];
4088
4089        let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4090        let graph = g.traverse_all_contigs();
4091
4092        // Write graph as GFA to a string
4093        let mut gfa_output = Vec::new();
4094        crate::utils::write_gfa(&mut gfa_output, &graph).unwrap();
4095
4096        // Print GFA string (commented out for test)
4097        let gfa_string = String::from_utf8(gfa_output).unwrap();
4098        println!("{}", gfa_string);
4099    }
4100
4101    #[test]
4102    fn test_traverse_contigs() {
4103        let genome = get_test_genome();
4104        let fwd_seqs = vec![genome];
4105
4106        let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4107        let graph = g.traverse_contigs(b"CCACG");
4108
4109        // Uncomment to manually verify structure of the graph
4110        // use petgraph::dot::Dot;
4111        // println!("{}", Dot::with_config(&graph, &[petgraph::dot::Config::EdgeNoLabel]));
4112
4113        // Check if the graph is consistent with the expected structure
4114        assert_eq!(graph.node_count(), 3);
4115        assert_eq!(graph.edge_count(), 4);
4116
4117        // Check specific nodes and their labels
4118        let node_labels: Vec<_> = graph.node_weights().cloned().collect();
4119        assert!(node_labels.contains(&"CGATGCCACGGTGG".to_string()));
4120        assert!(node_labels.contains(&"ACTGATTTCGAT".to_string()));
4121        assert!(node_labels.contains(&"CGATGCGAT".to_string()));
4122
4123        // Check specific edges
4124        let edges = graph.edge_indices().collect::<Vec<_>>();
4125        assert_eq!(edges.len(), 4);
4126
4127        // Helper function to find node index by label
4128        let find_node = |label: &str| graph.node_indices().find(|&i| graph[i] == label).unwrap();
4129
4130        // Check some specific edges
4131        assert!(graph.contains_edge(find_node("ACTGATTTCGAT"), find_node("CGATGCCACGGTGG")));
4132        assert!(graph.contains_edge(find_node("ACTGATTTCGAT"), find_node("CGATGCGAT")));
4133        assert!(graph.contains_edge(find_node("CGATGCGAT"), find_node("CGATGCCACGGTGG")));
4134        assert!(graph.contains_edge(find_node("CGATGCGAT"), find_node("CGATGCGAT")));
4135    }
4136
4137    #[test]
4138    fn test_prev_kmers() {
4139        let genome = get_test_genome();
4140        let fwd_seqs = vec![genome];
4141
4142        let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4143
4144        let prev_kmers_1 = g.prev_kmers(b"TTTCG");
4145        assert_eq!(prev_kmers_1.len(), 1);
4146        assert_eq!(prev_kmers_1[0], b"ATTTC");
4147
4148        let prev_kmers_2: HashSet<Vec<u8>> = g.prev_kmers(b"CGATG").into_iter().collect();
4149        assert_eq!(prev_kmers_2.len(), 2);
4150        assert!(prev_kmers_2.contains(&b"TCGAT".to_vec()));
4151        assert!(prev_kmers_2.contains(&b"GCGAT".to_vec()));
4152
4153        let prev_kmers_3 = g.prev_kmers(b"ACTGA");
4154        assert_eq!(prev_kmers_3.len(), 0);
4155    }
4156
4157    #[test]
4158    fn test_next_kmers() {
4159        let genome = get_test_genome();
4160        let fwd_seqs = vec![genome];
4161
4162        let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4163
4164        let next_kmers_1 = g.next_kmers(b"TTTCG");
4165        assert_eq!(next_kmers_1.len(), 1);
4166        assert_eq!(next_kmers_1[0], b"TTCGA");
4167
4168        let next_kmers_2: HashSet<Vec<u8>> = g.next_kmers(b"GATGC").into_iter().collect();
4169        assert_eq!(next_kmers_2.len(), 2);
4170        assert!(next_kmers_2.contains(&b"ATGCG".to_vec()));
4171        assert!(next_kmers_2.contains(&b"ATGCC".to_vec()));
4172
4173        let next_kmers_3 = g.next_kmers(b"GGTGG");
4174        assert_eq!(next_kmers_3.len(), 0);
4175    }
4176
4177    #[test]
4178    fn test_from_sequences() {
4179        let genome = get_test_genome();
4180        let fwd_seqs = vec![genome];
4181
4182        let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4183
4184        let mut exp_graph = KmerGraph::new();
4185        exp_graph.insert(
4186            b"AAATC".to_vec(),
4187            Record::new(1, Some(Edges::from_string("..g.A..."))),
4188        );
4189        exp_graph.insert(
4190            b"AATCA".to_vec(),
4191            Record::new(1, Some(Edges::from_string("a.....G."))),
4192        );
4193        exp_graph.insert(
4194            b"ACCGT".to_vec(),
4195            Record::new(1, Some(Edges::from_string(".c....G."))),
4196        );
4197        exp_graph.insert(
4198            b"ACTGA".to_vec(),
4199            Record::new(1, Some(Edges::from_string(".......T"))),
4200        );
4201        exp_graph.insert(
4202            b"ATCAG".to_vec(),
4203            Record::new(1, Some(Edges::from_string("a......T"))),
4204        );
4205        exp_graph.insert(
4206            b"ATCGA".to_vec(),
4207            Record::new(1, Some(Edges::from_string(".c..A..."))),
4208        );
4209        exp_graph.insert(
4210            b"ATCGC".to_vec(),
4211            Record::new(2, Some(Edges::from_string(".c..A..."))),
4212        );
4213        exp_graph.insert(
4214            b"ATGCC".to_vec(),
4215            Record::new(1, Some(Edges::from_string("..g.A..."))),
4216        );
4217        exp_graph.insert(
4218            b"ATGCG".to_vec(),
4219            Record::new(2, Some(Edges::from_string("..g.A..."))),
4220        );
4221        exp_graph.insert(
4222            b"ATTTC".to_vec(),
4223            Record::new(1, Some(Edges::from_string("..g...G."))),
4224        );
4225        exp_graph.insert(
4226            b"CACCG".to_vec(),
4227            Record::new(1, Some(Edges::from_string(".c.....T"))),
4228        );
4229        exp_graph.insert(
4230            b"CACGG".to_vec(),
4231            Record::new(1, Some(Edges::from_string(".c.....T"))),
4232        );
4233        exp_graph.insert(
4234            b"CATCG".to_vec(),
4235            Record::new(3, Some(Edges::from_string("..g.AC.."))),
4236        );
4237        exp_graph.insert(
4238            b"CCACC".to_vec(),
4239            Record::new(1, Some(Edges::from_string("......G."))),
4240        );
4241        exp_graph.insert(
4242            b"CCACG".to_vec(),
4243            Record::new(1, Some(Edges::from_string("..g...G."))),
4244        );
4245        exp_graph.insert(
4246            b"CGAAA".to_vec(),
4247            Record::new(1, Some(Edges::from_string("...t...T"))),
4248        );
4249        exp_graph.insert(
4250            b"GATGC".to_vec(),
4251            Record::new(3, Some(Edges::from_string(".c...CG."))),
4252        );
4253        exp_graph.insert(
4254            b"GCCAC".to_vec(),
4255            Record::new(1, Some(Edges::from_string("...t..G."))),
4256        );
4257        exp_graph.insert(
4258            b"TCGAA".to_vec(),
4259            Record::new(1, Some(Edges::from_string("a...A..."))),
4260        );
4261        exp_graph.insert(
4262            b"TCGCA".to_vec(),
4263            Record::new(2, Some(Edges::from_string("a......T"))),
4264        );
4265        exp_graph.insert(
4266            b"TGCCA".to_vec(),
4267            Record::new(1, Some(Edges::from_string("a....C.."))),
4268        );
4269
4270        let mut exp_links = Links::new();
4271        exp_links.insert(
4272            b"ATGCC".to_vec(),
4273            HashMap::from([(Link::from_junctions(false, b"CCA"), 1)]),
4274        );
4275        exp_links.insert(
4276            b"ATCGC".to_vec(),
4277            HashMap::from([
4278                (Link::from_junctions(false, b"GC"), 1),
4279                (Link::from_junctions(false, b"C"), 1),
4280            ]),
4281        );
4282        exp_links.insert(
4283            b"ATGCG".to_vec(),
4284            HashMap::from([
4285                (Link::from_junctions(false, b"CA"), 1),
4286                (Link::from_junctions(false, b"A"), 1),
4287            ]),
4288        );
4289        exp_links.insert(
4290            b"ATCGA".to_vec(),
4291            HashMap::from([(Link::from_junctions(false, b"GGC"), 1)]),
4292        );
4293
4294        for (kmer, record) in g.kmers {
4295            assert!(exp_graph.contains_key(kmer.as_bytes()));
4296
4297            let exp_record = exp_graph.get(kmer.as_bytes()).unwrap();
4298            assert!(record.coverage() == exp_record.coverage());
4299            assert!(record.incoming_edges() == exp_record.incoming_edges());
4300            assert!(record.outgoing_edges() == exp_record.outgoing_edges());
4301        }
4302
4303        for (kmer, link_map) in &g.links {
4304            assert!(exp_links.contains_key(kmer));
4305            assert!(link_map.len() == exp_links.get(kmer).unwrap().len());
4306            for (link, count) in link_map {
4307                assert!(exp_links.get(kmer).unwrap().contains_key(link));
4308                assert!(count == exp_links.get(kmer).unwrap().get(link).unwrap());
4309            }
4310        }
4311    }
4312
4313    #[test]
4314    fn test_assemble() {
4315        let fw_genome = get_test_genome();
4316        let rc_genome = fw_genome.reverse_complement();
4317
4318        let fwd_seqs = vec![fw_genome.clone()];
4319
4320        let g =
4321            LdBG::from_sequences(String::from("test"), 5, &fwd_seqs).build_links(&fwd_seqs, false);
4322
4323        // assembly outside cycle should recapitulate entire genome
4324        assert!(fw_genome == g.assemble(b"ACTGA"));
4325        assert!(fw_genome == g.assemble(b"TTCGA"));
4326
4327        assert!(fw_genome == g.assemble(b"TGCCA"));
4328        assert!(fw_genome == g.assemble(b"GGTGG"));
4329
4330        // assembly outside cycle should recapitulate entire genome
4331        assert!(rc_genome == g.assemble(b"ACTGA".to_vec().reverse_complement().as_bytes()));
4332        assert!(rc_genome == g.assemble(b"TTCGA".to_vec().reverse_complement().as_bytes()));
4333        assert!(rc_genome == g.assemble(b"TGCCA".to_vec().reverse_complement().as_bytes()));
4334        assert!(rc_genome == g.assemble(b"GGTGG".to_vec().reverse_complement().as_bytes()));
4335    }
4336
4337    #[test]
4338    fn test_assemble_until() {
4339        let fw_genome = get_test_genome();
4340        let fwd_seqs = vec![fw_genome.clone()];
4341
4342        let g =
4343            LdBG::from_sequences(String::from("test"), 5, &fwd_seqs).build_links(&fwd_seqs, false);
4344
4345        let mut contig1 = b"ACTGA".to_vec();
4346        let _ = g.assemble_forward_until(
4347            &mut contig1,
4348            b"ACTGA",
4349            &HashSet::from([b"TTCGA".to_vec()]),
4350            100,
4351        );
4352        assert_eq!(contig1, b"ACTGATTTCGA".to_vec());
4353
4354        let mut contig2 = b"GGTGG".to_vec();
4355        let _ = g.assemble_backward_until(
4356            &mut contig2,
4357            b"GGTGG",
4358            &HashSet::from([b"TGCCA".to_vec()]),
4359            100,
4360        );
4361        assert_eq!(contig2, b"TGCCACGGTGG".to_vec());
4362    }
4363
4364    #[test]
4365    fn test_correct_seq() {
4366        let genome = get_test_genome();
4367        let g = LdBG::from_sequence("test".to_string(), 5, &genome);
4368
4369        let mut uncorrected_seq_map = BTreeMap::new();
4370
4371        // Things that should get corrected successfully.
4372        uncorrected_seq_map.insert(
4373            b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec(),
4374            vec![b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec()],
4375        );
4376        uncorrected_seq_map.insert(
4377            b"ACTGAATTCGATGCGATGCGATGCCACGGTGG".to_vec(),
4378            vec![b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec()],
4379        );
4380        uncorrected_seq_map.insert(
4381            b"ACTGATATCGATGCGATGCGATGCCACGGTGG".to_vec(),
4382            vec![b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec()],
4383        );
4384        uncorrected_seq_map.insert(
4385            b"ACTGATTTCGATGCGATGCGATGCCATGGTGG".to_vec(),
4386            vec![b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec()],
4387        );
4388        uncorrected_seq_map.insert(
4389            b"ACTGATTTCGATGCGATGCGATGCCATCGGTGG".to_vec(),
4390            vec![b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec()],
4391        );
4392        uncorrected_seq_map.insert(
4393            b"ACTGATTTCGATGCGATGCGATGCCGGTGG".to_vec(),
4394            vec![b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec()],
4395        );
4396
4397        let g1 = g.traverse_all_kmers();
4398
4399        for (uncorrected_seq, expected_seqs) in uncorrected_seq_map.iter() {
4400            let corrected_seqs = g.correct_seq(&g1, &uncorrected_seq);
4401
4402            assert_eq!(corrected_seqs.len(), expected_seqs.len());
4403            // TODO: Uncomment this when we have a way to compare sequences
4404            // for (corrected_seq, expected_seq) in corrected_seqs.iter().zip(expected_seqs) {
4405            // assert_eq!(*corrected_seq, *expected_seq);
4406            // }
4407        }
4408    }
4409
4410    proptest! {
4411        #[test]
4412        fn test_assemble_random_genomes(
4413            repeat_length in (3..18usize).prop_filter("Increment by 3", |v| v % 3 == 0),
4414            num_repeats in (2..=3usize),
4415            k in (11..=17usize).prop_filter("Must be odd", |v| v % 2 == 1)
4416        ) {
4417            let random_genome = generate_genome_with_tandem_repeats(500, repeat_length, num_repeats, 0);
4418
4419            let g = LdBG::from_sequences(String::from("test"), k, &vec!(random_genome.clone()))
4420                .build_links(&vec!(random_genome.clone()), false);
4421
4422            let hd_links = g.links.clone();
4423
4424            let (mc_contigs, mc_links) = assemble_with_mccortex(k, &random_genome, true, true);
4425            let mc_links: BTreeMap<Vec<u8>, BTreeMap<Link, u16>> = mc_links.into_iter()
4426                .map(|(k, v)| (k, v.into_iter().collect()))
4427                .collect();
4428
4429            let all_kmers: HashSet<_> = mc_links.keys().chain(hd_links.keys()).collect();
4430
4431            for kmer in all_kmers {
4432                let mc_links_map = mc_links.get(kmer);
4433                let hd_links_map = hd_links.get(kmer);
4434
4435                match (mc_links_map, hd_links_map) {
4436                    (Some(mc_map), Some(hd_map)) => {
4437                        assert_eq!(mc_map.len(), hd_map.len(), "Maps have different lengths for kmer: {}", String::from_utf8_lossy(kmer));
4438                        for mc_link in mc_map.keys() {
4439                            assert!(hd_map.contains_key(mc_link), "Link {} not in hd map for kmer: {}", mc_link, String::from_utf8_lossy(kmer));
4440                        }
4441                    }
4442                    (Some(mc_map), None) => {
4443                        for (link, count) in mc_map {
4444                            println!("mc only: {} {} {}", String::from_utf8_lossy(kmer), link, count);
4445                        }
4446                        panic!("There are links unique to reference implementation.");
4447                    }
4448                    (None, Some(hd_map)) => {
4449                        for (link, count) in hd_map {
4450                            println!("hd only: {} {} {}", String::from_utf8_lossy(kmer), link, count);
4451                        }
4452                        panic!("There are links unique to skydive implementation.");
4453                    }
4454                    (None, None) => {}
4455                }
4456            }
4457
4458            for (seed, mc_contig) in mc_contigs {
4459                let hd_contig = String::from_utf8(g.assemble(seed.as_bytes())).expect("Invalid UTF-8 sequence");
4460
4461                assert_eq!(mc_contig, hd_contig);
4462            }
4463        }
4464    }
4465
4466    // From "BubbleGun: enumerating bubbles and superbubbles in genome graphs", Dabbaghie et al. 2022
4467    // https://academic.oup.com/bioinformatics/article/38/17/4217/6633304?login=false
4468    // https://github.com/fawaz-dabbaghieh/bubble_gun/blob/master/example/paper_example2.gfa
4469    fn create_bubblegun_graph() -> petgraph::Graph<String, f32> {
4470        // Create a new directed graph
4471        let mut graph = petgraph::Graph::<String, f32>::new();
4472
4473        // Create a HashMap to store node indices
4474        let mut node_indices = std::collections::HashMap::new();
4475
4476        // Add nodes to the graph
4477        let gfa_nodes = vec![
4478            ("1", "CCCAACAAGTG"),
4479            ("7", "AACAAGTGTACTCATTG"),
4480            ("25", "ACTCATTGACG"),
4481            ("31", "CATTGACGTAAATTTGGTGCGGGCCTCAAGGTGTCCA"),
4482            ("89", "GGTGTCCATTGGGGTC"),
4483            ("105", "TTGGGGTCAGCACAAAT"),
4484            ("123", "GCACAAATTGCCA"),
4485            ("163", "CATTGACGAGGCACCC"),
4486            ("179", "AGGCACCCGTGCATTAG"),
4487            ("197", "TGCATTAGGCAGGGTGT"),
4488            ("215", "CAGGGTGTCCA"),
4489            ("237", "TTGGGGTCTGCACAAAT"),
4490            ("271", "AACAAGTGAACTCATTG"),
4491            ("311", "AGGCACCCCTGCATTAG"),
4492            ("427", "CATTGACGCAACCGGCATTGAATACACAGGGTGT"),
4493        ];
4494
4495        for (id, seq) in gfa_nodes {
4496            let node_index = graph.add_node(seq.to_string());
4497            node_indices.insert(id.to_string(), node_index);
4498        }
4499
4500        // Add edges to the graph
4501        let gfa_edges = vec![
4502            ("1", "7"),
4503            ("1", "271"),
4504            ("7", "25"),
4505            ("25", "31"),
4506            ("25", "427"),
4507            ("25", "163"),
4508            ("31", "89"),
4509            ("89", "237"),
4510            ("89", "105"),
4511            ("105", "123"),
4512            ("163", "179"),
4513            ("163", "311"),
4514            ("179", "197"),
4515            ("197", "215"),
4516            ("215", "89"),
4517            ("237", "123"),
4518            ("271", "25"),
4519            ("311", "197"),
4520            ("427", "215"),
4521        ];
4522
4523        for (from, to) in gfa_edges {
4524            if let (Some(&from_index), Some(&to_index)) =
4525                (node_indices.get(from), node_indices.get(to))
4526            {
4527                graph.add_edge(from_index, to_index, 1.0);
4528            }
4529        }
4530        graph
4531    }
4532
4533    #[test]
4534    fn test_find_all_superbubbles() {
4535        let graph = create_bubblegun_graph();
4536
4537        let expected_bubbles = HashMap::from([
4538            ((b"CCCAACAAGTG".to_vec(), b"ACTCATTGACG".to_vec()), 2usize),
4539            // ((b"ACTCATTGACG".to_vec(),      b"CCCAACAAGTG".to_vec()),      2usize),
4540            (
4541                (b"ACTCATTGACG".to_vec(), b"GGTGTCCATTGGGGTC".to_vec()),
4542                7usize,
4543            ),
4544            // ((b"GGTGTCCATTGGGGTC".to_vec(), b"ACTCATTGACG".to_vec()),      7usize),
4545            (
4546                (b"GGTGTCCATTGGGGTC".to_vec(), b"GCACAAATTGCCA".to_vec()),
4547                2usize,
4548            ),
4549            // ((b"GCACAAATTGCCA".to_vec(),    b"GGTGTCCATTGGGGTC".to_vec()), 2usize),
4550        ]);
4551
4552        let bubbles = find_all_superbubbles(&graph);
4553        assert_eq!(bubbles.len(), expected_bubbles.len());
4554
4555        for ((in_node, out_node), interior) in bubbles {
4556            let in_seq = graph.node_weight(in_node).unwrap().as_bytes().to_vec();
4557            let out_seq = graph.node_weight(out_node).unwrap().as_bytes().to_vec();
4558
4559            let key_fwd = (in_seq.clone(), out_seq.clone());
4560            let key_rev = (out_seq.clone(), in_seq.clone());
4561
4562            assert!(
4563                expected_bubbles.contains_key(&key_fwd) || expected_bubbles.contains_key(&key_rev)
4564            );
4565
4566            if expected_bubbles.contains_key(&key_fwd) {
4567                assert_eq!(expected_bubbles.get(&key_fwd).unwrap(), &interior.len());
4568            } else {
4569                assert_eq!(expected_bubbles.get(&key_rev).unwrap(), &interior.len());
4570            }
4571        }
4572    }
4573
4574    #[test]
4575    fn test_find_all_superbubbles_2() {
4576        let graph = crate::utils::read_gfa("/Users/kiran/repositories/hidive/test.gfa").unwrap();
4577
4578        let bubbles = find_all_superbubbles(&graph);
4579
4580        let start = b"GAGGGAACAGCGACTTC".to_vec();
4581        let end = b"TGGACACAGGCACCTGG".to_vec();
4582        for ((in_node, out_node), interior) in bubbles {
4583            let in_seq = graph.node_weight(in_node).unwrap().as_bytes().to_vec();
4584            let out_seq = graph.node_weight(out_node).unwrap().as_bytes().to_vec();
4585
4586            if in_seq == start && out_seq == end {
4587                println!("Found bubble: {}", interior.len());
4588
4589                for path in petgraph::algo::all_simple_paths::<Vec<_>, _>(
4590                    &graph,
4591                    in_node,
4592                    out_node,
4593                    0,
4594                    Some(interior.len() + 10),
4595                ) {
4596                    println!("out path: {}", path.len());
4597                }
4598
4599                for path in petgraph::algo::all_simple_paths::<Vec<_>, _>(
4600                    &graph,
4601                    out_node,
4602                    in_node,
4603                    0,
4604                    Some(interior.len() + 10),
4605                ) {
4606                    println!("in path:  {}", path.len());
4607
4608                    for node in path {
4609                        println!(" - {}", graph.node_weight(node).unwrap());
4610                    }
4611                }
4612            }
4613        }
4614    }
4615}