1use anyhow::Result;
2use itertools::Itertools;
3use linked_hash_map::LinkedHashMap;
4
5use std::collections::{BTreeMap, HashMap, HashSet};
6use std::fs::File;
7use std::io::Write;
8
9use parquet::data_type::AsBytes;
10
11use needletail::sequence::complement;
12use needletail::Sequence;
13use std::path::PathBuf;
14
15use petgraph::graph::{DiGraph, NodeIndex};
16use petgraph::visit::EdgeRef;
17
18use indicatif::{ParallelProgressIterator, ProgressIterator};
19use rayon::iter::IntoParallelRefIterator;
20use rayon::iter::ParallelIterator;
21
22use gbdt::decision_tree::Data;
23use gbdt::gradient_boost::GBDT;
24
25use bio::alignment::distance::levenshtein;
26
27use crate::edges::Edges;
28use crate::link::Link;
29use crate::record::Record;
30use crate::wmec::WMECData;
31
32type KmerGraph = HashMap<Vec<u8>, Record>;
33type KmerScores = HashMap<Vec<u8>, f32>;
34type Links = HashMap<Vec<u8>, HashMap<Link, u16>>;
35type Sources = HashMap<Vec<u8>, Vec<usize>>;
36type KmerNoise = HashSet<Vec<u8>>;
37
38#[derive(Debug, Clone)]
40pub struct LdBG {
41 pub name: String,
42 pub kmer_size: usize,
43 pub kmers: KmerGraph,
44 pub scores: KmerScores,
45 pub links: Links,
46 pub sources: Sources,
47 pub noise: KmerNoise,
48 pub verbose: bool,
49}
50
51impl LdBG {
52 #[must_use]
53 pub fn new(name: String, kmer_size: usize) -> Self {
54 LdBG {
55 name,
56 kmer_size,
57 kmers: KmerGraph::new(),
58 scores: KmerScores::new(),
59 links: Links::new(),
60 sources: Sources::new(),
61 noise: KmerNoise::new(),
62 verbose: false,
63 }
64 }
65
66 #[must_use]
82 pub fn from_file(name: String, kmer_size: usize, seq_path: &PathBuf) -> Self {
83 let reader = bio::io::fasta::Reader::from_file(seq_path).unwrap();
84 let all_reads: Vec<bio::io::fasta::Record> = reader.records().map(|r| r.unwrap()).collect();
85
86 let fwd_seqs: Vec<Vec<u8>> = all_reads
87 .iter()
88 .map(|r| r.seq().as_bytes().to_vec())
89 .collect();
90
91 let kmers = Self::build_graph(kmer_size, &fwd_seqs);
92 let scores: KmerScores = kmers.keys().map(|k| (k.clone(), 1.0)).collect();
93 let links: Links = Links::new();
94 let sources: Sources = Sources::new();
95 let noise: KmerNoise = Self::find_noise(&fwd_seqs, kmer_size);
96
97 LdBG {
98 name,
99 kmer_size,
100 kmers,
101 scores,
102 links,
103 sources,
104 noise,
105 verbose: false,
106 }
107 }
108
109 #[must_use]
125 pub fn from_files(name: String, kmer_size: usize, seq_paths: &Vec<PathBuf>) -> Self {
126 let fwd_seqs = seq_paths
127 .iter()
128 .map(|p| {
129 let reader = bio::io::fasta::Reader::from_file(p).expect("Failed to open file");
130 reader
131 .records()
132 .filter_map(|r| r.ok())
133 .map(|r| r.seq().to_vec())
134 .collect::<Vec<Vec<u8>>>()
135 })
136 .flatten()
137 .collect();
138
139 let kmers = Self::build_graph(kmer_size, &fwd_seqs);
140 let scores: KmerScores = kmers.keys().map(|k| (k.clone(), 1.0)).collect();
141 let links: Links = Links::new();
142 let sources: Sources = Sources::new();
143 let noise: KmerNoise = Self::find_noise(&fwd_seqs, kmer_size);
144
145 LdBG {
146 name,
147 kmer_size,
148 kmers,
149 scores,
150 links,
151 sources,
152 noise,
153 verbose: false,
154 }
155 }
156
157 #[must_use]
169 pub fn from_sequence(name: String, kmer_size: usize, fwd_seq: &Vec<u8>) -> Self {
170 let fwd_seqs = vec![fwd_seq.clone()];
171
172 let kmers = Self::build_graph(kmer_size, &fwd_seqs);
173 let scores: KmerScores = kmers.keys().map(|k| (k.clone(), 1.0)).collect();
174 let links: Links = Links::new();
175 let sources: Sources = Sources::new();
176 let noise: KmerNoise = Self::find_noise(&fwd_seqs, kmer_size);
177
178 LdBG {
179 name,
180 kmer_size,
181 kmers,
182 scores,
183 links,
184 sources,
185 noise,
186 verbose: false,
187 }
188 }
189
190 #[must_use]
202 pub fn from_sequences(name: String, kmer_size: usize, fwd_seqs: &Vec<Vec<u8>>) -> Self {
203 let kmers = Self::build_graph(kmer_size, fwd_seqs);
204 let scores: KmerScores = kmers.keys().map(|k| (k.clone(), 1.0)).collect();
205 let links: Links = Links::new();
206 let sources: Sources = Sources::new();
207 let noise: KmerNoise = Self::find_noise(&fwd_seqs, kmer_size);
208
209 LdBG {
210 name,
211 kmer_size,
212 kmers,
213 scores,
214 links,
215 sources,
216 noise,
217 verbose: false,
218 }
219 }
220
221 #[must_use]
227 pub fn name(&self) -> &String {
228 &self.name
229 }
230
231 #[must_use]
232 pub fn verbose(mut self, verbose: bool) -> Self {
233 self.verbose = verbose;
234 self
235 }
236
237 fn find_noise(fwd_seqs: &Vec<Vec<u8>>, kmer_size: usize) -> HashSet<Vec<u8>> {
238 let mut noise: KmerNoise = KmerNoise::new();
239
240 for seq in fwd_seqs {
241 let mut kmer_positions = HashMap::new();
242 for (i, fw_kmer) in seq.windows(kmer_size).enumerate() {
243 let cn_kmer = crate::utils::canonicalize_kmer(fw_kmer);
244 kmer_positions
245 .entry(cn_kmer)
246 .or_insert_with(Vec::new)
247 .push(i);
248 }
249
250 for (cn_kmer, positions) in kmer_positions {
251 if positions.len() > 1 {
252 noise.insert(cn_kmer);
253 }
254 }
255
256 for range in sdust::symmetric_dust(seq) {
257 if range.len() >= 1 {
258 for fw_kmer in seq[range.start..range.end].windows(kmer_size) {
259 let cn_kmer = crate::utils::canonicalize_kmer(fw_kmer);
260
261 noise.insert(cn_kmer);
262 }
263 }
264 }
265 }
266
267 noise
268 }
269
270 fn build_graph(k: usize, fwd_seqs: &Vec<Vec<u8>>) -> KmerGraph {
281 let mut graph: KmerGraph = KmerGraph::new();
282
283 for fwd_seq in fwd_seqs {
285 if fwd_seq.len() < k + 1
286 || fwd_seq
287 .iter()
288 .any(|&b| !matches!(b, b'A' | b'C' | b'G' | b'T'))
289 {
290 continue;
291 }
292
293 for i in 0..fwd_seq.len() - k + 1 {
295 let fw_kmer = &fwd_seq[i..i + k];
296
297 let prev = fwd_seq.get(i.wrapping_sub(1));
298 let fw_prev_base = *prev.unwrap_or(&b'.');
299
300 let next = fwd_seq.get(i + k);
301 let fw_next_base = *next.unwrap_or(&b'.');
302
303 Self::add_record_to_graph(&mut graph, fw_kmer, fw_prev_base, fw_next_base);
304 }
305 }
306
307 graph
308 }
309
310 #[must_use]
321 pub fn build_links(mut self, fwd_seqs: &Vec<Vec<u8>>, correct: bool) -> Self {
322 let progress_bar = if self.verbose {
323 crate::utils::default_bounded_progress_bar("Building links", fwd_seqs.len() as u64)
324 } else {
325 crate::utils::default_hidden_progress_bar()
326 };
327
328 let g = self.traverse_all_kmers();
329
330 let links: Links = fwd_seqs
331 .par_iter()
332 .progress_with(progress_bar)
333 .map(|fwd_seq| {
334 if correct {
335 self.correct_seq(&g, fwd_seq)
336 } else {
337 fwd_seq.clone()
338 }
339 })
340 .map(|fwd_seq| {
341 let mut local_links = Links::new();
342
343 let fw_seq = fwd_seq.clone();
344 let rc_seq = fw_seq.reverse_complement();
345
346 LdBG::add_record_to_links(&mut local_links, &fw_seq, self.kmer_size, &self.kmers);
347 LdBG::add_record_to_links(&mut local_links, &rc_seq, self.kmer_size, &self.kmers);
348
349 local_links
350 })
351 .reduce(Links::new, |mut acc, local_links| {
352 for (k, v) in local_links {
353 acc.entry(k).or_default().extend(v);
354 }
355 acc
356 });
357
358 self.links = links;
359
360 self
361 }
362
363 fn add_record_to_graph(
372 graph: &mut KmerGraph,
373 fw_kmer: &[u8],
374 fw_prev_base: u8,
375 fw_next_base: u8,
376 ) {
377 let rc_kmer_vec = fw_kmer.reverse_complement();
378 let rc_kmer = rc_kmer_vec.as_bytes();
379
380 let (cn_kmer, can_prev_base, can_next_base, is_forward) = if fw_kmer < rc_kmer {
381 (fw_kmer, fw_prev_base, fw_next_base, true)
382 } else {
383 (
384 rc_kmer,
385 complement(fw_next_base),
386 complement(fw_prev_base),
387 false,
388 )
389 };
390
391 if !graph.contains_key(cn_kmer) {
393 graph.insert(cn_kmer.to_owned(), Record::new(0, None));
394 }
395
396 graph.get_mut(cn_kmer).unwrap().increment_coverage();
398
399 if is_forward {
401 graph.get_mut(cn_kmer).unwrap().increment_fw_coverage();
402 } else {
403 graph.get_mut(cn_kmer).unwrap().increment_rc_coverage();
404 }
405
406 graph
408 .get_mut(cn_kmer)
409 .unwrap()
410 .set_incoming_edge(can_prev_base);
411
412 graph
414 .get_mut(cn_kmer)
415 .unwrap()
416 .set_outgoing_edge(can_next_base);
417 }
418
419 fn add_record_to_links(links: &mut Links, seq: &[u8], k: usize, graph: &KmerGraph) {
430 if seq.len() < k + 1 {
431 return;
432 }
433
434 let range = (0..seq.len() - k + 1).collect::<Vec<_>>();
435
436 for i in range {
438 let fw_kmer = &seq[i..i + k];
439
440 if LdBG::has_junction(graph, fw_kmer, true) {
441 if let Some((anchor_kmer_vec, index)) =
442 Self::find_anchor_kmer(i, seq, k, graph, true)
443 {
444 let anchor_kmer = anchor_kmer_vec.as_bytes();
445
446 let cn_anchor_kmer_vec = crate::utils::canonicalize_kmer(anchor_kmer);
447 let cn_anchor_kmer = cn_anchor_kmer_vec.as_bytes();
448
449 let mut link = Link::new(anchor_kmer == cn_anchor_kmer);
451
452 let sub_range = (index..seq.len() - k).collect::<Vec<_>>();
453
454 for j in sub_range {
455 let next_kmer = &seq[j..j + k];
456
457 let has_junction = LdBG::has_junction(graph, next_kmer, false);
458 if has_junction {
459 let choice = seq[j + k];
460 link.push_back(choice);
461 }
462 }
463
464 if !link.junctions.is_empty() {
465 if !links.contains_key(cn_anchor_kmer) {
467 links.insert(cn_anchor_kmer.to_owned(), HashMap::new());
468 }
469
470 if !links.get(cn_anchor_kmer).unwrap().contains_key(&link) {
471 links.get_mut(cn_anchor_kmer).unwrap().insert(link, 1);
472 } else {
473 let linkcov =
474 *links.get_mut(cn_anchor_kmer).unwrap().get(&link).unwrap();
475
476 links
477 .get_mut(cn_anchor_kmer)
478 .unwrap()
479 .insert(link, linkcov.saturating_add(1));
480 }
481 }
482 }
483 }
484 }
485 }
486
487 fn find_anchor_kmer(
501 index: usize,
502 seq: &[u8],
503 k: usize,
504 graph: &KmerGraph,
505 reverse: bool,
506 ) -> Option<(Vec<u8>, usize)> {
507 if index > 0 {
508 let mut index = index - 1;
509 loop {
510 if index == 0 {
511 break;
512 }
513
514 let anchor_kmer = &seq[index..index + k];
515
516 if !LdBG::has_junction(graph, anchor_kmer, !reverse)
517 || LdBG::has_junction(graph, anchor_kmer, reverse)
518 {
519 return Some((anchor_kmer.to_vec(), index));
520 }
521
522 index -= 1;
523 }
524 }
525
526 None
527 }
528
529 fn has_junction(graph: &KmerGraph, kmer: &[u8], reverse: bool) -> bool {
541 let cn_kmer = crate::utils::canonicalize_kmer(kmer);
542
543 if let Some(r) = graph.get(&cn_kmer) {
544 let is_canonical = String::from_utf8_lossy(kmer) == String::from_utf8_lossy(&cn_kmer);
545 let (in_degree, out_degree) = (r.in_degree(), r.out_degree());
546
547 return if is_canonical {
548 if !reverse {
549 out_degree > 1
550 } else {
551 in_degree > 1
552 }
553 } else if !reverse {
554 in_degree > 1
555 } else {
556 out_degree > 1
557 };
558 }
559
560 false
561 }
562
563 fn next_kmers(&self, kmer: &[u8]) -> Vec<Vec<u8>> {
564 let cn_kmer_vec = crate::utils::canonicalize_kmer(kmer).clone();
565 let cn_kmer = cn_kmer_vec.as_bytes();
566
567 let ru = self.kmers.get(cn_kmer);
568 if ru.is_none() {
569 return vec![];
570 }
571
572 let r = ru.unwrap();
573
574 let next_kmers = if cn_kmer == kmer {
575 r.outgoing_edges()
576 .iter()
577 .map(|&e| {
578 let mut next_kmer = kmer[1..].to_owned();
579 next_kmer.push(e);
580 next_kmer
581 })
582 .collect::<Vec<Vec<u8>>>()
583 } else {
584 r.incoming_edges()
585 .iter()
586 .map(|&e| {
587 let mut next_kmer = kmer[1..].to_owned();
588 next_kmer.push(complement(e));
589 next_kmer
590 })
591 .collect::<Vec<Vec<u8>>>()
592 };
593
594 next_kmers
595 }
596
597 fn prev_kmers(&self, kmer: &[u8]) -> Vec<Vec<u8>> {
598 let cn_kmer_vec = crate::utils::canonicalize_kmer(kmer).clone();
599 let cn_kmer = cn_kmer_vec.as_bytes();
600
601 let ru = self.kmers.get(cn_kmer);
602 if ru.is_none() {
603 return vec![];
604 }
605
606 let r = ru.unwrap();
607
608 let prev_kmers = if cn_kmer == kmer {
609 r.incoming_edges()
610 .iter()
611 .map(|&e| {
612 let mut prev_kmer = kmer[0..kmer.len() - 1].to_owned();
613 prev_kmer.insert(0, e);
614 prev_kmer
615 })
616 .collect::<Vec<Vec<u8>>>()
617 } else {
618 r.outgoing_edges()
619 .iter()
620 .map(|&e| {
621 let mut prev_kmer = kmer[0..kmer.len() - 1].to_owned();
622 prev_kmer.insert(0, complement(e));
623 prev_kmer
624 })
625 .collect::<Vec<Vec<u8>>>()
626 };
627
628 prev_kmers
629 }
630
631 fn next_kmer(&self, kmer: &[u8], links_in_scope: &mut Vec<Link>) -> Option<Vec<u8>> {
642 let cn_kmer_vec = crate::utils::canonicalize_kmer(kmer).clone();
643 let cn_kmer = cn_kmer_vec.as_bytes();
644
645 let ru = self.kmers.get(cn_kmer);
646 let r = ru?;
647
648 let next_base = if cn_kmer == kmer {
649 match r.out_degree() {
650 0 => return None,
651 1 => r.outgoing_edges()[0],
652 _ => {
653 links_in_scope.retain(|link| {
654 if let Some(&next_char) = link.front() {
655 r.outgoing_edges().contains(&next_char)
656 } else {
657 false
658 }
659 });
660
661 let consensus_junction_choice = *links_in_scope.first()?.front().unwrap();
662
663 if r.outgoing_edges().contains(&consensus_junction_choice) {
664 for link in links_in_scope.iter_mut() {
665 link.pop_front();
666 }
667 links_in_scope.retain(|link| !link.is_empty());
668 consensus_junction_choice
669 } else {
670 return None;
671 }
672 }
673 }
674 } else {
675 match r.in_degree() {
676 0 => return None,
677 1 => complement(r.incoming_edges()[0]),
678 _ => {
679 links_in_scope.retain(|link| {
680 if let Some(&next_char) = link.front() {
681 r.incoming_edges().contains(&complement(next_char))
682 } else {
683 false
684 }
685 });
686
687 let consensus_junction_choice = *links_in_scope.first()?.front().unwrap();
688
689 if r.incoming_edges()
690 .contains(&complement(consensus_junction_choice))
691 {
692 for link in links_in_scope.iter_mut() {
693 link.pop_front();
694 }
695 links_in_scope.retain(|link| !link.is_empty());
696 consensus_junction_choice
697 } else {
698 return None;
699 }
700 }
701 }
702 };
703
704 let next_kmer = [&kmer[1..kmer.len()], &[next_base]].concat();
705
706 Some(next_kmer)
707 }
708
709 fn prev_kmer(&self, kmer: &[u8], links_in_scope: &mut Vec<Link>) -> Option<Vec<u8>> {
720 let cn_kmer_vec = crate::utils::canonicalize_kmer(kmer).clone();
721 let cn_kmer = cn_kmer_vec.as_bytes();
722
723 let ru = self.kmers.get(cn_kmer);
724 let r = ru?;
725
726 let prev_base = if cn_kmer == kmer {
727 match r.in_degree() {
728 0 => return None,
729 1 => r.incoming_edges()[0],
730 _ => {
731 links_in_scope.retain(|link| {
732 if let Some(&next_char) = link.front() {
733 r.incoming_edges().contains(&next_char)
734 } else {
735 false
736 }
737 });
738
739 let consensus_junction_choice = match links_in_scope.first()?.front() {
740 Some(choice) => *choice,
741 None => return None,
742 };
743
744 if r.incoming_edges().contains(&consensus_junction_choice) {
745 for link in links_in_scope.iter_mut() {
746 link.pop_front();
747 }
748 links_in_scope.retain(|link| !link.is_empty());
749 consensus_junction_choice
750 } else {
751 return None;
752 }
753 }
754 }
755 } else {
756 match r.out_degree() {
757 0 => return None,
758 1 => complement(r.outgoing_edges()[0]),
759 _ => {
760 links_in_scope.retain(|link| {
761 if let Some(&next_char) = link.front() {
762 r.outgoing_edges().contains(&complement(next_char))
763 } else {
764 false
765 }
766 });
767
768 let consensus_junction_choice = match links_in_scope.first()?.front() {
769 Some(choice) => *choice,
770 None => return None,
771 };
772
773 if r.outgoing_edges()
774 .contains(&complement(consensus_junction_choice))
775 {
776 for link in links_in_scope.iter_mut() {
777 link.pop_front();
778 }
779 links_in_scope.retain(|link| !link.is_empty());
780 consensus_junction_choice
781 } else {
782 return None;
783 }
784 }
785 }
786 };
787
788 let prev_kmer = [&[prev_base], &kmer[0..kmer.len() - 1]].concat();
789
790 Some(prev_kmer)
791 }
792
793 pub fn remove(&mut self, kmer: &[u8]) -> Option<Record> {
794 let cn_kmer = crate::utils::canonicalize_kmer(kmer);
795 self.kmers.remove(&cn_kmer)
796 }
797
798 #[must_use]
812 pub fn score_kmers(mut self, model_path: &PathBuf) -> Self {
813 let gbdt = GBDT::load_model(model_path.to_str().unwrap()).unwrap();
814
815 self.scores = self
816 .kmers
817 .keys()
818 .map(|cn_kmer| {
819 let lcov = self.kmers.get(cn_kmer).unwrap().coverage();
820 let scov: u16 = 0;
821 let compressed_len = crate::utils::homopolymer_compressed(cn_kmer).len();
822
823 let data = Data::new_test_data(
824 vec![
825 lcov as f32,
826 scov as f32,
827 (cn_kmer.len() - compressed_len) as f32,
828 ],
829 Some(0.0),
830 );
831 let prediction = *gbdt.predict(&vec![data]).first().unwrap_or(&0.0);
832
833 (cn_kmer.clone(), prediction)
834 })
835 .collect();
836
837 self
838 }
839
840 pub fn infer_edges(&mut self) {
841 let mut kmers = KmerGraph::new();
842
843 self.kmers.iter().for_each(|(cn_kmer, record)| {
844 let mut new_record = Record::new(record.coverage(), Some(Edges::empty()));
845
846 for edge_base in vec![b'A', b'C', b'G', b'T'] {
847 let next_kmer = [&cn_kmer[1..], &[edge_base]].concat();
848 let next_cn_kmer = crate::utils::canonicalize_kmer(&next_kmer);
849
850 if self.kmers.contains_key(&next_cn_kmer) {
851 new_record.set_outgoing_edge(edge_base);
852 }
853
854 let prev_kmer = [&[edge_base], &cn_kmer[0..cn_kmer.len() - 1]].concat();
855 let prev_cn_kmer = crate::utils::canonicalize_kmer(&prev_kmer);
856
857 if self.kmers.contains_key(&prev_cn_kmer) {
858 new_record.set_incoming_edge(edge_base);
859 }
860 }
861
862 kmers.insert(cn_kmer.clone(), new_record);
863 });
864
865 self.kmers = kmers;
866 }
867
868 #[must_use]
869 pub fn correct_seqs(&self, seqs: &Vec<Vec<u8>>) -> Vec<Vec<u8>> {
870 let progress_bar = if self.verbose {
871 crate::utils::default_bounded_progress_bar("Correcting reads", seqs.len() as u64)
872 } else {
873 crate::utils::default_hidden_progress_bar()
874 };
875
876 let g = self.traverse_all_kmers();
877
878 seqs.par_iter()
879 .progress_with(progress_bar)
880 .map(|seq| self.correct_seq(&g, seq))
881 .collect::<Vec<Vec<u8>>>()
882 }
883
884 #[must_use]
885 pub fn correct_seq(&self, g: &DiGraph<String, f32>, seq: &[u8]) -> Vec<u8> {
886 let mut g_seq = g.clone();
887
888 let a_kmer_set = seq
889 .windows(self.kmer_size)
890 .map(|kmer| kmer.to_vec())
891 .collect::<HashSet<_>>();
892
893 let count = seq
894 .windows(self.kmer_size)
895 .filter_map(|fw_kmer| {
896 g.node_indices()
897 .find(|&n| *g[n] == String::from_utf8_lossy(fw_kmer).to_string())
898 })
899 .count();
900
901 if count == 0 {
902 return seq.to_vec();
903 }
904
905 for i in 0..seq.len() - self.kmer_size {
907 let current_kmer = &seq[i..i + self.kmer_size];
908 let next_kmer = &seq[i + 1..i + 1 + self.kmer_size];
909
910 let current_node = g_seq
911 .node_indices()
912 .find(|&n| *g_seq[n] == String::from_utf8_lossy(current_kmer).to_string())
913 .unwrap_or_else(|| {
914 g_seq.add_node(String::from_utf8_lossy(current_kmer).to_string())
915 });
916 let next_node = g_seq
917 .node_indices()
918 .find(|&n| *g_seq[n] == String::from_utf8_lossy(next_kmer).to_string())
919 .unwrap_or_else(|| g_seq.add_node(String::from_utf8_lossy(next_kmer).to_string()));
920
921 if !g_seq.contains_edge(current_node, next_node) {
922 g_seq.add_edge(current_node, next_node, 1.0);
923 }
924 }
925
926 let sb = find_all_superbubbles(&g_seq);
927
928 let mut replacements = Vec::new();
929
930 for ((in_node, out_node), interior) in sb {
931 let kmer_in = g_seq[in_node].as_bytes();
932 let kmer_out = g_seq[out_node].as_bytes();
933
934 if a_kmer_set.contains(&kmer_in.to_vec()) && a_kmer_set.contains(&kmer_out.to_vec()) {
942 let paths_fwd = petgraph::algo::all_simple_paths::<Vec<_>, _>(
945 &g_seq,
946 in_node,
947 out_node,
948 0,
949 Some(interior.len()),
950 );
951 let paths_rev = petgraph::algo::all_simple_paths::<Vec<_>, _>(
952 &g_seq,
953 out_node,
954 in_node,
955 0,
956 Some(interior.len()),
957 );
958
959 let mut paths = paths_fwd
960 .chain(paths_rev)
961 .map(|path| {
962 let score = path
963 .iter()
964 .map(|node| {
965 let kmer = g_seq.node_weight(*node).unwrap().as_bytes();
966 let cn_kmer = crate::utils::canonicalize_kmer(kmer);
967 self.scores.get(&cn_kmer).unwrap_or(&0.0)
968 })
969 .fold(1.0, |acc, x| acc * x)
970 .powf(1.0 / path.len() as f32);
971
972 let seen_kmer_count = path
973 .iter()
974 .filter(|node| {
975 let kmer = g_seq.node_weight(**node).unwrap().as_bytes();
976 let cn_kmer = crate::utils::canonicalize_kmer(kmer);
977 a_kmer_set.contains(&cn_kmer.to_vec())
978 })
979 .count();
980
981 (path, score, seen_kmer_count)
982 })
983 .collect::<Vec<(Vec<NodeIndex>, f32, usize)>>();
984
985 paths.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap());
986
987 for (path, score, seen_kmer_count) in &paths {
988 let mut contig = String::new();
989
990 for node in path {
991 if contig.is_empty() {
992 contig.push_str(&String::from_utf8_lossy(g_seq[*node].as_bytes()));
993 } else {
994 contig.push_str(&String::from_utf8_lossy(
995 &g_seq[*node].as_bytes()[self.kmer_size - 1..],
996 ));
997 }
998 }
999
1000 }
1002
1003 let read_path = paths.first().cloned().unwrap();
1004 let read_kmers = read_path
1005 .0
1006 .iter()
1007 .map(|node| g_seq[*node].as_bytes().to_vec())
1008 .collect::<Vec<_>>();
1009
1010 if paths.len() > 1 && read_path.1 < 0.1 {
1025 let replacement_path = paths.get(1).cloned().unwrap();
1026 let replacement_kmers = replacement_path
1027 .0
1028 .iter()
1029 .map(|node| g_seq[*node].as_bytes().to_vec())
1030 .collect::<Vec<_>>();
1031
1032 replacements.push((read_kmers, replacement_kmers));
1033
1034 }
1036 }
1037 }
1038
1039 let mut b = seq
1040 .windows(self.kmer_size)
1041 .map(|kmer| kmer.to_vec())
1042 .collect::<Vec<_>>();
1043
1044 for (read_path, replacement_path) in replacements {
1045 let start_pos = b.iter().position(|kmer| kmer == &read_path[0]);
1046 let end_pos = b
1047 .iter()
1048 .position(|kmer| kmer == &read_path[read_path.len() - 1]);
1049
1050 if let (Some(start_pos), Some(end_pos)) = (start_pos, end_pos) {
1051 let mut q1 = String::new();
1052 for kmer in read_path.iter() {
1053 if q1.is_empty() {
1054 q1.push_str(&String::from_utf8_lossy(kmer));
1055 } else {
1056 q1.push_str(&String::from_utf8_lossy(&kmer[self.kmer_size - 1..]));
1057 }
1058 }
1059
1060 let mut r1 = String::new();
1061 for kmer in replacement_path.iter() {
1062 if r1.is_empty() {
1063 r1.push_str(&String::from_utf8_lossy(kmer));
1064 } else {
1065 r1.push_str(&String::from_utf8_lossy(&kmer[self.kmer_size - 1..]));
1066 }
1067 }
1068
1069 if start_pos <= end_pos {
1072 b.splice(start_pos..=end_pos, replacement_path);
1073 }
1074 }
1075 }
1076
1077 let mut c = String::new();
1078
1079 for kmer in b {
1080 if c.is_empty() {
1081 c.push_str(&String::from_utf8_lossy(&kmer));
1082 } else {
1083 c.push_str(&String::from_utf8_lossy(&kmer[self.kmer_size - 1..]));
1084 }
1085 }
1086
1087 c.as_bytes().to_vec()
1116 }
1117
1118 #[must_use]
1119 pub fn correct_seq_old(&self, seq: &[u8]) -> Vec<Vec<u8>> {
1120 let (mut graph, node_indices) = self.initialize_read_graph(seq);
1121
1122 for window in node_indices.windows(2) {
1123 let (from, to) = (window[0], window[1]);
1124
1125 let from_kmer = graph.node_weight(from).unwrap().as_bytes();
1126 let to_kmer = graph.node_weight(to).unwrap().as_bytes();
1127
1128 let from_cn_kmer = crate::utils::canonicalize_kmer(from_kmer);
1129 let to_cn_kmer = crate::utils::canonicalize_kmer(to_kmer);
1130
1131 let from_next_kmers = self.next_kmers(&from_kmer);
1132 let to_prev_kmers = self.prev_kmers(&to_kmer);
1133
1134 if self.kmers.contains_key(&from_cn_kmer)
1135 && self.kmers.contains_key(&to_cn_kmer)
1136 && from_next_kmers.contains(&to_kmer.to_vec())
1137 && to_prev_kmers.contains(&from_kmer.to_vec())
1138 {
1139 graph.add_edge(from, to, 1.0);
1140 } else {
1141 if let Ok((contig, forward)) = self.try_simple_paths(from_kmer, to_kmer) {
1142 self.add_to_read_graph(contig, &mut graph, forward);
1143 } else if let Ok((contig, forward)) = self.try_alternate_paths(&graph, from, to, 5)
1144 {
1145 self.add_to_read_graph(contig, &mut graph, forward);
1146 } else if let Ok((contig, forward)) =
1147 self.try_overlapping_paths(&graph, from, to, 5)
1148 {
1149 self.add_to_read_graph(contig, &mut graph, forward);
1150 } else if let Ok((contig, forward)) =
1151 self.try_exploratory_paths(&graph, from, to, seq, 5, 100)
1152 {
1153 self.add_to_read_graph(contig, &mut graph, forward);
1154 }
1155 }
1156 }
1157
1158 let initial_segments = traverse_read_graph(&graph, 0);
1159
1160 if let Ok(connecting_segments) = self.try_connecting_all_segments(&initial_segments) {
1161 for (contig, forward) in connecting_segments.into_iter() {
1162 self.add_to_read_graph(contig, &mut graph, forward);
1163 }
1164 }
1165
1166 let corrected_segments = traverse_read_graph(&graph, self.kmer_size);
1175
1176 if corrected_segments.len() > 1 {
1177 vec![self.combine_read_segments(seq, &corrected_segments)]
1178 } else {
1179 corrected_segments
1180 }
1181 }
1182
1183 fn combine_read_segments(&self, seq: &[u8], corrected_segments: &Vec<Vec<u8>>) -> Vec<u8> {
1184 use spoa::AlignmentType;
1185 let mut la = spoa::AlignmentEngine::new(AlignmentType::kSW, 5, -10, -16, -12, -20, -8);
1186
1187 let mut sg = spoa::Graph::new();
1188 for lr_seq in vec![seq.to_vec()].iter().chain(corrected_segments.iter()) {
1189 let seq_cstr = std::ffi::CString::new(lr_seq.clone()).unwrap();
1190 let seq_qual = std::ffi::CString::new(vec![b'I'; lr_seq.len()]).unwrap();
1191 let a = la.align(seq_cstr.as_ref(), &sg);
1192
1193 sg.add_alignment(&a, seq_cstr.as_ref(), seq_qual.as_ref());
1194 }
1195
1196 let msa_cstrs = sg.multiple_sequence_alignment(false);
1197 let msa_strings = msa_cstrs
1198 .iter()
1199 .map(|cstr| cstr.to_str().unwrap().to_string())
1200 .map(|msa_string| {
1201 let leading_dashes = msa_string.chars().take_while(|&c| c == '-').count();
1202 let trailing_dashes = msa_string.chars().rev().take_while(|&c| c == '-').count();
1203 let middle = &msa_string[leading_dashes..msa_string.len() - trailing_dashes];
1204 format!(
1205 "{}{}{}",
1206 " ".repeat(leading_dashes),
1207 middle,
1208 " ".repeat(trailing_dashes)
1209 )
1210 })
1211 .collect::<Vec<String>>();
1212
1213 let mut combined_seq = vec![];
1214 for column in 0..msa_strings[0].len() {
1215 let mut pileup = vec![];
1216 for msa_string in msa_strings.iter().skip(1).chain(msa_strings.iter().take(1)) {
1217 let char_at_column = msa_string.chars().nth(column).unwrap();
1218
1219 if char_at_column != ' ' {
1220 pileup.push(char_at_column);
1221 }
1222 }
1223
1224 combined_seq.push(pileup[0] as u8);
1225 }
1226
1227 combined_seq.retain(|&x| x != b'-');
1228
1229 combined_seq
1230 }
1231
1232 fn remove_short_branches(&self, graph: &mut petgraph::Graph<String, f64>, min_size: usize) {
1233 let mut nodes_to_remove = HashSet::new();
1235 for node in graph.node_indices() {
1236 let out_degree = graph.edges_directed(node, petgraph::Outgoing).count();
1237 if out_degree > 1 {
1238 for edge in graph.edges_directed(node, petgraph::Outgoing) {
1239 let mut branch = vec![edge.target()];
1240 let mut current = edge.target();
1241 let mut branch_length = 0;
1242
1243 while branch_length < 2 * self.kmer_size {
1244 let out_edges: Vec<_> =
1245 graph.edges_directed(current, petgraph::Outgoing).collect();
1246 if out_edges.len() != 1 {
1247 break;
1248 }
1249 current = out_edges[0].target();
1250 branch.push(current);
1251 branch_length += 1;
1252 }
1253
1254 if branch_length < min_size {
1255 nodes_to_remove.extend(branch);
1256 }
1257 }
1258 }
1259 }
1260
1261 for node in nodes_to_remove {
1263 graph.remove_node(node);
1264 }
1265 }
1266
1267 fn try_simple_paths(&self, from_kmer: &[u8], to_kmer: &[u8]) -> Result<(Vec<u8>, bool)> {
1268 let mut forward_contig = from_kmer.to_vec();
1269 let mut backward_contig = to_kmer.to_vec();
1270
1271 if let Ok(_) = self.assemble_forward_until(
1272 &mut forward_contig,
1273 &from_kmer,
1274 &HashSet::from([to_kmer.to_vec()]),
1275 100,
1276 ) {
1277 return Ok((forward_contig, true));
1278 } else if let Ok(_) = self.assemble_backward_until(
1279 &mut backward_contig,
1280 &to_kmer,
1281 &HashSet::from([from_kmer.to_vec()]),
1282 100,
1283 ) {
1284 return Ok((backward_contig, false));
1285 }
1286
1287 Err(anyhow::anyhow!("No simple path found"))
1288 }
1289
1290 fn try_alternate_paths(
1291 &self,
1292 graph: &petgraph::Graph<String, f64>,
1293 from: NodeIndex,
1294 to: NodeIndex,
1295 window: u8,
1296 ) -> Result<(Vec<u8>, bool)> {
1297 for scan_left in 0..=window {
1298 for scan_right in 0..=window {
1299 if let (Some(new_from), Some(new_to)) = (
1300 navigate_backward(from, scan_left, graph),
1301 navigate_forward(to, scan_right, graph),
1302 ) {
1303 let new_from_kmer = graph.node_weight(new_from).unwrap().as_bytes();
1304 let new_to_kmer = graph.node_weight(new_to).unwrap().as_bytes();
1305
1306 let mut forward_contig = new_from_kmer.to_vec();
1307 let mut backward_contig = new_to_kmer.to_vec();
1308
1309 if let Ok(_) = self.assemble_forward_until(
1310 &mut forward_contig,
1311 &new_from_kmer,
1312 &HashSet::from([new_to_kmer.to_vec()]),
1313 100,
1314 ) {
1315 return Ok((forward_contig, true));
1316 } else if let Ok(_) = self.assemble_backward_until(
1317 &mut backward_contig,
1318 &new_to_kmer,
1319 &HashSet::from([new_from_kmer.to_vec()]),
1320 100,
1321 ) {
1322 return Ok((backward_contig, false));
1323 }
1324 }
1325 }
1326 }
1327
1328 return Err(anyhow::anyhow!("No alternate path found"));
1329 }
1330
1331 fn try_overlapping_paths(
1332 &self,
1333 graph: &petgraph::Graph<String, f64>,
1334 from: NodeIndex,
1335 to: NodeIndex,
1336 window: u8,
1337 ) -> Result<(Vec<u8>, bool)> {
1338 for scan_left in 0..=window {
1339 for scan_right in 0..=window {
1340 if let (Some(new_from), Some(new_to)) = (
1341 navigate_backward(from, scan_left, graph),
1342 navigate_forward(to, scan_right, graph),
1343 ) {
1344 let new_from_kmer = graph.node_weight(new_from).unwrap().as_bytes();
1345 let new_to_kmer = graph.node_weight(new_to).unwrap().as_bytes();
1346
1347 let mut forward_contig = new_from_kmer.to_vec();
1348 let mut backward_contig = new_to_kmer.to_vec();
1349
1350 let seqs = vec![forward_contig.clone(), backward_contig.clone()];
1351 let l = LdBG::from_sequences("contig".to_string(), self.kmer_size, &seqs)
1352 .build_links(&seqs, false);
1353
1354 if let Ok(_) = l.assemble_forward_until(
1355 &mut forward_contig,
1356 &new_from_kmer,
1357 &HashSet::from([new_to_kmer.to_vec()]),
1358 100,
1359 ) {
1360 return Ok((forward_contig, true));
1361 } else if let Ok(_) = l.assemble_backward_until(
1362 &mut backward_contig,
1363 &new_to_kmer,
1364 &HashSet::from([new_from_kmer.to_vec()]),
1365 100,
1366 ) {
1367 return Ok((backward_contig, false));
1368 }
1369 }
1370 }
1371 }
1372
1373 return Err(anyhow::anyhow!("No overlapping path found"));
1374 }
1375
1376 fn try_exploratory_paths(
1377 &self,
1378 graph: &petgraph::Graph<String, f64>,
1379 from: NodeIndex,
1380 to: NodeIndex,
1381 seq: &[u8],
1382 window: u8,
1383 max_intermediate_nodes: usize,
1384 ) -> Result<(Vec<u8>, bool)> {
1385 for scan_left in 0..=window {
1386 for scan_right in 0..=window {
1387 if let (Some(new_from), Some(new_to)) = (
1388 navigate_backward(from, scan_left, graph),
1389 navigate_forward(to, scan_right, graph),
1390 ) {
1391 let new_from_kmer = graph.node_weight(new_from).unwrap().as_bytes();
1392 let new_to_kmer = graph.node_weight(new_to).unwrap().as_bytes();
1393
1394 if let Some(substring) =
1395 self.find_read_substring(seq, new_from_kmer, new_to_kmer)
1396 {
1397 let mut subgraph = DiGraph::new();
1398 let mut visited = HashMap::<String, NodeIndex>::new();
1399
1400 let start_label = String::from_utf8_lossy(&new_from_kmer).to_string();
1401 let start_node = subgraph.add_node(start_label.clone());
1402 visited.insert(start_label.clone(), start_node);
1403
1404 if let Ok(_) = self.traverse_forward_until(
1405 &mut subgraph,
1406 &mut visited,
1407 start_node,
1408 new_to_kmer,
1409 ) {
1410 let start_label = String::from_utf8_lossy(&new_from_kmer).to_string();
1411 let end_label = String::from_utf8_lossy(&new_to_kmer).to_string();
1412
1413 let start_node = subgraph
1415 .node_indices()
1416 .find(|&n| subgraph[n] == start_label);
1417 let end_node =
1418 subgraph.node_indices().find(|&n| subgraph[n] == end_label);
1419
1420 if let (Some(start), Some(end)) = (start_node, end_node) {
1421 let mut best_contig = String::new();
1422 let mut min_ldist = u32::MAX;
1423
1424 for path in petgraph::algo::all_simple_paths::<Vec<_>, _>(
1426 &subgraph,
1427 start,
1428 end,
1429 0,
1430 Some(max_intermediate_nodes),
1431 ) {
1432 let path_labels: Vec<&String> =
1434 path.iter().map(|&n| &subgraph[n]).collect();
1435
1436 let mut new_contig = String::new();
1438 for kmer in path_labels.iter() {
1439 if new_contig.is_empty() {
1440 new_contig.push_str(kmer);
1441 } else {
1442 new_contig.push_str(
1443 &kmer.chars().last().unwrap().to_string(),
1444 );
1445 }
1446 }
1447
1448 let ldist =
1449 levenshtein(&new_contig.as_bytes().to_vec(), &substring);
1450 if ldist < min_ldist {
1451 min_ldist = ldist;
1452 best_contig = new_contig;
1453 }
1454 }
1455
1456 if min_ldist < (substring.len() / 10) as u32 {
1457 return Ok((best_contig.as_bytes().to_vec(), true));
1458 }
1459 }
1460 }
1461 }
1462 }
1463 }
1464 }
1465
1466 return Err(anyhow::anyhow!("No alternate path found"));
1467 }
1468
1469 fn try_connecting_all_segments(&self, segments: &[Vec<u8>]) -> Result<Vec<(Vec<u8>, bool)>> {
1470 let mut connecting_segments = Vec::new();
1471
1472 for segment1 in segments.iter() {
1473 for segment2 in segments.iter() {
1474 if segment1 == segment2 {
1475 continue;
1476 }
1477
1478 if let Ok(result) = self.try_connecting_segments(segment1, segment2) {
1479 connecting_segments.push(result);
1480 }
1481 }
1482 }
1483
1484 if connecting_segments.len() == 0 {
1485 return Err(anyhow::anyhow!("No connecting path found"));
1486 }
1487
1488 Ok(connecting_segments)
1489 }
1490
1491 fn try_connecting_segments(&self, segment1: &[u8], segment2: &[u8]) -> Result<(Vec<u8>, bool)> {
1492 let start_kmers = segment1
1493 .windows(self.kmer_size)
1494 .skip(segment1.len().saturating_sub(2 * self.kmer_size))
1495 .map(|kmer| kmer.to_vec())
1496 .collect::<HashSet<Vec<u8>>>();
1497 let end_kmers = segment2
1498 .windows(self.kmer_size)
1499 .take(10)
1500 .map(|kmer| kmer.to_vec())
1501 .collect::<HashSet<Vec<u8>>>();
1502
1503 for start_kmer in start_kmers.iter() {
1504 let mut forward_contig = start_kmer.clone();
1505 if let Ok(_) =
1506 self.assemble_forward_until(&mut forward_contig, &start_kmer, &end_kmers, 100)
1507 {
1508 return Ok((forward_contig, true));
1509 }
1510 }
1511
1512 for end_kmer in end_kmers.iter() {
1513 let mut backward_contig = end_kmer.clone();
1514 if let Ok(_) =
1515 self.assemble_backward_until(&mut backward_contig, &end_kmer, &start_kmers, 100)
1516 {
1517 return Ok((backward_contig, false));
1518 }
1519 }
1520
1521 return Err(anyhow::anyhow!("No connecting path found"));
1522 }
1523
1524 fn find_read_substring(&self, seq: &[u8], from_kmer: &[u8], to_kmer: &[u8]) -> Option<Vec<u8>> {
1525 seq.windows(from_kmer.len())
1526 .position(|window| window == from_kmer)
1527 .and_then(|start_pos| {
1528 seq[start_pos + from_kmer.len()..]
1529 .windows(to_kmer.len())
1530 .position(|window| window == to_kmer)
1531 .map(|end_pos| {
1532 seq[start_pos..start_pos + from_kmer.len() + end_pos + to_kmer.len()]
1533 .to_vec()
1534 })
1535 })
1536 }
1537
1538 fn add_to_read_graph(
1539 &self,
1540 contig: Vec<u8>,
1541 graph: &mut petgraph::Graph<String, f64>,
1542 forward: bool,
1543 ) {
1544 if forward {
1545 self.add_to_read_graph_forward(&contig, graph);
1546 } else {
1547 self.add_to_read_graph_backward(&contig, graph);
1548 }
1549 }
1550
1551 fn add_to_read_graph_backward(
1552 &self,
1553 backward_contig: &[u8],
1554 graph: &mut petgraph::Graph<String, f64>,
1555 ) {
1556 for i in (0..backward_contig.len() - self.kmer_size).rev() {
1558 let current_kmer = &backward_contig[i..i + self.kmer_size];
1559 let prev_kmer = &backward_contig[i + 1..i + 1 + self.kmer_size];
1560
1561 let prev_node = graph
1562 .node_indices()
1563 .find(|&n| *graph[n] == String::from_utf8_lossy(current_kmer).to_string())
1564 .unwrap_or_else(|| {
1565 graph.add_node(String::from_utf8_lossy(current_kmer).to_string())
1566 });
1567 let current_node = graph
1568 .node_indices()
1569 .find(|&n| *graph[n] == String::from_utf8_lossy(prev_kmer).to_string())
1570 .unwrap_or_else(|| graph.add_node(String::from_utf8_lossy(prev_kmer).to_string()));
1571
1572 if !graph.contains_edge(prev_node, current_node) {
1573 graph.add_edge(prev_node, current_node, 1.0);
1574 }
1575 }
1576 }
1577
1578 fn add_to_read_graph_forward(
1579 &self,
1580 forward_contig: &[u8],
1581 graph: &mut petgraph::Graph<String, f64>,
1582 ) {
1583 for i in 0..forward_contig.len() - self.kmer_size {
1585 let current_kmer = &forward_contig[i..i + self.kmer_size];
1586 let next_kmer = &forward_contig[i + 1..i + 1 + self.kmer_size];
1587
1588 let current_node = graph
1589 .node_indices()
1590 .find(|&n| *graph[n] == String::from_utf8_lossy(current_kmer).to_string())
1591 .unwrap_or_else(|| {
1592 graph.add_node(String::from_utf8_lossy(current_kmer).to_string())
1593 });
1594 let next_node = graph
1595 .node_indices()
1596 .find(|&n| *graph[n] == String::from_utf8_lossy(next_kmer).to_string())
1597 .unwrap_or_else(|| graph.add_node(String::from_utf8_lossy(next_kmer).to_string()));
1598
1599 if !graph.contains_edge(current_node, next_node) {
1600 graph.add_edge(current_node, next_node, 1.0);
1601 }
1602 }
1603 }
1604
1605 fn initialize_read_graph(&self, seq: &[u8]) -> (petgraph::Graph<String, f64>, Vec<NodeIndex>) {
1606 let mut graph = DiGraph::<String, f64>::new();
1607
1608 let mut node_indices = Vec::new();
1609 for kmer in seq.windows(self.kmer_size) {
1610 if self
1611 .kmers
1612 .contains_key(&crate::utils::canonicalize_kmer(kmer))
1613 {
1614 let node_index = graph.add_node(String::from_utf8_lossy(kmer).to_string());
1615 node_indices.push(node_index);
1616 }
1617 }
1618 (graph, node_indices)
1619 }
1620
1621 fn assemble_forward(&self, contig: &mut Vec<u8>, start_kmer: &[u8]) {
1628 let mut links_in_scope: Vec<Link> = Vec::new();
1629 let mut used_links = HashSet::new();
1630 let mut last_kmer = start_kmer.to_owned();
1631 let mut visited = HashSet::new();
1632
1633 loop {
1634 self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, true);
1635
1636 match self.next_kmer(&last_kmer, &mut links_in_scope) {
1637 Some(this_kmer) => {
1638 if links_in_scope.len() == 0 {
1639 if visited.contains(&this_kmer) {
1640 break;
1641 }
1642
1643 visited.insert(this_kmer.clone());
1644 }
1645
1646 contig.push(this_kmer[this_kmer.len() - 1]);
1647 last_kmer = this_kmer;
1648 }
1649 None => {
1650 break;
1651 }
1652 }
1653 }
1654 }
1655
1656 fn assemble_forward_until(
1665 &self,
1666 contig: &mut Vec<u8>,
1667 start_kmer: &[u8],
1668 stop_kmers: &HashSet<Vec<u8>>,
1669 limit: usize,
1670 ) -> Result<Vec<u8>> {
1671 let mut links_in_scope: Vec<Link> = Vec::new();
1672 let mut used_links = HashSet::new();
1673 let mut last_kmer = start_kmer.to_owned();
1674 let mut nodes_traversed = 0;
1675
1676 loop {
1677 if stop_kmers.contains(&last_kmer) {
1678 return Ok(last_kmer);
1679 }
1680
1681 if nodes_traversed >= limit {
1682 return Err(anyhow::anyhow!(
1683 "Reached limit without finding a stopping k-mer"
1684 ));
1685 }
1686
1687 self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, true);
1688
1689 match self.next_kmer(&last_kmer, &mut links_in_scope) {
1690 Some(this_kmer) => {
1691 contig.push(this_kmer[this_kmer.len() - 1]);
1692 last_kmer = this_kmer;
1693 nodes_traversed += 1;
1694 }
1695 None => {
1696 break;
1697 }
1698 }
1699 }
1700
1701 Err(anyhow::anyhow!("No stopping k-mer found"))
1702 }
1703
1704 fn assemble_forward_until_condition<F>(
1705 &self,
1706 contig: &mut Vec<u8>,
1707 start_kmer: &[u8],
1708 limit: usize,
1709 stopping_condition: F,
1710 ) -> Result<Vec<u8>>
1711 where
1712 F: Fn(&[u8], usize, &Self) -> bool,
1713 {
1714 let initial_contig_length = contig.len();
1715
1716 let mut links_in_scope: Vec<Link> = Vec::new();
1717 let mut used_links = HashSet::new();
1718 let mut last_kmer = start_kmer.to_owned();
1719
1720 loop {
1721 if stopping_condition(&last_kmer, contig.len() - initial_contig_length, self) {
1722 return Ok(last_kmer);
1723 }
1724
1725 if contig.len() - initial_contig_length >= limit {
1726 return Err(anyhow::anyhow!(
1727 "Reached limit without finding a stopping k-mer"
1728 ));
1729 }
1730
1731 self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, true);
1732
1733 match self.next_kmer(&last_kmer, &mut links_in_scope) {
1734 Some(this_kmer) => {
1735 contig.push(this_kmer[this_kmer.len() - 1]);
1736 last_kmer = this_kmer;
1737 }
1738 None => {
1739 break;
1740 }
1741 }
1742 }
1743
1744 Err(anyhow::anyhow!("Stopping condition not satisfied"))
1745 }
1746
1747 fn assemble_forward_limit(
1748 &self,
1749 contig: &mut Vec<u8>,
1750 start_kmer: &[u8],
1751 limit: usize,
1752 ) -> Result<Vec<u8>> {
1753 let mut links_in_scope: Vec<Link> = Vec::new();
1754 let mut used_links = HashSet::new();
1755 let mut last_kmer = start_kmer.to_owned();
1756 loop {
1757 if contig.len() >= limit {
1758 return Ok(last_kmer);
1759 }
1760
1761 self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, true);
1762
1763 match self.next_kmer(&last_kmer, &mut links_in_scope) {
1764 Some(this_kmer) => {
1765 contig.push(this_kmer[this_kmer.len() - 1]);
1766 last_kmer = this_kmer;
1767 }
1768 None => {
1769 break;
1770 }
1771 }
1772 }
1773
1774 Err(anyhow::anyhow!("No stopping k-mer found"))
1775 }
1776
1777 fn assemble_backward(&self, contig: &mut Vec<u8>, start_kmer: &[u8]) {
1784 let mut links_in_scope: Vec<Link> = Vec::new();
1785 let mut used_links = HashSet::new();
1786 let mut last_kmer = start_kmer.to_owned();
1787 let mut visited = HashSet::new();
1788
1789 loop {
1790 self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, false);
1791
1792 match self.prev_kmer(&last_kmer, &mut links_in_scope) {
1793 Some(this_kmer) => {
1794 if links_in_scope.len() == 0 {
1795 if visited.contains(&this_kmer) {
1796 break;
1797 }
1798
1799 visited.insert(this_kmer.clone());
1800 }
1801
1802 contig.insert(0, this_kmer[0]);
1803 last_kmer = this_kmer;
1804 }
1805 None => {
1806 break;
1807 }
1808 }
1809 }
1810 }
1811
1812 fn assemble_backward_until(
1821 &self,
1822 contig: &mut Vec<u8>,
1823 start_kmer: &[u8],
1824 stop_kmers: &HashSet<Vec<u8>>,
1825 limit: usize,
1826 ) -> Result<Vec<u8>> {
1827 let mut links_in_scope: Vec<Link> = Vec::new();
1828 let mut used_links = HashSet::new();
1829 let mut last_kmer = start_kmer.to_owned();
1830 let mut nodes_traversed = 0;
1831
1832 loop {
1833 if stop_kmers.contains(&last_kmer) {
1834 return Ok(last_kmer);
1835 }
1836
1837 if nodes_traversed >= limit {
1838 return Err(anyhow::anyhow!(
1839 "Limit reached before finding a stopping k-mer"
1840 ));
1841 }
1842
1843 self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, false);
1844
1845 match self.prev_kmer(&last_kmer, &mut links_in_scope) {
1846 Some(this_kmer) => {
1847 contig.insert(0, this_kmer[0]);
1848 last_kmer = this_kmer;
1849 nodes_traversed += 1;
1850 }
1851 None => {
1852 break;
1853 }
1854 }
1855 }
1856
1857 Err(anyhow::anyhow!("No stopping k-mer found"))
1858 }
1859
1860 fn assemble_backward_until_condition<F>(
1861 &self,
1862 contig: &mut Vec<u8>,
1863 start_kmer: &[u8],
1864 limit: usize,
1865 stopping_condition: F,
1866 ) -> Result<Vec<u8>>
1867 where
1868 F: Fn(&[u8], usize, &Self) -> bool,
1869 {
1870 let initial_contig_length = contig.len();
1871
1872 let mut links_in_scope: Vec<Link> = Vec::new();
1873 let mut used_links = HashSet::new();
1874 let mut last_kmer = start_kmer.to_owned();
1875
1876 loop {
1877 if stopping_condition(&last_kmer, contig.len() - initial_contig_length, self) {
1878 return Ok(last_kmer);
1879 }
1880
1881 if contig.len() - initial_contig_length >= limit {
1882 return Err(anyhow::anyhow!(
1883 "Reached limit without finding a stopping k-mer"
1884 ));
1885 }
1886
1887 self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, false);
1888
1889 match self.prev_kmer(&last_kmer, &mut links_in_scope) {
1890 Some(this_kmer) => {
1891 contig.insert(0, this_kmer[0]);
1892 last_kmer = this_kmer;
1893 }
1894 None => {
1895 break;
1896 }
1897 }
1898 }
1899
1900 Err(anyhow::anyhow!("Stopping condition not satisfied"))
1901 }
1902
1903 fn assemble_backward_limit(
1904 &self,
1905 contig: &mut Vec<u8>,
1906 start_kmer: &[u8],
1907 limit: usize,
1908 ) -> Result<Vec<u8>> {
1909 let mut links_in_scope: Vec<Link> = Vec::new();
1910 let mut used_links = HashSet::new();
1911 let mut last_kmer = start_kmer.to_owned();
1912 loop {
1913 if contig.len() >= limit {
1914 return Ok(last_kmer);
1915 }
1916
1917 self.update_links(&mut links_in_scope, &last_kmer, &mut used_links, false);
1918
1919 match self.prev_kmer(&last_kmer, &mut links_in_scope) {
1920 Some(this_kmer) => {
1921 contig.insert(0, this_kmer[0]);
1922 last_kmer = this_kmer;
1923 }
1924 None => {
1925 break;
1926 }
1927 }
1928 }
1929
1930 Err(anyhow::anyhow!("No stopping k-mer found"))
1931 }
1932
1933 #[must_use]
1947 pub fn assemble(&self, kmer: &[u8]) -> Vec<u8> {
1948 let mut contig: Vec<u8> = kmer.to_vec();
1949
1950 assert!(
1951 kmer.len() == self.kmer_size,
1952 "kmer length {} does not match expected length {}",
1953 kmer.len(),
1954 self.kmer_size
1955 );
1956
1957 self.assemble_forward(&mut contig, kmer);
1958 self.assemble_backward(&mut contig, kmer);
1959
1960 contig
1961 }
1962
1963 #[must_use]
1974 pub fn assemble_all(&self) -> Vec<Vec<u8>> {
1975 let mut contigs = Vec::new();
1976
1977 let mut used_kmers = HashSet::new();
1978 let k = self.kmer_size;
1979
1980 let progress_bar = if self.verbose {
1981 crate::utils::default_bounded_progress_bar(
1982 "Assembling contigs",
1983 self.kmers.len() as u64,
1984 )
1985 } else {
1986 crate::utils::default_hidden_progress_bar()
1987 };
1988
1989 for cn_kmer in self.kmers.keys().progress_with(progress_bar) {
1990 if !used_kmers.contains(cn_kmer) {
1991 let r = self.kmers.get(cn_kmer).unwrap();
1992
1993 if r.in_degree() == 1 && r.out_degree() == 1 {
1994 let contig = self.assemble(cn_kmer);
1995 for kmer_in_contig in contig.windows(k) {
1996 used_kmers.insert(crate::utils::canonicalize_kmer(kmer_in_contig));
1997 }
1998 contigs.push(contig);
1999 } else if r.in_degree() == 0 && r.out_degree() == 0 {
2000 contigs.push(cn_kmer.clone());
2001 }
2002
2003 used_kmers.insert(cn_kmer.clone());
2004 }
2005 }
2006
2007 contigs
2008 }
2009
2010 #[must_use]
2024 pub fn assemble_at_bubbles(&self) -> Vec<Vec<u8>> {
2025 let g = self.traverse_all_kmers();
2026 let bubbles = find_all_superbubbles(&g);
2027
2028 let mut all_contigs = Vec::new();
2029 let mut visited = HashSet::new();
2030
2031 for ((in_node, out_node), interior) in bubbles {
2032 let paths_fwd = petgraph::algo::all_simple_paths::<Vec<_>, _>(
2033 &g,
2034 in_node,
2035 out_node,
2036 0,
2037 Some(interior.len()),
2038 );
2039 let paths_rev = petgraph::algo::all_simple_paths::<Vec<_>, _>(
2040 &g,
2041 out_node,
2042 in_node,
2043 0,
2044 Some(interior.len()),
2045 );
2046
2047 let paths: Vec<Vec<NodeIndex>> = paths_fwd.chain(paths_rev).collect();
2048
2049 let mut unique_nodes = Vec::new();
2050
2051 for (i, path) in paths.iter().enumerate() {
2052 let other_paths: HashSet<_> = paths
2053 .iter()
2054 .enumerate()
2055 .filter(|&(j, _)| j != i)
2056 .flat_map(|(_, p)| p)
2057 .collect();
2058
2059 if let Some(unique_node) = path.iter().find(|&node| !other_paths.contains(node)) {
2060 let kmer = g.node_weight(*unique_node).unwrap().as_bytes().to_vec();
2061 let cn_kmer = crate::utils::canonicalize_kmer(&kmer);
2062
2063 if !visited.contains(&cn_kmer) {
2064 unique_nodes.push(*unique_node);
2065 }
2066 }
2067 }
2068
2069 let contigs = unique_nodes
2070 .iter()
2071 .map(|unique_node| {
2072 let kmer = g.node_weight(*unique_node).unwrap().as_bytes().to_vec();
2073 self.assemble(&kmer)
2074 })
2075 .collect::<Vec<_>>();
2076
2077 all_contigs.extend(contigs.clone());
2078
2079 for contig in contigs {
2080 for kmer in contig.windows(self.kmer_size) {
2081 visited.insert(crate::utils::canonicalize_kmer(kmer));
2082 }
2083 }
2084 }
2085
2086 all_contigs
2087 }
2088
2089 fn update_links(
2195 &self,
2196 links_in_scope: &mut Vec<Link>,
2197 last_kmer: &Vec<u8>,
2198 used_links: &mut HashSet<Link>,
2199 forward: bool,
2200 ) {
2201 let cn_kmer_vec = crate::utils::canonicalize_kmer(last_kmer.as_bytes()).clone();
2202 let cn_kmer = cn_kmer_vec.as_bytes();
2203
2204 if self.links.contains_key(cn_kmer.as_bytes()) {
2205 let available_links = self.links.get(cn_kmer.as_bytes()).unwrap().clone();
2206
2207 let record_orientation_matches_kmer = last_kmer == cn_kmer;
2208
2209 for jv in available_links {
2210 let link_goes_forward = record_orientation_matches_kmer == jv.0.is_forward();
2211 let new_link = if link_goes_forward {
2212 jv.0.clone()
2213 } else {
2214 jv.0.complement()
2215 };
2216
2217 if forward == link_goes_forward && !used_links.contains(&new_link) {
2218 used_links.insert(new_link.clone());
2219 links_in_scope.push(new_link);
2220 }
2221 }
2222 }
2223 }
2224
2225 fn first_kmer(&self, contig: &[u8]) -> Option<Vec<u8>> {
2235 if contig.len() < self.kmer_size as usize {
2236 return None;
2237 }
2238
2239 Some(contig[0..self.kmer_size].to_vec())
2240 }
2241
2242 fn last_kmer(&self, contig: &[u8]) -> Option<Vec<u8>> {
2243 if contig.len() < self.kmer_size as usize {
2244 return None;
2245 }
2246
2247 Some(contig[contig.len() - self.kmer_size as usize..].to_vec())
2248 }
2249
2250 #[must_use]
2251 pub fn clean(self, threshold: f32) -> Self {
2252 let kmer_size = self.kmer_size;
2253
2254 self.clean_hairballs()
2255 .clean_tangles(0, 500, threshold)
2256 .clean_tangles(1, 500, threshold)
2257 .clean_tips(3 * kmer_size, threshold)
2258 .clean_branches(threshold)
2259 .clean_superbubbles(1, threshold)
2260 .clean_branches(threshold)
2261 .clean_contigs(100)
2262 }
2263
2264 #[must_use]
2281 pub fn clean_color_specific_paths(mut self, color: usize, min_score: f32) -> Self {
2282 let bad_cn_kmers = self
2283 .kmers
2284 .keys()
2285 .cloned()
2286 .filter(|cn_kmer| self.sources.get(cn_kmer).unwrap_or(&vec![]) == &vec![color])
2287 .filter(|cn_kmer| self.scores.get(cn_kmer).unwrap_or(&1.0) < &min_score)
2288 .filter(|cn_kmer| {
2289 self.kmers.get(cn_kmer).unwrap().in_degree() == 1
2290 && self.kmers.get(cn_kmer).unwrap().out_degree() == 1
2291 })
2292 .collect::<Vec<Vec<u8>>>();
2293
2294 let mut to_remove = HashSet::new();
2295 let mut bad_paths: usize = 0;
2296
2297 for bad_cn_kmer in bad_cn_kmers {
2298 if !to_remove.contains(&bad_cn_kmer) {
2299 let mut seen_kmers = HashSet::new();
2300 let mut score_sum = 0.0;
2301
2302 let mut fw_contig = bad_cn_kmer.clone();
2303 self.assemble_forward(&mut fw_contig, &bad_cn_kmer);
2304
2305 for kmer in fw_contig.windows(self.kmer_size) {
2306 let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2307
2308 if let Some(r) = self.kmers.get(&cn_kmer) {
2309 if r.in_degree() == 1 && r.out_degree() == 1 {
2310 seen_kmers.insert(cn_kmer.clone());
2311 score_sum += self.scores.get(&cn_kmer).unwrap_or(&1.0);
2312 } else {
2313 break;
2314 }
2315 }
2316 }
2317
2318 let mut rc_contig = bad_cn_kmer.clone();
2319 self.assemble_backward(&mut rc_contig, &bad_cn_kmer);
2320
2321 for kmer in rc_contig.windows(self.kmer_size).rev().skip(1) {
2322 let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2323
2324 if let Some(r) = self.kmers.get(&cn_kmer) {
2325 if r.in_degree() == 1 && r.out_degree() == 1 {
2326 seen_kmers.insert(cn_kmer.clone());
2327 score_sum += self.scores.get(&cn_kmer).unwrap_or(&1.0);
2328 } else {
2329 break;
2330 }
2331 }
2332 }
2333
2334 let weight = score_sum / seen_kmers.len() as f32;
2335
2336 if weight < min_score {
2337 to_remove.extend(seen_kmers);
2338 bad_paths += 1;
2339 }
2340 }
2341 }
2342
2343 for cn_kmer in &to_remove {
2344 self.kmers.remove(cn_kmer);
2345 self.scores.remove(cn_kmer);
2346 }
2347
2348 self.infer_edges();
2351
2352 self
2353 }
2354
2355 #[must_use]
2369 pub fn clean_branches(mut self, min_score: f32) -> Self {
2370 let bad_cn_kmers = self
2371 .kmers
2372 .keys()
2373 .cloned()
2374 .filter(|cn_kmer| self.scores.get(cn_kmer).unwrap_or(&1.0) < &min_score)
2375 .filter(|cn_kmer| {
2376 self.kmers.get(cn_kmer).unwrap().in_degree() == 1
2377 && self.kmers.get(cn_kmer).unwrap().out_degree() == 1
2378 })
2379 .collect::<Vec<Vec<u8>>>();
2380
2381 let mut to_remove = HashSet::new();
2382 let mut bad_paths: usize = 0;
2383
2384 for bad_cn_kmer in bad_cn_kmers {
2385 if !to_remove.contains(&bad_cn_kmer) {
2386 let mut seen_kmers = HashSet::new();
2387 let mut score_prod = 1.0;
2389
2390 let mut fw_contig = bad_cn_kmer.clone();
2391 self.assemble_forward(&mut fw_contig, &bad_cn_kmer);
2392
2393 for kmer in fw_contig.windows(self.kmer_size) {
2394 let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2395
2396 if let Some(r) = self.kmers.get(&cn_kmer) {
2397 if r.in_degree() == 1 && r.out_degree() == 1 {
2398 seen_kmers.insert(cn_kmer.clone());
2399 score_prod *= self.scores.get(&cn_kmer).unwrap_or(&1.0);
2401 } else {
2402 break;
2403 }
2404 }
2405 }
2406
2407 let mut rc_contig = bad_cn_kmer.clone();
2408 self.assemble_backward(&mut rc_contig, &bad_cn_kmer);
2409
2410 for kmer in rc_contig.windows(self.kmer_size).rev().skip(1) {
2411 let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2412
2413 if let Some(r) = self.kmers.get(&cn_kmer) {
2414 if r.in_degree() == 1 && r.out_degree() == 1 {
2415 seen_kmers.insert(cn_kmer.clone());
2416 score_prod *= self.scores.get(&cn_kmer).unwrap_or(&1.0);
2418 } else {
2419 break;
2420 }
2421 }
2422 }
2423
2424 let weight = score_prod.powf(1.0 / seen_kmers.len() as f32);
2426
2427 if weight < min_score {
2428 to_remove.extend(seen_kmers);
2429 bad_paths += 1;
2430 }
2431 }
2432 }
2433
2434 for cn_kmer in &to_remove {
2435 self.kmers.remove(cn_kmer);
2436 self.scores.remove(cn_kmer);
2437 }
2438
2439 self.infer_edges();
2442
2443 self
2444 }
2445
2446 #[must_use]
2467 pub fn clean_tangles(mut self, color: usize, limit: usize, min_score: f32) -> Self {
2468 let mut to_remove = HashSet::new();
2469 let mut bad_tangles: usize = 0;
2470
2471 let mut visited = HashSet::new();
2472
2473 let stopping_condition = |kmer: &[u8], _: usize, g: &LdBG| {
2474 g.scores
2475 .get(&crate::utils::canonicalize_kmer(kmer))
2476 .unwrap_or(&1.0)
2477 < &min_score
2478 };
2479
2480 for cn_kmer in self.kmers.keys() {
2481 if !visited.contains(cn_kmer)
2482 && self.sources.get(cn_kmer).unwrap_or(&vec![]) == &vec![color]
2483 && self.scores.get(cn_kmer).unwrap_or(&1.0) < &min_score
2484 && self.kmers.get(cn_kmer).unwrap().in_degree()
2485 + self.kmers.get(cn_kmer).unwrap().out_degree()
2486 >= 4
2487 {
2488 let g =
2489 self.traverse_kmers_until_condition(&cn_kmer, color, limit, stopping_condition);
2490
2491 visited.insert(cn_kmer.clone());
2492 for node in g.node_indices() {
2493 let current_kmer = g.node_weight(node).unwrap().as_bytes();
2494 let current_cn_kmer = crate::utils::canonicalize_kmer(current_kmer);
2495
2496 to_remove.insert(current_cn_kmer.clone());
2497
2498 if !(self.scores.get(¤t_cn_kmer).unwrap_or(&1.0) < &min_score
2500 && self.kmers.get(¤t_cn_kmer).unwrap().in_degree()
2501 + self.kmers.get(¤t_cn_kmer).unwrap().out_degree()
2502 >= 4)
2503 {
2504 visited.insert(current_cn_kmer.clone());
2505 }
2506 }
2507
2508 bad_tangles += 1;
2509 }
2510 }
2511
2512 for cn_kmer in &to_remove {
2513 self.kmers.remove(cn_kmer);
2514 self.scores.remove(cn_kmer);
2515 }
2516
2517 self.infer_edges();
2520
2521 self
2522 }
2523
2524 #[must_use]
2525 pub fn clean_hairballs(mut self) -> Self {
2526 for cn_kmer in self.noise.iter() {
2527 self.kmers.remove(cn_kmer);
2528 self.scores.remove(cn_kmer);
2529 }
2530
2531 self.infer_edges();
2534
2535 self
2536 }
2537
2538 #[must_use]
2553 pub fn clean_tips(mut self, limit: usize, min_score: f32) -> Self {
2554 let mut to_remove = HashSet::new();
2555 let mut bad_paths: usize = 0;
2556
2557 let stopping_condition = |kmer: &[u8], _: usize, g: &LdBG| {
2558 g.prev_kmers(kmer).len() > 1 || g.next_kmers(kmer).len() > 1
2559 };
2560
2561 for cn_kmer in self.kmers.keys() {
2562 if self.kmers.get(cn_kmer).unwrap().in_degree() == 0
2563 && self.kmers.get(cn_kmer).unwrap().out_degree() > 1
2564 {
2565 let mut fw_contig = cn_kmer.clone();
2566 if let Ok(_) = self.assemble_forward_until_condition(
2567 &mut fw_contig,
2568 &cn_kmer,
2569 limit,
2570 stopping_condition,
2571 ) {
2572 let score_sum = fw_contig
2573 .kmers(self.kmer_size as u8)
2574 .map(|kmer| {
2575 self.scores
2576 .get(&crate::utils::canonicalize_kmer(&kmer))
2577 .unwrap_or(&1.0)
2578 })
2579 .sum::<f32>();
2580 let weight = score_sum / (fw_contig.len() - self.kmer_size + 1) as f32;
2581
2582 if weight < min_score {
2583 for kmer in fw_contig.kmers(self.kmer_size as u8) {
2584 to_remove.insert(crate::utils::canonicalize_kmer(&kmer));
2585 }
2586
2587 bad_paths += 1;
2588 }
2589 }
2590 }
2591
2592 if self.kmers.get(cn_kmer).unwrap().out_degree() == 0
2593 && self.kmers.get(cn_kmer).unwrap().in_degree() > 1
2594 {
2595 let mut rv_contig = cn_kmer.clone();
2596 if let Ok(_) = self.assemble_backward_until_condition(
2597 &mut rv_contig,
2598 &cn_kmer,
2599 limit,
2600 stopping_condition,
2601 ) {
2602 let score_sum = rv_contig
2603 .kmers(self.kmer_size as u8)
2604 .map(|kmer| {
2605 self.scores
2606 .get(&crate::utils::canonicalize_kmer(&kmer))
2607 .unwrap_or(&1.0)
2608 })
2609 .sum::<f32>();
2610 let weight = score_sum / (rv_contig.len() - self.kmer_size + 1) as f32;
2611
2612 if weight < min_score {
2613 for kmer in rv_contig.kmers(self.kmer_size as u8) {
2614 to_remove.insert(crate::utils::canonicalize_kmer(&kmer));
2615 }
2616
2617 bad_paths += 1;
2618 }
2619 }
2620 }
2621 }
2622
2623 for cn_kmer in &to_remove {
2624 self.kmers.remove(cn_kmer);
2625 self.scores.remove(cn_kmer);
2626 }
2627
2628 self.infer_edges();
2631
2632 self
2633 }
2634
2635 #[must_use]
2652 pub fn clean_superbubbles(mut self, color: usize, min_score: f32) -> Self {
2653 let mut to_remove = HashSet::new();
2654 let mut num_bad_paths: usize = 0;
2655
2656 let g = self.traverse_all_kmers();
2657 let bubbles = find_all_superbubbles(&g);
2658
2659 for ((in_node, out_node), interior) in bubbles.iter() {
2660 let paths_fwd = petgraph::algo::all_simple_paths::<Vec<_>, _>(
2661 &g,
2662 *in_node,
2663 *out_node,
2664 0,
2665 Some(interior.len()),
2666 );
2667 let paths_rev = petgraph::algo::all_simple_paths::<Vec<_>, _>(
2668 &g,
2669 *out_node,
2670 *in_node,
2671 0,
2672 Some(interior.len()),
2673 );
2674
2675 let mut paths = paths_fwd
2676 .chain(paths_rev)
2677 .map(|path| {
2678 let score = path
2679 .iter()
2680 .map(|node| {
2681 let kmer = g.node_weight(*node).unwrap().as_bytes();
2682 let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2683 *self.scores.get(&cn_kmer).unwrap_or(&1.0)
2684 })
2685 .fold(1.0, |acc, x| acc * x)
2686 .powf(1.0 / path.len() as f32);
2687
2688 let mut bad_color_count = 0;
2689
2690 for node in &path {
2691 let kmer = g.node_weight(*node).unwrap().as_bytes();
2692 let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2693
2694 let seen_colors = self
2695 .sources
2696 .get(&cn_kmer)
2697 .unwrap_or(&vec![])
2698 .iter()
2699 .cloned()
2700 .collect::<HashSet<_>>();
2701
2702 if seen_colors.len() == 1 && seen_colors.contains(&color) {
2703 bad_color_count += 1;
2704 }
2705 }
2706
2707 let is_bad_color = bad_color_count as f32 > 0.5 * (path.len() as f32);
2708
2709 (
2710 path,
2711 if score.is_nan() || is_bad_color {
2712 0.0
2713 } else {
2714 score
2715 },
2716 )
2717 })
2718 .collect::<Vec<(Vec<NodeIndex>, f32)>>();
2719
2720 paths.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
2721
2722 let good_paths = paths
2723 .iter()
2724 .take(1)
2725 .chain(
2726 paths
2727 .iter()
2728 .skip(1)
2729 .filter(|path| path.1 > min_score)
2730 .take(1),
2731 )
2732 .collect::<Vec<_>>();
2733 let bad_paths = paths
2734 .iter()
2735 .filter(|path| !good_paths.contains(path))
2736 .collect::<Vec<_>>();
2737
2738 let to_keep = good_paths
2739 .iter()
2740 .map(|(path, _)| path)
2741 .flatten()
2742 .map(|node| {
2743 crate::utils::canonicalize_kmer(g.node_weight(*node).unwrap().as_bytes())
2744 })
2745 .collect::<HashSet<Vec<u8>>>();
2746
2747 bad_paths.iter().for_each(|(path, _)| {
2748 for node in path {
2749 let kmer = g.node_weight(*node).unwrap().as_bytes();
2750 let cn_kmer = crate::utils::canonicalize_kmer(kmer);
2751
2752 if !to_keep.contains(&cn_kmer) {
2753 to_remove.insert(cn_kmer.clone());
2754 }
2755 }
2756 });
2757
2758 num_bad_paths += bad_paths.len();
2759 }
2760
2761 for cn_kmer in &to_remove {
2762 self.kmers.remove(cn_kmer);
2763 self.scores.remove(cn_kmer);
2764 }
2765
2766 self.infer_edges();
2769
2770 self
2771 }
2772
2773 #[must_use]
2788 pub fn clean_contigs(mut self, min_contig_length: usize) -> Self {
2789 let mut to_remove = HashSet::new();
2790 let mut bad_contigs: usize = 0;
2791
2792 self.assemble_all().iter().for_each(|contig| {
2793 if contig.len() < min_contig_length {
2794 let mut process = true;
2795
2796 let first_kmer = contig.windows(self.kmer_size).next().unwrap();
2798 let prev_kmers = self.prev_kmers(first_kmer);
2799 if prev_kmers.len() > 0 {
2800 process = false;
2801 }
2802
2803 let last_kmer = contig.windows(self.kmer_size).last().unwrap();
2805 let next_kmers = self.next_kmers(last_kmer);
2806 if next_kmers.len() > 0 {
2807 process = false;
2808 }
2809
2810 if process {
2811 for kmer in contig.windows(self.kmer_size) {
2812 to_remove.insert(crate::utils::canonicalize_kmer(kmer));
2813 }
2814
2815 bad_contigs += 1;
2816 }
2817 }
2818 });
2819
2820 for cn_kmer in &to_remove {
2821 self.kmers.remove(cn_kmer);
2822 self.scores.remove(cn_kmer);
2823 }
2824
2825 self.infer_edges();
2828
2829 self
2830 }
2831
2832 fn traverse_forward(
2833 &self,
2834 graph: &mut petgraph::Graph<String, f32>,
2835 visited: &mut HashMap<String, NodeIndex>,
2836 start_node: NodeIndex,
2837 ) {
2838 let mut fwd_stack = vec![start_node];
2839 while let Some(node) = fwd_stack.pop() {
2840 let this_kmer = graph.node_weight(node).unwrap().as_bytes();
2841
2842 for next_kmer in self.next_kmers(this_kmer) {
2843 let next_label = String::from_utf8_lossy(&next_kmer).to_string();
2844 let next_node = if let Some(&existing_node) = visited.get(&next_label) {
2845 existing_node
2846 } else {
2847 let new_node = graph.add_node(next_label.clone());
2848 visited.insert(next_label.clone(), new_node);
2849 fwd_stack.push(new_node);
2850 new_node
2851 };
2852
2853 if !graph.contains_edge(node, next_node) {
2854 graph.add_edge(
2855 node,
2856 next_node,
2857 *self
2858 .scores
2859 .get(&crate::utils::canonicalize_kmer(&next_kmer))
2860 .unwrap_or(&1.0),
2861 );
2862 }
2863 }
2864 }
2865 }
2866
2867 fn traverse_forward_until(
2868 &self,
2869 graph: &mut petgraph::Graph<String, f32>,
2870 visited: &mut HashMap<String, NodeIndex>,
2871 start_node: NodeIndex,
2872 stop_node: &[u8],
2873 ) -> Result<()> {
2874 let mut found = false;
2875
2876 let mut fwd_stack = vec![start_node];
2877 while let Some(node) = fwd_stack.pop() {
2878 let this_kmer = graph.node_weight(node).unwrap().as_bytes();
2879
2880 for next_kmer in self.next_kmers(this_kmer) {
2881 let next_label = String::from_utf8_lossy(&next_kmer).to_string();
2882 let next_node = if let Some(&existing_node) = visited.get(&next_label) {
2883 existing_node
2884 } else {
2885 let new_node = graph.add_node(next_label.clone());
2886 visited.insert(next_label.clone(), new_node);
2887
2888 if graph.node_weight(new_node).unwrap().as_bytes() != stop_node {
2889 fwd_stack.push(new_node);
2890 } else {
2891 found = true;
2892 }
2893
2894 new_node
2895 };
2896
2897 if !graph.contains_edge(node, next_node) {
2898 graph.add_edge(
2899 node,
2900 next_node,
2901 *self
2902 .scores
2903 .get(&crate::utils::canonicalize_kmer(&next_kmer))
2904 .unwrap_or(&1.0),
2905 );
2906 }
2907 }
2908 }
2909
2910 if found {
2911 Ok(())
2912 } else {
2913 Err(anyhow::anyhow!("Stop node not found"))
2914 }
2915 }
2916
2917 fn traverse_forward_until_condition<F>(
2918 &self,
2919 graph: &mut petgraph::Graph<String, f32>,
2920 visited: &mut HashMap<String, NodeIndex>,
2921 color: usize,
2922 start_node: NodeIndex,
2923 limit: usize,
2924 stopping_condition: F,
2925 ) -> Result<()>
2926 where
2927 F: Fn(&[u8], usize, &Self) -> bool,
2928 {
2929 let mut found = false;
2930
2931 let mut fwd_stack = vec![(start_node, 0)];
2933 while let Some((node, depth)) = fwd_stack.pop() {
2934 if depth >= limit {
2935 continue; }
2937
2938 let this_kmer = graph.node_weight(node).unwrap().as_bytes();
2939
2940 for next_kmer in self.next_kmers(this_kmer) {
2941 let same_color = self
2942 .sources
2943 .get(&crate::utils::canonicalize_kmer(&next_kmer))
2944 .unwrap_or(&vec![])
2945 == &vec![color];
2946 if !same_color {
2947 continue;
2948 }
2949
2950 let next_label = String::from_utf8_lossy(&next_kmer).to_string();
2951 let next_node = if let Some(&existing_node) = visited.get(&next_label) {
2952 existing_node
2953 } else {
2954 let new_node = graph.add_node(next_label.clone());
2955 visited.insert(next_label.clone(), new_node);
2956
2957 if !stopping_condition(&next_kmer, depth + 1, self) {
2958 fwd_stack.push((new_node, depth + 1));
2960 } else {
2961 found = true;
2962 }
2963
2964 new_node
2965 };
2966
2967 if !graph.contains_edge(node, next_node) {
2968 graph.add_edge(
2969 node,
2970 next_node,
2971 *self
2972 .scores
2973 .get(&crate::utils::canonicalize_kmer(&next_kmer))
2974 .unwrap_or(&1.0),
2975 );
2976 }
2977 }
2978 }
2979
2980 if found {
2981 Ok(())
2982 } else {
2983 Err(anyhow::anyhow!("Stop condition not met within depth limit"))
2984 }
2985 }
2986
2987 fn traverse_backward(
2988 &self,
2989 graph: &mut petgraph::Graph<String, f32>,
2990 visited: &mut HashMap<String, NodeIndex>,
2991 start_node: NodeIndex,
2992 ) {
2993 let mut rev_stack = vec![start_node];
2994 while let Some(node) = rev_stack.pop() {
2995 let this_kmer = graph.node_weight(node).unwrap().as_bytes();
2996
2997 for prev_kmer in self.prev_kmers(this_kmer) {
2998 let prev_label = String::from_utf8_lossy(&prev_kmer).to_string();
2999 let prev_node = if let Some(&existing_node) = visited.get(&prev_label) {
3000 existing_node
3001 } else {
3002 let new_node = graph.add_node(prev_label.clone());
3003 visited.insert(prev_label.clone(), new_node);
3004 rev_stack.push(new_node);
3005 new_node
3006 };
3007
3008 if !graph.contains_edge(prev_node, node) {
3009 graph.add_edge(
3010 prev_node,
3011 node,
3012 *self
3013 .scores
3014 .get(&crate::utils::canonicalize_kmer(&prev_kmer))
3015 .unwrap_or(&1.0),
3016 );
3017 }
3018 }
3019 }
3020 }
3021
3022 fn traverse_backward_until(
3023 &self,
3024 graph: &mut petgraph::Graph<String, f32>,
3025 visited: &mut HashMap<String, NodeIndex>,
3026 start_node: NodeIndex,
3027 stop_node: &[u8],
3028 ) -> Result<()> {
3029 let mut found = false;
3030
3031 let mut rev_stack = vec![start_node];
3032 while let Some(node) = rev_stack.pop() {
3033 let this_kmer = graph.node_weight(node).unwrap().as_bytes();
3034
3035 for prev_kmer in self.prev_kmers(this_kmer) {
3036 let prev_label = String::from_utf8_lossy(&prev_kmer).to_string();
3037 let prev_node = if let Some(&existing_node) = visited.get(&prev_label) {
3038 existing_node
3039 } else {
3040 let new_node = graph.add_node(prev_label.clone());
3041 visited.insert(prev_label.clone(), new_node);
3042
3043 if graph.node_weight(new_node).unwrap().as_bytes() != stop_node {
3044 rev_stack.push(new_node);
3045 } else {
3046 found = true;
3047 }
3048
3049 new_node
3050 };
3051
3052 if !graph.contains_edge(prev_node, node) {
3053 graph.add_edge(
3054 prev_node,
3055 node,
3056 *self
3057 .scores
3058 .get(&crate::utils::canonicalize_kmer(&prev_kmer))
3059 .unwrap_or(&1.0),
3060 );
3061 }
3062 }
3063 }
3064
3065 if found {
3066 Ok(())
3067 } else {
3068 Err(anyhow::anyhow!("Stop node not found"))
3069 }
3070 }
3071
3072 fn traverse_backward_until_condition<F>(
3073 &self,
3074 graph: &mut petgraph::Graph<String, f32>,
3075 visited: &mut HashMap<String, NodeIndex>,
3076 color: usize,
3077 start_node: NodeIndex,
3078 limit: usize,
3079 stopping_condition: F,
3080 ) -> Result<()>
3081 where
3082 F: Fn(&[u8], usize, &Self) -> bool,
3083 {
3084 let mut found = false;
3085
3086 let mut rev_stack = vec![(start_node, 0)];
3088 while let Some((node, depth)) = rev_stack.pop() {
3089 if depth >= limit {
3090 continue; }
3092
3093 let this_kmer = graph.node_weight(node).unwrap().as_bytes();
3094
3095 for prev_kmer in self.prev_kmers(this_kmer) {
3096 let same_color = self
3097 .sources
3098 .get(&crate::utils::canonicalize_kmer(&prev_kmer))
3099 .unwrap_or(&vec![])
3100 == &vec![color];
3101 if !same_color {
3102 continue;
3103 }
3104
3105 let prev_label = String::from_utf8_lossy(&prev_kmer).to_string();
3106 let prev_node = if let Some(&existing_node) = visited.get(&prev_label) {
3107 existing_node
3108 } else {
3109 let new_node = graph.add_node(prev_label.clone());
3110 visited.insert(prev_label.clone(), new_node);
3111
3112 if !stopping_condition(&prev_kmer, depth + 1, self) {
3113 rev_stack.push((new_node, depth + 1));
3115 } else {
3116 found = true;
3117 }
3118
3119 new_node
3120 };
3121
3122 if !graph.contains_edge(prev_node, node) {
3123 graph.add_edge(
3124 prev_node,
3125 node,
3126 *self
3127 .scores
3128 .get(&crate::utils::canonicalize_kmer(&prev_kmer))
3129 .unwrap_or(&1.0),
3130 );
3131 }
3132 }
3133 }
3134
3135 if found {
3136 Ok(())
3137 } else {
3138 Err(anyhow::anyhow!("Stop condition not met within depth limit"))
3139 }
3140 }
3141
3142 pub fn traverse_kmers_until_condition<F>(
3143 &self,
3144 start_kmer: &[u8],
3145 color: usize,
3146 limit: usize,
3147 stopping_condition: F,
3148 ) -> DiGraph<String, f32>
3149 where
3150 F: Fn(&[u8], usize, &Self) -> bool,
3151 {
3152 let mut graph = DiGraph::new();
3153 let mut visited = HashMap::<String, NodeIndex>::new(); let start_label = String::from_utf8_lossy(&start_kmer).to_string();
3156 let start_node = graph.add_node(start_label.clone());
3157 visited.insert(start_label.clone(), start_node);
3158
3159 let _ = self.traverse_forward_until_condition(
3160 &mut graph,
3161 &mut visited,
3162 color,
3163 start_node,
3164 limit,
3165 &stopping_condition,
3166 );
3167 let _ = self.traverse_backward_until_condition(
3168 &mut graph,
3169 &mut visited,
3170 color,
3171 start_node,
3172 limit,
3173 &stopping_condition,
3174 );
3175
3176 graph
3177 }
3178
3179 #[must_use]
3181 pub fn traverse_kmers(&self, start_kmer: &[u8]) -> DiGraph<String, f32> {
3182 let mut graph = DiGraph::new();
3183 let mut visited = HashMap::<String, NodeIndex>::new(); let start_label = String::from_utf8_lossy(&start_kmer).to_string();
3186 let start_node = graph.add_node(start_label.clone());
3187 visited.insert(start_label.clone(), start_node);
3188
3189 self.traverse_forward(&mut graph, &mut visited, start_node);
3190 self.traverse_backward(&mut graph, &mut visited, start_node);
3191
3192 graph
3193 }
3194
3195 #[must_use]
3219 pub fn traverse_contigs(&self, start_kmer: &[u8]) -> DiGraph<String, f32> {
3220 let mut graph = DiGraph::new();
3221 let mut visited = HashMap::new();
3222
3223 let start_contig = self.assemble(&start_kmer);
3224 let start_node = graph.add_node(String::from_utf8_lossy(&start_contig).to_string());
3225
3226 for kmer in start_contig.windows(self.kmer_size) {
3228 visited.insert(crate::utils::canonicalize_kmer(kmer), start_node);
3229 }
3230
3231 let mut fwd_stack = vec![start_node];
3233 while let Some(node) = fwd_stack.pop() {
3234 let this_contig = graph.node_weight(node).unwrap().as_bytes();
3235
3236 if let Some(last_kmer) = self.last_kmer(this_contig) {
3237 for next_kmer in self.next_kmers(&last_kmer) {
3238 if !visited.contains_key(&crate::utils::canonicalize_kmer(&next_kmer)) {
3239 let mut next_contig = next_kmer.clone();
3240 self.assemble_forward(&mut next_contig, &next_kmer);
3241
3242 let weight = next_contig
3243 .windows(self.kmer_size)
3244 .map(|x| {
3245 self.scores
3246 .get(&crate::utils::canonicalize_kmer(x))
3247 .unwrap_or(&1.0)
3248 })
3249 .fold(f32::INFINITY, |acc, x| acc.min(*x));
3250
3251 let next_node =
3252 graph.add_node(String::from_utf8_lossy(&next_contig).to_string());
3253 graph.add_edge(node, next_node, weight);
3254
3255 for kmer in next_contig.windows(self.kmer_size) {
3257 visited.insert(crate::utils::canonicalize_kmer(kmer), next_node);
3258 }
3259
3260 fwd_stack.push(next_node);
3261 } else {
3262 let next_node = *visited
3263 .get(&crate::utils::canonicalize_kmer(&next_kmer))
3264 .unwrap();
3265 if !graph.contains_edge(node, next_node) {
3266 let weight = *self
3267 .scores
3268 .get(&crate::utils::canonicalize_kmer(&next_kmer))
3269 .unwrap_or(&1.0);
3270 graph.add_edge(node, next_node, weight);
3271 }
3272 }
3273 }
3274 }
3275 }
3276
3277 let mut rev_stack = vec![start_node];
3279 while let Some(node) = rev_stack.pop() {
3280 let this_contig = graph.node_weight(node).unwrap().as_bytes();
3281
3282 if let Some(first_kmer) = self.first_kmer(this_contig) {
3283 for prev_kmer in self.prev_kmers(&first_kmer) {
3284 if !visited.contains_key(&crate::utils::canonicalize_kmer(&prev_kmer)) {
3285 let mut prev_contig = prev_kmer.clone();
3286 self.assemble_backward(&mut prev_contig, &prev_kmer);
3287
3288 let weight = prev_contig
3289 .windows(self.kmer_size)
3290 .map(|x| {
3291 self.scores
3292 .get(&crate::utils::canonicalize_kmer(x))
3293 .unwrap_or(&1.0)
3294 })
3295 .fold(f32::INFINITY, |acc, x| acc.min(*x));
3296
3297 let prev_node =
3298 graph.add_node(String::from_utf8_lossy(&prev_contig).to_string());
3299 graph.add_edge(prev_node, node, weight); for kmer in prev_contig.windows(self.kmer_size) {
3303 visited.insert(crate::utils::canonicalize_kmer(kmer), prev_node);
3304 }
3305
3306 rev_stack.push(prev_node);
3307 } else {
3308 let prev_node = *visited
3309 .get(&crate::utils::canonicalize_kmer(&prev_kmer))
3310 .unwrap();
3311 if !graph.contains_edge(prev_node, node) {
3312 let weight = *self
3313 .scores
3314 .get(&crate::utils::canonicalize_kmer(&prev_kmer))
3315 .unwrap_or(&1.0);
3316 graph.add_edge(prev_node, node, weight); }
3318 }
3319 }
3320 }
3321 }
3322
3323 graph
3324 }
3325
3326 #[must_use]
3338 pub fn traverse_all_kmers(&self) -> DiGraph<String, f32> {
3339 let cn_kmers = self
3340 .kmers
3341 .iter()
3342 .filter(|(_, record)| record.in_degree() <= 1 && record.out_degree() <= 1)
3343 .map(|(cn_kmer, _)| cn_kmer.clone())
3344 .collect::<Vec<Vec<u8>>>();
3345
3346 let mut visited = HashSet::new();
3347 let mut graph = DiGraph::new();
3348 let mut node_indices = HashMap::<String, NodeIndex>::new(); for cn_kmer in cn_kmers {
3351 let fw_kmer = cn_kmer.clone();
3352 let rc_kmer = fw_kmer.reverse_complement();
3353 for kmer in [fw_kmer, rc_kmer] {
3354 if !visited.contains(&kmer) {
3355 let g = self.traverse_kmers(&kmer);
3356
3357 for node_label in g.node_weights() {
3359 visited.insert(node_label.as_bytes().to_vec());
3361 }
3362
3363 for node_index in g.node_indices() {
3365 let node_label = g.node_weight(node_index).unwrap().clone();
3366 if !node_indices.contains_key(&node_label) {
3367 let new_node_idx = graph.add_node(node_label.clone());
3368 node_indices.insert(node_label.clone(), new_node_idx);
3369 }
3370 }
3371
3372 for edge in g.edge_references() {
3374 let (source, target) = (edge.source(), edge.target());
3375 let source_label = g.node_weight(source).unwrap();
3376 let target_label = g.node_weight(target).unwrap();
3377
3378 let new_source = *node_indices
3379 .get(source_label)
3380 .expect("Source node should exist");
3381 let new_target = *node_indices
3382 .get(target_label)
3383 .expect("Target node should exist");
3384
3385 if !graph.contains_edge(new_source, new_target) {
3386 graph.add_edge(new_source, new_target, *edge.weight());
3387 }
3388 }
3389 }
3390 }
3391 }
3392
3393 graph
3394 }
3395
3396 #[must_use]
3408 pub fn traverse_all_contigs(&self) -> DiGraph<String, f32> {
3409 let cn_kmers = self
3410 .kmers
3411 .iter()
3412 .filter(|(_, record)| record.in_degree() <= 1 && record.out_degree() <= 1)
3413 .map(|(cn_kmer, _)| cn_kmer.clone())
3414 .collect::<Vec<Vec<u8>>>();
3415
3416 let mut visited = HashSet::new();
3417 let mut graph = DiGraph::new();
3418
3419 for cn_kmer in cn_kmers {
3420 if !visited.contains(&cn_kmer) {
3421 let g = self.traverse_contigs(&cn_kmer);
3422
3423 g.node_weights().for_each(|node| {
3424 for kmer in node.as_bytes().kmers(self.kmer_size as u8) {
3425 visited.insert(crate::utils::canonicalize_kmer(&kmer));
3426 }
3427 });
3428
3429 for node_index in g.node_indices() {
3431 let node_weight = g.node_weight(node_index).unwrap();
3432 graph.add_node(node_weight.clone());
3433 }
3434
3435 for edge in g.edge_references() {
3437 let (source, target) = (edge.source(), edge.target());
3438 let source_weight = g.node_weight(source).unwrap();
3439 let target_weight = g.node_weight(target).unwrap();
3440
3441 let new_source = graph
3442 .node_indices()
3443 .find(|&i| graph[i] == *source_weight)
3444 .unwrap();
3445 let new_target = graph
3446 .node_indices()
3447 .find(|&i| graph[i] == *target_weight)
3448 .unwrap();
3449
3450 graph.add_edge(new_source, new_target, *edge.weight());
3451 }
3452 }
3453 }
3454
3455 graph
3456 }
3457}
3458
3459fn traverse_read_graph(graph: &petgraph::Graph<String, f64>, min_length: usize) -> Vec<Vec<u8>> {
3460 let mut corrected_seqs = Vec::new();
3462 let mut visited = HashSet::new();
3463 for node in graph.node_indices() {
3464 if !visited.contains(&node) {
3465 let mut current_seq = Vec::new();
3466 let mut current_node = node;
3467
3468 loop {
3469 visited.insert(current_node);
3470 if current_seq.is_empty() {
3471 current_seq.push(graph[current_node].clone());
3472 } else {
3473 current_seq.push(graph[current_node].chars().last().unwrap().to_string());
3474 }
3475
3476 let outgoing_edges: Vec<_> = graph
3477 .edges_directed(current_node, petgraph::Direction::Outgoing)
3478 .collect();
3479 if outgoing_edges.len() == 1 {
3480 let next_node = outgoing_edges[0].target();
3481 if !visited.contains(&next_node) {
3482 current_node = next_node;
3483 } else {
3484 break;
3485 }
3486 } else {
3487 break;
3488 }
3489 }
3490
3491 if current_seq.len() > min_length {
3492 let contig = current_seq.join("").as_bytes().to_vec();
3493 corrected_seqs.push(contig);
3494 }
3495 }
3496 }
3497 corrected_seqs
3498}
3499
3500fn navigate_backward(
3501 node: NodeIndex,
3502 steps: u8,
3503 graph: &petgraph::Graph<String, f64>,
3504) -> Option<NodeIndex> {
3505 let mut current_node = node;
3506 let mut steps_backward = 0;
3507
3508 while steps_backward < steps {
3509 let incoming_edges: Vec<_> = graph
3510 .edges_directed(current_node, petgraph::Direction::Incoming)
3511 .collect();
3512 if incoming_edges.is_empty() {
3513 return None;
3514 }
3515 current_node = incoming_edges[0].source();
3516 steps_backward += 1;
3517 }
3518
3519 if steps_backward == steps {
3520 Some(current_node)
3521 } else {
3522 None
3523 }
3524}
3525
3526fn navigate_forward(
3527 node: NodeIndex,
3528 steps: u8,
3529 graph: &petgraph::Graph<String, f64>,
3530) -> Option<NodeIndex> {
3531 let mut current_node = node;
3532 let mut steps_forward = 0;
3533
3534 while steps_forward < steps {
3535 let outgoing_edges: Vec<_> = graph
3536 .edges_directed(current_node, petgraph::Direction::Outgoing)
3537 .collect();
3538 if outgoing_edges.is_empty() {
3539 return None;
3540 }
3541 current_node = outgoing_edges[0].source();
3542 steps_forward += 1;
3543 }
3544
3545 if steps_forward == steps {
3546 Some(current_node)
3547 } else {
3548 None
3549 }
3550}
3551
3552#[must_use]
3571pub fn find_superbubble(
3572 graph: &DiGraph<String, f32>,
3573 s: NodeIndex,
3574 direction: petgraph::Direction,
3575) -> Option<(NodeIndex, NodeIndex, Vec<NodeIndex>)> {
3576 let mut seen = HashSet::new();
3577 let mut visited = HashSet::new();
3578 let mut nodes_inside = Vec::new();
3579
3580 seen.insert(s.index());
3582
3583 let mut stack = vec![(s, direction)];
3584 while !stack.is_empty() {
3585 let (v, v_direction) = stack.pop().unwrap();
3586 visited.insert(v.index());
3587
3588 nodes_inside.push(v);
3589
3590 seen.remove(&v.index());
3592
3593 let children = match v_direction {
3594 petgraph::Direction::Incoming => graph
3595 .neighbors_directed(v, petgraph::Direction::Incoming)
3596 .collect::<Vec<_>>(),
3597 petgraph::Direction::Outgoing => graph
3598 .neighbors_directed(v, petgraph::Direction::Outgoing)
3599 .collect::<Vec<_>>(),
3600 };
3601
3602 if children.is_empty() {
3603 break;
3604 }
3605
3606 for u in children {
3607 let (u_child_direction, u_parents) = if v_direction == petgraph::Direction::Outgoing {
3608 (
3609 petgraph::Direction::Outgoing,
3610 graph
3611 .neighbors_directed(u, petgraph::Direction::Incoming)
3612 .collect::<Vec<_>>(),
3613 )
3614 } else {
3615 (
3616 petgraph::Direction::Incoming,
3617 graph
3618 .neighbors_directed(u, petgraph::Direction::Outgoing)
3619 .collect::<Vec<_>>(),
3620 )
3621 };
3622
3623 if u.index() == s.index() {
3624 stack.clear();
3625 break;
3626 }
3627
3628 seen.insert(u.index());
3629
3630 if u_parents.iter().all(|&n| visited.contains(&n.index())) {
3631 stack.push((u, u_child_direction));
3632 }
3633 }
3634
3635 if stack.len() == 1 && seen.len() == 1 {
3636 let (t, _) = stack.pop().unwrap();
3637 nodes_inside.push(t);
3638
3639 if nodes_inside.len() == 2 {
3640 break;
3641 }
3642
3643 nodes_inside.retain(|&x| x != s && x != t);
3644 return Some((s, t, nodes_inside));
3645 }
3646 }
3647
3648 None
3649}
3650
3651#[must_use]
3652pub fn find_all_superbubbles_old(
3653 graph: &petgraph::Graph<String, f32>,
3654) -> LinkedHashMap<(NodeIndex, NodeIndex), Vec<NodeIndex>> {
3655 let mut bubbles = LinkedHashMap::new();
3656
3657 let mut visited: HashSet<NodeIndex> = HashSet::new();
3658 for n in graph.node_indices() {
3659 for d in [petgraph::Direction::Outgoing, petgraph::Direction::Incoming] {
3661 let bubble = find_superbubble(&graph, n, d);
3662
3663 if let Some(bubble) = bubble {
3664 visited.extend(bubble.2.iter().cloned());
3665
3666 let key_fwd = (bubble.0, bubble.1);
3667 let key_rev = (bubble.1, bubble.0);
3668
3669 if !bubbles.contains_key(&key_fwd) && !bubbles.contains_key(&key_rev) {
3670 bubbles.insert((bubble.0, bubble.1), bubble.2);
3671 }
3672 }
3673 }
3674 }
3676
3677 bubbles
3678}
3679
3680pub fn find_all_superbubbles(
3681 graph: &petgraph::Graph<String, f32>,
3682) -> LinkedHashMap<(NodeIndex, NodeIndex), Vec<NodeIndex>> {
3683 let mut bubbles = LinkedHashMap::new();
3684 let mut processed_pairs = HashSet::new();
3685
3686 for n in graph.node_indices() {
3687 for d in [petgraph::Direction::Outgoing, petgraph::Direction::Incoming] {
3688 if let Some((start, end, interior)) = find_superbubble(graph, n, d) {
3689 let pair_forward = (start.index(), end.index());
3691 let pair_reverse = (end.index(), start.index());
3692
3693 if !processed_pairs.contains(&pair_forward)
3695 && !processed_pairs.contains(&pair_reverse)
3696 {
3697 bubbles.insert((start, end), interior);
3698 processed_pairs.insert(pair_forward);
3699 processed_pairs.insert(pair_reverse);
3700 }
3701 }
3702 }
3703 }
3704
3705 bubbles
3706}
3707
3708#[cfg(test)]
3709mod tests {
3710 use super::*;
3711 use crate::edges::Edges;
3712
3713 use petgraph::data::DataMap;
3714 use rand::{Rng, SeedableRng};
3715
3716 use std::collections::BTreeMap;
3717 use std::env;
3718 use std::fs::File;
3719 use std::io::{BufRead, BufReader, Write};
3720 use std::process::{Command, Stdio};
3721
3722 use flate2::read::GzDecoder;
3723
3724 use proptest::prelude::*;
3725
3726 fn get_test_genome() -> Vec<u8> {
3728 "ACTGATTTCGATGCGATGCGATGCCACGGTGG".as_bytes().to_vec()
3729 }
3730
3731 fn generate_genome_with_tandem_repeats(
3744 flank_length: usize,
3745 repeat_length: usize,
3746 num_repeats: usize,
3747 seed: u64,
3748 ) -> Vec<u8> {
3749 let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(seed);
3750 let alphabet = b"ACGT";
3751
3752 let mut left_flank = Vec::with_capacity(flank_length);
3753 for _ in 0..flank_length {
3754 let idx = rng.gen_range(0..4);
3755 left_flank.push(alphabet[idx]);
3756 }
3757
3758 let mut repeat = Vec::with_capacity(repeat_length);
3759 for _ in 0..repeat_length {
3760 let idx = rng.gen_range(0..4);
3761 repeat.push(alphabet[idx]);
3762 }
3763
3764 let mut right_flank = Vec::with_capacity(flank_length);
3765 for _ in 0..flank_length {
3766 let idx = rng.gen_range(0..4);
3767 right_flank.push(alphabet[idx]);
3768 }
3769
3770 let mut genome = Vec::new();
3771 genome.extend_from_slice(&left_flank);
3772 for _ in 0..num_repeats {
3773 genome.extend_from_slice(&repeat);
3774 }
3775 genome.extend_from_slice(&right_flank);
3776
3777 genome
3778 }
3779
3780 fn assemble_with_mccortex(
3782 k: usize,
3783 input_genome: &Vec<u8>,
3784 build_links: bool,
3785 use_cache: bool,
3786 ) -> (BTreeMap<String, String>, Links) {
3787 let current_dir = env::current_dir().unwrap();
3788 let test_dir = current_dir.join("tests/test_data");
3789 let test_dir_str = test_dir.to_str().unwrap();
3790
3791 let copied_genome = input_genome.clone();
3792 let md5_hash = format!("{:x}", md5::compute(&copied_genome));
3793
3794 let genome_path = test_dir.join(format!("test.{}.k_{}.l_{}.fa", md5_hash, k, build_links));
3795 let links_path = test_dir.join(format!(
3796 "links.{}.k_{}.l_{}.ctp.gz",
3797 md5_hash, k, build_links
3798 ));
3799 let contigs_path =
3800 test_dir.join(format!("contigs.{}.k_{}.l_{}.fa", md5_hash, k, build_links));
3801
3802 if !use_cache || !contigs_path.exists() || !links_path.exists() {
3803 let mut genome_file = File::create(genome_path).expect("Unable to create file");
3804
3805 writeln!(genome_file, ">genome").expect("Unable to write to file");
3806 writeln!(genome_file, "{}", String::from_utf8(copied_genome).unwrap())
3807 .expect("Unable to write to file");
3808
3809 Command::new("docker")
3810 .args(&[
3811 "run",
3812 "-v",
3813 &format!("{}:/data", test_dir_str),
3814 "-it",
3815 "us.gcr.io/broad-dsp-lrma/lr-mccortex:1.0.0",
3816 "mccortex31",
3817 "build",
3818 "-f",
3819 "-m",
3820 "1g",
3821 "-k",
3822 &format!("{}", k),
3823 "-s",
3824 "test",
3825 "-1",
3826 &format!("/data/test.{}.k_{}.l_{}.fa", md5_hash, k, build_links),
3827 &format!("/data/test.{}.k_{}.l_{}.ctx", md5_hash, k, build_links),
3828 ])
3829 .stdout(Stdio::null())
3830 .stderr(Stdio::null())
3831 .status()
3832 .expect("failed to execute process");
3833
3834 if build_links {
3835 Command::new("docker")
3836 .args(&[
3837 "run",
3838 "-v",
3839 &format!("{}:/data", test_dir_str),
3840 "-it",
3841 "us.gcr.io/broad-dsp-lrma/lr-mccortex:1.0.0",
3842 "mccortex31",
3843 "thread",
3844 "-f",
3845 "-m",
3846 "1g",
3847 "-w",
3848 "-o",
3849 &format!("/data/links.{}.k_{}.l_{}.ctp.gz", md5_hash, k, build_links),
3850 "-1",
3851 &format!("/data/test.{}.k_{}.l_{}.fa", md5_hash, k, build_links),
3852 &format!("/data/test.{}.k_{}.l_{}.ctx", md5_hash, k, build_links),
3853 ])
3854 .stdout(Stdio::null())
3855 .stderr(Stdio::null())
3856 .status()
3857 .expect("failed to execute process");
3858
3859 Command::new("docker")
3860 .args(&[
3861 "run",
3862 "-v",
3863 &format!("{}:/data", test_dir_str),
3864 "-it",
3865 "us.gcr.io/broad-dsp-lrma/lr-mccortex:1.0.0",
3866 "mccortex31",
3867 "contigs",
3868 "-f",
3869 "-m",
3870 "1g",
3871 "-p",
3872 &format!("/data/links.{}.k_{}.l_{}.ctp.gz", md5_hash, k, build_links),
3873 "-o",
3874 &format!("/data/contigs.{}.k_{}.l_{}.fa", md5_hash, k, build_links),
3875 &format!("/data/test.{}.k_{}.l_{}.ctx", md5_hash, k, build_links),
3876 ])
3877 .stdout(Stdio::null())
3878 .stderr(Stdio::null())
3879 .status()
3880 .expect("failed to execute process");
3881 } else {
3882 Command::new("docker")
3883 .args(&[
3884 "run",
3885 "-v",
3886 &format!("{}:/data", test_dir_str),
3887 "-it",
3888 "us.gcr.io/broad-dsp-lrma/lr-mccortex:1.0.0",
3889 "mccortex31",
3890 "contigs",
3891 "-f",
3892 "-m",
3893 "1g",
3894 "-o",
3895 &format!("/data/contigs.{}.k_{}.l_{}.fa", md5_hash, k, build_links),
3896 &format!("/data/test.{}.k_{}.l_{}.ctx", md5_hash, k, build_links),
3897 ])
3898 .stdout(Stdio::null())
3899 .stderr(Stdio::null())
3900 .status()
3901 .expect("failed to execute process");
3902 }
3903 }
3904
3905 let contig_map = parse_mccortex_fasta(contigs_path);
3906 let links = parse_mccortex_ctp_file(links_path);
3907
3908 (contig_map, links)
3909 }
3910
3911 fn parse_mccortex_fasta(contigs_path: PathBuf) -> BTreeMap<String, String> {
3913 let contigs = bio::io::fasta::Reader::from_file(contigs_path).unwrap();
3914 let contig_map: BTreeMap<_, _> = contigs
3915 .records()
3916 .into_iter()
3917 .filter_map(Result::ok)
3918 .map(|r| {
3919 let pieces = r
3920 .desc()
3921 .unwrap()
3922 .split_whitespace()
3923 .flat_map(|s| s.split("=").collect::<Vec<&str>>())
3924 .collect::<Vec<&str>>();
3925
3926 let seed = pieces.chunks(2).find(|pair| pair[0] == "seed").unwrap()[1].to_string();
3927
3928 let seq = String::from_utf8_lossy(r.seq()).to_string();
3929
3930 (seed, seq)
3931 })
3932 .collect();
3933
3934 contig_map
3935 }
3936
3937 fn parse_mccortex_ctp_file(links_path: PathBuf) -> Links {
3939 let mut links = Links::new();
3940
3941 let file = File::open(links_path).expect("Unable to open file");
3942 let decoder = GzDecoder::new(file);
3943 let reader = BufReader::new(decoder);
3944
3945 let lines: Vec<Vec<String>> = reader
3946 .lines()
3947 .map(|line| {
3948 line.unwrap()
3949 .split_whitespace()
3950 .map(|s| s.to_string())
3951 .collect()
3952 })
3953 .collect();
3954
3955 for i in 0..lines.len() - 1 {
3956 let pieces1 = &lines[i];
3957
3958 if pieces1.len() < 1 {
3959 continue;
3960 }
3961
3962 if let Some(first_char_1) = pieces1.get(0).unwrap().chars().next() {
3963 if "ACGT".contains(first_char_1) {
3964 let anchor_kmer = pieces1.get(0).unwrap().as_bytes();
3965 let cn_anchor_vec = crate::utils::canonicalize_kmer(anchor_kmer);
3966 let cn_anchor_kmer = cn_anchor_vec.as_bytes();
3967
3968 for j in i + 1..lines.len() {
3969 let pieces2 = &lines[j];
3970
3971 if pieces2.len() < 1 {
3972 continue;
3973 }
3974
3975 if let Some(first_char_2) = pieces2.get(0).unwrap().chars().next() {
3976 if "ACGT".contains(first_char_2) {
3977 break;
3978 } else if "FR".contains(first_char_2) {
3979 let mut link = Link::new(pieces2.get(0).unwrap() == "F");
3980
3981 let junctions = pieces2.get(3).unwrap();
3982 for junction in junctions.chars() {
3983 link.push_back(junction as u8);
3984 }
3985
3986 if !links.contains_key(cn_anchor_kmer) {
3988 links.insert(cn_anchor_kmer.to_owned(), HashMap::new());
3989 }
3990
3991 if !links.get(cn_anchor_kmer).unwrap().contains_key(&link) {
3992 links.get_mut(cn_anchor_kmer).unwrap().insert(link, 1);
3993 } else {
3994 let linkcov =
3995 *links.get_mut(cn_anchor_kmer).unwrap().get(&link).unwrap();
3996
3997 links
3998 .get_mut(cn_anchor_kmer)
3999 .unwrap()
4000 .insert(link, linkcov.saturating_add(1));
4001 }
4002 }
4003 }
4004 }
4005 }
4006 }
4007 }
4008
4009 links
4010 }
4011
4012 #[test]
4013 fn test_traverse_kmers() {
4014 let genome = get_test_genome();
4015 let fwd_seqs = vec![genome];
4016
4017 let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4018 let graph = g.traverse_kmers(b"ATTTC");
4019
4020 assert_eq!(graph.node_count(), 21);
4024 assert_eq!(graph.edge_count(), 21);
4025
4026 let node_labels: Vec<_> = graph.node_weights().cloned().collect();
4028 assert!(node_labels.contains(&"ATTTC".to_string()));
4029 assert!(node_labels.contains(&"TTTCG".to_string()));
4030 assert!(node_labels.contains(&"TTCGA".to_string()));
4031 assert!(node_labels.contains(&"TCGAT".to_string()));
4032 assert!(node_labels.contains(&"CGATG".to_string()));
4033 assert!(node_labels.contains(&"GATGC".to_string()));
4034 assert!(node_labels.contains(&"ATGCC".to_string()));
4035 assert!(node_labels.contains(&"ATGCG".to_string()));
4036 assert!(node_labels.contains(&"TGCGA".to_string()));
4037 assert!(node_labels.contains(&"GCGAT".to_string()));
4038 assert!(node_labels.contains(&"TGCCA".to_string()));
4039 assert!(node_labels.contains(&"GCCAC".to_string()));
4040 assert!(node_labels.contains(&"CCACG".to_string()));
4041 assert!(node_labels.contains(&"CACGG".to_string()));
4042 assert!(node_labels.contains(&"ACGGT".to_string()));
4043 assert!(node_labels.contains(&"CGGTG".to_string()));
4044 assert!(node_labels.contains(&"GGTGG".to_string()));
4045 assert!(node_labels.contains(&"GATTT".to_string()));
4046 assert!(node_labels.contains(&"TGATT".to_string()));
4047 assert!(node_labels.contains(&"CTGAT".to_string()));
4048 assert!(node_labels.contains(&"ACTGA".to_string()));
4049
4050 let edges = graph.edge_indices().collect::<Vec<_>>();
4052 assert_eq!(edges.len(), 21);
4053
4054 let find_node = |label: &str| graph.node_indices().find(|&i| graph[i] == label).unwrap();
4056
4057 assert!(graph.contains_edge(find_node("ATTTC"), find_node("TTTCG")));
4059 assert!(graph.contains_edge(find_node("TTTCG"), find_node("TTCGA")));
4060 assert!(graph.contains_edge(find_node("GATGC"), find_node("ATGCC")));
4061 assert!(graph.contains_edge(find_node("GATGC"), find_node("ATGCG")));
4062 assert!(graph.contains_edge(find_node("CGATG"), find_node("GATGC")));
4063 assert!(graph.contains_edge(find_node("GCCAC"), find_node("CCACG")));
4064 assert!(graph.contains_edge(find_node("ACTGA"), find_node("CTGAT")));
4065 }
4066
4067 fn test_traverse_all_kmers() {
4069 let genome = get_test_genome();
4070 let fwd_seqs = vec![genome];
4071
4072 let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4073 let graph = g.traverse_all_kmers();
4074
4075 let mut gfa_output = Vec::new();
4077 crate::utils::write_gfa(&mut gfa_output, &graph).unwrap();
4078
4079 let gfa_string = String::from_utf8(gfa_output).unwrap();
4081 println!("{}", gfa_string);
4082 }
4083
4084 fn test_traverse_all_contigs() {
4086 let genome = get_test_genome();
4087 let fwd_seqs = vec![genome];
4088
4089 let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4090 let graph = g.traverse_all_contigs();
4091
4092 let mut gfa_output = Vec::new();
4094 crate::utils::write_gfa(&mut gfa_output, &graph).unwrap();
4095
4096 let gfa_string = String::from_utf8(gfa_output).unwrap();
4098 println!("{}", gfa_string);
4099 }
4100
4101 #[test]
4102 fn test_traverse_contigs() {
4103 let genome = get_test_genome();
4104 let fwd_seqs = vec![genome];
4105
4106 let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4107 let graph = g.traverse_contigs(b"CCACG");
4108
4109 assert_eq!(graph.node_count(), 3);
4115 assert_eq!(graph.edge_count(), 4);
4116
4117 let node_labels: Vec<_> = graph.node_weights().cloned().collect();
4119 assert!(node_labels.contains(&"CGATGCCACGGTGG".to_string()));
4120 assert!(node_labels.contains(&"ACTGATTTCGAT".to_string()));
4121 assert!(node_labels.contains(&"CGATGCGAT".to_string()));
4122
4123 let edges = graph.edge_indices().collect::<Vec<_>>();
4125 assert_eq!(edges.len(), 4);
4126
4127 let find_node = |label: &str| graph.node_indices().find(|&i| graph[i] == label).unwrap();
4129
4130 assert!(graph.contains_edge(find_node("ACTGATTTCGAT"), find_node("CGATGCCACGGTGG")));
4132 assert!(graph.contains_edge(find_node("ACTGATTTCGAT"), find_node("CGATGCGAT")));
4133 assert!(graph.contains_edge(find_node("CGATGCGAT"), find_node("CGATGCCACGGTGG")));
4134 assert!(graph.contains_edge(find_node("CGATGCGAT"), find_node("CGATGCGAT")));
4135 }
4136
4137 #[test]
4138 fn test_prev_kmers() {
4139 let genome = get_test_genome();
4140 let fwd_seqs = vec![genome];
4141
4142 let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4143
4144 let prev_kmers_1 = g.prev_kmers(b"TTTCG");
4145 assert_eq!(prev_kmers_1.len(), 1);
4146 assert_eq!(prev_kmers_1[0], b"ATTTC");
4147
4148 let prev_kmers_2: HashSet<Vec<u8>> = g.prev_kmers(b"CGATG").into_iter().collect();
4149 assert_eq!(prev_kmers_2.len(), 2);
4150 assert!(prev_kmers_2.contains(&b"TCGAT".to_vec()));
4151 assert!(prev_kmers_2.contains(&b"GCGAT".to_vec()));
4152
4153 let prev_kmers_3 = g.prev_kmers(b"ACTGA");
4154 assert_eq!(prev_kmers_3.len(), 0);
4155 }
4156
4157 #[test]
4158 fn test_next_kmers() {
4159 let genome = get_test_genome();
4160 let fwd_seqs = vec![genome];
4161
4162 let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4163
4164 let next_kmers_1 = g.next_kmers(b"TTTCG");
4165 assert_eq!(next_kmers_1.len(), 1);
4166 assert_eq!(next_kmers_1[0], b"TTCGA");
4167
4168 let next_kmers_2: HashSet<Vec<u8>> = g.next_kmers(b"GATGC").into_iter().collect();
4169 assert_eq!(next_kmers_2.len(), 2);
4170 assert!(next_kmers_2.contains(&b"ATGCG".to_vec()));
4171 assert!(next_kmers_2.contains(&b"ATGCC".to_vec()));
4172
4173 let next_kmers_3 = g.next_kmers(b"GGTGG");
4174 assert_eq!(next_kmers_3.len(), 0);
4175 }
4176
4177 #[test]
4178 fn test_from_sequences() {
4179 let genome = get_test_genome();
4180 let fwd_seqs = vec![genome];
4181
4182 let g = LdBG::from_sequences(String::from("test"), 5, &fwd_seqs);
4183
4184 let mut exp_graph = KmerGraph::new();
4185 exp_graph.insert(
4186 b"AAATC".to_vec(),
4187 Record::new(1, Some(Edges::from_string("..g.A..."))),
4188 );
4189 exp_graph.insert(
4190 b"AATCA".to_vec(),
4191 Record::new(1, Some(Edges::from_string("a.....G."))),
4192 );
4193 exp_graph.insert(
4194 b"ACCGT".to_vec(),
4195 Record::new(1, Some(Edges::from_string(".c....G."))),
4196 );
4197 exp_graph.insert(
4198 b"ACTGA".to_vec(),
4199 Record::new(1, Some(Edges::from_string(".......T"))),
4200 );
4201 exp_graph.insert(
4202 b"ATCAG".to_vec(),
4203 Record::new(1, Some(Edges::from_string("a......T"))),
4204 );
4205 exp_graph.insert(
4206 b"ATCGA".to_vec(),
4207 Record::new(1, Some(Edges::from_string(".c..A..."))),
4208 );
4209 exp_graph.insert(
4210 b"ATCGC".to_vec(),
4211 Record::new(2, Some(Edges::from_string(".c..A..."))),
4212 );
4213 exp_graph.insert(
4214 b"ATGCC".to_vec(),
4215 Record::new(1, Some(Edges::from_string("..g.A..."))),
4216 );
4217 exp_graph.insert(
4218 b"ATGCG".to_vec(),
4219 Record::new(2, Some(Edges::from_string("..g.A..."))),
4220 );
4221 exp_graph.insert(
4222 b"ATTTC".to_vec(),
4223 Record::new(1, Some(Edges::from_string("..g...G."))),
4224 );
4225 exp_graph.insert(
4226 b"CACCG".to_vec(),
4227 Record::new(1, Some(Edges::from_string(".c.....T"))),
4228 );
4229 exp_graph.insert(
4230 b"CACGG".to_vec(),
4231 Record::new(1, Some(Edges::from_string(".c.....T"))),
4232 );
4233 exp_graph.insert(
4234 b"CATCG".to_vec(),
4235 Record::new(3, Some(Edges::from_string("..g.AC.."))),
4236 );
4237 exp_graph.insert(
4238 b"CCACC".to_vec(),
4239 Record::new(1, Some(Edges::from_string("......G."))),
4240 );
4241 exp_graph.insert(
4242 b"CCACG".to_vec(),
4243 Record::new(1, Some(Edges::from_string("..g...G."))),
4244 );
4245 exp_graph.insert(
4246 b"CGAAA".to_vec(),
4247 Record::new(1, Some(Edges::from_string("...t...T"))),
4248 );
4249 exp_graph.insert(
4250 b"GATGC".to_vec(),
4251 Record::new(3, Some(Edges::from_string(".c...CG."))),
4252 );
4253 exp_graph.insert(
4254 b"GCCAC".to_vec(),
4255 Record::new(1, Some(Edges::from_string("...t..G."))),
4256 );
4257 exp_graph.insert(
4258 b"TCGAA".to_vec(),
4259 Record::new(1, Some(Edges::from_string("a...A..."))),
4260 );
4261 exp_graph.insert(
4262 b"TCGCA".to_vec(),
4263 Record::new(2, Some(Edges::from_string("a......T"))),
4264 );
4265 exp_graph.insert(
4266 b"TGCCA".to_vec(),
4267 Record::new(1, Some(Edges::from_string("a....C.."))),
4268 );
4269
4270 let mut exp_links = Links::new();
4271 exp_links.insert(
4272 b"ATGCC".to_vec(),
4273 HashMap::from([(Link::from_junctions(false, b"CCA"), 1)]),
4274 );
4275 exp_links.insert(
4276 b"ATCGC".to_vec(),
4277 HashMap::from([
4278 (Link::from_junctions(false, b"GC"), 1),
4279 (Link::from_junctions(false, b"C"), 1),
4280 ]),
4281 );
4282 exp_links.insert(
4283 b"ATGCG".to_vec(),
4284 HashMap::from([
4285 (Link::from_junctions(false, b"CA"), 1),
4286 (Link::from_junctions(false, b"A"), 1),
4287 ]),
4288 );
4289 exp_links.insert(
4290 b"ATCGA".to_vec(),
4291 HashMap::from([(Link::from_junctions(false, b"GGC"), 1)]),
4292 );
4293
4294 for (kmer, record) in g.kmers {
4295 assert!(exp_graph.contains_key(kmer.as_bytes()));
4296
4297 let exp_record = exp_graph.get(kmer.as_bytes()).unwrap();
4298 assert!(record.coverage() == exp_record.coverage());
4299 assert!(record.incoming_edges() == exp_record.incoming_edges());
4300 assert!(record.outgoing_edges() == exp_record.outgoing_edges());
4301 }
4302
4303 for (kmer, link_map) in &g.links {
4304 assert!(exp_links.contains_key(kmer));
4305 assert!(link_map.len() == exp_links.get(kmer).unwrap().len());
4306 for (link, count) in link_map {
4307 assert!(exp_links.get(kmer).unwrap().contains_key(link));
4308 assert!(count == exp_links.get(kmer).unwrap().get(link).unwrap());
4309 }
4310 }
4311 }
4312
4313 #[test]
4314 fn test_assemble() {
4315 let fw_genome = get_test_genome();
4316 let rc_genome = fw_genome.reverse_complement();
4317
4318 let fwd_seqs = vec![fw_genome.clone()];
4319
4320 let g =
4321 LdBG::from_sequences(String::from("test"), 5, &fwd_seqs).build_links(&fwd_seqs, false);
4322
4323 assert!(fw_genome == g.assemble(b"ACTGA"));
4325 assert!(fw_genome == g.assemble(b"TTCGA"));
4326
4327 assert!(fw_genome == g.assemble(b"TGCCA"));
4328 assert!(fw_genome == g.assemble(b"GGTGG"));
4329
4330 assert!(rc_genome == g.assemble(b"ACTGA".to_vec().reverse_complement().as_bytes()));
4332 assert!(rc_genome == g.assemble(b"TTCGA".to_vec().reverse_complement().as_bytes()));
4333 assert!(rc_genome == g.assemble(b"TGCCA".to_vec().reverse_complement().as_bytes()));
4334 assert!(rc_genome == g.assemble(b"GGTGG".to_vec().reverse_complement().as_bytes()));
4335 }
4336
4337 #[test]
4338 fn test_assemble_until() {
4339 let fw_genome = get_test_genome();
4340 let fwd_seqs = vec![fw_genome.clone()];
4341
4342 let g =
4343 LdBG::from_sequences(String::from("test"), 5, &fwd_seqs).build_links(&fwd_seqs, false);
4344
4345 let mut contig1 = b"ACTGA".to_vec();
4346 let _ = g.assemble_forward_until(
4347 &mut contig1,
4348 b"ACTGA",
4349 &HashSet::from([b"TTCGA".to_vec()]),
4350 100,
4351 );
4352 assert_eq!(contig1, b"ACTGATTTCGA".to_vec());
4353
4354 let mut contig2 = b"GGTGG".to_vec();
4355 let _ = g.assemble_backward_until(
4356 &mut contig2,
4357 b"GGTGG",
4358 &HashSet::from([b"TGCCA".to_vec()]),
4359 100,
4360 );
4361 assert_eq!(contig2, b"TGCCACGGTGG".to_vec());
4362 }
4363
4364 #[test]
4365 fn test_correct_seq() {
4366 let genome = get_test_genome();
4367 let g = LdBG::from_sequence("test".to_string(), 5, &genome);
4368
4369 let mut uncorrected_seq_map = BTreeMap::new();
4370
4371 uncorrected_seq_map.insert(
4373 b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec(),
4374 vec![b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec()],
4375 );
4376 uncorrected_seq_map.insert(
4377 b"ACTGAATTCGATGCGATGCGATGCCACGGTGG".to_vec(),
4378 vec![b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec()],
4379 );
4380 uncorrected_seq_map.insert(
4381 b"ACTGATATCGATGCGATGCGATGCCACGGTGG".to_vec(),
4382 vec![b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec()],
4383 );
4384 uncorrected_seq_map.insert(
4385 b"ACTGATTTCGATGCGATGCGATGCCATGGTGG".to_vec(),
4386 vec![b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec()],
4387 );
4388 uncorrected_seq_map.insert(
4389 b"ACTGATTTCGATGCGATGCGATGCCATCGGTGG".to_vec(),
4390 vec![b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec()],
4391 );
4392 uncorrected_seq_map.insert(
4393 b"ACTGATTTCGATGCGATGCGATGCCGGTGG".to_vec(),
4394 vec![b"ACTGATTTCGATGCGATGCGATGCCACGGTGG".to_vec()],
4395 );
4396
4397 let g1 = g.traverse_all_kmers();
4398
4399 for (uncorrected_seq, expected_seqs) in uncorrected_seq_map.iter() {
4400 let corrected_seqs = g.correct_seq(&g1, &uncorrected_seq);
4401
4402 assert_eq!(corrected_seqs.len(), expected_seqs.len());
4403 }
4408 }
4409
4410 proptest! {
4411 #[test]
4412 fn test_assemble_random_genomes(
4413 repeat_length in (3..18usize).prop_filter("Increment by 3", |v| v % 3 == 0),
4414 num_repeats in (2..=3usize),
4415 k in (11..=17usize).prop_filter("Must be odd", |v| v % 2 == 1)
4416 ) {
4417 let random_genome = generate_genome_with_tandem_repeats(500, repeat_length, num_repeats, 0);
4418
4419 let g = LdBG::from_sequences(String::from("test"), k, &vec!(random_genome.clone()))
4420 .build_links(&vec!(random_genome.clone()), false);
4421
4422 let hd_links = g.links.clone();
4423
4424 let (mc_contigs, mc_links) = assemble_with_mccortex(k, &random_genome, true, true);
4425 let mc_links: BTreeMap<Vec<u8>, BTreeMap<Link, u16>> = mc_links.into_iter()
4426 .map(|(k, v)| (k, v.into_iter().collect()))
4427 .collect();
4428
4429 let all_kmers: HashSet<_> = mc_links.keys().chain(hd_links.keys()).collect();
4430
4431 for kmer in all_kmers {
4432 let mc_links_map = mc_links.get(kmer);
4433 let hd_links_map = hd_links.get(kmer);
4434
4435 match (mc_links_map, hd_links_map) {
4436 (Some(mc_map), Some(hd_map)) => {
4437 assert_eq!(mc_map.len(), hd_map.len(), "Maps have different lengths for kmer: {}", String::from_utf8_lossy(kmer));
4438 for mc_link in mc_map.keys() {
4439 assert!(hd_map.contains_key(mc_link), "Link {} not in hd map for kmer: {}", mc_link, String::from_utf8_lossy(kmer));
4440 }
4441 }
4442 (Some(mc_map), None) => {
4443 for (link, count) in mc_map {
4444 println!("mc only: {} {} {}", String::from_utf8_lossy(kmer), link, count);
4445 }
4446 panic!("There are links unique to reference implementation.");
4447 }
4448 (None, Some(hd_map)) => {
4449 for (link, count) in hd_map {
4450 println!("hd only: {} {} {}", String::from_utf8_lossy(kmer), link, count);
4451 }
4452 panic!("There are links unique to skydive implementation.");
4453 }
4454 (None, None) => {}
4455 }
4456 }
4457
4458 for (seed, mc_contig) in mc_contigs {
4459 let hd_contig = String::from_utf8(g.assemble(seed.as_bytes())).expect("Invalid UTF-8 sequence");
4460
4461 assert_eq!(mc_contig, hd_contig);
4462 }
4463 }
4464 }
4465
4466 fn create_bubblegun_graph() -> petgraph::Graph<String, f32> {
4470 let mut graph = petgraph::Graph::<String, f32>::new();
4472
4473 let mut node_indices = std::collections::HashMap::new();
4475
4476 let gfa_nodes = vec![
4478 ("1", "CCCAACAAGTG"),
4479 ("7", "AACAAGTGTACTCATTG"),
4480 ("25", "ACTCATTGACG"),
4481 ("31", "CATTGACGTAAATTTGGTGCGGGCCTCAAGGTGTCCA"),
4482 ("89", "GGTGTCCATTGGGGTC"),
4483 ("105", "TTGGGGTCAGCACAAAT"),
4484 ("123", "GCACAAATTGCCA"),
4485 ("163", "CATTGACGAGGCACCC"),
4486 ("179", "AGGCACCCGTGCATTAG"),
4487 ("197", "TGCATTAGGCAGGGTGT"),
4488 ("215", "CAGGGTGTCCA"),
4489 ("237", "TTGGGGTCTGCACAAAT"),
4490 ("271", "AACAAGTGAACTCATTG"),
4491 ("311", "AGGCACCCCTGCATTAG"),
4492 ("427", "CATTGACGCAACCGGCATTGAATACACAGGGTGT"),
4493 ];
4494
4495 for (id, seq) in gfa_nodes {
4496 let node_index = graph.add_node(seq.to_string());
4497 node_indices.insert(id.to_string(), node_index);
4498 }
4499
4500 let gfa_edges = vec![
4502 ("1", "7"),
4503 ("1", "271"),
4504 ("7", "25"),
4505 ("25", "31"),
4506 ("25", "427"),
4507 ("25", "163"),
4508 ("31", "89"),
4509 ("89", "237"),
4510 ("89", "105"),
4511 ("105", "123"),
4512 ("163", "179"),
4513 ("163", "311"),
4514 ("179", "197"),
4515 ("197", "215"),
4516 ("215", "89"),
4517 ("237", "123"),
4518 ("271", "25"),
4519 ("311", "197"),
4520 ("427", "215"),
4521 ];
4522
4523 for (from, to) in gfa_edges {
4524 if let (Some(&from_index), Some(&to_index)) =
4525 (node_indices.get(from), node_indices.get(to))
4526 {
4527 graph.add_edge(from_index, to_index, 1.0);
4528 }
4529 }
4530 graph
4531 }
4532
4533 #[test]
4534 fn test_find_all_superbubbles() {
4535 let graph = create_bubblegun_graph();
4536
4537 let expected_bubbles = HashMap::from([
4538 ((b"CCCAACAAGTG".to_vec(), b"ACTCATTGACG".to_vec()), 2usize),
4539 (
4541 (b"ACTCATTGACG".to_vec(), b"GGTGTCCATTGGGGTC".to_vec()),
4542 7usize,
4543 ),
4544 (
4546 (b"GGTGTCCATTGGGGTC".to_vec(), b"GCACAAATTGCCA".to_vec()),
4547 2usize,
4548 ),
4549 ]);
4551
4552 let bubbles = find_all_superbubbles(&graph);
4553 assert_eq!(bubbles.len(), expected_bubbles.len());
4554
4555 for ((in_node, out_node), interior) in bubbles {
4556 let in_seq = graph.node_weight(in_node).unwrap().as_bytes().to_vec();
4557 let out_seq = graph.node_weight(out_node).unwrap().as_bytes().to_vec();
4558
4559 let key_fwd = (in_seq.clone(), out_seq.clone());
4560 let key_rev = (out_seq.clone(), in_seq.clone());
4561
4562 assert!(
4563 expected_bubbles.contains_key(&key_fwd) || expected_bubbles.contains_key(&key_rev)
4564 );
4565
4566 if expected_bubbles.contains_key(&key_fwd) {
4567 assert_eq!(expected_bubbles.get(&key_fwd).unwrap(), &interior.len());
4568 } else {
4569 assert_eq!(expected_bubbles.get(&key_rev).unwrap(), &interior.len());
4570 }
4571 }
4572 }
4573
4574 #[test]
4575 fn test_find_all_superbubbles_2() {
4576 let graph = crate::utils::read_gfa("/Users/kiran/repositories/hidive/test.gfa").unwrap();
4577
4578 let bubbles = find_all_superbubbles(&graph);
4579
4580 let start = b"GAGGGAACAGCGACTTC".to_vec();
4581 let end = b"TGGACACAGGCACCTGG".to_vec();
4582 for ((in_node, out_node), interior) in bubbles {
4583 let in_seq = graph.node_weight(in_node).unwrap().as_bytes().to_vec();
4584 let out_seq = graph.node_weight(out_node).unwrap().as_bytes().to_vec();
4585
4586 if in_seq == start && out_seq == end {
4587 println!("Found bubble: {}", interior.len());
4588
4589 for path in petgraph::algo::all_simple_paths::<Vec<_>, _>(
4590 &graph,
4591 in_node,
4592 out_node,
4593 0,
4594 Some(interior.len() + 10),
4595 ) {
4596 println!("out path: {}", path.len());
4597 }
4598
4599 for path in petgraph::algo::all_simple_paths::<Vec<_>, _>(
4600 &graph,
4601 out_node,
4602 in_node,
4603 0,
4604 Some(interior.len() + 10),
4605 ) {
4606 println!("in path: {}", path.len());
4607
4608 for node in path {
4609 println!(" - {}", graph.node_weight(node).unwrap());
4610 }
4611 }
4612 }
4613 }
4614 }
4615}