diff --git a/src/obidebruinj/src/debruijn.rs b/src/obidebruinj/src/debruijn.rs index b634b68..6c6ec20 100644 --- a/src/obidebruinj/src/debruijn.rs +++ b/src/obidebruinj/src/debruijn.rs @@ -3,8 +3,9 @@ use hashbrown::HashMap; use obikseq::k; use obikseq::{CanonicalKmer, Kmer, Sequence}; use rayon::prelude::*; -use std::arch::naked_asm; use std::fmt; +use std::mem::needs_drop; +use std::os::unix::raw::gid_t; use std::sync::atomic::{AtomicU8, Ordering}; use xxhash_rust::xxh3::Xxh3Builder; @@ -175,6 +176,156 @@ impl fmt::Display for Node { } } +pub struct WalkState { + kmer: CanonicalKmer, + node: Node, + direct: bool, +} + +impl WalkState { + pub fn new(kmer: CanonicalKmer, node: Node, direct: bool) -> Self { + debug_assert!(!node.is_visited(), "Cannot walk over a visited node"); + Self { kmer, node, direct } + } + + pub fn leavable(&self, graph: &GraphDeBruijn) -> bool { + WalkState { + kmer: self.kmer, + node: self.node, + direct: !self.direct, + } + .reachable(graph) + } + + pub fn reachable(&self, graph: &GraphDeBruijn) -> bool { + if self.direct { + if self.node.can_extend_left() { + let next = self.kmer.into_kmer().push_left(self.node.left_nuc()); + let cnext = next.canonical(); + let dnext = next.raw() == cnext.raw(); + let next_node = Node( + graph + .nodes + .get(&cnext) + .unwrap() + .load(std::sync::atomic::Ordering::Relaxed), + ); + !next_node.is_visited() + && if dnext { + next_node.can_extend_right() + } else { + next_node.can_extend_left() + } + } else { + false + } + } else { + if self.node.can_extend_right() { + let next = self.kmer.into_kmer().push_right(self.node.right_nuc()); + let cnext = next.canonical(); + let dnext = next.raw() == cnext.raw(); + let next_node = Node( + graph + .nodes + .get(&cnext) + .unwrap() + .load(std::sync::atomic::Ordering::Relaxed), + ); + !next_node.is_visited() + && if dnext { + next_node.can_extend_left() + } else { + next_node.can_extend_right() + } + } else { + false + } + } + } + + pub fn walk(&self, graph: &GraphDeBruijn) -> Option { + if self.direct { + if self.node.can_extend_right() { + let next = self.kmer.into_kmer().push_right(self.node.right_nuc()); + let cnext = next.canonical(); + let dnext = next.raw() == cnext.raw(); + let next_node = Node( + graph + .nodes + .get(&cnext) + .unwrap() + .load(std::sync::atomic::Ordering::Relaxed), + ); + if next_node.is_visited() { + None + } else { + if dnext { + if next_node.can_extend_left() { + Some(WalkState { + kmer: cnext, + node: next_node, + direct: dnext, + }) + } else { + None + } + } else { + if next_node.can_extend_right() { + Some(WalkState { + kmer: cnext, + node: next_node, + direct: dnext, + }) + } else { + None + } + } + } else { + None + } + } + } else { + if self.node.can_extend_left() { + let next = self.kmer.into_kmer().push_left(self.node.left_nuc()); + let cnext = next.canonical(); + let dnext = next.raw() == cnext.raw(); + let next_node = Node( + graph + .nodes + .get(&cnext) + .unwrap() + .load(std::sync::atomic::Ordering::Relaxed), + ); + if next_node.is_visited() { + None + } else { + if dnext { + if next_node.can_extend_right() { + Some(WalkState { + kmer: cnext, + node: next_node, + direct: dnext, + }) + } else { + None + } + } else { + if next_node.can_extend_left() { + Some(WalkState { + kmer: cnext, + node: next_node, + direct: dnext, + }) + } else { + None + } + } + } else { + None + } + } +} + // ── GraphDeBruijn ───────────────────────────────────────────────────────────── pub struct GraphDeBruijn { @@ -260,6 +411,52 @@ impl GraphDeBruijn { } } + pub fn start_walk(&self, kmer: CanonicalKmer) -> WalkState { + let node = Node( + self.nodes + .get(&kmer) + .expect("Cannot start a walk from not indexed kmer") + .load(Ordering::Relaxed), + ); + debug_assert!( + !node.is_visited(), + "Cannot start a walk from a visited node" + ); + WalkState::new(kmer, node, true) + } + + pub fn try_start_walk(&self, kmer: CanonicalKmer) -> Option { + let node = Node(self.nodes.get(&kmer)?.load(Ordering::Relaxed)); + if node.is_visited() { + return None; + } + Some(WalkState::new(kmer, node, true)) + } + + pub fn walk(&self, step: WalkState) -> Option { + if !step.leavable(self) { + return None; + } + let node = step.node; + let kmer = step.kmer.into_kmer(); + let n_kmer = if step.direct { + kmer.push_right(node.right_nuc()) + } else { + kmer.push_left(node.left_nuc()) + }; + let n_ckmer = n_kmer.canonical(); + let n_direct = n_ckmer.raw() == n_kmer.raw(); + let n_node = if let Some(node_val) = self.nodes.get(&n_ckmer) { + Node(node_val.load(Ordering::Relaxed)) + } else { + unreachable!() + }; + if n_node.is_visited() { + return None; + } + Some(WalkState::new(n_ckmer, n_node, n_direct)) + } + fn next_unitig_kmer(&self, kmer: Kmer) -> Option { let canonical = kmer.canonical(); let node = Node(self.nodes.get(&canonical)?.load(Ordering::Relaxed)); @@ -454,17 +651,12 @@ impl GraphDeBruijn { } fn is_start(&self, query: CanonicalKmer, node: Node) -> bool { - if !node.can_extend_left() { - return true; + !WalkState { + kmer: query, + node, + direct: true, } - let pred = query.into_kmer().push_left(node.left_nuc()).canonical(); - self.nodes - .get(&pred) - .map(|a| { - let pred_node = Node(a.load(Ordering::Acquire)); - !pred_node.can_extend_right() - }) - .unwrap_or(false) + .reachable(self) } pub fn try_for_each_unitig(&self, f: F) -> Result<(), E> diff --git a/src/obidebruinj/src/tests/debruijn.rs b/src/obidebruinj/src/tests/debruijn.rs index 24312b0..91c41a6 100644 --- a/src/obidebruinj/src/tests/debruijn.rs +++ b/src/obidebruinj/src/tests/debruijn.rs @@ -1,5 +1,5 @@ use super::*; -use obikseq::{k, set_k}; +use obikseq::{k, set_k, unitig::Unitig}; // Build a graph from an ASCII sequence, inserting all canonical k-mers. fn graph_from_ascii(seq: &[u8]) -> GraphDeBruijn { @@ -22,6 +22,16 @@ fn canonical_kmers(seq: &[u8]) -> Vec { v } +fn collect_unitigs(g: &GraphDeBruijn) -> Vec { + let mut unitigs = Vec::new(); + g.try_for_each_unitig(|nuc_iter| -> Result<(), std::convert::Infallible> { + unitigs.push(nuc_iter.collect()); + Ok(()) + }) + .unwrap(); + unitigs +} + // ── push / canonicalisation ─────────────────────────────────────────────── #[test] @@ -76,7 +86,7 @@ fn degrees_linear_chain_extensions() { let seq = b"AAAAGGGG"; let g = graph_from_ascii(seq); g.compute_degrees_and_mark_starts(); - let unitigs: Vec = g.iter_unitig().collect(); + let unitigs: Vec = collect_unitigs(&g); assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig"); // seql = k + (n_kmers - 1) = 5 + 3 = 8 = seq.len() assert_eq!( @@ -123,7 +133,7 @@ fn unitig_roundtrip_linear() { } println!("Les unitig:"); - let unitigs: Vec = g.iter_unitig().collect(); + let unitigs: Vec = collect_unitigs(&g); for unitig in &unitigs { println!("{}", String::from_utf8_lossy(&unitig.to_ascii())); } @@ -149,7 +159,7 @@ fn unitig_roundtrip_longer_sequence() { let seq = b"ACGTGGCTATCGAC"; let g = graph_from_ascii(seq); g.compute_degrees_and_mark_starts(); - let unitigs: Vec = g.iter_unitig().collect(); + let unitigs: Vec = collect_unitigs(&g); let mut got = kmers_from_unitigs(&unitigs); let mut expected = canonical_kmers(seq); got.sort_unstable(); @@ -166,7 +176,7 @@ fn unitig_isolated_node() { let mut g = GraphDeBruijn::new(); g.push(kmer.canonical()); g.compute_degrees_and_mark_starts(); - let unitigs: Vec = g.iter_unitig().collect(); + let unitigs: Vec = collect_unitigs(&g); assert_eq!(unitigs.len(), 1); assert_eq!(unitigs[0].seql(), k); } @@ -191,7 +201,7 @@ fn unitig_two_truly_distinct_isolated_nodes() { g.push(Kmer::from_ascii(b"AAAAC").unwrap().canonical()); g.push(Kmer::from_ascii(b"GGGGT").unwrap().canonical()); g.compute_degrees_and_mark_starts(); - let unitigs: Vec = g.iter_unitig().collect(); + let unitigs: Vec = collect_unitigs(&g); // Each isolated node → one unitig of length k assert_eq!(unitigs.len(), 2); assert!(unitigs.iter().all(|u| u.seql() == k)); @@ -206,7 +216,7 @@ fn no_kmer_lost_or_duplicated() { let seq = b"ACGTACGTACGTTTTTACGTACGT"; let g = graph_from_ascii(seq); g.compute_degrees_and_mark_starts(); - let unitigs: Vec = g.iter_unitig().collect(); + let unitigs: Vec = collect_unitigs(&g); let got = kmers_from_unitigs(&unitigs); let expected = canonical_kmers(seq); assert_eq!( @@ -231,7 +241,7 @@ fn cycle_kmers_not_lost() { let seq = b"ACGTACGT"; let g = graph_from_ascii(seq); g.compute_degrees_and_mark_starts(); - let unitigs: Vec = g.iter_unitig().collect(); + let unitigs: Vec = collect_unitigs(&g); let got = kmers_from_unitigs(&unitigs); let expected = canonical_kmers(seq); assert_eq!(got.len(), expected.len(), "cycle k-mers lost"); @@ -274,7 +284,7 @@ fn branching_graph_no_kmer_lost_or_duplicated() { insert(b"TTCGTGGCTA"); // J-I-F (different J prefix) g.compute_degrees_and_mark_starts(); - let unitigs: Vec = g.iter_unitig().collect(); + let unitigs: Vec = collect_unitigs(&g); // Collect all k-mers from unitigs. let got = kmers_from_unitigs(&unitigs);