From 35840b9d7362a39357dfc796070acc518e41bbd7 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 1 May 2026 13:52:13 +0200 Subject: [PATCH] refactor: simplify unitig iteration logic and API Refactors the `start_iter` function to separate chain/cycle detection into two passes, consolidates visited marking logic internally in `next_unitig_kmer`, and streamlines the public API by removing redundant methods, updating iteration parameters to `(start_first_next)`, and eliminating external state like `at_start`. --- src/obidebruinj/src/lib.rs | 161 +++++++++++++------------------------ 1 file changed, 57 insertions(+), 104 deletions(-) diff --git a/src/obidebruinj/src/lib.rs b/src/obidebruinj/src/lib.rs index 3286223..a4300cb 100644 --- a/src/obidebruinj/src/lib.rs +++ b/src/obidebruinj/src/lib.rs @@ -128,125 +128,73 @@ impl GraphDeBruijn { /// - `!can_extend_left` → yield canonical form /// - `!can_extend_right` → yield reverse complement /// 2. Nodes still unvisited → part of a cycle; yield canonical form. - fn start_iter(&self) -> impl Iterator + '_ { + // Yields (start, first_next) arcs: + // pass 1 — one arc per right-neighbour of every !can_extend_left node + // pass 2 — one arc per unvisited interior node (cycles); marks it visited + // Junction nodes (pass 1) are never marked visited. + fn start_iter(&self) -> impl Iterator)> + '_ { let k = self.k; - let chain_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| { - let node = cell.get(); - if node.is_visited() { - return None; + let chain_starts = self.nodes.iter().flat_map(move |(&kmer, _cell)| { + let node = _cell.get(); + if node.can_extend_left() { + return vec![]; } - let start: Kmer = if !node.can_extend_left() { - kmer.into_kmer() - } else if !node.can_extend_right() { - kmer.revcomp(k) + let start = kmer.into_kmer(); + if node.can_extend_right() { + let next_c = start.push_right(node.right_nuc(), k).canonical(k); + vec![(start, Some(oriented_next(start, next_c, k)))] } else { - return None; - }; - let mut updated = node; - updated.set_visited(); - cell.set(updated); - Some(start) + let rights: Vec<_> = kmer + .right_canonical_neighbors(k) + .into_iter() + .filter(|n| self.nodes.contains_key(n)) + .map(|n| (start, Some(oriented_next(start, n, k)))) + .collect(); + if rights.is_empty() { vec![(start, None)] } else { rights } + } }); - // Cycle nodes: unvisited after chain traversal, both extensions present. - // Yield in canonical orientation (forward) so next_kmer follows right. let cycle_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| { let node = cell.get(); - if node.is_visited() { + if node.is_visited() || !node.can_extend_left() || !node.can_extend_right() { return None; } let mut updated = node; updated.set_visited(); cell.set(updated); - Some(kmer.into_kmer()) + let start = kmer.into_kmer(); + let next_c = start.push_right(node.right_nuc(), k).canonical(k); + Some((start, Some(oriented_next(start, next_c, k)))) }); chain_starts.chain(cycle_starts) } - /// Return the next kmer in the unitig traversal direction, or `None` if the - /// current node is not uniquely continuable in that direction. - /// - /// Direction is inferred from whether `kmer` is canonical: - /// - `kmer == kmer.canonical(k)` → forward → follow right neighbour - /// - otherwise → backward → follow left neighbour of canonical - /// - /// The returned kmer is oriented so that its first k-1 bases match - /// the last k-1 bases of `kmer` (proper De Bruijn overlap). - pub fn next_unitig_kmer(&self, kmer: Kmer, is_at_start: bool) -> Option { + // Direction from kmer orientation; returns None if called on a non-interior node. + fn next_unitig_kmer(&self, kmer: Kmer) -> Option { let canonical = kmer.canonical(self.k); let node = self.nodes.get(&canonical)?.get(); - - if !is_at_start && (!node.can_extend_left() || !node.can_extend_right()) { + if !node.can_extend_left() || !node.can_extend_right() { return None; } - - let next: CanonicalKmer = if kmer.raw() == canonical.raw() { - if !node.can_extend_right() { - return None; - } - // push_right gives the raw right extension that properly extends kmer - canonical - .into_kmer() - .push_right(node.right_nuc(), self.k) - .canonical(self.k) + let next_c = if kmer.raw() == canonical.raw() { + canonical.into_kmer().push_right(node.right_nuc(), self.k).canonical(self.k) } else { - if !node.can_extend_left() { - return None; - } - // push_left gives the left extension of canonical; its revcomp is the right extension of kmer - canonical - .into_kmer() - .push_left(node.left_nuc(), self.k) - .canonical(self.k) + canonical.into_kmer().push_left(node.left_nuc(), self.k).canonical(self.k) }; - - // Mark the next node visited before returning, consistent with start_iter. - // Returns None if the node was already visited (cycle guard). - let cell = self.nodes.get(&next)?; - let node = cell.get(); - if node.is_visited() { - return None; - } - - let oriented = if kmer.is_overlapping(next.into_kmer(), self.k) { - next.into_kmer() - } else { - next.revcomp(self.k) - }; - - let mut updated = node; - updated.set_visited(); - cell.set(updated); - - Some(oriented) + Some(oriented_next(kmer, next_c, self.k)) } - /// Iterate over the kmers of a single unitig starting at `start`. - /// - /// `start` must already be marked visited (as `start_iter` does). - /// Each subsequent kmer is marked visited as it is yielded. - /// Stops when the chain ends or the next node was already visited. - /// - /// To get "la base à ajouter" rather than full kmers: - /// - first item → `kmer.nucleotide(0..k)` (k bases, the seed) - /// - next items → `kmer.nucleotide(k-1)` (1 new base each) - pub fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> { - UnitigIter::new(self, start) + fn iter_unitig_kmers(&self, first_next: Option) -> UnitigIter<'_> { + UnitigIter { graph: self, current: first_next } } - /// Iterate over all unitigs in the graph. - /// - /// Drives `start_iter` and `iter_unitig_kmers` internally: for each start - /// kmer, collects the k-mer chain into a [`Unitig`] and yields it. pub fn iter_unitig(&self) -> impl Iterator + '_ { let k = self.k; - self.start_iter().map(move |start| { - // start is the first kmer — we already have it + self.start_iter().map(move |(start, first_next)| { let mut nucs: Vec = (0..k).map(|i| start.nucleotide(i)).collect(); - // each subsequent kmer contributes only its last (new) nucleotide - for kmer in self.iter_unitig_kmers(start).skip(1) { + for kmer in self.iter_unitig_kmers(first_next) { nucs.push(kmer.nucleotide(k - 1)); } Unitig::from_nucleotides(&nucs) @@ -275,36 +223,41 @@ impl GraphDeBruijn { // ── UnitigIter ──────────────────────────────────────────────────────────────── -pub struct UnitigIter<'a> { +struct UnitigIter<'a> { graph: &'a GraphDeBruijn, current: Option, - at_start: bool, } -impl<'a> UnitigIter<'a> { - fn new(graph: &'a GraphDeBruijn, start: Kmer) -> Self { - Self { - graph, - current: Some(start), - at_start: true, - } - } -} - -impl<'a> Iterator for UnitigIter<'a> { +impl Iterator for UnitigIter<'_> { type Item = Kmer; fn next(&mut self) -> Option { let current = self.current?; - // next_unitig_kmer handles visited marking and cycle detection - self.current = self.graph.next_unitig_kmer(current, self.at_start); - self.at_start = false; + let canonical = current.canonical(self.graph.k); + let cell = self.graph.nodes.get(&canonical)?; + let node = cell.get(); + // Cycle guard: stop if already visited (cycle start marked by start_iter pass 2). + if node.is_visited() { + self.current = None; + return None; + } + // Only interior nodes get marked visited. + if node.can_extend_left() && node.can_extend_right() { + let mut updated = node; + updated.set_visited(); + cell.set(updated); + } + self.current = self.graph.next_unitig_kmer(current); Some(current) } } // ── helpers ─────────────────────────────────────────────────────────────────── +fn oriented_next(from: Kmer, to: CanonicalKmer, k: usize) -> Kmer { + if from.is_overlapping(to.into_kmer(), k) { to.into_kmer() } else { to.revcomp(k) } +} + /// Returns `Some(i)` if exactly one of the four canonical neighbours exists in /// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for /// zero or ≥2 existing neighbours.