refactor: simplify unitig iteration logic and API

Refactors the `start_iter` function to separate chain/cycle detection into two passes, consolidates visited marking logic internally in `next_unitig_kmer`, and streamlines the public API by removing redundant methods, updating iteration parameters to `(start_first_next)`, and eliminating external state like `at_start`.
This commit is contained in:
Eric Coissac
2026-05-01 13:52:13 +02:00
parent defeeb9460
commit 35840b9d73
+57 -104
View File
@@ -128,125 +128,73 @@ impl GraphDeBruijn {
/// - `!can_extend_left` → yield canonical form /// - `!can_extend_left` → yield canonical form
/// - `!can_extend_right` → yield reverse complement /// - `!can_extend_right` → yield reverse complement
/// 2. Nodes still unvisited → part of a cycle; yield canonical form. /// 2. Nodes still unvisited → part of a cycle; yield canonical form.
fn start_iter(&self) -> impl Iterator<Item = Kmer> + '_ { // Yields (start, first_next) arcs:
// pass 1 — one arc per right-neighbour of every !can_extend_left node
// pass 2 — one arc per unvisited interior node (cycles); marks it visited
// Junction nodes (pass 1) are never marked visited.
fn start_iter(&self) -> impl Iterator<Item = (Kmer, Option<Kmer>)> + '_ {
let k = self.k; let k = self.k;
let chain_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| { let chain_starts = self.nodes.iter().flat_map(move |(&kmer, _cell)| {
let node = cell.get(); let node = _cell.get();
if node.is_visited() { if node.can_extend_left() {
return None; return vec![];
} }
let start: Kmer = if !node.can_extend_left() { let start = kmer.into_kmer();
kmer.into_kmer() if node.can_extend_right() {
} else if !node.can_extend_right() { let next_c = start.push_right(node.right_nuc(), k).canonical(k);
kmer.revcomp(k) vec![(start, Some(oriented_next(start, next_c, k)))]
} else { } else {
return None; let rights: Vec<_> = kmer
}; .right_canonical_neighbors(k)
let mut updated = node; .into_iter()
updated.set_visited(); .filter(|n| self.nodes.contains_key(n))
cell.set(updated); .map(|n| (start, Some(oriented_next(start, n, k))))
Some(start) .collect();
if rights.is_empty() { vec![(start, None)] } else { rights }
}
}); });
// Cycle nodes: unvisited after chain traversal, both extensions present.
// Yield in canonical orientation (forward) so next_kmer follows right.
let cycle_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| { let cycle_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| {
let node = cell.get(); let node = cell.get();
if node.is_visited() { if node.is_visited() || !node.can_extend_left() || !node.can_extend_right() {
return None; return None;
} }
let mut updated = node; let mut updated = node;
updated.set_visited(); updated.set_visited();
cell.set(updated); cell.set(updated);
Some(kmer.into_kmer()) let start = kmer.into_kmer();
let next_c = start.push_right(node.right_nuc(), k).canonical(k);
Some((start, Some(oriented_next(start, next_c, k))))
}); });
chain_starts.chain(cycle_starts) chain_starts.chain(cycle_starts)
} }
/// Return the next kmer in the unitig traversal direction, or `None` if the // Direction from kmer orientation; returns None if called on a non-interior node.
/// current node is not uniquely continuable in that direction. fn next_unitig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
///
/// Direction is inferred from whether `kmer` is canonical:
/// - `kmer == kmer.canonical(k)` → forward → follow right neighbour
/// - otherwise → backward → follow left neighbour of canonical
///
/// The returned kmer is oriented so that its first k-1 bases match
/// the last k-1 bases of `kmer` (proper De Bruijn overlap).
pub fn next_unitig_kmer(&self, kmer: Kmer, is_at_start: bool) -> Option<Kmer> {
let canonical = kmer.canonical(self.k); let canonical = kmer.canonical(self.k);
let node = self.nodes.get(&canonical)?.get(); let node = self.nodes.get(&canonical)?.get();
if !node.can_extend_left() || !node.can_extend_right() {
if !is_at_start && (!node.can_extend_left() || !node.can_extend_right()) {
return None; return None;
} }
let next_c = if kmer.raw() == canonical.raw() {
let next: CanonicalKmer = if kmer.raw() == canonical.raw() { canonical.into_kmer().push_right(node.right_nuc(), self.k).canonical(self.k)
if !node.can_extend_right() {
return None;
}
// push_right gives the raw right extension that properly extends kmer
canonical
.into_kmer()
.push_right(node.right_nuc(), self.k)
.canonical(self.k)
} else { } else {
if !node.can_extend_left() { canonical.into_kmer().push_left(node.left_nuc(), self.k).canonical(self.k)
return None;
}
// push_left gives the left extension of canonical; its revcomp is the right extension of kmer
canonical
.into_kmer()
.push_left(node.left_nuc(), self.k)
.canonical(self.k)
}; };
Some(oriented_next(kmer, next_c, self.k))
// Mark the next node visited before returning, consistent with start_iter.
// Returns None if the node was already visited (cycle guard).
let cell = self.nodes.get(&next)?;
let node = cell.get();
if node.is_visited() {
return None;
} }
let oriented = if kmer.is_overlapping(next.into_kmer(), self.k) { fn iter_unitig_kmers(&self, first_next: Option<Kmer>) -> UnitigIter<'_> {
next.into_kmer() UnitigIter { graph: self, current: first_next }
} else {
next.revcomp(self.k)
};
let mut updated = node;
updated.set_visited();
cell.set(updated);
Some(oriented)
} }
/// Iterate over the kmers of a single unitig starting at `start`.
///
/// `start` must already be marked visited (as `start_iter` does).
/// Each subsequent kmer is marked visited as it is yielded.
/// Stops when the chain ends or the next node was already visited.
///
/// To get "la base à ajouter" rather than full kmers:
/// - first item → `kmer.nucleotide(0..k)` (k bases, the seed)
/// - next items → `kmer.nucleotide(k-1)` (1 new base each)
pub fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> {
UnitigIter::new(self, start)
}
/// Iterate over all unitigs in the graph.
///
/// Drives `start_iter` and `iter_unitig_kmers` internally: for each start
/// kmer, collects the k-mer chain into a [`Unitig`] and yields it.
pub fn iter_unitig(&self) -> impl Iterator<Item = Unitig> + '_ { pub fn iter_unitig(&self) -> impl Iterator<Item = Unitig> + '_ {
let k = self.k; let k = self.k;
self.start_iter().map(move |start| { self.start_iter().map(move |(start, first_next)| {
// start is the first kmer — we already have it
let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect(); let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect();
// each subsequent kmer contributes only its last (new) nucleotide for kmer in self.iter_unitig_kmers(first_next) {
for kmer in self.iter_unitig_kmers(start).skip(1) {
nucs.push(kmer.nucleotide(k - 1)); nucs.push(kmer.nucleotide(k - 1));
} }
Unitig::from_nucleotides(&nucs) Unitig::from_nucleotides(&nucs)
@@ -275,36 +223,41 @@ impl GraphDeBruijn {
// ── UnitigIter ──────────────────────────────────────────────────────────────── // ── UnitigIter ────────────────────────────────────────────────────────────────
pub struct UnitigIter<'a> { struct UnitigIter<'a> {
graph: &'a GraphDeBruijn, graph: &'a GraphDeBruijn,
current: Option<Kmer>, current: Option<Kmer>,
at_start: bool,
} }
impl<'a> UnitigIter<'a> { impl Iterator for UnitigIter<'_> {
fn new(graph: &'a GraphDeBruijn, start: Kmer) -> Self {
Self {
graph,
current: Some(start),
at_start: true,
}
}
}
impl<'a> Iterator for UnitigIter<'a> {
type Item = Kmer; type Item = Kmer;
fn next(&mut self) -> Option<Kmer> { fn next(&mut self) -> Option<Kmer> {
let current = self.current?; let current = self.current?;
// next_unitig_kmer handles visited marking and cycle detection let canonical = current.canonical(self.graph.k);
self.current = self.graph.next_unitig_kmer(current, self.at_start); let cell = self.graph.nodes.get(&canonical)?;
self.at_start = false; let node = cell.get();
// Cycle guard: stop if already visited (cycle start marked by start_iter pass 2).
if node.is_visited() {
self.current = None;
return None;
}
// Only interior nodes get marked visited.
if node.can_extend_left() && node.can_extend_right() {
let mut updated = node;
updated.set_visited();
cell.set(updated);
}
self.current = self.graph.next_unitig_kmer(current);
Some(current) Some(current)
} }
} }
// ── helpers ─────────────────────────────────────────────────────────────────── // ── helpers ───────────────────────────────────────────────────────────────────
fn oriented_next(from: Kmer, to: CanonicalKmer, k: usize) -> Kmer {
if from.is_overlapping(to.into_kmer(), k) { to.into_kmer() } else { to.revcomp(k) }
}
/// Returns `Some(i)` if exactly one of the four canonical neighbours exists in /// Returns `Some(i)` if exactly one of the four canonical neighbours exists in
/// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for /// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for
/// zero or ≥2 existing neighbours. /// zero or ≥2 existing neighbours.