refactor: simplify unitig iteration logic and API

Refactors the `start_iter` function to separate chain/cycle detection into two passes, consolidates visited marking logic internally in `next_unitig_kmer`, and streamlines the public API by removing redundant methods, updating iteration parameters to `(start_first_next)`, and eliminating external state like `at_start`.
This commit is contained in:
Eric Coissac
2026-05-01 13:52:13 +02:00
parent defeeb9460
commit 35840b9d73
+57 -104
View File
@@ -128,125 +128,73 @@ impl GraphDeBruijn {
/// - `!can_extend_left` → yield canonical form
/// - `!can_extend_right` → yield reverse complement
/// 2. Nodes still unvisited → part of a cycle; yield canonical form.
fn start_iter(&self) -> impl Iterator<Item = Kmer> + '_ {
// Yields (start, first_next) arcs:
// pass 1 — one arc per right-neighbour of every !can_extend_left node
// pass 2 — one arc per unvisited interior node (cycles); marks it visited
// Junction nodes (pass 1) are never marked visited.
fn start_iter(&self) -> impl Iterator<Item = (Kmer, Option<Kmer>)> + '_ {
let k = self.k;
let chain_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| {
let node = cell.get();
if node.is_visited() {
return None;
let chain_starts = self.nodes.iter().flat_map(move |(&kmer, _cell)| {
let node = _cell.get();
if node.can_extend_left() {
return vec![];
}
let start: Kmer = if !node.can_extend_left() {
kmer.into_kmer()
} else if !node.can_extend_right() {
kmer.revcomp(k)
let start = kmer.into_kmer();
if node.can_extend_right() {
let next_c = start.push_right(node.right_nuc(), k).canonical(k);
vec![(start, Some(oriented_next(start, next_c, k)))]
} else {
return None;
};
let mut updated = node;
updated.set_visited();
cell.set(updated);
Some(start)
let rights: Vec<_> = kmer
.right_canonical_neighbors(k)
.into_iter()
.filter(|n| self.nodes.contains_key(n))
.map(|n| (start, Some(oriented_next(start, n, k))))
.collect();
if rights.is_empty() { vec![(start, None)] } else { rights }
}
});
// Cycle nodes: unvisited after chain traversal, both extensions present.
// Yield in canonical orientation (forward) so next_kmer follows right.
let cycle_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| {
let node = cell.get();
if node.is_visited() {
if node.is_visited() || !node.can_extend_left() || !node.can_extend_right() {
return None;
}
let mut updated = node;
updated.set_visited();
cell.set(updated);
Some(kmer.into_kmer())
let start = kmer.into_kmer();
let next_c = start.push_right(node.right_nuc(), k).canonical(k);
Some((start, Some(oriented_next(start, next_c, k))))
});
chain_starts.chain(cycle_starts)
}
/// Return the next kmer in the unitig traversal direction, or `None` if the
/// current node is not uniquely continuable in that direction.
///
/// Direction is inferred from whether `kmer` is canonical:
/// - `kmer == kmer.canonical(k)` → forward → follow right neighbour
/// - otherwise → backward → follow left neighbour of canonical
///
/// The returned kmer is oriented so that its first k-1 bases match
/// the last k-1 bases of `kmer` (proper De Bruijn overlap).
pub fn next_unitig_kmer(&self, kmer: Kmer, is_at_start: bool) -> Option<Kmer> {
// Direction from kmer orientation; returns None if called on a non-interior node.
fn next_unitig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
let canonical = kmer.canonical(self.k);
let node = self.nodes.get(&canonical)?.get();
if !is_at_start && (!node.can_extend_left() || !node.can_extend_right()) {
if !node.can_extend_left() || !node.can_extend_right() {
return None;
}
let next: CanonicalKmer = if kmer.raw() == canonical.raw() {
if !node.can_extend_right() {
return None;
}
// push_right gives the raw right extension that properly extends kmer
canonical
.into_kmer()
.push_right(node.right_nuc(), self.k)
.canonical(self.k)
let next_c = if kmer.raw() == canonical.raw() {
canonical.into_kmer().push_right(node.right_nuc(), self.k).canonical(self.k)
} else {
if !node.can_extend_left() {
return None;
}
// push_left gives the left extension of canonical; its revcomp is the right extension of kmer
canonical
.into_kmer()
.push_left(node.left_nuc(), self.k)
.canonical(self.k)
canonical.into_kmer().push_left(node.left_nuc(), self.k).canonical(self.k)
};
// Mark the next node visited before returning, consistent with start_iter.
// Returns None if the node was already visited (cycle guard).
let cell = self.nodes.get(&next)?;
let node = cell.get();
if node.is_visited() {
return None;
}
let oriented = if kmer.is_overlapping(next.into_kmer(), self.k) {
next.into_kmer()
} else {
next.revcomp(self.k)
};
let mut updated = node;
updated.set_visited();
cell.set(updated);
Some(oriented)
Some(oriented_next(kmer, next_c, self.k))
}
/// Iterate over the kmers of a single unitig starting at `start`.
///
/// `start` must already be marked visited (as `start_iter` does).
/// Each subsequent kmer is marked visited as it is yielded.
/// Stops when the chain ends or the next node was already visited.
///
/// To get "la base à ajouter" rather than full kmers:
/// - first item → `kmer.nucleotide(0..k)` (k bases, the seed)
/// - next items → `kmer.nucleotide(k-1)` (1 new base each)
pub fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> {
UnitigIter::new(self, start)
fn iter_unitig_kmers(&self, first_next: Option<Kmer>) -> UnitigIter<'_> {
UnitigIter { graph: self, current: first_next }
}
/// Iterate over all unitigs in the graph.
///
/// Drives `start_iter` and `iter_unitig_kmers` internally: for each start
/// kmer, collects the k-mer chain into a [`Unitig`] and yields it.
pub fn iter_unitig(&self) -> impl Iterator<Item = Unitig> + '_ {
let k = self.k;
self.start_iter().map(move |start| {
// start is the first kmer — we already have it
self.start_iter().map(move |(start, first_next)| {
let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect();
// each subsequent kmer contributes only its last (new) nucleotide
for kmer in self.iter_unitig_kmers(start).skip(1) {
for kmer in self.iter_unitig_kmers(first_next) {
nucs.push(kmer.nucleotide(k - 1));
}
Unitig::from_nucleotides(&nucs)
@@ -275,36 +223,41 @@ impl GraphDeBruijn {
// ── UnitigIter ────────────────────────────────────────────────────────────────
pub struct UnitigIter<'a> {
struct UnitigIter<'a> {
graph: &'a GraphDeBruijn,
current: Option<Kmer>,
at_start: bool,
}
impl<'a> UnitigIter<'a> {
fn new(graph: &'a GraphDeBruijn, start: Kmer) -> Self {
Self {
graph,
current: Some(start),
at_start: true,
}
}
}
impl<'a> Iterator for UnitigIter<'a> {
impl Iterator for UnitigIter<'_> {
type Item = Kmer;
fn next(&mut self) -> Option<Kmer> {
let current = self.current?;
// next_unitig_kmer handles visited marking and cycle detection
self.current = self.graph.next_unitig_kmer(current, self.at_start);
self.at_start = false;
let canonical = current.canonical(self.graph.k);
let cell = self.graph.nodes.get(&canonical)?;
let node = cell.get();
// Cycle guard: stop if already visited (cycle start marked by start_iter pass 2).
if node.is_visited() {
self.current = None;
return None;
}
// Only interior nodes get marked visited.
if node.can_extend_left() && node.can_extend_right() {
let mut updated = node;
updated.set_visited();
cell.set(updated);
}
self.current = self.graph.next_unitig_kmer(current);
Some(current)
}
}
// ── helpers ───────────────────────────────────────────────────────────────────
fn oriented_next(from: Kmer, to: CanonicalKmer, k: usize) -> Kmer {
if from.is_overlapping(to.into_kmer(), k) { to.into_kmer() } else { to.revcomp(k) }
}
/// Returns `Some(i)` if exactly one of the four canonical neighbours exists in
/// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for
/// zero or ≥2 existing neighbours.