refactor: simplify unitig iteration logic and API
Refactors the `start_iter` function to separate chain/cycle detection into two passes, consolidates visited marking logic internally in `next_unitig_kmer`, and streamlines the public API by removing redundant methods, updating iteration parameters to `(start_first_next)`, and eliminating external state like `at_start`.
This commit is contained in:
+57
-104
@@ -128,125 +128,73 @@ impl GraphDeBruijn {
|
||||
/// - `!can_extend_left` → yield canonical form
|
||||
/// - `!can_extend_right` → yield reverse complement
|
||||
/// 2. Nodes still unvisited → part of a cycle; yield canonical form.
|
||||
fn start_iter(&self) -> impl Iterator<Item = Kmer> + '_ {
|
||||
// Yields (start, first_next) arcs:
|
||||
// pass 1 — one arc per right-neighbour of every !can_extend_left node
|
||||
// pass 2 — one arc per unvisited interior node (cycles); marks it visited
|
||||
// Junction nodes (pass 1) are never marked visited.
|
||||
fn start_iter(&self) -> impl Iterator<Item = (Kmer, Option<Kmer>)> + '_ {
|
||||
let k = self.k;
|
||||
|
||||
let chain_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| {
|
||||
let node = cell.get();
|
||||
if node.is_visited() {
|
||||
return None;
|
||||
let chain_starts = self.nodes.iter().flat_map(move |(&kmer, _cell)| {
|
||||
let node = _cell.get();
|
||||
if node.can_extend_left() {
|
||||
return vec![];
|
||||
}
|
||||
let start: Kmer = if !node.can_extend_left() {
|
||||
kmer.into_kmer()
|
||||
} else if !node.can_extend_right() {
|
||||
kmer.revcomp(k)
|
||||
let start = kmer.into_kmer();
|
||||
if node.can_extend_right() {
|
||||
let next_c = start.push_right(node.right_nuc(), k).canonical(k);
|
||||
vec![(start, Some(oriented_next(start, next_c, k)))]
|
||||
} else {
|
||||
return None;
|
||||
};
|
||||
let mut updated = node;
|
||||
updated.set_visited();
|
||||
cell.set(updated);
|
||||
Some(start)
|
||||
let rights: Vec<_> = kmer
|
||||
.right_canonical_neighbors(k)
|
||||
.into_iter()
|
||||
.filter(|n| self.nodes.contains_key(n))
|
||||
.map(|n| (start, Some(oriented_next(start, n, k))))
|
||||
.collect();
|
||||
if rights.is_empty() { vec![(start, None)] } else { rights }
|
||||
}
|
||||
});
|
||||
|
||||
// Cycle nodes: unvisited after chain traversal, both extensions present.
|
||||
// Yield in canonical orientation (forward) so next_kmer follows right.
|
||||
let cycle_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| {
|
||||
let node = cell.get();
|
||||
if node.is_visited() {
|
||||
if node.is_visited() || !node.can_extend_left() || !node.can_extend_right() {
|
||||
return None;
|
||||
}
|
||||
let mut updated = node;
|
||||
updated.set_visited();
|
||||
cell.set(updated);
|
||||
Some(kmer.into_kmer())
|
||||
let start = kmer.into_kmer();
|
||||
let next_c = start.push_right(node.right_nuc(), k).canonical(k);
|
||||
Some((start, Some(oriented_next(start, next_c, k))))
|
||||
});
|
||||
|
||||
chain_starts.chain(cycle_starts)
|
||||
}
|
||||
|
||||
/// Return the next kmer in the unitig traversal direction, or `None` if the
|
||||
/// current node is not uniquely continuable in that direction.
|
||||
///
|
||||
/// Direction is inferred from whether `kmer` is canonical:
|
||||
/// - `kmer == kmer.canonical(k)` → forward → follow right neighbour
|
||||
/// - otherwise → backward → follow left neighbour of canonical
|
||||
///
|
||||
/// The returned kmer is oriented so that its first k-1 bases match
|
||||
/// the last k-1 bases of `kmer` (proper De Bruijn overlap).
|
||||
pub fn next_unitig_kmer(&self, kmer: Kmer, is_at_start: bool) -> Option<Kmer> {
|
||||
// Direction from kmer orientation; returns None if called on a non-interior node.
|
||||
fn next_unitig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
|
||||
let canonical = kmer.canonical(self.k);
|
||||
let node = self.nodes.get(&canonical)?.get();
|
||||
|
||||
if !is_at_start && (!node.can_extend_left() || !node.can_extend_right()) {
|
||||
if !node.can_extend_left() || !node.can_extend_right() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let next: CanonicalKmer = if kmer.raw() == canonical.raw() {
|
||||
if !node.can_extend_right() {
|
||||
return None;
|
||||
}
|
||||
// push_right gives the raw right extension that properly extends kmer
|
||||
canonical
|
||||
.into_kmer()
|
||||
.push_right(node.right_nuc(), self.k)
|
||||
.canonical(self.k)
|
||||
let next_c = if kmer.raw() == canonical.raw() {
|
||||
canonical.into_kmer().push_right(node.right_nuc(), self.k).canonical(self.k)
|
||||
} else {
|
||||
if !node.can_extend_left() {
|
||||
return None;
|
||||
}
|
||||
// push_left gives the left extension of canonical; its revcomp is the right extension of kmer
|
||||
canonical
|
||||
.into_kmer()
|
||||
.push_left(node.left_nuc(), self.k)
|
||||
.canonical(self.k)
|
||||
canonical.into_kmer().push_left(node.left_nuc(), self.k).canonical(self.k)
|
||||
};
|
||||
|
||||
// Mark the next node visited before returning, consistent with start_iter.
|
||||
// Returns None if the node was already visited (cycle guard).
|
||||
let cell = self.nodes.get(&next)?;
|
||||
let node = cell.get();
|
||||
if node.is_visited() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let oriented = if kmer.is_overlapping(next.into_kmer(), self.k) {
|
||||
next.into_kmer()
|
||||
} else {
|
||||
next.revcomp(self.k)
|
||||
};
|
||||
|
||||
let mut updated = node;
|
||||
updated.set_visited();
|
||||
cell.set(updated);
|
||||
|
||||
Some(oriented)
|
||||
Some(oriented_next(kmer, next_c, self.k))
|
||||
}
|
||||
|
||||
/// Iterate over the kmers of a single unitig starting at `start`.
|
||||
///
|
||||
/// `start` must already be marked visited (as `start_iter` does).
|
||||
/// Each subsequent kmer is marked visited as it is yielded.
|
||||
/// Stops when the chain ends or the next node was already visited.
|
||||
///
|
||||
/// To get "la base à ajouter" rather than full kmers:
|
||||
/// - first item → `kmer.nucleotide(0..k)` (k bases, the seed)
|
||||
/// - next items → `kmer.nucleotide(k-1)` (1 new base each)
|
||||
pub fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> {
|
||||
UnitigIter::new(self, start)
|
||||
fn iter_unitig_kmers(&self, first_next: Option<Kmer>) -> UnitigIter<'_> {
|
||||
UnitigIter { graph: self, current: first_next }
|
||||
}
|
||||
|
||||
/// Iterate over all unitigs in the graph.
|
||||
///
|
||||
/// Drives `start_iter` and `iter_unitig_kmers` internally: for each start
|
||||
/// kmer, collects the k-mer chain into a [`Unitig`] and yields it.
|
||||
pub fn iter_unitig(&self) -> impl Iterator<Item = Unitig> + '_ {
|
||||
let k = self.k;
|
||||
self.start_iter().map(move |start| {
|
||||
// start is the first kmer — we already have it
|
||||
self.start_iter().map(move |(start, first_next)| {
|
||||
let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect();
|
||||
// each subsequent kmer contributes only its last (new) nucleotide
|
||||
for kmer in self.iter_unitig_kmers(start).skip(1) {
|
||||
for kmer in self.iter_unitig_kmers(first_next) {
|
||||
nucs.push(kmer.nucleotide(k - 1));
|
||||
}
|
||||
Unitig::from_nucleotides(&nucs)
|
||||
@@ -275,36 +223,41 @@ impl GraphDeBruijn {
|
||||
|
||||
// ── UnitigIter ────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct UnitigIter<'a> {
|
||||
struct UnitigIter<'a> {
|
||||
graph: &'a GraphDeBruijn,
|
||||
current: Option<Kmer>,
|
||||
at_start: bool,
|
||||
}
|
||||
|
||||
impl<'a> UnitigIter<'a> {
|
||||
fn new(graph: &'a GraphDeBruijn, start: Kmer) -> Self {
|
||||
Self {
|
||||
graph,
|
||||
current: Some(start),
|
||||
at_start: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UnitigIter<'a> {
|
||||
impl Iterator for UnitigIter<'_> {
|
||||
type Item = Kmer;
|
||||
|
||||
fn next(&mut self) -> Option<Kmer> {
|
||||
let current = self.current?;
|
||||
// next_unitig_kmer handles visited marking and cycle detection
|
||||
self.current = self.graph.next_unitig_kmer(current, self.at_start);
|
||||
self.at_start = false;
|
||||
let canonical = current.canonical(self.graph.k);
|
||||
let cell = self.graph.nodes.get(&canonical)?;
|
||||
let node = cell.get();
|
||||
// Cycle guard: stop if already visited (cycle start marked by start_iter pass 2).
|
||||
if node.is_visited() {
|
||||
self.current = None;
|
||||
return None;
|
||||
}
|
||||
// Only interior nodes get marked visited.
|
||||
if node.can_extend_left() && node.can_extend_right() {
|
||||
let mut updated = node;
|
||||
updated.set_visited();
|
||||
cell.set(updated);
|
||||
}
|
||||
self.current = self.graph.next_unitig_kmer(current);
|
||||
Some(current)
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn oriented_next(from: Kmer, to: CanonicalKmer, k: usize) -> Kmer {
|
||||
if from.is_overlapping(to.into_kmer(), k) { to.into_kmer() } else { to.revcomp(k) }
|
||||
}
|
||||
|
||||
/// Returns `Some(i)` if exactly one of the four canonical neighbours exists in
|
||||
/// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for
|
||||
/// zero or ≥2 existing neighbours.
|
||||
|
||||
Reference in New Issue
Block a user