refactor: rename compute_degrees and mark start nodes

Renames `compute_degrees` to `compute_degrees_and_mark_starts` across the De Bruijn graph and partitioner layers to consolidate degree calculation and start-node flagging. Introduces safe neighbor iteration methods and a debug validation block to verify graph consistency. Refactors unitig extraction to use sequential execution with a `Mutex` for safe error propagation. Fixes malformed and duplicated method calls, adds auto-generation of missing `meta.json` files, and ensures persistent matrix builders are explicitly closed to finalize metadata.
This commit is contained in:
Eric Coissac
2026-06-05 19:32:30 +02:00
parent 27088ab810
commit 5c2f48535f
6 changed files with 252 additions and 384 deletions
+38 -259
View File
@@ -215,7 +215,7 @@ impl GraphDeBruijn {
/// In production builds, runs in parallel across all nodes (each entry is
/// written by exactly one thread). In test builds, runs sequentially to
/// avoid propagating thread-local k/m values to rayon worker threads.
pub fn compute_degrees(&self) {
pub fn compute_degrees_and_mark_starts(&self) {
// Pass 1: count right/left neighbors for each node
self.for_each_node(|kmer, atomic| {
@@ -225,8 +225,8 @@ impl GraphDeBruijn {
atomic.store(old.0, Ordering::Relaxed);
return;
}
let (rc, rn) = count_neighbors(kmer.right_canonical_neighbors(), &self.nodes);
let (lc, ln) = count_neighbors(kmer.left_canonical_neighbors(), &self.nodes);
let (rc, rn) = count_neighbors(&kmer.right_canonical_neighbors(), &self.nodes);
let (lc, ln) = count_neighbors(&kmer.left_canonical_neighbors(), &self.nodes);
let mut node = Node(0); // reset all bits (visited=0, start=0)
node.set_right(rc, rn);
node.set_left(lc, ln);
@@ -248,32 +248,6 @@ impl GraphDeBruijn {
});
}
/// Iterates over the right neighbors of `kmer`.
pub fn iter_right_neighbors(
&self,
kmer: CanonicalKmer,
) -> impl Iterator<Item = CanonicalKmer> + '_ {
kmer.right_canonical_neighbors()
.into_iter()
.filter_map(|kmer| {
self.nodes.get(&kmer)?;
Some(kmer)
})
}
/// Iterates over the left neighbors of `kmer`.
pub fn iter_left_neighbors(
&self,
kmer: CanonicalKmer,
) -> impl Iterator<Item = CanonicalKmer> + '_ {
kmer.left_canonical_neighbors()
.into_iter()
.filter_map(|kmer| {
self.nodes.get(&kmer)?;
Some(kmer)
})
}
pub fn is_visited(&self, kmer: &CanonicalKmer) -> Option<bool> {
self.nodes
.get(kmer)
@@ -394,217 +368,7 @@ impl GraphDeBruijn {
if n == 0 {
break;
}
self.compute_degrees();
}
// #[cfg(debug_assertions)]
{
let mut residual = GraphDeBruijn {
nodes: FastHashMap::with_hasher(Xxh3Builder::new()),
};
for (&kmer, atomic) in &self.nodes {
if !Node(atomic.load(Ordering::Relaxed)).is_visited() {
residual.nodes.insert(kmer, AtomicU8::new(0));
}
}
let mut sample = 0;
for (&kmer, _) in residual.nodes.iter().take(1000) {
let left = kmer.left_canonical_neighbors();
let real_left = left
.iter()
.filter(|&nb| residual.nodes.contains_key(nb))
.count();
let right = kmer.right_canonical_neighbors();
let real_right = right
.iter()
.filter(|&nb| residual.nodes.contains_key(nb))
.count();
if real_left != 1 || real_right == 1 {
// normal
} else {
sample += 1;
if sample <= 10 {
eprintln!("Kmer {:?}: left={} right={}", kmer, real_left, real_right);
}
}
}
residual.compute_degrees();
let mut n_starts = 0usize;
let mut n_no_right = 0usize;
let mut n_no_left = 0usize;
for (_, a) in &residual.nodes {
let node = Node(a.load(Ordering::Relaxed));
if node.is_start() {
n_starts += 1;
}
if !node.can_extend_right() {
n_no_right += 1;
}
if !node.can_extend_left() {
n_no_left += 1;
}
}
eprintln!(
"[for_each_unitig] residual after cascade: {} nodes | starts={} no_right={} no_left={}",
residual.nodes.len(),
n_starts,
n_no_right,
n_no_left,
);
let mut left_mismatch = 0;
let mut right_mismatch = 0;
let mut total_left = 0;
let mut total_right = 0;
let n_residual = residual.nodes.len();
for (&kmer, atomic) in &residual.nodes {
let node = Node(atomic.load(Ordering::Relaxed));
let left_neighbors = kmer.left_canonical_neighbors();
let actual_left = left_neighbors
.iter()
.filter(|&nb| residual.nodes.contains_key(nb))
.count();
total_left += actual_left;
if (actual_left == 1) != node.can_extend_left() {
left_mismatch += 1;
}
let right_neighbors = kmer.right_canonical_neighbors();
let actual_right = right_neighbors
.iter()
.filter(|&nb| residual.nodes.contains_key(nb))
.count();
total_right += actual_right;
if (actual_right == 1) != node.can_extend_right() {
right_mismatch += 1;
}
}
eprintln!(
"[consistency] N={} total_left={} total_right={} left_mismatch={} right_mismatch={}",
n_residual, total_left, total_right, left_mismatch, right_mismatch
);
let mut real_starts = 0;
let mut sample = 0;
for (&kmer, atomic) in &residual.nodes {
let node = Node(atomic.load(Ordering::Relaxed));
// Calcul réel des voisins gauches (les 4 possibilités)
let left_neighbors = kmer.left_canonical_neighbors();
let real_left_count = left_neighbors
.iter()
.filter(|&nb| residual.nodes.contains_key(nb))
.count();
let right_neighbors = kmer.right_canonical_neighbors();
let real_right_count = right_neighbors
.iter()
.filter(|&nb| residual.nodes.contains_key(nb))
.count();
// Vérification de cohérence avec les flags
if (real_left_count == 1) != node.can_extend_left() {
eprintln!(
"Incoherence left for {:?}: real={}, flag={}",
kmer,
real_left_count,
node.can_extend_left()
);
}
if (real_right_count == 1) != node.can_extend_right() {
eprintln!(
"Incoherence right for {:?}: real={}, flag={}",
kmer,
real_right_count,
node.can_extend_right()
);
}
// Détermination si c'est un start selon la définition avec les vrais voisins
let is_start = if real_left_count != 1 {
true
} else {
// Trouver l'unique prédécesseur réel
let pred = left_neighbors
.iter()
.find(|&nb| residual.nodes.contains_key(nb))
.unwrap();
let pred_node = Node(residual.nodes.get(pred).unwrap().load(Ordering::Relaxed));
let pred_right_neighbors = pred.right_canonical_neighbors();
let pred_real_right_count = pred_right_neighbors
.iter()
.filter(|&nb| residual.nodes.contains_key(nb))
.count();
pred_real_right_count != 1
};
if is_start {
real_starts += 1;
if sample < 10 {
sample += 1;
eprintln!(
"Real start: {:?}, left_count={}, right_count={}",
kmer, real_left_count, real_right_count
);
}
}
}
eprintln!("[real starts] count = {}", real_starts);
let mut ok = 0;
let mut missing_pred = 0;
let mut pred_visited = 0;
let mut pred_no_right = 0;
let mut mismatch = 0;
for (&kmer, atomic) in &residual.nodes {
let node = Node(atomic.load(Ordering::Relaxed));
if !node.can_extend_right() {
// Le prédécesseur unique (car can_extend_left est vrai pour tous)
let pred = kmer.left_canonical_neighbors()[node.left_nuc() as usize];
if let Some(pred_atomic) = residual.nodes.get(&pred) {
let pred_node = Node(pred_atomic.load(Ordering::Relaxed));
if pred_node.can_extend_right() {
let succ =
pred.right_canonical_neighbors()[pred_node.right_nuc() as usize];
if succ == kmer {
ok += 1;
} else {
mismatch += 1;
eprintln!(
"Mismatch: pred {:?} right_nuc={} -> {:?} != {:?}",
pred,
pred_node.right_nuc(),
succ,
kmer
);
}
} else {
pred_no_right += 1;
eprintln!("Pred {:?} has !can_extend_right", pred);
}
} else {
// Prédécesseur absent du résidu : vérifier s'il est visité
if let Some(orig) = self.nodes.get(&pred) {
if Node(orig.load(Ordering::Relaxed)).is_visited() {
pred_visited += 1;
} else {
missing_pred += 1;
}
} else {
missing_pred += 1;
}
}
}
}
eprintln!(
"[diagnostic] nodes without right: ok={} missing_pred={} pred_visited={} pred_no_right={} mismatch={}",
ok, missing_pred, pred_visited, pred_no_right, mismatch
);
self.compute_degrees_and_mark_starts();
}
// Pass 2: cycles and tails — always sequential
@@ -624,6 +388,16 @@ impl GraphDeBruijn {
n2.fetch_add(1, Ordering::Relaxed);
f(self.unitig_nucleotides(start, Node(old), k));
}
// Fallback: if kmer was not reached by start's chain, claim it directly.
// Safe because unitig_nucleotides(start, ...) may have visited kmer in the
// meantime — in that case fetch_or returns IS_VISITED_MASK set and we skip.
if start != kmer {
let kmer_old = atomic.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
if kmer_old & IS_VISITED_MASK == 0 {
n2.fetch_add(1, Ordering::Relaxed);
f(self.unitig_nucleotides(kmer, Node(kmer_old), k));
}
}
}
eprintln!(
@@ -668,7 +442,10 @@ impl GraphDeBruijn {
return current;
}
// Stop if asymmetry: pred's right canonical neighbor is not current
let pred_right = pred.into_kmer().push_right(pred_node.right_nuc()).canonical();
let pred_right = pred
.into_kmer()
.push_right(pred_node.right_nuc())
.canonical();
if pred_right != current {
return current;
}
@@ -683,28 +460,29 @@ impl GraphDeBruijn {
let pred = query.into_kmer().push_left(node.left_nuc()).canonical();
self.nodes
.get(&pred)
.map(|a| !Node(a.load(Ordering::Acquire)).can_extend_right())
.map(|a| {
let pred_node = Node(a.load(Ordering::Acquire));
!pred_node.can_extend_right()
})
.unwrap_or(false)
}
pub fn try_for_each_unitig<E, F>(&self, mut f: F) -> Result<(), E>
pub fn try_for_each_unitig<E, F>(&self, f: F) -> Result<(), E>
where
F: FnMut(UnitigNucIter<'_>) -> Result<(), E>,
E: Send,
F: FnMut(UnitigNucIter<'_>) -> Result<(), E> + Send,
{
let k = k();
for (&kmer, atomic) in &self.nodes {
let node = Node(atomic.load(Ordering::Acquire));
if node.is_start() {
f(self.unitig_nucleotides(kmer, node, k))?;
let error = std::sync::Mutex::new(None::<E>);
let f = std::sync::Mutex::new(f);
self.for_each_unitig(|iter| {
if error.lock().unwrap().is_some() {
return;
}
}
for (&kmer, atomic) in &self.nodes {
let old = atomic.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
if old & IS_VISITED_MASK == 0 {
f(self.unitig_nucleotides(kmer, Node(old), k))?;
if let Err(e) = f.lock().unwrap()(iter) {
*error.lock().unwrap() = Some(e);
}
}
Ok(())
});
error.into_inner().unwrap().map_or(Ok(()), Err)
}
pub fn len(&self) -> usize {
@@ -772,8 +550,9 @@ fn try_claim(atomic: &AtomicU8) -> bool {
}
fn oriented_next(from: Kmer, to: CanonicalKmer) -> Kmer {
if from.is_overlapping(to.into_kmer()) {
to.into_kmer()
let direct = to.into_kmer();
if from.is_overlapping(direct) {
direct
} else {
to.revcomp()
}
@@ -784,7 +563,7 @@ fn oriented_next(from: Kmer, to: CanonicalKmer) -> Kmer {
/// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T).
/// Returns `None` for count = 0 or ≥2 existing neighbours.
fn count_neighbors(
neighbors: [CanonicalKmer; 4],
neighbors: &[CanonicalKmer; 4],
nodes: &FastHashMap<CanonicalKmer, AtomicU8>,
) -> (u8, Option<u8>) {
let mut count = 0u8;
+13 -9
View File
@@ -75,7 +75,7 @@ fn degrees_linear_chain_extensions() {
set_k(k);
let seq = b"AAAAGGGG";
let g = graph_from_ascii(seq);
g.compute_degrees();
g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig");
// seql = k + (n_kmers - 1) = 5 + 3 = 8 = seq.len()
@@ -112,10 +112,14 @@ fn unitig_roundtrip_linear() {
set_k(k);
let seq = b"ACCTGGCTA";
let g = graph_from_ascii(seq);
g.compute_degrees();
g.compute_degrees_and_mark_starts();
println!("Les kmers:");
for (kmer, v) in g.nodes.iter() {
println!("{}: {}", String::from_utf8_lossy(&kmer.to_ascii()), v.load(std::sync::atomic::Ordering::Relaxed));
println!(
"{}: {}",
String::from_utf8_lossy(&kmer.to_ascii()),
v.load(std::sync::atomic::Ordering::Relaxed)
);
}
println!("Les unitig:");
@@ -144,7 +148,7 @@ fn unitig_roundtrip_longer_sequence() {
set_k(k);
let seq = b"ACGTGGCTATCGAC";
let g = graph_from_ascii(seq);
g.compute_degrees();
g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
let mut got = kmers_from_unitigs(&unitigs);
let mut expected = canonical_kmers(seq);
@@ -161,7 +165,7 @@ fn unitig_isolated_node() {
let kmer = Kmer::from_ascii(b"ACGTA").unwrap();
let mut g = GraphDeBruijn::new();
g.push(kmer.canonical());
g.compute_degrees();
g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
assert_eq!(unitigs.len(), 1);
assert_eq!(unitigs[0].seql(), k);
@@ -186,7 +190,7 @@ fn unitig_two_truly_distinct_isolated_nodes() {
let mut g = GraphDeBruijn::new();
g.push(Kmer::from_ascii(b"AAAAC").unwrap().canonical());
g.push(Kmer::from_ascii(b"GGGGT").unwrap().canonical());
g.compute_degrees();
g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
// Each isolated node → one unitig of length k
assert_eq!(unitigs.len(), 2);
@@ -201,7 +205,7 @@ fn no_kmer_lost_or_duplicated() {
set_k(k);
let seq = b"ACGTACGTACGTTTTTACGTACGT";
let g = graph_from_ascii(seq);
g.compute_degrees();
g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
let got = kmers_from_unitigs(&unitigs);
let expected = canonical_kmers(seq);
@@ -226,7 +230,7 @@ fn cycle_kmers_not_lost() {
set_k(k);
let seq = b"ACGTACGT";
let g = graph_from_ascii(seq);
g.compute_degrees();
g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
let got = kmers_from_unitigs(&unitigs);
let expected = canonical_kmers(seq);
@@ -269,7 +273,7 @@ fn branching_graph_no_kmer_lost_or_duplicated() {
insert(b"GTGGCTACCGT"); // H-M-N continuation
insert(b"TTCGTGGCTA"); // J-I-F (different J prefix)
g.compute_degrees();
g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
// Collect all k-mers from unitigs.