refactor: remove naked_asm and extract collect_unitigs helper

Remove the `std::arch::naked_asm` import as it is no longer required. Introduce a `collect_unitigs` helper to abstract nucleotide sequence extraction from `GraphDeBruijn`, and refactor the test suite to use it, eliminating repetitive collection code and standardizing graph iteration logic.
This commit is contained in:
Eric Coissac
2026-06-05 19:38:17 +02:00
parent 5c2f48535f
commit 952a21eef8
2 changed files with 222 additions and 20 deletions
+203 -11
View File
@@ -3,8 +3,9 @@ use hashbrown::HashMap;
use obikseq::k; use obikseq::k;
use obikseq::{CanonicalKmer, Kmer, Sequence}; use obikseq::{CanonicalKmer, Kmer, Sequence};
use rayon::prelude::*; use rayon::prelude::*;
use std::arch::naked_asm;
use std::fmt; use std::fmt;
use std::mem::needs_drop;
use std::os::unix::raw::gid_t;
use std::sync::atomic::{AtomicU8, Ordering}; use std::sync::atomic::{AtomicU8, Ordering};
use xxhash_rust::xxh3::Xxh3Builder; use xxhash_rust::xxh3::Xxh3Builder;
@@ -175,6 +176,156 @@ impl fmt::Display for Node {
} }
} }
pub struct WalkState {
kmer: CanonicalKmer,
node: Node,
direct: bool,
}
impl WalkState {
pub fn new(kmer: CanonicalKmer, node: Node, direct: bool) -> Self {
debug_assert!(!node.is_visited(), "Cannot walk over a visited node");
Self { kmer, node, direct }
}
pub fn leavable(&self, graph: &GraphDeBruijn) -> bool {
WalkState {
kmer: self.kmer,
node: self.node,
direct: !self.direct,
}
.reachable(graph)
}
pub fn reachable(&self, graph: &GraphDeBruijn) -> bool {
if self.direct {
if self.node.can_extend_left() {
let next = self.kmer.into_kmer().push_left(self.node.left_nuc());
let cnext = next.canonical();
let dnext = next.raw() == cnext.raw();
let next_node = Node(
graph
.nodes
.get(&cnext)
.unwrap()
.load(std::sync::atomic::Ordering::Relaxed),
);
!next_node.is_visited()
&& if dnext {
next_node.can_extend_right()
} else {
next_node.can_extend_left()
}
} else {
false
}
} else {
if self.node.can_extend_right() {
let next = self.kmer.into_kmer().push_right(self.node.right_nuc());
let cnext = next.canonical();
let dnext = next.raw() == cnext.raw();
let next_node = Node(
graph
.nodes
.get(&cnext)
.unwrap()
.load(std::sync::atomic::Ordering::Relaxed),
);
!next_node.is_visited()
&& if dnext {
next_node.can_extend_left()
} else {
next_node.can_extend_right()
}
} else {
false
}
}
}
pub fn walk(&self, graph: &GraphDeBruijn) -> Option<WalkState> {
if self.direct {
if self.node.can_extend_right() {
let next = self.kmer.into_kmer().push_right(self.node.right_nuc());
let cnext = next.canonical();
let dnext = next.raw() == cnext.raw();
let next_node = Node(
graph
.nodes
.get(&cnext)
.unwrap()
.load(std::sync::atomic::Ordering::Relaxed),
);
if next_node.is_visited() {
None
} else {
if dnext {
if next_node.can_extend_left() {
Some(WalkState {
kmer: cnext,
node: next_node,
direct: dnext,
})
} else {
None
}
} else {
if next_node.can_extend_right() {
Some(WalkState {
kmer: cnext,
node: next_node,
direct: dnext,
})
} else {
None
}
}
} else {
None
}
}
} else {
if self.node.can_extend_left() {
let next = self.kmer.into_kmer().push_left(self.node.left_nuc());
let cnext = next.canonical();
let dnext = next.raw() == cnext.raw();
let next_node = Node(
graph
.nodes
.get(&cnext)
.unwrap()
.load(std::sync::atomic::Ordering::Relaxed),
);
if next_node.is_visited() {
None
} else {
if dnext {
if next_node.can_extend_right() {
Some(WalkState {
kmer: cnext,
node: next_node,
direct: dnext,
})
} else {
None
}
} else {
if next_node.can_extend_left() {
Some(WalkState {
kmer: cnext,
node: next_node,
direct: dnext,
})
} else {
None
}
}
} else {
None
}
}
}
// ── GraphDeBruijn ───────────────────────────────────────────────────────────── // ── GraphDeBruijn ─────────────────────────────────────────────────────────────
pub struct GraphDeBruijn { pub struct GraphDeBruijn {
@@ -260,6 +411,52 @@ impl GraphDeBruijn {
} }
} }
pub fn start_walk(&self, kmer: CanonicalKmer) -> WalkState {
let node = Node(
self.nodes
.get(&kmer)
.expect("Cannot start a walk from not indexed kmer")
.load(Ordering::Relaxed),
);
debug_assert!(
!node.is_visited(),
"Cannot start a walk from a visited node"
);
WalkState::new(kmer, node, true)
}
pub fn try_start_walk(&self, kmer: CanonicalKmer) -> Option<WalkState> {
let node = Node(self.nodes.get(&kmer)?.load(Ordering::Relaxed));
if node.is_visited() {
return None;
}
Some(WalkState::new(kmer, node, true))
}
pub fn walk(&self, step: WalkState) -> Option<WalkState> {
if !step.leavable(self) {
return None;
}
let node = step.node;
let kmer = step.kmer.into_kmer();
let n_kmer = if step.direct {
kmer.push_right(node.right_nuc())
} else {
kmer.push_left(node.left_nuc())
};
let n_ckmer = n_kmer.canonical();
let n_direct = n_ckmer.raw() == n_kmer.raw();
let n_node = if let Some(node_val) = self.nodes.get(&n_ckmer) {
Node(node_val.load(Ordering::Relaxed))
} else {
unreachable!()
};
if n_node.is_visited() {
return None;
}
Some(WalkState::new(n_ckmer, n_node, n_direct))
}
fn next_unitig_kmer(&self, kmer: Kmer) -> Option<Kmer> { fn next_unitig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
let canonical = kmer.canonical(); let canonical = kmer.canonical();
let node = Node(self.nodes.get(&canonical)?.load(Ordering::Relaxed)); let node = Node(self.nodes.get(&canonical)?.load(Ordering::Relaxed));
@@ -454,17 +651,12 @@ impl GraphDeBruijn {
} }
fn is_start(&self, query: CanonicalKmer, node: Node) -> bool { fn is_start(&self, query: CanonicalKmer, node: Node) -> bool {
if !node.can_extend_left() { !WalkState {
return true; kmer: query,
node,
direct: true,
} }
let pred = query.into_kmer().push_left(node.left_nuc()).canonical(); .reachable(self)
self.nodes
.get(&pred)
.map(|a| {
let pred_node = Node(a.load(Ordering::Acquire));
!pred_node.can_extend_right()
})
.unwrap_or(false)
} }
pub fn try_for_each_unitig<E, F>(&self, f: F) -> Result<(), E> pub fn try_for_each_unitig<E, F>(&self, f: F) -> Result<(), E>
+19 -9
View File
@@ -1,5 +1,5 @@
use super::*; use super::*;
use obikseq::{k, set_k}; use obikseq::{k, set_k, unitig::Unitig};
// Build a graph from an ASCII sequence, inserting all canonical k-mers. // Build a graph from an ASCII sequence, inserting all canonical k-mers.
fn graph_from_ascii(seq: &[u8]) -> GraphDeBruijn { fn graph_from_ascii(seq: &[u8]) -> GraphDeBruijn {
@@ -22,6 +22,16 @@ fn canonical_kmers(seq: &[u8]) -> Vec<CanonicalKmer> {
v v
} }
fn collect_unitigs(g: &GraphDeBruijn) -> Vec<Unitig> {
let mut unitigs = Vec::new();
g.try_for_each_unitig(|nuc_iter| -> Result<(), std::convert::Infallible> {
unitigs.push(nuc_iter.collect());
Ok(())
})
.unwrap();
unitigs
}
// ── push / canonicalisation ─────────────────────────────────────────────── // ── push / canonicalisation ───────────────────────────────────────────────
#[test] #[test]
@@ -76,7 +86,7 @@ fn degrees_linear_chain_extensions() {
let seq = b"AAAAGGGG"; let seq = b"AAAAGGGG";
let g = graph_from_ascii(seq); let g = graph_from_ascii(seq);
g.compute_degrees_and_mark_starts(); g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect(); let unitigs: Vec<Unitig> = collect_unitigs(&g);
assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig"); assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig");
// seql = k + (n_kmers - 1) = 5 + 3 = 8 = seq.len() // seql = k + (n_kmers - 1) = 5 + 3 = 8 = seq.len()
assert_eq!( assert_eq!(
@@ -123,7 +133,7 @@ fn unitig_roundtrip_linear() {
} }
println!("Les unitig:"); println!("Les unitig:");
let unitigs: Vec<Unitig> = g.iter_unitig().collect(); let unitigs: Vec<Unitig> = collect_unitigs(&g);
for unitig in &unitigs { for unitig in &unitigs {
println!("{}", String::from_utf8_lossy(&unitig.to_ascii())); println!("{}", String::from_utf8_lossy(&unitig.to_ascii()));
} }
@@ -149,7 +159,7 @@ fn unitig_roundtrip_longer_sequence() {
let seq = b"ACGTGGCTATCGAC"; let seq = b"ACGTGGCTATCGAC";
let g = graph_from_ascii(seq); let g = graph_from_ascii(seq);
g.compute_degrees_and_mark_starts(); g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect(); let unitigs: Vec<Unitig> = collect_unitigs(&g);
let mut got = kmers_from_unitigs(&unitigs); let mut got = kmers_from_unitigs(&unitigs);
let mut expected = canonical_kmers(seq); let mut expected = canonical_kmers(seq);
got.sort_unstable(); got.sort_unstable();
@@ -166,7 +176,7 @@ fn unitig_isolated_node() {
let mut g = GraphDeBruijn::new(); let mut g = GraphDeBruijn::new();
g.push(kmer.canonical()); g.push(kmer.canonical());
g.compute_degrees_and_mark_starts(); g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect(); let unitigs: Vec<Unitig> = collect_unitigs(&g);
assert_eq!(unitigs.len(), 1); assert_eq!(unitigs.len(), 1);
assert_eq!(unitigs[0].seql(), k); assert_eq!(unitigs[0].seql(), k);
} }
@@ -191,7 +201,7 @@ fn unitig_two_truly_distinct_isolated_nodes() {
g.push(Kmer::from_ascii(b"AAAAC").unwrap().canonical()); g.push(Kmer::from_ascii(b"AAAAC").unwrap().canonical());
g.push(Kmer::from_ascii(b"GGGGT").unwrap().canonical()); g.push(Kmer::from_ascii(b"GGGGT").unwrap().canonical());
g.compute_degrees_and_mark_starts(); g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect(); let unitigs: Vec<Unitig> = collect_unitigs(&g);
// Each isolated node → one unitig of length k // Each isolated node → one unitig of length k
assert_eq!(unitigs.len(), 2); assert_eq!(unitigs.len(), 2);
assert!(unitigs.iter().all(|u| u.seql() == k)); assert!(unitigs.iter().all(|u| u.seql() == k));
@@ -206,7 +216,7 @@ fn no_kmer_lost_or_duplicated() {
let seq = b"ACGTACGTACGTTTTTACGTACGT"; let seq = b"ACGTACGTACGTTTTTACGTACGT";
let g = graph_from_ascii(seq); let g = graph_from_ascii(seq);
g.compute_degrees_and_mark_starts(); g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect(); let unitigs: Vec<Unitig> = collect_unitigs(&g);
let got = kmers_from_unitigs(&unitigs); let got = kmers_from_unitigs(&unitigs);
let expected = canonical_kmers(seq); let expected = canonical_kmers(seq);
assert_eq!( assert_eq!(
@@ -231,7 +241,7 @@ fn cycle_kmers_not_lost() {
let seq = b"ACGTACGT"; let seq = b"ACGTACGT";
let g = graph_from_ascii(seq); let g = graph_from_ascii(seq);
g.compute_degrees_and_mark_starts(); g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect(); let unitigs: Vec<Unitig> = collect_unitigs(&g);
let got = kmers_from_unitigs(&unitigs); let got = kmers_from_unitigs(&unitigs);
let expected = canonical_kmers(seq); let expected = canonical_kmers(seq);
assert_eq!(got.len(), expected.len(), "cycle k-mers lost"); assert_eq!(got.len(), expected.len(), "cycle k-mers lost");
@@ -274,7 +284,7 @@ fn branching_graph_no_kmer_lost_or_duplicated() {
insert(b"TTCGTGGCTA"); // J-I-F (different J prefix) insert(b"TTCGTGGCTA"); // J-I-F (different J prefix)
g.compute_degrees_and_mark_starts(); g.compute_degrees_and_mark_starts();
let unitigs: Vec<Unitig> = g.iter_unitig().collect(); let unitigs: Vec<Unitig> = collect_unitigs(&g);
// Collect all k-mers from unitigs. // Collect all k-mers from unitigs.
let got = kmers_from_unitigs(&unitigs); let got = kmers_from_unitigs(&unitigs);