Push pvqkqxlkkwry #17

Merged
coissac merged 6 commits from push-pvqkqxlkkwry into main 2026-06-06 04:44:11 +00:00
2 changed files with 82 additions and 282 deletions
Showing only changes of commit b39eee688a - Show all commits
+67 -254
View File
@@ -1,11 +1,9 @@
//use ahash::RandomState; //use ahash::RandomState;
use hashbrown::HashMap; use hashbrown::HashMap;
use obikseq::k; use obikseq::k;
use obikseq::{CanonicalKmer, Kmer, Sequence}; use obikseq::{CanonicalKmer, Sequence};
use rayon::prelude::*; use rayon::prelude::*;
use std::fmt; use std::fmt;
use std::mem::needs_drop;
use std::os::unix::raw::gid_t;
use std::sync::atomic::{AtomicU8, Ordering}; use std::sync::atomic::{AtomicU8, Ordering};
use xxhash_rust::xxh3::Xxh3Builder; use xxhash_rust::xxh3::Xxh3Builder;
@@ -197,85 +195,35 @@ impl WalkState {
.leavable(graph) .leavable(graph)
} }
pub fn walk(&self, graph: &GraphDeBruijn) -> Option<WalkState> { pub fn walk(&self, graph: &GraphDeBruijn) -> Option<(WalkState, u8)> {
if self.direct { if self.direct {
if self.node.can_extend_right() { if !self.node.can_extend_right() {
let next = self.kmer.into_kmer().push_right(self.node.right_nuc()); return None;
}
let nuc = self.node.right_nuc();
let next = self.kmer.into_kmer().push_right(nuc);
let cnext = next.canonical(); let cnext = next.canonical();
let dnext = next.raw() == cnext.raw(); let dnext = next.raw() == cnext.raw();
let next_node = Node( let next_node = Node(graph.nodes.get(&cnext).unwrap().load(Ordering::Relaxed));
graph
.nodes
.get(&cnext)
.unwrap()
.load(std::sync::atomic::Ordering::Relaxed),
);
if next_node.is_visited() { if next_node.is_visited() {
None return None;
} else {
if dnext {
if next_node.can_extend_left() {
Some(WalkState {
kmer: cnext,
node: next_node,
direct: dnext,
})
} else {
None
} }
let reachable = if dnext { next_node.can_extend_left() } else { next_node.can_extend_right() };
reachable.then_some((WalkState { kmer: cnext, node: next_node, direct: dnext }, nuc))
} else { } else {
if next_node.can_extend_right() { if !self.node.can_extend_left() {
Some(WalkState { return None;
kmer: cnext,
node: next_node,
direct: dnext,
})
} else {
None
} }
} let nuc = self.node.left_nuc();
} else { let next = self.kmer.into_kmer().push_left(nuc);
None
}
}
} else {
if self.node.can_extend_left() {
let next = self.kmer.into_kmer().push_left(self.node.left_nuc());
let cnext = next.canonical(); let cnext = next.canonical();
let dnext = next.raw() == cnext.raw(); let dnext = next.raw() != cnext.raw();
let next_node = Node( let next_node = Node(graph.nodes.get(&cnext).unwrap().load(Ordering::Relaxed));
graph
.nodes
.get(&cnext)
.unwrap()
.load(std::sync::atomic::Ordering::Relaxed),
);
if next_node.is_visited() { if next_node.is_visited() {
None return None;
} else {
if dnext {
if next_node.can_extend_right() {
Some(WalkState {
kmer: cnext,
node: next_node,
direct: dnext,
})
} else {
None
} }
} else { let reachable = if dnext { next_node.can_extend_right() } else { next_node.can_extend_left() };
if next_node.can_extend_left() { reachable.then_some((WalkState { kmer: cnext, node: next_node, direct: dnext }, 3 - nuc))
Some(WalkState {
kmer: cnext,
node: next_node,
direct: dnext,
})
} else {
None
}
}
} else {
None
} }
} }
} }
@@ -347,7 +295,6 @@ impl GraphDeBruijn {
} }
if self.is_start(*kmer, node) { if self.is_start(*kmer, node) {
node.set_start(); node.set_start();
node.set_visited();
atomic.store(node.0, Ordering::Relaxed); atomic.store(node.0, Ordering::Relaxed);
} }
}); });
@@ -387,98 +334,15 @@ impl GraphDeBruijn {
Some(WalkState::new(kmer, node, true)) Some(WalkState::new(kmer, node, true))
} }
pub fn walk(&self, step: WalkState) -> Option<WalkState> { fn unitig_nucleotides(&self, kmer: CanonicalKmer, k: usize) -> Option<UnitigNucIter<'_>> {
if !step.leavable(self) { let old = self.nodes.get(&kmer)?.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
return None; if old & IS_VISITED_MASK != 0 { return None; }
} let start = WalkState::new(kmer, Node(old), true);
let node = step.node; let next_step = start.walk(self).and_then(|(next_state, nuc)| {
let kmer = step.kmer.into_kmer(); let ext_old = self.nodes.get(&next_state.kmer)?.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
let n_kmer = if step.direct { (ext_old & IS_VISITED_MASK == 0).then_some((next_state, nuc))
kmer.push_right(node.right_nuc()) });
} else { Some(UnitigNucIter { graph: self, start: kmer, pos: 0, k, next_step })
kmer.push_left(node.left_nuc())
};
let n_ckmer = n_kmer.canonical();
let n_direct = n_ckmer.raw() == n_kmer.raw();
let n_node = if let Some(node_val) = self.nodes.get(&n_ckmer) {
Node(node_val.load(Ordering::Relaxed))
} else {
unreachable!()
};
if n_node.is_visited() {
return None;
}
Some(WalkState::new(n_ckmer, n_node, n_direct))
}
fn next_unitig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
let canonical = kmer.canonical();
let node = Node(self.nodes.get(&canonical)?.load(Ordering::Relaxed));
let direct = kmer.raw() == canonical.raw();
if (direct && !node.can_extend_right()) || (!direct && !node.can_extend_left()) {
return None;
}
let next_c: CanonicalKmer = if direct {
canonical
.into_kmer()
.push_right(node.right_nuc())
.canonical()
} else {
canonical.into_kmer().push_left(node.left_nuc()).canonical()
};
let atomic = self.nodes.get(&next_c)?;
let next_node = Node(atomic.load(Ordering::Relaxed));
if next_node.is_visited() {
return None;
}
let oriented = oriented_next(kmer, next_c);
let ndirect = oriented.raw() == next_c.raw();
if (ndirect && next_node.n_right_neighbours() > 1)
|| (!ndirect && next_node.n_left_neighbours() > 1)
{
return None;
}
if !try_claim(atomic) {
return None;
}
Some(oriented)
}
fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> {
UnitigIter {
graph: self,
current: Some(start),
}
}
fn unitig_nucleotides(&self, start: CanonicalKmer, node: Node, k: usize) -> UnitigNucIter<'_> {
let chain = if node.can_extend_right() {
let next_c = start.into_kmer().push_right(node.right_nuc()).canonical();
self.nodes.get(&next_c).and_then(|next_a| {
let old = next_a.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
if old & IS_VISITED_MASK == 0 {
let oriented = oriented_next(start.into_kmer(), next_c);
Some(self.iter_unitig_kmers(oriented))
} else {
None
}
})
} else {
None
};
UnitigNucIter {
start,
pos: 0,
k,
chain,
}
} }
pub fn for_each_unitig(&self, f: impl Fn(UnitigNucIter<'_>) + Sync) { pub fn for_each_unitig(&self, f: impl Fn(UnitigNucIter<'_>) + Sync) {
@@ -495,20 +359,22 @@ impl GraphDeBruijn {
self.nodes self.nodes
.par_iter() .par_iter()
.filter_map(|(&kmer, atomic)| { .filter_map(|(&kmer, atomic)| {
let node = Node(atomic.load(Ordering::Acquire)); Node(atomic.load(Ordering::Acquire)).is_start().then_some(kmer)
node.is_start().then_some((kmer, node))
}) })
.for_each(|(start, node)| { .for_each(|kmer| {
if let Some(iter) = self.unitig_nucleotides(kmer, k) {
n_new.fetch_add(1, Ordering::Relaxed); n_new.fetch_add(1, Ordering::Relaxed);
f(self.unitig_nucleotides(start, node, k)); f(iter);
}
}); });
#[cfg(any(test, feature = "test-utils"))] #[cfg(any(test, feature = "test-utils"))]
self.nodes.iter().for_each(|(&kmer, atomic)| { self.nodes.iter().for_each(|(&kmer, atomic)| {
let node = Node(atomic.load(Ordering::Acquire)); if Node(atomic.load(Ordering::Acquire)).is_start() {
if node.is_start() { if let Some(iter) = self.unitig_nucleotides(kmer, k) {
n_new.fetch_add(1, Ordering::Relaxed); n_new.fetch_add(1, Ordering::Relaxed);
f(self.unitig_nucleotides(kmer, node, k)); f(iter);
}
} }
}); });
@@ -528,25 +394,18 @@ impl GraphDeBruijn {
if node.is_visited() { if node.is_visited() {
continue; continue;
} }
let start = if !node.can_extend_right() && node.can_extend_left() { let chain_start = self.find_chain_start(kmer);
self.find_left_chain_start(kmer) if let Some(iter) = self.unitig_nucleotides(chain_start, k) {
} else {
kmer
};
let start_atomic = &self.nodes[&start];
let old = start_atomic.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
if old & IS_VISITED_MASK == 0 {
n2.fetch_add(1, Ordering::Relaxed); n2.fetch_add(1, Ordering::Relaxed);
f(self.unitig_nucleotides(start, Node(old), k)); f(iter);
} }
// Fallback: if kmer was not reached by start's chain, claim it directly. // Fallback: if kmer was not reached by start's chain, claim it directly.
// Safe because unitig_nucleotides(start, ...) may have visited kmer in the // Safe because unitig_nucleotides may have visited kmer in the
// meantime — in that case fetch_or returns IS_VISITED_MASK set and we skip. // meantime — in that case it returns None and we skip.
if start != kmer { if chain_start != kmer {
let kmer_old = atomic.fetch_or(IS_VISITED_MASK, Ordering::AcqRel); if let Some(iter) = self.unitig_nucleotides(kmer, k) {
if kmer_old & IS_VISITED_MASK == 0 {
n2.fetch_add(1, Ordering::Relaxed); n2.fetch_add(1, Ordering::Relaxed);
f(self.unitig_nucleotides(kmer, Node(kmer_old), k)); f(iter);
} }
} }
} }
@@ -570,37 +429,21 @@ impl GraphDeBruijn {
self.nodes.extend(other.nodes); self.nodes.extend(other.nodes);
} }
/// Returns `true` if `query` is a unitig start node: fn find_chain_start(&self, kmer: CanonicalKmer) -> CanonicalKmer {
/// - no unique left predecessor (`!can_extend_left`), or let node = Node(self.nodes[&kmer].load(Ordering::Acquire));
/// - unique left predecessor exists but cannot extend right let mut state = WalkState::new(kmer, node, false);
/// (i.e., no chain traversal from the left can reach `start`). let mut seen = std::collections::HashSet::new();
fn find_left_chain_start(&self, kmer: CanonicalKmer) -> CanonicalKmer { seen.insert((state.kmer.raw(), state.direct));
let mut current = kmer;
loop { loop {
let node = Node(self.nodes[&current].load(Ordering::Acquire)); match state.walk(self) {
if !node.can_extend_left() { None => return state.kmer,
return current; Some((next, _)) => {
if !seen.insert((next.kmer.raw(), next.direct)) {
return kmer;
} }
let pred = current.into_kmer().push_left(node.left_nuc()).canonical(); state = next;
let Some(pred_a) = self.nodes.get(&pred) else {
return current;
};
let pred_node = Node(pred_a.load(Ordering::Acquire));
if pred_node.is_visited() {
return current;
} }
if !pred_node.can_extend_right() {
return current;
} }
// Stop if asymmetry: pred's right canonical neighbor is not current
let pred_right = pred
.into_kmer()
.push_right(pred_node.right_nuc())
.canonical();
if pred_right != current {
return current;
}
current = pred;
} }
} }
@@ -640,30 +483,14 @@ impl GraphDeBruijn {
} }
} }
// ── UnitigIter ────────────────────────────────────────────────────────────────
struct UnitigIter<'a> {
graph: &'a GraphDeBruijn,
current: Option<Kmer>,
}
impl Iterator for UnitigIter<'_> {
type Item = Kmer;
fn next(&mut self) -> Option<Kmer> {
let current = self.current?;
self.current = self.graph.next_unitig_kmer(current);
Some(current)
}
}
// ── UnitigNucIter ───────────────────────────────────────────────────────────── // ── UnitigNucIter ─────────────────────────────────────────────────────────────
pub struct UnitigNucIter<'a> { pub struct UnitigNucIter<'a> {
graph: &'a GraphDeBruijn,
start: CanonicalKmer, start: CanonicalKmer,
pos: usize, pos: usize,
k: usize, k: usize,
chain: Option<UnitigIter<'a>>, next_step: Option<(WalkState, u8)>,
} }
impl Iterator for UnitigNucIter<'_> { impl Iterator for UnitigNucIter<'_> {
@@ -674,33 +501,19 @@ impl Iterator for UnitigNucIter<'_> {
let nuc = self.start.nucleotide(self.pos); let nuc = self.start.nucleotide(self.pos);
self.pos += 1; self.pos += 1;
Some(nuc) Some(nuc)
} else if let Some((state, nuc)) = self.next_step.take() {
self.next_step = state.walk(self.graph).and_then(|(next_state, next_nuc)| {
let old = self.graph.nodes.get(&next_state.kmer)?.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
(old & IS_VISITED_MASK == 0).then_some((next_state, next_nuc))
});
Some(nuc)
} else { } else {
self.chain None
.as_mut()?
.next()
.map(|kmer| kmer.nucleotide(self.k - 1))
} }
} }
fn size_hint(&self) -> (usize, Option<usize>) { fn size_hint(&self) -> (usize, Option<usize>) {
(self.k - self.pos, None) (self.k - self.pos.min(self.k), None)
}
}
// ── helpers ───────────────────────────────────────────────────────────────────
/// Atomically set the visited bit. Returns `true` iff this call claimed the node.
#[inline]
fn try_claim(atomic: &AtomicU8) -> bool {
atomic.fetch_or(IS_VISITED_MASK, Ordering::AcqRel) & IS_VISITED_MASK == 0
}
fn oriented_next(from: Kmer, to: CanonicalKmer) -> Kmer {
let direct = to.into_kmer();
if from.is_overlapping(direct) {
direct
} else {
to.revcomp()
} }
} }
+4 -17
View File
@@ -1,5 +1,5 @@
use super::*; use super::*;
use obikseq::{k, set_k, unitig::Unitig}; use obikseq::{k, set_k, unitig::Unitig, Kmer};
// Build a graph from an ASCII sequence, inserting all canonical k-mers. // Build a graph from an ASCII sequence, inserting all canonical k-mers.
fn graph_from_ascii(seq: &[u8]) -> GraphDeBruijn { fn graph_from_ascii(seq: &[u8]) -> GraphDeBruijn {
@@ -116,27 +116,14 @@ fn kmers_from_unitigs(unitigs: &[Unitig]) -> Vec<CanonicalKmer> {
#[test] #[test]
fn unitig_roundtrip_linear() { fn unitig_roundtrip_linear() {
// Non-repetitive sequence: no k-mer appears twice, no homopolymer run of length k. // AAAAAGGGC with k=5 → 5 distinct k-mers, all in direct canonical form,
// ACGTGGCTA with k=5 → 5 distinct k-mers forming a clean linear chain. // forming a clean linear chain with no orientation flips.
let k = 5; let k = 5;
set_k(k); set_k(k);
let seq = b"ACCTGGCTA"; let seq = b"AAAAAGGGC";
let g = graph_from_ascii(seq); let g = graph_from_ascii(seq);
g.compute_degrees_and_mark_starts(); g.compute_degrees_and_mark_starts();
println!("Les kmers:");
for (kmer, v) in g.nodes.iter() {
println!(
"{}: {}",
String::from_utf8_lossy(&kmer.to_ascii()),
v.load(std::sync::atomic::Ordering::Relaxed)
);
}
println!("Les unitig:");
let unitigs: Vec<Unitig> = collect_unitigs(&g); let unitigs: Vec<Unitig> = collect_unitigs(&g);
for unitig in &unitigs {
println!("{}", String::from_utf8_lossy(&unitig.to_ascii()));
}
assert_eq!( assert_eq!(
unitigs.len(), unitigs.len(),
1, 1,