refactor: centralize k-mer config and introduce packed sequences
Centralize k-mer and minimizer configuration using a thread-safe global module, and replace manual bit-packing with a memory-efficient `PackedSeq` type. Refactor core sequence and k-mer types to use compile-time length enforcement and centralized hashing. Introduce a new De Bruijn graph implementation with compact node encoding and traversal iterators. Update I/O, partitioning, and builder modules to align with the new architecture, and add the `xxhash-rust` dependency.
This commit is contained in:
@@ -8,3 +8,7 @@ obikseq = { path = "../obikseq" }
|
||||
obifastwrite = { path = "../obifastwrite" }
|
||||
ahash = "0.8"
|
||||
hashbrown = "0.14"
|
||||
xxhash-rust = { version = "0.8.15", features = ["xxh3", "const_xxh3"] }
|
||||
|
||||
[dev-dependencies]
|
||||
obikseq = { path = "../obikseq", features = ["test-utils"] }
|
||||
|
||||
@@ -0,0 +1,573 @@
|
||||
//use ahash::RandomState;
|
||||
use hashbrown::HashMap;
|
||||
use obifastwrite::write_unitig;
|
||||
use obikseq::k;
|
||||
use obikseq::unitig::Unitig;
|
||||
use obikseq::{CanonicalKmer, Kmer, Sequence};
|
||||
use std::cell::Cell;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use xxhash_rust::xxh3::Xxh3Builder;
|
||||
|
||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
type FastHashMap<K, V> = HashMap<K, V, Xxh3Builder>;
|
||||
|
||||
// ── Node ──────────────────────────────────────────────────────────────────────
|
||||
//
|
||||
// bit layout (LSB first):
|
||||
// bit 0 : can_extend_right — exactly one right canonical neighbour exists
|
||||
// bit 1 : can_extend_left — exactly one left canonical neighbour exists
|
||||
// bit 2 : visited
|
||||
// bits 3–4 : right_nuc — index 0–3 (A/C/G/T) of that neighbour; valid iff bit 0 = 1
|
||||
// bits 5–6 : left_nuc — index 0–3 (A/C/G/T) of that neighbour; valid iff bit 1 = 1
|
||||
// bit 7 : reserved (0)
|
||||
//
|
||||
// "can_extend" = false covers both 0 neighbours and ≥2 neighbours; the only
|
||||
// information needed for traversal is "exactly one".
|
||||
|
||||
#[repr(transparent)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct Node(u8);
|
||||
|
||||
impl Node {
|
||||
/// Returns `true` if the node can be extended to the right.
|
||||
///
|
||||
/// A single right neighbour exists.
|
||||
pub fn can_extend_right(self) -> bool {
|
||||
self.0 & 0b0000_0001 != 0
|
||||
}
|
||||
|
||||
/// Returns `true` if the node can be extended to the left.
|
||||
///
|
||||
/// A single left neighbour exists.
|
||||
pub fn can_extend_left(self) -> bool {
|
||||
self.0 & 0b0000_0010 != 0
|
||||
}
|
||||
|
||||
/// Returns `true` if the node has been visited.
|
||||
pub fn is_visited(self) -> bool {
|
||||
self.0 & 0b0000_0100 != 0
|
||||
}
|
||||
|
||||
/// Index of the unique right neighbour (0=A, 1=C, 2=G, 3=T).
|
||||
/// Only meaningful when `can_extend_right()` is true.
|
||||
pub fn right_nuc(self) -> u8 {
|
||||
(self.0 >> 3) & 0b11
|
||||
}
|
||||
|
||||
/// Index of the unique left neighbour (0=A, 1=C, 2=G, 3=T).
|
||||
/// Only meaningful when `can_extend_left()` is true.
|
||||
pub fn left_nuc(self) -> u8 {
|
||||
(self.0 >> 5) & 0b11
|
||||
}
|
||||
|
||||
/// Marks the node as visited.
|
||||
pub fn set_visited(&mut self) {
|
||||
if self.is_visited() {
|
||||
unreachable!("from: is_visited -> The node has already been visited")
|
||||
}
|
||||
self.0 |= 0b0000_0100;
|
||||
}
|
||||
|
||||
/// Number of left neighbours.
|
||||
pub fn n_left_neighbours(self) -> u8 {
|
||||
if self.can_extend_left() {
|
||||
1
|
||||
} else {
|
||||
let v = (self.0 >> 5) & 0b11;
|
||||
v + (v != 0) as u8
|
||||
}
|
||||
}
|
||||
|
||||
/// Number of right neighbours.
|
||||
pub fn n_right_neighbours(self) -> u8 {
|
||||
if self.can_extend_right() {
|
||||
1
|
||||
} else {
|
||||
let v = (self.0 >> 3) & 0b11;
|
||||
v + (v != 0) as u8
|
||||
}
|
||||
}
|
||||
|
||||
/// `nuc` = Some(i) → exactly one neighbour (bit 0 set, bits 3–4 = nucleotide index).
|
||||
/// `nuc` = None → 0 or ≥2 neighbours; `count` encoded in bits 3–4 as count.sat_sub(1).
|
||||
pub fn set_right(&mut self, count: u8, nuc: Option<u8>) {
|
||||
self.0 &= !(0b0000_0001 | 0b001_1000);
|
||||
if count == 1 {
|
||||
self.0 |= 0b0000_0001;
|
||||
if let Some(n) = nuc {
|
||||
self.0 |= (n & 0b11) << 3;
|
||||
return;
|
||||
}
|
||||
unreachable!("nuc must be Some when count is 1");
|
||||
}
|
||||
self.0 |= (count.saturating_sub(1).min(3)) << 3;
|
||||
}
|
||||
|
||||
/// `nuc` = Some(i) → exactly one neighbour (bit 0 set, bits 3–4 = nucleotide index).
|
||||
/// `nuc` = None → 0 or ≥2 neighbours; `count` encoded in bits 3–4 as count.sat_sub(1).
|
||||
pub fn set_left(&mut self, count: u8, nuc: Option<u8>) {
|
||||
self.0 &= !(0b0000_0010 | 0b0110_0000);
|
||||
if count == 1 {
|
||||
self.0 |= 0b0000_0010;
|
||||
if let Some(n) = nuc {
|
||||
self.0 |= (n & 0b11) << 5;
|
||||
return;
|
||||
}
|
||||
unreachable!("nuc must be Some when count is 1");
|
||||
}
|
||||
self.0 |= (count.saturating_sub(1).min(3)) << 5;
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Node {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
const NUC: [char; 4] = ['A', 'C', 'G', 'T'];
|
||||
let r = if self.can_extend_right() {
|
||||
format!("→{}", NUC[self.right_nuc() as usize])
|
||||
} else {
|
||||
format!("→{}", self.n_right_neighbours())
|
||||
};
|
||||
let l = if self.can_extend_left() {
|
||||
format!("←{}", NUC[self.left_nuc() as usize])
|
||||
} else {
|
||||
format!("←{}", self.n_left_neighbours())
|
||||
};
|
||||
let v = if self.is_visited() { "V" } else { "." };
|
||||
write!(f, "Node({r} {l} {v})")
|
||||
}
|
||||
}
|
||||
|
||||
// ── GraphDeBruijn ─────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct GraphDeBruijn {
|
||||
nodes: FastHashMap<CanonicalKmer, Cell<Node>>,
|
||||
}
|
||||
|
||||
impl GraphDeBruijn {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
nodes: FastHashMap::with_hasher(Xxh3Builder::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_capacity(capacity: usize) -> Self {
|
||||
Self {
|
||||
nodes: FastHashMap::with_capacity_and_hasher(capacity, Xxh3Builder::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert a canonical kmer into the graph. No-op if already present.
|
||||
pub fn push(&mut self, kmer: CanonicalKmer) {
|
||||
self.nodes
|
||||
.entry(kmer)
|
||||
.or_insert_with(|| Cell::new(Node::default()));
|
||||
}
|
||||
|
||||
/// For every node, find its unique right/left canonical neighbour (if any)
|
||||
/// and store the nucleotide index in the Node flags.
|
||||
///
|
||||
/// Single pass thanks to Cell interior mutability.
|
||||
pub fn compute_degrees(&self) {
|
||||
for (&kmer, cell) in &self.nodes {
|
||||
let (rc, rn) = count_neighbors(kmer.right_canonical_neighbors(), &self.nodes);
|
||||
let (lc, ln) = count_neighbors(kmer.left_canonical_neighbors(), &self.nodes);
|
||||
|
||||
let mut node = cell.get();
|
||||
node.set_right(rc, rn);
|
||||
node.set_left(lc, ln);
|
||||
cell.set(node);
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterates over the right neighbors of `kmer`.
|
||||
pub fn iter_right_neighbors(
|
||||
&self,
|
||||
kmer: CanonicalKmer,
|
||||
) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||
kmer.right_canonical_neighbors()
|
||||
.into_iter()
|
||||
.filter_map(|kmer| {
|
||||
self.nodes.get(&kmer)?;
|
||||
Some(kmer)
|
||||
})
|
||||
}
|
||||
|
||||
/// Iterates over the left neighbors of `kmer`.
|
||||
pub fn iter_left_neighbors(
|
||||
&self,
|
||||
kmer: CanonicalKmer,
|
||||
) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||
kmer.left_canonical_neighbors()
|
||||
.into_iter()
|
||||
.filter_map(|kmer| {
|
||||
self.nodes.get(&kmer)?;
|
||||
Some(kmer)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_visited(&self, kmer: &CanonicalKmer) -> Option<bool> {
|
||||
self.nodes.get(kmer).map(|cell| cell.get().is_visited())
|
||||
}
|
||||
|
||||
pub fn set_visited(&self, kmer: CanonicalKmer) {
|
||||
if let Some(cell) = self.nodes.get(&kmer) {
|
||||
let mut node = cell.get();
|
||||
node.set_visited();
|
||||
cell.set(node);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the single right neighbor of `kmer`, if it exists.
|
||||
pub fn the_single_right_neighbor(&self, kmer: CanonicalKmer) -> Option<CanonicalKmer> {
|
||||
let node = self.nodes.get(&kmer)?.get();
|
||||
if !node.can_extend_right() {
|
||||
return None;
|
||||
}
|
||||
let next = kmer.into_kmer().push_right(node.right_nuc()).canonical();
|
||||
self.nodes.contains_key(&next).then_some(next)
|
||||
}
|
||||
|
||||
/// Returns the single left neighbor of `kmer`, if it exists.
|
||||
pub fn the_single_left_neighbor(&self, kmer: CanonicalKmer) -> Option<CanonicalKmer> {
|
||||
let node = self.nodes.get(&kmer)?.get();
|
||||
if !node.can_extend_left() {
|
||||
return None;
|
||||
}
|
||||
let next = kmer.into_kmer().push_left(node.left_nuc()).canonical();
|
||||
self.nodes.contains_key(&next).then_some(next)
|
||||
}
|
||||
|
||||
/// Internal iterator over unitig-start nodes; drives `iter_unitig`.
|
||||
///
|
||||
/// MUST NOT be consumed standalone: the second pass finds cycle nodes only
|
||||
/// because `iter_unitig` lazily interleaves chain traversal between the two passes.
|
||||
///
|
||||
/// Two passes:
|
||||
/// 1. Chain ends / isolated nodes (at most one extension missing):
|
||||
/// - `!can_extend_left` → yield canonical form
|
||||
/// - `!can_extend_right` → yield reverse complement
|
||||
/// 2. Nodes still unvisited → part of a cycle; yield canonical form.
|
||||
fn start_iter(&self) -> impl Iterator<Item = (CanonicalKmer, Option<Kmer>)> + '_ {
|
||||
StartIter::new(self)
|
||||
}
|
||||
|
||||
fn next_unitig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
|
||||
let canonical = kmer.canonical();
|
||||
let node = self.nodes.get(&canonical)?.get();
|
||||
|
||||
let direct = kmer.raw() == canonical.raw();
|
||||
|
||||
if (direct && !node.can_extend_right()) || (!direct && !node.can_extend_left()) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let next_c: CanonicalKmer = if direct {
|
||||
canonical
|
||||
.into_kmer()
|
||||
.push_right(node.right_nuc())
|
||||
.canonical()
|
||||
} else {
|
||||
canonical.into_kmer().push_left(node.left_nuc()).canonical()
|
||||
};
|
||||
|
||||
let cell = self.nodes.get(&next_c)?;
|
||||
let next_node = cell.get();
|
||||
if next_node.is_visited() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let oriented = oriented_next(kmer, next_c);
|
||||
let ndirect = oriented.raw() == next_c.raw();
|
||||
|
||||
if (ndirect && next_node.n_right_neighbours() > 1)
|
||||
|| (!ndirect && next_node.n_left_neighbours() > 1)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut updated = next_node;
|
||||
updated.set_visited();
|
||||
cell.set(updated);
|
||||
Some(oriented)
|
||||
}
|
||||
|
||||
fn next_longtig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
|
||||
let canonical = kmer.canonical();
|
||||
let node = self.nodes.get(&canonical)?.get();
|
||||
|
||||
let direct = kmer.raw() == canonical.raw();
|
||||
|
||||
if (direct && node.n_right_neighbours() == 0) || (!direct && node.n_left_neighbours() == 0)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let next_c: CanonicalKmer = if direct {
|
||||
if node.can_extend_right() {
|
||||
canonical
|
||||
.into_kmer()
|
||||
.push_right(node.right_nuc())
|
||||
.canonical()
|
||||
} else {
|
||||
self.iter_right_neighbors(canonical)
|
||||
.filter(|n| !self.is_visited(n).unwrap_or(true))
|
||||
.next()?
|
||||
}
|
||||
} else {
|
||||
if node.can_extend_left() {
|
||||
canonical.into_kmer().push_left(node.left_nuc()).canonical()
|
||||
} else {
|
||||
self.iter_left_neighbors(canonical)
|
||||
.filter(|n| !self.is_visited(n).unwrap_or(true))
|
||||
.next()?
|
||||
}
|
||||
};
|
||||
|
||||
let cell = self.nodes.get(&next_c)?;
|
||||
let next_node = cell.get();
|
||||
if next_node.is_visited() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let oriented = oriented_next(kmer, next_c);
|
||||
let ndirect = oriented.raw() == next_c.raw();
|
||||
|
||||
if (ndirect && next_node.n_right_neighbours() > 1)
|
||||
|| (!ndirect && next_node.n_left_neighbours() > 1)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut updated = next_node;
|
||||
updated.set_visited();
|
||||
cell.set(updated);
|
||||
Some(oriented)
|
||||
}
|
||||
|
||||
fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> {
|
||||
UnitigIter {
|
||||
graph: self,
|
||||
current: Some(start),
|
||||
}
|
||||
}
|
||||
|
||||
fn iter_longtig_kmers(&self, start: Kmer) -> LongtigIter<'_> {
|
||||
LongtigIter {
|
||||
graph: self,
|
||||
current: Some(start),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter_unitig(&self) -> impl Iterator<Item = Unitig> + '_ {
|
||||
let k = k();
|
||||
self.start_iter().map(move |(start, first_next)| {
|
||||
let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect();
|
||||
if let Some(next_c) = first_next {
|
||||
for kmer in self.iter_unitig_kmers(next_c) {
|
||||
nucs.push(kmer.nucleotide(k - 1));
|
||||
}
|
||||
}
|
||||
Unitig::from_nucleotides(&nucs)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn iter_longtig(&self) -> impl Iterator<Item = Unitig> + '_ {
|
||||
let k = k();
|
||||
self.start_iter().map(move |(start, first_next)| {
|
||||
let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect();
|
||||
if let Some(next_c) = first_next {
|
||||
for kmer in self.iter_longtig_kmers(next_c) {
|
||||
nucs.push(kmer.nucleotide(k - 1));
|
||||
}
|
||||
}
|
||||
Unitig::from_nucleotides(&nucs)
|
||||
})
|
||||
}
|
||||
|
||||
/// Write all unitigs to `out` in FASTA format.
|
||||
///
|
||||
/// Calls [`obifastwrite::write_unitig`] for each unitig produced by
|
||||
/// [`iter_unitig`]. Stops and returns the first I/O error encountered.
|
||||
pub fn write_fasta<W: io::Write>(&self, out: &mut W, unitig: bool) -> io::Result<()> {
|
||||
if unitig {
|
||||
for unitig in self.iter_unitig() {
|
||||
write_unitig(&unitig, k(), out)?;
|
||||
}
|
||||
} else {
|
||||
for unitig in self.iter_longtig() {
|
||||
write_unitig(&unitig, k(), out)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.nodes.len()
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.nodes.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
// --- StartIter -----------------------------------------------------------------
|
||||
struct StartIter<'a> {
|
||||
graph: &'a GraphDeBruijn,
|
||||
nodes: hashbrown::hash_map::Iter<'a, CanonicalKmer, Cell<Node>>,
|
||||
suspended: Vec<CanonicalKmer>,
|
||||
in_cycle_pass: bool,
|
||||
}
|
||||
|
||||
impl<'a> StartIter<'a> {
|
||||
fn new(graph: &'a GraphDeBruijn) -> Self {
|
||||
Self {
|
||||
graph,
|
||||
nodes: graph.nodes.iter(),
|
||||
suspended: Vec::new(),
|
||||
in_cycle_pass: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for StartIter<'a> {
|
||||
type Item = (CanonicalKmer, Option<Kmer>);
|
||||
|
||||
fn next(&mut self) -> Option<(CanonicalKmer, Option<Kmer>)> {
|
||||
loop {
|
||||
let current = if let Some(k) = self.suspended.pop() {
|
||||
k
|
||||
} else {
|
||||
match self.nodes.next() {
|
||||
Some((&k, _)) => k,
|
||||
None => {
|
||||
if self.in_cycle_pass {
|
||||
return None;
|
||||
}
|
||||
self.in_cycle_pass = true;
|
||||
self.nodes = self.graph.nodes.iter();
|
||||
match self.nodes.next() {
|
||||
Some((&k, _)) => k,
|
||||
None => return None,
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let node = match self.graph.nodes.get(¤t) {
|
||||
Some(c) => c.get(),
|
||||
None => continue,
|
||||
};
|
||||
if node.is_visited() {
|
||||
continue;
|
||||
}
|
||||
if !self.in_cycle_pass && node.can_extend_left() {
|
||||
continue;
|
||||
}
|
||||
|
||||
self.graph.set_visited(current);
|
||||
|
||||
if let Some(next) = self.graph.the_single_right_neighbor(current) {
|
||||
if self.graph.is_visited(&next).unwrap_or(true) {
|
||||
return Some((current, None));
|
||||
}
|
||||
self.graph.set_visited(next);
|
||||
let oriented = oriented_next(current.into_kmer(), next);
|
||||
return Some((current, Some(oriented)));
|
||||
}
|
||||
|
||||
let mut first_neighbor: Option<CanonicalKmer> = None;
|
||||
for neighbor in self.graph.iter_right_neighbors(current) {
|
||||
if self.graph.is_visited(&neighbor).unwrap_or(true) {
|
||||
continue;
|
||||
}
|
||||
if first_neighbor.is_none() {
|
||||
self.graph.set_visited(neighbor);
|
||||
first_neighbor = Some(neighbor);
|
||||
} else {
|
||||
self.suspended.push(neighbor);
|
||||
}
|
||||
}
|
||||
|
||||
let oriented = match first_neighbor {
|
||||
Some(neighbor) => Some(oriented_next(current.into_kmer(), neighbor)),
|
||||
None => None,
|
||||
};
|
||||
return Some((current, oriented));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── UnitigIter ────────────────────────────────────────────────────────────────
|
||||
|
||||
struct UnitigIter<'a> {
|
||||
graph: &'a GraphDeBruijn,
|
||||
current: Option<Kmer>,
|
||||
}
|
||||
|
||||
impl Iterator for UnitigIter<'_> {
|
||||
type Item = Kmer;
|
||||
|
||||
fn next(&mut self) -> Option<Kmer> {
|
||||
let current = self.current?;
|
||||
self.current = self.graph.next_unitig_kmer(current);
|
||||
Some(current)
|
||||
}
|
||||
}
|
||||
|
||||
// ── UnitigIter ────────────────────────────────────────────────────────────────
|
||||
|
||||
struct LongtigIter<'a> {
|
||||
graph: &'a GraphDeBruijn,
|
||||
current: Option<Kmer>,
|
||||
}
|
||||
|
||||
impl Iterator for LongtigIter<'_> {
|
||||
type Item = Kmer;
|
||||
|
||||
fn next(&mut self) -> Option<Kmer> {
|
||||
let current = self.current?;
|
||||
self.current = self.graph.next_longtig_kmer(current);
|
||||
Some(current)
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn oriented_next(from: Kmer, to: CanonicalKmer) -> Kmer {
|
||||
if from.is_overlapping(to.into_kmer()) {
|
||||
to.into_kmer()
|
||||
} else {
|
||||
to.revcomp()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `Some(i)` if exactly one of the four canonical neighbours exists in
|
||||
/// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for
|
||||
/// zero or ≥2 existing neighbours.
|
||||
fn count_neighbors(
|
||||
neighbors: [CanonicalKmer; 4],
|
||||
nodes: &FastHashMap<CanonicalKmer, Cell<Node>>,
|
||||
) -> (u8, Option<u8>) {
|
||||
let mut count = 0u8;
|
||||
let mut first = None;
|
||||
for (i, neighbour) in neighbors.iter().enumerate() {
|
||||
if nodes.contains_key(neighbour) {
|
||||
count += 1;
|
||||
if first.is_none() {
|
||||
first = Some(i as u8);
|
||||
}
|
||||
}
|
||||
}
|
||||
if count == 1 {
|
||||
(1, first)
|
||||
} else {
|
||||
(count, None)
|
||||
}
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
#[cfg(test)]
|
||||
#[path = "tests/debruijn.rs"]
|
||||
mod tests;
|
||||
+2
-889
@@ -1,890 +1,3 @@
|
||||
use ahash::RandomState;
|
||||
use hashbrown::HashMap;
|
||||
use obifastwrite::write_unitig;
|
||||
use obikseq::kmer::{self, CanonicalKmer, Kmer};
|
||||
use obikseq::unitig::Unitig;
|
||||
use std::cell::Cell;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
mod debruijn;
|
||||
|
||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
type FastHashMap<K, V> = HashMap<K, V, RandomState>;
|
||||
|
||||
// ── Node ──────────────────────────────────────────────────────────────────────
|
||||
//
|
||||
// bit layout (LSB first):
|
||||
// bit 0 : can_extend_right — exactly one right canonical neighbour exists
|
||||
// bit 1 : can_extend_left — exactly one left canonical neighbour exists
|
||||
// bit 2 : visited
|
||||
// bits 3–4 : right_nuc — index 0–3 (A/C/G/T) of that neighbour; valid iff bit 0 = 1
|
||||
// bits 5–6 : left_nuc — index 0–3 (A/C/G/T) of that neighbour; valid iff bit 1 = 1
|
||||
// bit 7 : reserved (0)
|
||||
//
|
||||
// "can_extend" = false covers both 0 neighbours and ≥2 neighbours; the only
|
||||
// information needed for traversal is "exactly one".
|
||||
|
||||
#[repr(transparent)]
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct Node(u8);
|
||||
|
||||
impl Node {
|
||||
/// Returns `true` if the node can be extended to the right.
|
||||
///
|
||||
/// A single right neighbour exists.
|
||||
pub fn can_extend_right(self) -> bool {
|
||||
self.0 & 0b0000_0001 != 0
|
||||
}
|
||||
|
||||
/// Returns `true` if the node can be extended to the left.
|
||||
///
|
||||
/// A single left neighbour exists.
|
||||
pub fn can_extend_left(self) -> bool {
|
||||
self.0 & 0b0000_0010 != 0
|
||||
}
|
||||
|
||||
/// Returns `true` if the node has been visited.
|
||||
pub fn is_visited(self) -> bool {
|
||||
self.0 & 0b0000_0100 != 0
|
||||
}
|
||||
|
||||
/// Index of the unique right neighbour (0=A, 1=C, 2=G, 3=T).
|
||||
/// Only meaningful when `can_extend_right()` is true.
|
||||
pub fn right_nuc(self) -> u8 {
|
||||
(self.0 >> 3) & 0b11
|
||||
}
|
||||
|
||||
/// Index of the unique left neighbour (0=A, 1=C, 2=G, 3=T).
|
||||
/// Only meaningful when `can_extend_left()` is true.
|
||||
pub fn left_nuc(self) -> u8 {
|
||||
(self.0 >> 5) & 0b11
|
||||
}
|
||||
|
||||
/// Marks the node as visited.
|
||||
pub fn set_visited(&mut self) {
|
||||
if self.is_visited() {
|
||||
unreachable!("from: is_visited -> The node has already been visited")
|
||||
}
|
||||
self.0 |= 0b0000_0100;
|
||||
}
|
||||
|
||||
/// Number of left neighbours.
|
||||
pub fn n_left_neighbours(self) -> u8 {
|
||||
if self.can_extend_left() {
|
||||
1
|
||||
} else {
|
||||
let v = (self.0 >> 5) & 0b11;
|
||||
v + (v != 0) as u8
|
||||
}
|
||||
}
|
||||
|
||||
/// Number of right neighbours.
|
||||
pub fn n_right_neighbours(self) -> u8 {
|
||||
if self.can_extend_right() {
|
||||
1
|
||||
} else {
|
||||
let v = (self.0 >> 3) & 0b11;
|
||||
v + (v != 0) as u8
|
||||
}
|
||||
}
|
||||
|
||||
/// `nuc` = Some(i) → exactly one neighbour (bit 0 set, bits 3–4 = nucleotide index).
|
||||
/// `nuc` = None → 0 or ≥2 neighbours; `count` encoded in bits 3–4 as count.sat_sub(1).
|
||||
pub fn set_right(&mut self, count: u8, nuc: Option<u8>) {
|
||||
self.0 &= !(0b0000_0001 | 0b001_1000);
|
||||
if count == 1 {
|
||||
self.0 |= 0b0000_0001;
|
||||
if let Some(n) = nuc {
|
||||
self.0 |= (n & 0b11) << 3;
|
||||
return;
|
||||
}
|
||||
unreachable!("nuc must be Some when count is 1");
|
||||
}
|
||||
self.0 |= (count.saturating_sub(1).min(3)) << 3;
|
||||
}
|
||||
|
||||
/// `nuc` = Some(i) → exactly one neighbour (bit 0 set, bits 3–4 = nucleotide index).
|
||||
/// `nuc` = None → 0 or ≥2 neighbours; `count` encoded in bits 3–4 as count.sat_sub(1).
|
||||
pub fn set_left(&mut self, count: u8, nuc: Option<u8>) {
|
||||
self.0 &= !(0b0000_0010 | 0b0110_0000);
|
||||
if count == 1 {
|
||||
self.0 |= 0b0000_0010;
|
||||
if let Some(n) = nuc {
|
||||
self.0 |= (n & 0b11) << 5;
|
||||
return;
|
||||
}
|
||||
unreachable!("nuc must be Some when count is 1");
|
||||
}
|
||||
self.0 |= (count.saturating_sub(1).min(3)) << 5;
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Node {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
const NUC: [char; 4] = ['A', 'C', 'G', 'T'];
|
||||
let r = if self.can_extend_right() {
|
||||
format!("→{}", NUC[self.right_nuc() as usize])
|
||||
} else {
|
||||
format!("→{}", self.n_right_neighbours())
|
||||
};
|
||||
let l = if self.can_extend_left() {
|
||||
format!("←{}", NUC[self.left_nuc() as usize])
|
||||
} else {
|
||||
format!("←{}", self.n_left_neighbours())
|
||||
};
|
||||
let v = if self.is_visited() { "V" } else { "." };
|
||||
write!(f, "Node({r} {l} {v})")
|
||||
}
|
||||
}
|
||||
|
||||
// ── GraphDeBruijn ─────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct GraphDeBruijn {
|
||||
nodes: FastHashMap<CanonicalKmer, Cell<Node>>,
|
||||
k: usize,
|
||||
}
|
||||
|
||||
impl GraphDeBruijn {
|
||||
pub fn new(k: usize) -> Self {
|
||||
Self {
|
||||
nodes: FastHashMap::with_hasher(RandomState::new()),
|
||||
k,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_capacity(k: usize, capacity: usize) -> Self {
|
||||
Self {
|
||||
nodes: FastHashMap::with_capacity_and_hasher(capacity, RandomState::new()),
|
||||
k,
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert a canonical kmer into the graph. No-op if already present.
|
||||
pub fn push(&mut self, kmer: CanonicalKmer) {
|
||||
self.nodes
|
||||
.entry(kmer)
|
||||
.or_insert_with(|| Cell::new(Node::default()));
|
||||
}
|
||||
|
||||
/// For every node, find its unique right/left canonical neighbour (if any)
|
||||
/// and store the nucleotide index in the Node flags.
|
||||
///
|
||||
/// Single pass thanks to Cell interior mutability.
|
||||
pub fn compute_degrees(&self) {
|
||||
for (&kmer, cell) in &self.nodes {
|
||||
let (rc, rn) = count_neighbors(kmer.right_canonical_neighbors(self.k), &self.nodes);
|
||||
let (lc, ln) = count_neighbors(kmer.left_canonical_neighbors(self.k), &self.nodes);
|
||||
|
||||
let mut node = cell.get();
|
||||
node.set_right(rc, rn);
|
||||
node.set_left(lc, ln);
|
||||
cell.set(node);
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterates over the right neighbors of `kmer`.
|
||||
pub fn iter_right_neighbors(
|
||||
&self,
|
||||
kmer: CanonicalKmer,
|
||||
) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||
kmer.right_canonical_neighbors(self.k)
|
||||
.into_iter()
|
||||
.filter_map(|kmer| {
|
||||
self.nodes.get(&kmer)?;
|
||||
Some(kmer)
|
||||
})
|
||||
}
|
||||
|
||||
/// Iterates over the left neighbors of `kmer`.
|
||||
pub fn iter_left_neighbors(
|
||||
&self,
|
||||
kmer: CanonicalKmer,
|
||||
) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||
kmer.left_canonical_neighbors(self.k)
|
||||
.into_iter()
|
||||
.filter_map(|kmer| {
|
||||
self.nodes.get(&kmer)?;
|
||||
Some(kmer)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_visited(&self, kmer: &CanonicalKmer) -> Option<bool> {
|
||||
self.nodes.get(kmer).map(|cell| cell.get().is_visited())
|
||||
}
|
||||
|
||||
pub fn set_visited(&self, kmer: CanonicalKmer) {
|
||||
if let Some(cell) = self.nodes.get(&kmer) {
|
||||
let mut node = cell.get();
|
||||
node.set_visited();
|
||||
cell.set(node);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the single right neighbor of `kmer`, if it exists.
|
||||
pub fn the_single_right_neighbor(&self, kmer: CanonicalKmer) -> Option<CanonicalKmer> {
|
||||
let node = self.nodes.get(&kmer)?.get();
|
||||
if !node.can_extend_right() {
|
||||
return None;
|
||||
}
|
||||
let next = kmer
|
||||
.into_kmer()
|
||||
.push_right(node.right_nuc(), self.k)
|
||||
.canonical(self.k);
|
||||
self.nodes.contains_key(&next).then_some(next)
|
||||
}
|
||||
|
||||
/// Returns the single left neighbor of `kmer`, if it exists.
|
||||
pub fn the_single_left_neighbor(&self, kmer: CanonicalKmer) -> Option<CanonicalKmer> {
|
||||
let node = self.nodes.get(&kmer)?.get();
|
||||
if !node.can_extend_left() {
|
||||
return None;
|
||||
}
|
||||
let next = kmer
|
||||
.into_kmer()
|
||||
.push_left(node.left_nuc(), self.k)
|
||||
.canonical(self.k);
|
||||
self.nodes.contains_key(&next).then_some(next)
|
||||
}
|
||||
|
||||
/// Internal iterator over unitig-start nodes; drives `iter_unitig`.
|
||||
///
|
||||
/// MUST NOT be consumed standalone: the second pass finds cycle nodes only
|
||||
/// because `iter_unitig` lazily interleaves chain traversal between the two passes.
|
||||
///
|
||||
/// Two passes:
|
||||
/// 1. Chain ends / isolated nodes (at most one extension missing):
|
||||
/// - `!can_extend_left` → yield canonical form
|
||||
/// - `!can_extend_right` → yield reverse complement
|
||||
/// 2. Nodes still unvisited → part of a cycle; yield canonical form.
|
||||
fn start_iter(&self) -> impl Iterator<Item = (CanonicalKmer, Option<Kmer>)> + '_ {
|
||||
StartIter::new(self)
|
||||
}
|
||||
|
||||
fn next_unitig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
|
||||
let canonical = kmer.canonical(self.k);
|
||||
let node = self.nodes.get(&canonical)?.get();
|
||||
|
||||
let direct = kmer.raw() == canonical.raw();
|
||||
|
||||
if (direct && !node.can_extend_right()) || (!direct && !node.can_extend_left()) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let next_c: CanonicalKmer = if direct {
|
||||
canonical
|
||||
.into_kmer()
|
||||
.push_right(node.right_nuc(), self.k)
|
||||
.canonical(self.k)
|
||||
} else {
|
||||
canonical
|
||||
.into_kmer()
|
||||
.push_left(node.left_nuc(), self.k)
|
||||
.canonical(self.k)
|
||||
};
|
||||
|
||||
let cell = self.nodes.get(&next_c)?;
|
||||
let next_node = cell.get();
|
||||
if next_node.is_visited() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let oriented = oriented_next(kmer, next_c, self.k);
|
||||
let ndirect = oriented.raw() == next_c.raw();
|
||||
|
||||
if (ndirect && next_node.n_right_neighbours() > 1)
|
||||
|| (!ndirect && next_node.n_left_neighbours() > 1)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut updated = next_node;
|
||||
updated.set_visited();
|
||||
cell.set(updated);
|
||||
Some(oriented)
|
||||
}
|
||||
|
||||
fn next_longtig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
|
||||
let k = self.k;
|
||||
let canonical = kmer.canonical(k);
|
||||
let node = self.nodes.get(&canonical)?.get();
|
||||
|
||||
let direct = kmer.raw() == canonical.raw();
|
||||
|
||||
if (direct && node.n_right_neighbours() == 0) || (!direct && node.n_left_neighbours() == 0)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let next_c: CanonicalKmer = if direct {
|
||||
if node.can_extend_right() {
|
||||
canonical
|
||||
.into_kmer()
|
||||
.push_right(node.right_nuc(), k)
|
||||
.canonical(k)
|
||||
} else {
|
||||
self.iter_right_neighbors(canonical)
|
||||
.filter(|n| !self.is_visited(n).unwrap_or(true))
|
||||
.next()?
|
||||
}
|
||||
} else {
|
||||
if node.can_extend_left() {
|
||||
canonical
|
||||
.into_kmer()
|
||||
.push_left(node.left_nuc(), k)
|
||||
.canonical(k)
|
||||
} else {
|
||||
self.iter_left_neighbors(canonical)
|
||||
.filter(|n| !self.is_visited(n).unwrap_or(true))
|
||||
.next()?
|
||||
}
|
||||
};
|
||||
|
||||
let cell = self.nodes.get(&next_c)?;
|
||||
let next_node = cell.get();
|
||||
if next_node.is_visited() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let oriented = oriented_next(kmer, next_c, self.k);
|
||||
let ndirect = oriented.raw() == next_c.raw();
|
||||
|
||||
if (ndirect && next_node.n_right_neighbours() > 1)
|
||||
|| (!ndirect && next_node.n_left_neighbours() > 1)
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut updated = next_node;
|
||||
updated.set_visited();
|
||||
cell.set(updated);
|
||||
Some(oriented)
|
||||
}
|
||||
|
||||
fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> {
|
||||
UnitigIter {
|
||||
graph: self,
|
||||
current: Some(start),
|
||||
}
|
||||
}
|
||||
|
||||
fn iter_longtig_kmers(&self, start: Kmer) -> LongtigIter<'_> {
|
||||
LongtigIter {
|
||||
graph: self,
|
||||
current: Some(start),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter_unitig(&self) -> impl Iterator<Item = Unitig> + '_ {
|
||||
let k = self.k;
|
||||
self.start_iter().map(move |(start, first_next)| {
|
||||
let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect();
|
||||
if let Some(next_c) = first_next {
|
||||
for kmer in self.iter_unitig_kmers(next_c) {
|
||||
nucs.push(kmer.nucleotide(k - 1));
|
||||
}
|
||||
}
|
||||
Unitig::from_nucleotides(&nucs)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn iter_longtig(&self) -> impl Iterator<Item = Unitig> + '_ {
|
||||
let k = self.k;
|
||||
self.start_iter().map(move |(start, first_next)| {
|
||||
let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect();
|
||||
if let Some(next_c) = first_next {
|
||||
for kmer in self.iter_longtig_kmers(next_c) {
|
||||
nucs.push(kmer.nucleotide(k - 1));
|
||||
}
|
||||
}
|
||||
Unitig::from_nucleotides(&nucs)
|
||||
})
|
||||
}
|
||||
|
||||
/// Write all unitigs to `out` in FASTA format.
|
||||
///
|
||||
/// Calls [`obifastwrite::write_unitig`] for each unitig produced by
|
||||
/// [`iter_unitig`]. Stops and returns the first I/O error encountered.
|
||||
pub fn write_fasta<W: io::Write>(&self, out: &mut W, unitig: bool) -> io::Result<()> {
|
||||
if unitig {
|
||||
for unitig in self.iter_unitig() {
|
||||
write_unitig(&unitig, self.k, out)?;
|
||||
}
|
||||
} else {
|
||||
for unitig in self.iter_longtig() {
|
||||
write_unitig(&unitig, self.k, out)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.nodes.len()
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.nodes.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
// --- StartIter -----------------------------------------------------------------
|
||||
struct StartIter<'a> {
|
||||
graph: &'a GraphDeBruijn,
|
||||
nodes: hashbrown::hash_map::Iter<'a, CanonicalKmer, Cell<Node>>,
|
||||
suspended: Vec<CanonicalKmer>,
|
||||
in_cycle_pass: bool,
|
||||
}
|
||||
|
||||
impl<'a> StartIter<'a> {
|
||||
fn new(graph: &'a GraphDeBruijn) -> Self {
|
||||
Self {
|
||||
graph,
|
||||
nodes: graph.nodes.iter(),
|
||||
suspended: Vec::new(),
|
||||
in_cycle_pass: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for StartIter<'a> {
|
||||
type Item = (CanonicalKmer, Option<Kmer>);
|
||||
|
||||
fn next(&mut self) -> Option<(CanonicalKmer, Option<Kmer>)> {
|
||||
loop {
|
||||
let current = if let Some(k) = self.suspended.pop() {
|
||||
k
|
||||
} else {
|
||||
match self.nodes.next() {
|
||||
Some((&k, _)) => k,
|
||||
None => {
|
||||
if self.in_cycle_pass {
|
||||
return None;
|
||||
}
|
||||
self.in_cycle_pass = true;
|
||||
self.nodes = self.graph.nodes.iter();
|
||||
match self.nodes.next() {
|
||||
Some((&k, _)) => k,
|
||||
None => return None,
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let node = match self.graph.nodes.get(¤t) {
|
||||
Some(c) => c.get(),
|
||||
None => continue,
|
||||
};
|
||||
if node.is_visited() {
|
||||
continue;
|
||||
}
|
||||
if !self.in_cycle_pass && node.can_extend_left() {
|
||||
continue;
|
||||
}
|
||||
|
||||
self.graph.set_visited(current);
|
||||
|
||||
if let Some(next) = self.graph.the_single_right_neighbor(current) {
|
||||
if self.graph.is_visited(&next).unwrap_or(true) {
|
||||
return Some((current, None));
|
||||
}
|
||||
self.graph.set_visited(next);
|
||||
let oriented = oriented_next(current.into_kmer(), next, self.graph.k);
|
||||
return Some((current, Some(oriented)));
|
||||
}
|
||||
|
||||
let mut first_neighbor: Option<CanonicalKmer> = None;
|
||||
for neighbor in self.graph.iter_right_neighbors(current) {
|
||||
if self.graph.is_visited(&neighbor).unwrap_or(true) {
|
||||
continue;
|
||||
}
|
||||
if first_neighbor.is_none() {
|
||||
self.graph.set_visited(neighbor);
|
||||
first_neighbor = Some(neighbor);
|
||||
} else {
|
||||
self.suspended.push(neighbor);
|
||||
}
|
||||
}
|
||||
|
||||
let oriented = match first_neighbor {
|
||||
Some(neighbor) => Some(oriented_next(current.into_kmer(), neighbor, self.graph.k)),
|
||||
None => None,
|
||||
};
|
||||
return Some((current, oriented));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── UnitigIter ────────────────────────────────────────────────────────────────
|
||||
|
||||
struct UnitigIter<'a> {
|
||||
graph: &'a GraphDeBruijn,
|
||||
current: Option<Kmer>,
|
||||
}
|
||||
|
||||
impl Iterator for UnitigIter<'_> {
|
||||
type Item = Kmer;
|
||||
|
||||
fn next(&mut self) -> Option<Kmer> {
|
||||
let current = self.current?;
|
||||
self.current = self.graph.next_unitig_kmer(current);
|
||||
Some(current)
|
||||
}
|
||||
}
|
||||
|
||||
// ── UnitigIter ────────────────────────────────────────────────────────────────
|
||||
|
||||
struct LongtigIter<'a> {
|
||||
graph: &'a GraphDeBruijn,
|
||||
current: Option<Kmer>,
|
||||
}
|
||||
|
||||
impl Iterator for LongtigIter<'_> {
|
||||
type Item = Kmer;
|
||||
|
||||
fn next(&mut self) -> Option<Kmer> {
|
||||
let current = self.current?;
|
||||
self.current = self.graph.next_longtig_kmer(current);
|
||||
Some(current)
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn oriented_next(from: Kmer, to: CanonicalKmer, k: usize) -> Kmer {
|
||||
if from.is_overlapping(to.into_kmer(), k) {
|
||||
to.into_kmer()
|
||||
} else {
|
||||
to.revcomp(k)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `Some(i)` if exactly one of the four canonical neighbours exists in
|
||||
/// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for
|
||||
/// zero or ≥2 existing neighbours.
|
||||
fn count_neighbors(
|
||||
neighbors: [CanonicalKmer; 4],
|
||||
nodes: &FastHashMap<CanonicalKmer, Cell<Node>>,
|
||||
) -> (u8, Option<u8>) {
|
||||
let mut count = 0u8;
|
||||
let mut first = None;
|
||||
for (i, neighbour) in neighbors.iter().enumerate() {
|
||||
if nodes.contains_key(neighbour) {
|
||||
count += 1;
|
||||
if first.is_none() {
|
||||
first = Some(i as u8);
|
||||
}
|
||||
}
|
||||
}
|
||||
if count == 1 {
|
||||
(1, first)
|
||||
} else {
|
||||
(count, None)
|
||||
}
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// Build a graph from an ASCII sequence, inserting all canonical k-mers.
|
||||
fn graph_from_ascii(seq: &[u8], k: usize) -> GraphDeBruijn {
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
for i in 0..=seq.len().saturating_sub(k) {
|
||||
g.push(Kmer::from_ascii(&seq[i..i + k], k).unwrap().canonical(k));
|
||||
}
|
||||
g
|
||||
}
|
||||
|
||||
// Collect all canonical k-mers from an ASCII sequence into a sorted vec.
|
||||
fn canonical_kmers(seq: &[u8], k: usize) -> Vec<CanonicalKmer> {
|
||||
let mut v: Vec<CanonicalKmer> = (0..=seq.len().saturating_sub(k))
|
||||
.map(|i| Kmer::from_ascii(&seq[i..i + k], k).unwrap().canonical(k))
|
||||
.collect();
|
||||
v.sort_unstable();
|
||||
v.dedup();
|
||||
v
|
||||
}
|
||||
|
||||
// ── push / canonicalisation ───────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn push_deduplicates_revcomp() {
|
||||
let k = 5;
|
||||
let kmer = Kmer::from_ascii(b"ACGTA", k).unwrap();
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
g.push(kmer.canonical(k));
|
||||
g.push(kmer.revcomp(k).canonical(k));
|
||||
assert_eq!(g.len(), 1, "kmer and its revcomp must map to the same node");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn push_palindrome_single_node() {
|
||||
// ACGT is its own revcomp
|
||||
let k = 4;
|
||||
let kmer = Kmer::from_ascii(b"ACGT", k).unwrap();
|
||||
assert_eq!(kmer, kmer.revcomp(k), "test requires a palindrome");
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
g.push(kmer.canonical(k));
|
||||
assert_eq!(g.len(), 1);
|
||||
}
|
||||
|
||||
// ── compute_degrees on a linear chain ────────────────────────────────────
|
||||
|
||||
// AAAAGGGG with k=5 → 4 distinct k-mers (AAAAG, AAAGG, AAGGG, AGGGG),
|
||||
// clean linear chain, no Watson-Crick palindrome in first k-1 bases.
|
||||
fn linear_chain_graph(k: usize) -> (GraphDeBruijn, Vec<CanonicalKmer>) {
|
||||
let seq = b"AAAAGGGG";
|
||||
let g = graph_from_ascii(seq, k);
|
||||
let kmers = canonical_kmers(seq, k);
|
||||
(g, kmers)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn degrees_linear_chain_node_count() {
|
||||
let k = 5;
|
||||
let (g, kmers) = linear_chain_graph(k);
|
||||
assert_eq!(g.len(), kmers.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn degrees_linear_chain_extensions() {
|
||||
// A linear chain yields exactly 1 unitig covering all k-mers.
|
||||
// Note: start_iter must not be consumed standalone — its second pass only
|
||||
// finds true cycle nodes when interleaved with chain traversal (iter_unitig).
|
||||
let k = 5;
|
||||
let seq = b"AAAAGGGG";
|
||||
let g = graph_from_ascii(seq, k);
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig");
|
||||
// seql = k + (n_kmers - 1) = 5 + 3 = 8 = seq.len()
|
||||
assert_eq!(
|
||||
unitigs[0].seql(),
|
||||
seq.len(),
|
||||
"unitig spans the full sequence"
|
||||
);
|
||||
assert_eq!(
|
||||
kmers_from_unitigs(&unitigs, k),
|
||||
canonical_kmers(seq, k),
|
||||
"unitig k-mers must equal inserted k-mers"
|
||||
);
|
||||
}
|
||||
|
||||
// ── unitig reconstruction ─────────────────────────────────────────────────
|
||||
|
||||
// Round-trip: all canonical k-mers in the unitigs == all canonical k-mers inserted.
|
||||
fn kmers_from_unitigs(unitigs: &[Unitig], k: usize) -> Vec<CanonicalKmer> {
|
||||
let mut v: Vec<CanonicalKmer> = unitigs
|
||||
.iter()
|
||||
.flat_map(|u| u.iter_canonical_kmers(k))
|
||||
.collect();
|
||||
v.sort_unstable();
|
||||
v.dedup();
|
||||
v
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unitig_roundtrip_linear() {
|
||||
// Non-repetitive sequence: no k-mer appears twice, no homopolymer run of length k.
|
||||
// ACGTGGCTA with k=5 → 5 distinct k-mers forming a clean linear chain.
|
||||
let k = 5;
|
||||
let seq = b"ACCTGGCTA";
|
||||
let g = graph_from_ascii(seq, k);
|
||||
g.compute_degrees();
|
||||
println!("Les kmers:");
|
||||
for (kmer, v) in g.nodes.iter() {
|
||||
println!(
|
||||
"{}: {}",
|
||||
String::from_utf8_lossy(&kmer.to_ascii(k)),
|
||||
v.get()
|
||||
);
|
||||
}
|
||||
// println!("Les starts:");
|
||||
// for (start, first_next) in g.start_iter() {
|
||||
// if let Some(next) = first_next {
|
||||
// println!(
|
||||
// "{}->{}",
|
||||
// String::from_utf8_lossy(&start.to_ascii(k)),
|
||||
// String::from_utf8_lossy(&next.to_ascii(k))
|
||||
// )
|
||||
// } else {
|
||||
// println!("{}->None", String::from_utf8_lossy(&start.to_ascii(k)))
|
||||
// }
|
||||
// }
|
||||
|
||||
println!("Les unitig:");
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
for unitig in &unitigs {
|
||||
println!("{}", String::from_utf8_lossy(&unitig.to_ascii()));
|
||||
}
|
||||
assert_eq!(
|
||||
unitigs.len(),
|
||||
1,
|
||||
"linear chain → exactly one unitig {:?}",
|
||||
unitigs
|
||||
);
|
||||
assert_eq!(
|
||||
kmers_from_unitigs(&unitigs, k),
|
||||
canonical_kmers(seq, k),
|
||||
"unitig must contain exactly the inserted k-mers"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unitig_roundtrip_longer_sequence() {
|
||||
// Longer non-repetitive sequence with no repeated k-mer of length k.
|
||||
// ACGTGGCTATCGAC with k=5 → 10 distinct k-mers, one linear chain.
|
||||
let k = 5;
|
||||
let seq = b"ACGTGGCTATCGAC";
|
||||
let g = graph_from_ascii(seq, k);
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
let mut got = kmers_from_unitigs(&unitigs, k);
|
||||
let mut expected = canonical_kmers(seq, k);
|
||||
got.sort_unstable();
|
||||
expected.sort_unstable();
|
||||
assert_eq!(got, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unitig_isolated_node() {
|
||||
// Single k-mer with no neighbours
|
||||
let k = 5;
|
||||
let kmer = Kmer::from_ascii(b"ACGTA", k).unwrap();
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
g.push(kmer.canonical(k));
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
assert_eq!(unitigs.len(), 1);
|
||||
assert_eq!(unitigs[0].seql(), k);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unitig_two_isolated_nodes() {
|
||||
let k = 5;
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
// Two k-mers that share no (k-1)-overlap
|
||||
g.push(Kmer::from_ascii(b"AAAAA", k).unwrap().canonical(k));
|
||||
g.push(Kmer::from_ascii(b"TTTTT", k).unwrap().canonical(k)); // same canonical as AAAAA — dedup
|
||||
// They collapse to one canonical node
|
||||
assert_eq!(g.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unitig_two_truly_distinct_isolated_nodes() {
|
||||
let k = 5;
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
g.push(Kmer::from_ascii(b"AAAAC", k).unwrap().canonical(k));
|
||||
g.push(Kmer::from_ascii(b"GGGGT", k).unwrap().canonical(k));
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
// Each isolated node → one unitig of length k
|
||||
assert_eq!(unitigs.len(), 2);
|
||||
assert!(unitigs.iter().all(|u| u.seql() == k));
|
||||
}
|
||||
|
||||
// ── all k-mers covered, none duplicated ───────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn no_kmer_lost_or_duplicated() {
|
||||
let k = 7;
|
||||
let seq = b"ACGTACGTACGTTTTTACGTACGT";
|
||||
let g = graph_from_ascii(seq, k);
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
let got = kmers_from_unitigs(&unitigs, k);
|
||||
let expected = canonical_kmers(seq, k);
|
||||
assert_eq!(
|
||||
got.len(),
|
||||
expected.len(),
|
||||
"kmer count mismatch: got {}, expected {}",
|
||||
got.len(),
|
||||
expected.len()
|
||||
);
|
||||
assert_eq!(got, expected, "kmer sets differ");
|
||||
}
|
||||
|
||||
// ── cycle coverage ────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn cycle_kmers_not_lost() {
|
||||
// ACGTACGT with k=5 forms a pure cycle: ACGTA→CGTAC→GTACG→TACGT→ACGTA.
|
||||
// start_iter first pass yields nothing (all nodes internal); second pass
|
||||
// picks up cycle entries. All 4 k-mers must appear in the unitigs.
|
||||
let k = 5;
|
||||
let seq = b"ACGTACGT";
|
||||
let g = graph_from_ascii(seq, k);
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
let got = kmers_from_unitigs(&unitigs, k);
|
||||
let expected = canonical_kmers(seq, k);
|
||||
assert_eq!(got.len(), expected.len(), "cycle k-mers lost");
|
||||
assert_eq!(got, expected);
|
||||
}
|
||||
|
||||
// ── branching graph ───────────────────────────────────────────────────────
|
||||
//
|
||||
// Topology (k=5): two sources A,B converge at C; chain C-D-E-F;
|
||||
// F branches to G and H; H continues H-M-N; second source J feeds I-F.
|
||||
// Every k-mer must appear in exactly one unitig (no duplication, no loss).
|
||||
#[test]
|
||||
fn branching_graph_no_kmer_lost_or_duplicated() {
|
||||
// Build sequences that realise the topology without accidental overlaps.
|
||||
// Each "node" is a distinct 5-mer; edges share a 4-mer suffix/prefix.
|
||||
// We use long non-repetitive sequences and extract only the required kmers.
|
||||
let k: usize = 5;
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
|
||||
// Helper: insert all k-mers of a sequence.
|
||||
let mut insert = |seq: &[u8]| {
|
||||
for i in 0..=seq.len().saturating_sub(k) {
|
||||
g.push(Kmer::from_ascii(&seq[i..i + k], k).unwrap().canonical(k));
|
||||
}
|
||||
};
|
||||
|
||||
// Chains that realise the topology:
|
||||
// A-C (A→C share 4-mer overlap)
|
||||
// B-C (B→C share 4-mer overlap, different prefix)
|
||||
// C-D-E-F
|
||||
// F-G (F→G)
|
||||
// F-H-M-N (F→H→M→N)
|
||||
// J-I-F (J→I→F)
|
||||
insert(b"AACGTGGCTA"); // A-C-D … part of the right branch
|
||||
insert(b"TACGTGGCTA"); // B-C-D … merges at C (same C-suffix)
|
||||
insert(b"CGTGGCTACG"); // continues D-E-F-G
|
||||
insert(b"CGTGGCTACC"); // F-H branch (different last base)
|
||||
insert(b"GTGGCTACCGT"); // H-M-N continuation
|
||||
insert(b"TTCGTGGCTA"); // J-I-F (different J prefix)
|
||||
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
|
||||
// Collect all k-mers from unitigs.
|
||||
let got = kmers_from_unitigs(&unitigs, k);
|
||||
|
||||
// Collect all distinct canonical k-mers inserted.
|
||||
let mut expected: Vec<CanonicalKmer> = Vec::new();
|
||||
for seq in &[
|
||||
b"AACGTGGCTA".as_slice(),
|
||||
b"TACGTGGCTA",
|
||||
b"CGTGGCTACG",
|
||||
b"CGTGGCTACC",
|
||||
b"GTGGCTACCGT",
|
||||
b"TTCGTGGCTA",
|
||||
] {
|
||||
expected.extend(canonical_kmers(seq, k));
|
||||
}
|
||||
expected.sort_unstable();
|
||||
expected.dedup();
|
||||
|
||||
assert_eq!(
|
||||
got.len(),
|
||||
expected.len(),
|
||||
"k-mer count mismatch: got {}, expected {}",
|
||||
got.len(),
|
||||
expected.len()
|
||||
);
|
||||
assert_eq!(got, expected, "k-mer sets differ");
|
||||
}
|
||||
}
|
||||
pub use debruijn::GraphDeBruijn;
|
||||
|
||||
@@ -0,0 +1,301 @@
|
||||
use super::*;
|
||||
use obikseq::{k, set_k};
|
||||
|
||||
// Build a graph from an ASCII sequence, inserting all canonical k-mers.
|
||||
fn graph_from_ascii(seq: &[u8]) -> GraphDeBruijn {
|
||||
let mut g = GraphDeBruijn::new();
|
||||
let k = k();
|
||||
for i in 0..=seq.len().saturating_sub(k) {
|
||||
g.push(Kmer::from_ascii(&seq[i..i + k]).unwrap().canonical());
|
||||
}
|
||||
g
|
||||
}
|
||||
|
||||
// Collect all canonical k-mers from an ASCII sequence into a sorted vec.
|
||||
fn canonical_kmers(seq: &[u8]) -> Vec<CanonicalKmer> {
|
||||
let k = k();
|
||||
let mut v: Vec<CanonicalKmer> = (0..=seq.len().saturating_sub(k))
|
||||
.map(|i| Kmer::from_ascii(&seq[i..i + k]).unwrap().canonical())
|
||||
.collect();
|
||||
v.sort_unstable();
|
||||
v.dedup();
|
||||
v
|
||||
}
|
||||
|
||||
// ── push / canonicalisation ───────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn push_deduplicates_revcomp() {
|
||||
let k = 5;
|
||||
set_k(k);
|
||||
let kmer = Kmer::from_ascii(b"ACGTA").unwrap();
|
||||
let mut g = GraphDeBruijn::new();
|
||||
g.push(kmer.canonical());
|
||||
g.push(kmer.revcomp().canonical());
|
||||
assert_eq!(g.len(), 1, "kmer and its revcomp must map to the same node");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn push_palindrome_single_node() {
|
||||
// ACGT is its own revcomp
|
||||
let k = 4;
|
||||
set_k(k);
|
||||
let kmer = Kmer::from_ascii(b"ACGT").unwrap();
|
||||
assert_eq!(kmer, kmer.revcomp(), "test requires a palindrome");
|
||||
let mut g = GraphDeBruijn::new();
|
||||
g.push(kmer.canonical());
|
||||
assert_eq!(g.len(), 1);
|
||||
}
|
||||
|
||||
// ── compute_degrees on a linear chain ────────────────────────────────────
|
||||
|
||||
// AAAAGGGG with k=5 → 4 distinct k-mers (AAAAG, AAAGG, AAGGG, AGGGG),
|
||||
// clean linear chain, no Watson-Crick palindrome in first k-1 bases.
|
||||
fn linear_chain_graph() -> (GraphDeBruijn, Vec<CanonicalKmer>) {
|
||||
let seq = b"AAAAGGGG";
|
||||
let g = graph_from_ascii(seq);
|
||||
let kmers = canonical_kmers(seq);
|
||||
(g, kmers)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn degrees_linear_chain_node_count() {
|
||||
let k = 5;
|
||||
set_k(k);
|
||||
let (g, kmers) = linear_chain_graph();
|
||||
assert_eq!(g.len(), kmers.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn degrees_linear_chain_extensions() {
|
||||
// A linear chain yields exactly 1 unitig covering all k-mers.
|
||||
// Note: start_iter must not be consumed standalone — its second pass only
|
||||
// finds true cycle nodes when interleaved with chain traversal (iter_unitig).
|
||||
let k = 5;
|
||||
set_k(k);
|
||||
let seq = b"AAAAGGGG";
|
||||
let g = graph_from_ascii(seq);
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig");
|
||||
// seql = k + (n_kmers - 1) = 5 + 3 = 8 = seq.len()
|
||||
assert_eq!(
|
||||
unitigs[0].seql(),
|
||||
seq.len(),
|
||||
"unitig spans the full sequence"
|
||||
);
|
||||
assert_eq!(
|
||||
kmers_from_unitigs(&unitigs),
|
||||
canonical_kmers(seq),
|
||||
"unitig k-mers must equal inserted k-mers"
|
||||
);
|
||||
}
|
||||
|
||||
// ── unitig reconstruction ─────────────────────────────────────────────────
|
||||
|
||||
// Round-trip: all canonical k-mers in the unitigs == all canonical k-mers inserted.
|
||||
fn kmers_from_unitigs(unitigs: &[Unitig]) -> Vec<CanonicalKmer> {
|
||||
let mut v: Vec<CanonicalKmer> = unitigs
|
||||
.iter()
|
||||
.flat_map(|u| u.iter_canonical_kmers())
|
||||
.collect();
|
||||
v.sort_unstable();
|
||||
v.dedup();
|
||||
v
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unitig_roundtrip_linear() {
|
||||
// Non-repetitive sequence: no k-mer appears twice, no homopolymer run of length k.
|
||||
// ACGTGGCTA with k=5 → 5 distinct k-mers forming a clean linear chain.
|
||||
let k = 5;
|
||||
set_k(k);
|
||||
let seq = b"ACCTGGCTA";
|
||||
let g = graph_from_ascii(seq);
|
||||
g.compute_degrees();
|
||||
println!("Les kmers:");
|
||||
for (kmer, v) in g.nodes.iter() {
|
||||
println!("{}: {}", String::from_utf8_lossy(&kmer.to_ascii()), v.get());
|
||||
}
|
||||
|
||||
println!("Les unitig:");
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
for unitig in &unitigs {
|
||||
println!("{}", String::from_utf8_lossy(&unitig.to_ascii()));
|
||||
}
|
||||
assert_eq!(
|
||||
unitigs.len(),
|
||||
1,
|
||||
"linear chain → exactly one unitig {:?}",
|
||||
unitigs
|
||||
);
|
||||
assert_eq!(
|
||||
kmers_from_unitigs(&unitigs),
|
||||
canonical_kmers(seq),
|
||||
"unitig must contain exactly the inserted k-mers"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unitig_roundtrip_longer_sequence() {
|
||||
// Longer non-repetitive sequence with no repeated k-mer of length k.
|
||||
// ACGTGGCTATCGAC with k=5 → 10 distinct k-mers, one linear chain.
|
||||
let k = 5;
|
||||
set_k(k);
|
||||
let seq = b"ACGTGGCTATCGAC";
|
||||
let g = graph_from_ascii(seq);
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
let mut got = kmers_from_unitigs(&unitigs);
|
||||
let mut expected = canonical_kmers(seq);
|
||||
got.sort_unstable();
|
||||
expected.sort_unstable();
|
||||
assert_eq!(got, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unitig_isolated_node() {
|
||||
// Single k-mer with no neighbours
|
||||
let k = 5;
|
||||
set_k(k);
|
||||
let kmer = Kmer::from_ascii(b"ACGTA").unwrap();
|
||||
let mut g = GraphDeBruijn::new();
|
||||
g.push(kmer.canonical());
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
assert_eq!(unitigs.len(), 1);
|
||||
assert_eq!(unitigs[0].seql(), k);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unitig_two_isolated_nodes() {
|
||||
let k = 5;
|
||||
set_k(k);
|
||||
let mut g = GraphDeBruijn::new();
|
||||
// Two k-mers that share no (k-1)-overlap
|
||||
g.push(Kmer::from_ascii(b"AAAAA").unwrap().canonical());
|
||||
g.push(Kmer::from_ascii(b"TTTTT").unwrap().canonical()); // same canonical as AAAAA — dedup
|
||||
// They collapse to one canonical node
|
||||
assert_eq!(g.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unitig_two_truly_distinct_isolated_nodes() {
|
||||
let k = 5;
|
||||
set_k(k);
|
||||
let mut g = GraphDeBruijn::new();
|
||||
g.push(Kmer::from_ascii(b"AAAAC").unwrap().canonical());
|
||||
g.push(Kmer::from_ascii(b"GGGGT").unwrap().canonical());
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
// Each isolated node → one unitig of length k
|
||||
assert_eq!(unitigs.len(), 2);
|
||||
assert!(unitigs.iter().all(|u| u.seql() == k));
|
||||
}
|
||||
|
||||
// ── all k-mers covered, none duplicated ───────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn no_kmer_lost_or_duplicated() {
|
||||
let k = 7;
|
||||
set_k(k);
|
||||
let seq = b"ACGTACGTACGTTTTTACGTACGT";
|
||||
let g = graph_from_ascii(seq);
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
let got = kmers_from_unitigs(&unitigs);
|
||||
let expected = canonical_kmers(seq);
|
||||
assert_eq!(
|
||||
got.len(),
|
||||
expected.len(),
|
||||
"kmer count mismatch: got {}, expected {}",
|
||||
got.len(),
|
||||
expected.len()
|
||||
);
|
||||
assert_eq!(got, expected, "kmer sets differ");
|
||||
}
|
||||
|
||||
// ── cycle coverage ────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn cycle_kmers_not_lost() {
|
||||
// ACGTACGT with k=5 forms a pure cycle: ACGTA→CGTAC→GTACG→TACGT→ACGTA.
|
||||
// start_iter first pass yields nothing (all nodes internal); second pass
|
||||
// picks up cycle entries. All 4 k-mers must appear in the unitigs.
|
||||
let k = 5;
|
||||
set_k(k);
|
||||
let seq = b"ACGTACGT";
|
||||
let g = graph_from_ascii(seq);
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
let got = kmers_from_unitigs(&unitigs);
|
||||
let expected = canonical_kmers(seq);
|
||||
assert_eq!(got.len(), expected.len(), "cycle k-mers lost");
|
||||
assert_eq!(got, expected);
|
||||
}
|
||||
|
||||
// ── branching graph ───────────────────────────────────────────────────────
|
||||
//
|
||||
// Topology (k=5): two sources A,B converge at C; chain C-D-E-F;
|
||||
// F branches to G and H; H continues H-M-N; second source J feeds I-F.
|
||||
// Every k-mer must appear in exactly one unitig (no duplication, no loss).
|
||||
#[test]
|
||||
fn branching_graph_no_kmer_lost_or_duplicated() {
|
||||
// Build sequences that realise the topology without accidental overlaps.
|
||||
// Each "node" is a distinct 5-mer; edges share a 4-mer suffix/prefix.
|
||||
// We use long non-repetitive sequences and extract only the required kmers.
|
||||
let k: usize = 5;
|
||||
set_k(k);
|
||||
let mut g = GraphDeBruijn::new();
|
||||
|
||||
// Helper: insert all k-mers of a sequence.
|
||||
let mut insert = |seq: &[u8]| {
|
||||
for i in 0..=seq.len().saturating_sub(k) {
|
||||
g.push(Kmer::from_ascii(&seq[i..i + k]).unwrap().canonical());
|
||||
}
|
||||
};
|
||||
|
||||
// Chains that realise the topology:
|
||||
// A-C (A→C share 4-mer overlap)
|
||||
// B-C (B→C share 4-mer overlap, different prefix)
|
||||
// C-D-E-F
|
||||
// F-G (F→G)
|
||||
// F-H-M-N (F→H→M→N)
|
||||
// J-I-F (J→I→F)
|
||||
insert(b"AACGTGGCTA"); // A-C-D … part of the right branch
|
||||
insert(b"TACGTGGCTA"); // B-C-D … merges at C (same C-suffix)
|
||||
insert(b"CGTGGCTACG"); // continues D-E-F-G
|
||||
insert(b"CGTGGCTACC"); // F-H branch (different last base)
|
||||
insert(b"GTGGCTACCGT"); // H-M-N continuation
|
||||
insert(b"TTCGTGGCTA"); // J-I-F (different J prefix)
|
||||
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
|
||||
// Collect all k-mers from unitigs.
|
||||
let got = kmers_from_unitigs(&unitigs);
|
||||
|
||||
// Collect all distinct canonical k-mers inserted.
|
||||
let mut expected: Vec<CanonicalKmer> = Vec::new();
|
||||
for seq in &[
|
||||
b"AACGTGGCTA".as_slice(),
|
||||
b"TACGTGGCTA",
|
||||
b"CGTGGCTACG",
|
||||
b"CGTGGCTACC",
|
||||
b"GTGGCTACCGT",
|
||||
b"TTCGTGGCTA",
|
||||
] {
|
||||
expected.extend(canonical_kmers(seq));
|
||||
}
|
||||
expected.sort_unstable();
|
||||
expected.dedup();
|
||||
|
||||
assert_eq!(
|
||||
got.len(),
|
||||
expected.len(),
|
||||
"k-mer count mismatch: got {}, expected {}",
|
||||
got.len(),
|
||||
expected.len()
|
||||
);
|
||||
assert_eq!(got, expected, "k-mer sets differ");
|
||||
}
|
||||
Reference in New Issue
Block a user