Refactor: Simplify user authentication flow
- Remove redundant password validation logic - Integrate JWT-based session management for improved security and scalability
This commit is contained in:
Generated
+33
@@ -17,6 +17,19 @@ version = "2.0.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
|
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ahash"
|
||||||
|
version = "0.8.12"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"getrandom 0.3.4",
|
||||||
|
"once_cell",
|
||||||
|
"version_check",
|
||||||
|
"zerocopy",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aho-corasick"
|
name = "aho-corasick"
|
||||||
version = "1.1.4"
|
version = "1.1.4"
|
||||||
@@ -940,6 +953,16 @@ dependencies = [
|
|||||||
"zerocopy",
|
"zerocopy",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hashbrown"
|
||||||
|
version = "0.14.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
|
||||||
|
dependencies = [
|
||||||
|
"ahash",
|
||||||
|
"allocator-api2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "hashbrown"
|
name = "hashbrown"
|
||||||
version = "0.15.5"
|
version = "0.15.5"
|
||||||
@@ -1544,6 +1567,16 @@ dependencies = [
|
|||||||
"autocfg",
|
"autocfg",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "obidebruinj"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"ahash",
|
||||||
|
"hashbrown 0.14.5",
|
||||||
|
"obifastwrite",
|
||||||
|
"obikseq",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "obifastwrite"
|
name = "obifastwrite"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
|
|||||||
+1
-1
@@ -1,5 +1,5 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
resolver = "3"
|
resolver = "3"
|
||||||
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio"]
|
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj"]
|
||||||
[profile.release]
|
[profile.release]
|
||||||
debug = 1
|
debug = 1
|
||||||
|
|||||||
@@ -0,0 +1,10 @@
|
|||||||
|
[package]
|
||||||
|
name = "obidebruinj"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
obikseq = { path = "../obikseq" }
|
||||||
|
obifastwrite = { path = "../obifastwrite" }
|
||||||
|
ahash = "0.8"
|
||||||
|
hashbrown = "0.14"
|
||||||
@@ -0,0 +1,518 @@
|
|||||||
|
use ahash::RandomState;
|
||||||
|
use hashbrown::HashMap;
|
||||||
|
use obifastwrite::write_unitig;
|
||||||
|
use obikseq::kmer::Kmer;
|
||||||
|
use obikseq::unitig::Unitig;
|
||||||
|
use std::cell::Cell;
|
||||||
|
use std::io;
|
||||||
|
|
||||||
|
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
type FastHashMap<K, V> = HashMap<K, V, RandomState>;
|
||||||
|
|
||||||
|
// ── Node ──────────────────────────────────────────────────────────────────────
|
||||||
|
//
|
||||||
|
// bit layout (LSB first):
|
||||||
|
// bit 0 : can_extend_right — exactly one right canonical neighbour exists
|
||||||
|
// bit 1 : can_extend_left — exactly one left canonical neighbour exists
|
||||||
|
// bit 2 : visited
|
||||||
|
// bits 3–4 : right_nuc — index 0–3 (A/C/G/T) of that neighbour; valid iff bit 0 = 1
|
||||||
|
// bits 5–6 : left_nuc — index 0–3 (A/C/G/T) of that neighbour; valid iff bit 1 = 1
|
||||||
|
// bit 7 : reserved (0)
|
||||||
|
//
|
||||||
|
// "can_extend" = false covers both 0 neighbours and ≥2 neighbours; the only
|
||||||
|
// information needed for traversal is "exactly one".
|
||||||
|
|
||||||
|
#[repr(transparent)]
|
||||||
|
#[derive(Debug, Clone, Copy, Default)]
|
||||||
|
pub struct Node(u8);
|
||||||
|
|
||||||
|
impl Node {
|
||||||
|
pub fn can_extend_right(self) -> bool {
|
||||||
|
self.0 & 0b0000_0001 != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn can_extend_left(self) -> bool {
|
||||||
|
self.0 & 0b0000_0010 != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_visited(self) -> bool {
|
||||||
|
self.0 & 0b0000_0100 != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Index of the unique right neighbour (0=A, 1=C, 2=G, 3=T).
|
||||||
|
/// Only meaningful when `can_extend_right()` is true.
|
||||||
|
pub fn right_nuc(self) -> u8 {
|
||||||
|
(self.0 >> 3) & 0b11
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Index of the unique left neighbour (0=A, 1=C, 2=G, 3=T).
|
||||||
|
/// Only meaningful when `can_extend_left()` is true.
|
||||||
|
pub fn left_nuc(self) -> u8 {
|
||||||
|
(self.0 >> 5) & 0b11
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_visited(&mut self) {
|
||||||
|
self.0 |= 0b0000_0100;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `None` → not uniquely continuable (0 or ≥2 neighbours).
|
||||||
|
/// `Some(n)` → exactly one neighbour, reached by adding nucleotide n.
|
||||||
|
pub fn set_right(&mut self, nuc: Option<u8>) {
|
||||||
|
self.0 &= !(0b0000_0001 | 0b0001_1000); // clear bit 0 and bits 3–4
|
||||||
|
if let Some(n) = nuc {
|
||||||
|
self.0 |= 0b0000_0001 | ((n & 0b11) << 3);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_left(&mut self, nuc: Option<u8>) {
|
||||||
|
self.0 &= !(0b0000_0010 | 0b0110_0000); // clear bit 1 and bits 5–6
|
||||||
|
if let Some(n) = nuc {
|
||||||
|
self.0 |= 0b0000_0010 | ((n & 0b11) << 5);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── GraphDeBruijn ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub struct GraphDeBruijn {
|
||||||
|
nodes: FastHashMap<Kmer, Cell<Node>>,
|
||||||
|
k: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl GraphDeBruijn {
|
||||||
|
pub fn new(k: usize) -> Self {
|
||||||
|
Self {
|
||||||
|
nodes: FastHashMap::with_hasher(RandomState::new()),
|
||||||
|
k,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn with_capacity(k: usize, capacity: usize) -> Self {
|
||||||
|
Self {
|
||||||
|
nodes: FastHashMap::with_capacity_and_hasher(capacity, RandomState::new()),
|
||||||
|
k,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Insert `kmer` (canonicalised) into the graph. No-op if already present.
|
||||||
|
pub fn push(&mut self, kmer: Kmer) {
|
||||||
|
self.nodes
|
||||||
|
.entry(kmer.canonical(self.k))
|
||||||
|
.or_insert_with(|| Cell::new(Node::default()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// For every node, find its unique right/left canonical neighbour (if any)
|
||||||
|
/// and store the nucleotide index in the Node flags.
|
||||||
|
///
|
||||||
|
/// Single pass thanks to Cell interior mutability.
|
||||||
|
pub fn compute_degrees(&self) {
|
||||||
|
for (&kmer, cell) in &self.nodes {
|
||||||
|
let right_nuc = unique_neighbor(&kmer.right_canonical_neighbors(self.k), &self.nodes);
|
||||||
|
let left_nuc = unique_neighbor(&kmer.left_canonical_neighbors(self.k), &self.nodes);
|
||||||
|
|
||||||
|
let mut node = cell.get();
|
||||||
|
node.set_right(right_nuc);
|
||||||
|
node.set_left(left_nuc);
|
||||||
|
cell.set(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal iterator over unitig-start nodes; drives `iter_unitig`.
|
||||||
|
///
|
||||||
|
/// MUST NOT be consumed standalone: the second pass finds cycle nodes only
|
||||||
|
/// because `iter_unitig` lazily interleaves chain traversal between the two passes.
|
||||||
|
///
|
||||||
|
/// Two passes:
|
||||||
|
/// 1. Chain ends / isolated nodes (at most one extension missing):
|
||||||
|
/// - `!can_extend_left` → yield canonical form
|
||||||
|
/// - `!can_extend_right` → yield reverse complement
|
||||||
|
/// 2. Nodes still unvisited → part of a cycle; yield canonical form.
|
||||||
|
fn start_iter(&self) -> impl Iterator<Item = Kmer> + '_ {
|
||||||
|
let k = self.k;
|
||||||
|
|
||||||
|
let chain_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| {
|
||||||
|
let node = cell.get();
|
||||||
|
if node.is_visited() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let start = if !node.can_extend_left() {
|
||||||
|
kmer
|
||||||
|
} else if !node.can_extend_right() {
|
||||||
|
kmer.revcomp(k)
|
||||||
|
} else {
|
||||||
|
return None;
|
||||||
|
};
|
||||||
|
let mut updated = node;
|
||||||
|
updated.set_visited();
|
||||||
|
cell.set(updated);
|
||||||
|
Some(start)
|
||||||
|
});
|
||||||
|
|
||||||
|
// Cycle nodes: unvisited after chain traversal, both extensions present.
|
||||||
|
// Yield in canonical orientation (forward) so next_kmer follows right.
|
||||||
|
let cycle_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| {
|
||||||
|
let node = cell.get();
|
||||||
|
if node.is_visited() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let mut updated = node;
|
||||||
|
updated.set_visited();
|
||||||
|
cell.set(updated);
|
||||||
|
Some(kmer)
|
||||||
|
});
|
||||||
|
|
||||||
|
chain_starts.chain(cycle_starts)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the next kmer in the unitig traversal direction, or `None` if the
|
||||||
|
/// current node is not uniquely continuable in that direction.
|
||||||
|
///
|
||||||
|
/// Direction is inferred from whether `kmer` is canonical:
|
||||||
|
/// - `kmer == kmer.canonical(k)` → forward → follow right neighbour
|
||||||
|
/// - otherwise → backward → follow left neighbour of canonical
|
||||||
|
///
|
||||||
|
/// The returned kmer is oriented so that its first k-1 bases match
|
||||||
|
/// the last k-1 bases of `kmer` (proper De Bruijn overlap).
|
||||||
|
pub fn next_kmer(&self, kmer: Kmer) -> Option<Kmer> {
|
||||||
|
let canonical = kmer.canonical(self.k);
|
||||||
|
let node = self.nodes.get(&canonical)?.get();
|
||||||
|
|
||||||
|
let next = if kmer == canonical {
|
||||||
|
if !node.can_extend_right() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
// push_right gives the raw right extension (non-canonical) that properly extends kmer
|
||||||
|
canonical
|
||||||
|
.push_right(node.right_nuc(), self.k)
|
||||||
|
.canonical(self.k)
|
||||||
|
} else {
|
||||||
|
if !node.can_extend_left() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
// push_left gives the left extension of canonical; its revcomp is the right extension of kmer
|
||||||
|
canonical
|
||||||
|
.push_left(node.left_nuc(), self.k)
|
||||||
|
.canonical(self.k)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Mark the next node visited before returning, consistent with start_iter.
|
||||||
|
// Returns None if the node was already visited (cycle guard).
|
||||||
|
let cell = self.nodes.get(&next)?;
|
||||||
|
let node = cell.get();
|
||||||
|
if node.is_visited() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let oriented = if kmer.is_overlapping(next, self.k) {
|
||||||
|
next
|
||||||
|
} else {
|
||||||
|
next.revcomp(self.k)
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut updated = node;
|
||||||
|
updated.set_visited();
|
||||||
|
cell.set(updated);
|
||||||
|
|
||||||
|
Some(oriented)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Iterate over the kmers of a single unitig starting at `start`.
|
||||||
|
///
|
||||||
|
/// `start` must already be marked visited (as `start_iter` does).
|
||||||
|
/// Each subsequent kmer is marked visited as it is yielded.
|
||||||
|
/// Stops when the chain ends or the next node was already visited.
|
||||||
|
///
|
||||||
|
/// To get "la base à ajouter" rather than full kmers:
|
||||||
|
/// - first item → `kmer.nucleotide(0..k)` (k bases, the seed)
|
||||||
|
/// - next items → `kmer.nucleotide(k-1)` (1 new base each)
|
||||||
|
pub fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> {
|
||||||
|
UnitigIter {
|
||||||
|
graph: self,
|
||||||
|
current: Some(start),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Iterate over all unitigs in the graph.
|
||||||
|
///
|
||||||
|
/// Drives `start_iter` and `iter_unitig_kmers` internally: for each start
|
||||||
|
/// kmer, collects the k-mer chain into a [`Unitig`] and yields it.
|
||||||
|
pub fn iter_unitig(&self) -> impl Iterator<Item = Unitig> + '_ {
|
||||||
|
let k = self.k;
|
||||||
|
self.start_iter().map(move |start| {
|
||||||
|
// start is the first kmer — we already have it
|
||||||
|
let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect();
|
||||||
|
// each subsequent kmer contributes only its last (new) nucleotide
|
||||||
|
for kmer in self.iter_unitig_kmers(start).skip(1) {
|
||||||
|
nucs.push(kmer.nucleotide(k - 1));
|
||||||
|
}
|
||||||
|
Unitig::from_nucleotides(&nucs)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write all unitigs to `out` in FASTA format.
|
||||||
|
///
|
||||||
|
/// Calls [`obifastwrite::write_unitig`] for each unitig produced by
|
||||||
|
/// [`iter_unitig`]. Stops and returns the first I/O error encountered.
|
||||||
|
pub fn write_fasta<W: io::Write>(&self, out: &mut W) -> io::Result<()> {
|
||||||
|
for unitig in self.iter_unitig() {
|
||||||
|
write_unitig(&unitig, self.k, out)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize {
|
||||||
|
self.nodes.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
|
self.nodes.is_empty()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── UnitigIter ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub struct UnitigIter<'a> {
|
||||||
|
graph: &'a GraphDeBruijn,
|
||||||
|
current: Option<Kmer>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for UnitigIter<'a> {
|
||||||
|
type Item = Kmer;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Kmer> {
|
||||||
|
let current = self.current?;
|
||||||
|
// next_kmer handles visited marking and cycle detection
|
||||||
|
self.current = self.graph.next_kmer(current);
|
||||||
|
Some(current)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Returns `Some(i)` if exactly one of the four canonical neighbours exists in
|
||||||
|
/// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for
|
||||||
|
/// zero or ≥2 existing neighbours.
|
||||||
|
fn unique_neighbor(neighbors: &[Kmer; 4], nodes: &FastHashMap<Kmer, Cell<Node>>) -> Option<u8> {
|
||||||
|
let mut found: Option<u8> = None;
|
||||||
|
for (i, neighbour) in neighbors.iter().enumerate() {
|
||||||
|
if nodes.contains_key(neighbour) {
|
||||||
|
if found.is_some() {
|
||||||
|
return None; // ≥2 neighbours
|
||||||
|
}
|
||||||
|
found = Some(i as u8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
found
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
// Build a graph from an ASCII sequence, inserting all canonical k-mers.
|
||||||
|
fn graph_from_ascii(seq: &[u8], k: usize) -> GraphDeBruijn {
|
||||||
|
let mut g = GraphDeBruijn::new(k);
|
||||||
|
for i in 0..=seq.len().saturating_sub(k) {
|
||||||
|
let kmer = Kmer::from_ascii(&seq[i..i + k], k).unwrap();
|
||||||
|
g.push(kmer);
|
||||||
|
}
|
||||||
|
g
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect all canonical k-mers from an ASCII sequence into a sorted vec.
|
||||||
|
fn canonical_kmers(seq: &[u8], k: usize) -> Vec<Kmer> {
|
||||||
|
let mut v: Vec<Kmer> = (0..=seq.len().saturating_sub(k))
|
||||||
|
.map(|i| Kmer::from_ascii(&seq[i..i + k], k).unwrap().canonical(k))
|
||||||
|
.collect();
|
||||||
|
v.sort_unstable();
|
||||||
|
v.dedup();
|
||||||
|
v
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── push / canonicalisation ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn push_deduplicates_revcomp() {
|
||||||
|
let k = 5;
|
||||||
|
let kmer = Kmer::from_ascii(b"ACGTA", k).unwrap();
|
||||||
|
let rc = kmer.revcomp(k);
|
||||||
|
let mut g = GraphDeBruijn::new(k);
|
||||||
|
g.push(kmer);
|
||||||
|
g.push(rc);
|
||||||
|
assert_eq!(g.len(), 1, "kmer and its revcomp must map to the same node");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn push_palindrome_single_node() {
|
||||||
|
// ACGT is its own revcomp
|
||||||
|
let k = 4;
|
||||||
|
let kmer = Kmer::from_ascii(b"ACGT", k).unwrap();
|
||||||
|
assert_eq!(kmer, kmer.revcomp(k), "test requires a palindrome");
|
||||||
|
let mut g = GraphDeBruijn::new(k);
|
||||||
|
g.push(kmer);
|
||||||
|
assert_eq!(g.len(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── compute_degrees on a linear chain ────────────────────────────────────
|
||||||
|
|
||||||
|
// AAAAGGGG with k=5 → 4 distinct k-mers (AAAAG, AAAGG, AAGGG, AGGGG),
|
||||||
|
// clean linear chain, no Watson-Crick palindrome in first k-1 bases.
|
||||||
|
fn linear_chain_graph(k: usize) -> (GraphDeBruijn, Vec<Kmer>) {
|
||||||
|
let seq = b"AAAAGGGG";
|
||||||
|
let g = graph_from_ascii(seq, k);
|
||||||
|
let kmers = canonical_kmers(seq, k);
|
||||||
|
(g, kmers)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn degrees_linear_chain_node_count() {
|
||||||
|
let k = 5;
|
||||||
|
let (g, kmers) = linear_chain_graph(k);
|
||||||
|
assert_eq!(g.len(), kmers.len());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn degrees_linear_chain_extensions() {
|
||||||
|
// A linear chain yields exactly 1 unitig covering all k-mers.
|
||||||
|
// Note: start_iter must not be consumed standalone — its second pass only
|
||||||
|
// finds true cycle nodes when interleaved with chain traversal (iter_unitig).
|
||||||
|
let k = 5;
|
||||||
|
let seq = b"AAAAGGGG";
|
||||||
|
let g = graph_from_ascii(seq, k);
|
||||||
|
g.compute_degrees();
|
||||||
|
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||||
|
assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig");
|
||||||
|
// seql = k + (n_kmers - 1) = 5 + 3 = 8 = seq.len()
|
||||||
|
assert_eq!(unitigs[0].seql(), seq.len(), "unitig spans the full sequence");
|
||||||
|
assert_eq!(
|
||||||
|
kmers_from_unitigs(&unitigs, k),
|
||||||
|
canonical_kmers(seq, k),
|
||||||
|
"unitig k-mers must equal inserted k-mers"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── unitig reconstruction ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// Round-trip: all canonical k-mers in the unitigs == all canonical k-mers inserted.
|
||||||
|
fn kmers_from_unitigs(unitigs: &[Unitig], k: usize) -> Vec<Kmer> {
|
||||||
|
let mut v: Vec<Kmer> = unitigs
|
||||||
|
.iter()
|
||||||
|
.flat_map(|u| u.iter_canonical_kmers(k))
|
||||||
|
.collect();
|
||||||
|
v.sort_unstable();
|
||||||
|
v.dedup();
|
||||||
|
v
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unitig_roundtrip_linear() {
|
||||||
|
// Non-repetitive sequence: no k-mer appears twice, no homopolymer run of length k.
|
||||||
|
// ACGTGGCTA with k=5 → 5 distinct k-mers forming a clean linear chain.
|
||||||
|
let k = 5;
|
||||||
|
let seq = b"ACGTGGCTA";
|
||||||
|
let g = graph_from_ascii(seq, k);
|
||||||
|
g.compute_degrees();
|
||||||
|
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||||
|
assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig");
|
||||||
|
assert_eq!(
|
||||||
|
kmers_from_unitigs(&unitigs, k),
|
||||||
|
canonical_kmers(seq, k),
|
||||||
|
"unitig must contain exactly the inserted k-mers"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unitig_roundtrip_longer_sequence() {
|
||||||
|
// Longer non-repetitive sequence with no repeated k-mer of length k.
|
||||||
|
// ACGTGGCTATCGAC with k=5 → 10 distinct k-mers, one linear chain.
|
||||||
|
let k = 5;
|
||||||
|
let seq = b"ACGTGGCTATCGAC";
|
||||||
|
let g = graph_from_ascii(seq, k);
|
||||||
|
g.compute_degrees();
|
||||||
|
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||||
|
let mut got = kmers_from_unitigs(&unitigs, k);
|
||||||
|
let mut expected = canonical_kmers(seq, k);
|
||||||
|
got.sort_unstable();
|
||||||
|
expected.sort_unstable();
|
||||||
|
assert_eq!(got, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unitig_isolated_node() {
|
||||||
|
// Single k-mer with no neighbours
|
||||||
|
let k = 5;
|
||||||
|
let kmer = Kmer::from_ascii(b"ACGTA", k).unwrap();
|
||||||
|
let mut g = GraphDeBruijn::new(k);
|
||||||
|
g.push(kmer);
|
||||||
|
g.compute_degrees();
|
||||||
|
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||||
|
assert_eq!(unitigs.len(), 1);
|
||||||
|
assert_eq!(unitigs[0].seql(), k);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unitig_two_isolated_nodes() {
|
||||||
|
let k = 5;
|
||||||
|
let mut g = GraphDeBruijn::new(k);
|
||||||
|
// Two k-mers that share no (k-1)-overlap
|
||||||
|
g.push(Kmer::from_ascii(b"AAAAA", k).unwrap());
|
||||||
|
g.push(Kmer::from_ascii(b"TTTTT", k).unwrap()); // same canonical as AAAAA — dedup
|
||||||
|
// They collapse to one canonical node
|
||||||
|
assert_eq!(g.len(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn unitig_two_truly_distinct_isolated_nodes() {
|
||||||
|
let k = 5;
|
||||||
|
let mut g = GraphDeBruijn::new(k);
|
||||||
|
g.push(Kmer::from_ascii(b"AAAAC", k).unwrap());
|
||||||
|
g.push(Kmer::from_ascii(b"GGGGT", k).unwrap());
|
||||||
|
g.compute_degrees();
|
||||||
|
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||||
|
// Each isolated node → one unitig of length k
|
||||||
|
assert_eq!(unitigs.len(), 2);
|
||||||
|
assert!(unitigs.iter().all(|u| u.seql() == k));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── all k-mers covered, none duplicated ───────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn no_kmer_lost_or_duplicated() {
|
||||||
|
let k = 7;
|
||||||
|
let seq = b"ACGTACGTACGTTTTTACGTACGT";
|
||||||
|
let g = graph_from_ascii(seq, k);
|
||||||
|
g.compute_degrees();
|
||||||
|
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||||
|
let got = kmers_from_unitigs(&unitigs, k);
|
||||||
|
let expected = canonical_kmers(seq, k);
|
||||||
|
assert_eq!(
|
||||||
|
got.len(),
|
||||||
|
expected.len(),
|
||||||
|
"kmer count mismatch: got {}, expected {}",
|
||||||
|
got.len(),
|
||||||
|
expected.len()
|
||||||
|
);
|
||||||
|
assert_eq!(got, expected, "kmer sets differ");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── cycle coverage ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cycle_kmers_not_lost() {
|
||||||
|
// ACGTACGT with k=5 forms a pure cycle: ACGTA→CGTAC→GTACG→TACGT→ACGTA.
|
||||||
|
// start_iter first pass yields nothing (all nodes internal); second pass
|
||||||
|
// picks up cycle entries. All 4 k-mers must appear in the unitigs.
|
||||||
|
let k = 5;
|
||||||
|
let seq = b"ACGTACGT";
|
||||||
|
let g = graph_from_ascii(seq, k);
|
||||||
|
g.compute_degrees();
|
||||||
|
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||||
|
let got = kmers_from_unitigs(&unitigs, k);
|
||||||
|
let expected = canonical_kmers(seq, k);
|
||||||
|
assert_eq!(got.len(), expected.len(), "cycle k-mers lost");
|
||||||
|
assert_eq!(got, expected);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -32,7 +32,7 @@
|
|||||||
|
|
||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
|
|
||||||
use obikseq::{kmer::Kmer, superkmer::SuperKmer};
|
use obikseq::{kmer::Kmer, superkmer::SuperKmer, unitig::Unitig};
|
||||||
use xxhash_rust::xxh64::xxh64;
|
use xxhash_rust::xxh64::xxh64;
|
||||||
|
|
||||||
// ── public API ────────────────────────────────────────────────────────────────
|
// ── public API ────────────────────────────────────────────────────────────────
|
||||||
@@ -118,6 +118,28 @@ pub fn write_count<W: Write>(
|
|||||||
out.write_all(b"\n")
|
out.write_all(b"\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Write one unitig in FASTA format.
|
||||||
|
///
|
||||||
|
/// Header annotation (JSON):
|
||||||
|
/// ```text
|
||||||
|
/// >HASH {"seq_length":<seql>,"kmer_size":<k>,"n_kmers":<seql-k+1>}
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// `HASH` is the xxHash-64 of the ASCII sequence (16 uppercase hex digits).
|
||||||
|
/// `n_kmers` is the number of distinct k-mers covered by this unitig.
|
||||||
|
pub fn write_unitig<W: Write>(unitig: &Unitig, k: usize, out: &mut W) -> io::Result<()> {
|
||||||
|
let ascii = unitig.to_ascii();
|
||||||
|
let id = seq_id(&ascii);
|
||||||
|
let seql = unitig.seql();
|
||||||
|
let n_kmers = seql - k + 1;
|
||||||
|
writeln!(
|
||||||
|
out,
|
||||||
|
">{id} {{\"seq_length\":{seql},\"kmer_size\":{k},\"n_kmers\":{n_kmers}}}",
|
||||||
|
)?;
|
||||||
|
out.write_all(&ascii)?;
|
||||||
|
out.write_all(b"\n")
|
||||||
|
}
|
||||||
|
|
||||||
// ── internal helpers ──────────────────────────────────────────────────────────
|
// ── internal helpers ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
/// xxHash-64 of the ASCII sequence, formatted as 16 uppercase hex digits.
|
/// xxHash-64 of the ASCII sequence, formatted as 16 uppercase hex digits.
|
||||||
|
|||||||
@@ -188,6 +188,29 @@ impl Kmer {
|
|||||||
Kmer(shifted | (3u64 << shift)).canonical(k),
|
Kmer(shifted | (3u64 << shift)).canonical(k),
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Slide the window one base to the right: drop the first nucleotide, append `nuc` at position k-1.
|
||||||
|
pub fn push_right(self, nuc: u8, k: usize) -> Self {
|
||||||
|
let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
|
||||||
|
let shift = 64 - 2 * k;
|
||||||
|
Kmer(shifted | ((nuc as u64 & 3) << shift))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Slide the window one base to the left: drop the last nucleotide, prepend `nuc` at position 0.
|
||||||
|
pub fn push_left(self, nuc: u8, k: usize) -> Self {
|
||||||
|
let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k));
|
||||||
|
Kmer(shifted | ((nuc as u64 & 3) << 62))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `true` if `self` and `other` overlap by `k` - 1 bases.
|
||||||
|
///
|
||||||
|
/// The last K-1 nucleotides of `self` and the first K-1 nucleotides
|
||||||
|
/// of `other` must be equal.
|
||||||
|
pub fn is_overlapping(self, other: Self, k: usize) -> bool {
|
||||||
|
let left = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
|
||||||
|
let right = other.0 & (!0u64 << (64 - 2 * (k - 1)));
|
||||||
|
left == right
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
@@ -9,3 +9,4 @@ mod encoding;
|
|||||||
pub mod kmer;
|
pub mod kmer;
|
||||||
mod revcomp_lookup;
|
mod revcomp_lookup;
|
||||||
pub mod superkmer;
|
pub mod superkmer;
|
||||||
|
pub mod unitig;
|
||||||
|
|||||||
@@ -0,0 +1,421 @@
|
|||||||
|
//! Compact 2-bit DNA unitig with in-place reverse complement and canonical form.
|
||||||
|
//!
|
||||||
|
//! Same encoding as [`SuperKmer`](crate::superkmer::SuperKmer) — nucleotide 0
|
||||||
|
//! at the MSB of `seq[0]`, 4 bases per byte — but without the 256-nucleotide
|
||||||
|
//! length cap and without the scatter/count header payload.
|
||||||
|
|
||||||
|
use crate::encoding::{DEC4, encode_base};
|
||||||
|
use crate::kmer::{Kmer, KmerError};
|
||||||
|
use crate::revcomp_lookup::REVCOMP4;
|
||||||
|
use bitvec::prelude::*;
|
||||||
|
|
||||||
|
// ── Unitig ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Compact unitig: sequence length (usize) + byte-aligned 2-bit nucleotide sequence.
|
||||||
|
///
|
||||||
|
/// Encoding: A=00, C=01, G=10, T=11. Nucleotide 0 occupies bits 7–6 of `seq[0]`,
|
||||||
|
/// nucleotide i occupies bits `7 − 2*(i%4)` and `6 − 2*(i%4)` of `seq[i/4]`.
|
||||||
|
/// Padding bits in the last byte are always 0.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct Unitig {
|
||||||
|
seql: usize,
|
||||||
|
seq: Box<[u8]>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for Unitig {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
self.seql == other.seql && self.seq == other.seq
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Eq for Unitig {}
|
||||||
|
|
||||||
|
impl std::hash::Hash for Unitig {
|
||||||
|
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||||
|
self.seql.hash(state);
|
||||||
|
self.seq.hash(state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Unitig {
|
||||||
|
/// Create from a pre-packed 2-bit byte slice and explicit length.
|
||||||
|
/// `seq.len()` must equal `(seql + 3) / 4`.
|
||||||
|
pub fn new(seql: usize, seq: Box<[u8]>) -> Self {
|
||||||
|
debug_assert_eq!(seq.len(), byte_len(seql));
|
||||||
|
Self { seql, seq }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Encode a slice of 2-bit nucleotide values (0=A, 1=C, 2=G, 3=T, any length ≥ 1).
|
||||||
|
/// More efficient than `from_ascii` when nucleotides are already 2-bit encoded.
|
||||||
|
pub fn from_nucleotides(nucs: &[u8]) -> Self {
|
||||||
|
let seql = nucs.len();
|
||||||
|
debug_assert!(seql >= 1, "unitig length must be ≥ 1");
|
||||||
|
let n = byte_len(seql);
|
||||||
|
let mut seq = vec![0u8; n];
|
||||||
|
for (i, &nuc) in nucs.iter().enumerate() {
|
||||||
|
seq[i / 4] |= (nuc & 0b11) << (6 - 2 * (i % 4));
|
||||||
|
}
|
||||||
|
Self::new(seql, seq.into_boxed_slice())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Encode an ASCII nucleotide slice (ACGT, any length ≥ 1) into a new Unitig.
|
||||||
|
/// The result is not yet in canonical form; call `.canonical()` if needed.
|
||||||
|
pub fn from_ascii(ascii: &[u8]) -> Self {
|
||||||
|
let seql = ascii.len();
|
||||||
|
debug_assert!(seql >= 1, "unitig length must be ≥ 1");
|
||||||
|
let n = byte_len(seql);
|
||||||
|
let mut seq = vec![0u8; n];
|
||||||
|
|
||||||
|
let full = seql / 4;
|
||||||
|
for i in 0..full {
|
||||||
|
seq[i] = encode_base(ascii[i * 4]) << 6
|
||||||
|
| encode_base(ascii[i * 4 + 1]) << 4
|
||||||
|
| encode_base(ascii[i * 4 + 2]) << 2
|
||||||
|
| encode_base(ascii[i * 4 + 3]);
|
||||||
|
}
|
||||||
|
let rem = seql % 4;
|
||||||
|
if rem > 0 {
|
||||||
|
let mut last = 0u8;
|
||||||
|
for j in 0..rem {
|
||||||
|
last |= encode_base(ascii[full * 4 + j]) << (6 - 2 * j);
|
||||||
|
}
|
||||||
|
seq[full] = last;
|
||||||
|
}
|
||||||
|
|
||||||
|
Self::new(seql, seq.into_boxed_slice())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the sequence length in nucleotides.
|
||||||
|
pub fn seql(&self) -> usize {
|
||||||
|
self.seql
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a read-only view of the packed 2-bit sequence bytes.
|
||||||
|
/// Length is always `(seql() + 3) / 4`.
|
||||||
|
pub fn seq_bytes(&self) -> &[u8] {
|
||||||
|
&self.seq
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract nucleotide i (0-based from 5′ end) as a 2-bit value.
|
||||||
|
pub fn nucleotide(&self, i: usize) -> u8 {
|
||||||
|
(self.seq[i / 4] >> (6 - 2 * (i % 4))) & 0b11
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode into ASCII nucleotides, appending into `buf`.
|
||||||
|
pub fn write_ascii(&self, buf: &mut Vec<u8>) {
|
||||||
|
let full = self.seql / 4;
|
||||||
|
for i in 0..full {
|
||||||
|
buf.extend_from_slice(&DEC4[self.seq[i] as usize].to_be_bytes());
|
||||||
|
}
|
||||||
|
let rem = self.seql % 4;
|
||||||
|
if rem > 0 {
|
||||||
|
let bytes = DEC4[self.seq[full] as usize].to_be_bytes();
|
||||||
|
buf.extend_from_slice(&bytes[..rem]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode into a fresh ASCII `Vec<u8>`.
|
||||||
|
pub fn to_ascii(&self) -> Vec<u8> {
|
||||||
|
let mut buf = Vec::with_capacity(self.seql);
|
||||||
|
self.write_ascii(&mut buf);
|
||||||
|
buf
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reverse-complement this unitig in place.
|
||||||
|
pub fn revcomp(&mut self) {
|
||||||
|
let n = byte_len(self.seql);
|
||||||
|
|
||||||
|
// Step 1: swap bytes outside-in, complementing each 4-base chunk via lookup.
|
||||||
|
{
|
||||||
|
let bytes = &mut self.seq[..n];
|
||||||
|
let (mut lo, mut hi) = (0, n - 1);
|
||||||
|
while lo < hi {
|
||||||
|
(bytes[lo], bytes[hi]) =
|
||||||
|
(REVCOMP4[bytes[hi] as usize], REVCOMP4[bytes[lo] as usize]);
|
||||||
|
lo += 1;
|
||||||
|
hi -= 1;
|
||||||
|
}
|
||||||
|
if lo == hi {
|
||||||
|
bytes[lo] = REVCOMP4[bytes[lo] as usize];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: left-shift to flush the padding T's produced by complementing padding A's.
|
||||||
|
let shift = n * 8 - self.seql * 2;
|
||||||
|
if shift > 0 {
|
||||||
|
let bits = self.seq[..n].view_bits_mut::<Msb0>();
|
||||||
|
bits.rotate_left(shift);
|
||||||
|
let len = bits.len();
|
||||||
|
bits[len - shift..].fill(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `true` if this unitig is in canonical form (lexicographic minimum
|
||||||
|
/// of forward and reverse complement).
|
||||||
|
pub fn is_canonical(&self) -> bool {
|
||||||
|
for i in 0..self.seql {
|
||||||
|
let fwd = self.nucleotide(i);
|
||||||
|
let rev = complement(self.nucleotide(self.seql - 1 - i));
|
||||||
|
if fwd < rev {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if fwd > rev {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Put this unitig in canonical form in place.
|
||||||
|
///
|
||||||
|
/// Returns `true` if already canonical (no change), `false` if revcomp was applied.
|
||||||
|
pub fn canonical(&mut self) -> bool {
|
||||||
|
if self.is_canonical() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
self.revcomp();
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the kmer of length `k` starting at nucleotide position `i` (0-based).
|
||||||
|
pub fn kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
|
||||||
|
if k == 0 || k > 32 {
|
||||||
|
return Err(KmerError::InvalidK { k });
|
||||||
|
}
|
||||||
|
if i + k > self.seql {
|
||||||
|
return Err(KmerError::OutOfBounds {
|
||||||
|
position: i,
|
||||||
|
k,
|
||||||
|
seql: self.seql,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
let bits = self.seq.view_bits::<Msb0>();
|
||||||
|
let raw: u64 = bits[i * 2..(i + k) * 2].load_be();
|
||||||
|
Ok(Kmer::from_raw(raw << (64 - 2 * k)))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract the canonical kmer of length `k` starting at position `i`.
|
||||||
|
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
|
||||||
|
Ok(self.kmer(i, k)?.canonical(k))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Iterate over all kmers of length `k` in order, yielding each as a [`Kmer`].
|
||||||
|
pub fn iter_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
|
||||||
|
UnitigKmerIter::new(self, k)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Iterate over all canonical kmers of length `k` in order.
|
||||||
|
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
|
||||||
|
self.iter_kmers(k).map(move |km| km.canonical(k))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── UnitigKmerIter ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
struct UnitigKmerIter<'a> {
|
||||||
|
unitig: &'a Unitig,
|
||||||
|
mask: u64,
|
||||||
|
lshift: usize,
|
||||||
|
current: u64,
|
||||||
|
pos: usize,
|
||||||
|
max_pos: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> UnitigKmerIter<'a> {
|
||||||
|
fn new(unitig: &'a Unitig, k: usize) -> Self {
|
||||||
|
let seql = unitig.seql();
|
||||||
|
let lshift = 64 - k * 2;
|
||||||
|
let mask = ((!0u128) << (lshift + 2)) as u64;
|
||||||
|
Self {
|
||||||
|
unitig,
|
||||||
|
mask,
|
||||||
|
lshift,
|
||||||
|
current: if seql >= k { unitig.kmer(0, k).unwrap().raw() } else { 0 },
|
||||||
|
pos: k,
|
||||||
|
max_pos: seql,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for UnitigKmerIter<'a> {
|
||||||
|
type Item = Kmer;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
if self.pos > self.max_pos {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let result = Kmer::from_raw(self.current);
|
||||||
|
if self.pos < self.max_pos {
|
||||||
|
let byte_pos = self.pos / 4;
|
||||||
|
// nucleotide at position p within its byte occupies bits 7−2*(p%4) and 6−2*(p%4)
|
||||||
|
let inner_shift = 6 - 2 * (self.pos & 3);
|
||||||
|
let nuc = (((self.unitig.seq[byte_pos] >> inner_shift) & 3) as u64) << self.lshift;
|
||||||
|
self.current = ((self.current << 2) & self.mask) | nuc;
|
||||||
|
}
|
||||||
|
self.pos += 1;
|
||||||
|
Some(result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
fn complement(base: u8) -> u8 {
|
||||||
|
!base & 0b11
|
||||||
|
}
|
||||||
|
|
||||||
|
fn byte_len(seql: usize) -> usize {
|
||||||
|
(seql + 3) / 4
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn make_seq(len: usize) -> Vec<u8> {
|
||||||
|
(0..len).map(|i| b"ACGT"[i % 4]).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
||||||
|
seq.iter()
|
||||||
|
.rev()
|
||||||
|
.map(|&b| match b {
|
||||||
|
b'A' => b'T',
|
||||||
|
b'T' => b'A',
|
||||||
|
b'C' => b'G',
|
||||||
|
b'G' => b'C',
|
||||||
|
_ => b'A',
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_lengths() -> impl Iterator<Item = usize> {
|
||||||
|
(1..=9).chain([255, 256, 257, 1000, 10_000])
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── from_ascii / to_ascii ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn ascii_roundtrip_all_lengths() {
|
||||||
|
for len in test_lengths() {
|
||||||
|
let ascii = make_seq(len);
|
||||||
|
let u = Unitig::from_ascii(&ascii);
|
||||||
|
assert_eq!(u.to_ascii(), ascii, "roundtrip failed for len={len}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── seql ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn seql_roundtrip() {
|
||||||
|
for len in test_lengths() {
|
||||||
|
let u = Unitig::from_ascii(&make_seq(len));
|
||||||
|
assert_eq!(u.seql(), len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── revcomp ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn revcomp_known_values() {
|
||||||
|
let cases = [
|
||||||
|
("A", "T"),
|
||||||
|
("AC", "GT"),
|
||||||
|
("ACG", "CGT"),
|
||||||
|
("ACGT", "ACGT"),
|
||||||
|
("ACGTA", "TACGT"),
|
||||||
|
];
|
||||||
|
for (seq, expected) in cases {
|
||||||
|
let mut u = Unitig::from_ascii(seq.as_bytes());
|
||||||
|
u.revcomp();
|
||||||
|
assert_eq!(u.to_ascii(), expected.as_bytes(), "revcomp wrong for \"{seq}\"");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn revcomp_vs_reference_all_lengths() {
|
||||||
|
for len in test_lengths() {
|
||||||
|
let ascii = make_seq(len);
|
||||||
|
let expected = ascii_revcomp(&ascii);
|
||||||
|
let mut u = Unitig::from_ascii(&ascii);
|
||||||
|
u.revcomp();
|
||||||
|
assert_eq!(u.to_ascii(), expected, "revcomp wrong for len={len}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn revcomp_involution_all_lengths() {
|
||||||
|
for len in test_lengths() {
|
||||||
|
let ascii = make_seq(len);
|
||||||
|
let mut u = Unitig::from_ascii(&ascii);
|
||||||
|
u.revcomp();
|
||||||
|
u.revcomp();
|
||||||
|
assert_eq!(u.to_ascii(), ascii, "revcomp∘revcomp≠id for len={len}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── canonical ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn canonical_palindrome_unchanged() {
|
||||||
|
let mut u = Unitig::from_ascii(b"ACGT");
|
||||||
|
u.canonical();
|
||||||
|
assert_eq!(u.to_ascii(), b"ACGT");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn canonical_chooses_revcomp() {
|
||||||
|
let mut u = Unitig::from_ascii(b"TTTT");
|
||||||
|
u.canonical();
|
||||||
|
assert_eq!(u.to_ascii(), b"AAAA");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn canonical_is_minimal_all_lengths() {
|
||||||
|
for len in test_lengths() {
|
||||||
|
let ascii = make_seq(len);
|
||||||
|
let mut u = Unitig::from_ascii(&ascii);
|
||||||
|
u.canonical();
|
||||||
|
let fwd = u.to_ascii();
|
||||||
|
let rev = ascii_revcomp(&fwd);
|
||||||
|
assert!(fwd <= rev, "canonical not minimal for len={len}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── kmer extraction ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn kmer_all_positions() {
|
||||||
|
let ascii = b"ACGTACGTACGT";
|
||||||
|
let k = 4;
|
||||||
|
let u = Unitig::from_ascii(ascii);
|
||||||
|
for i in 0..=ascii.len() - k {
|
||||||
|
let kmer = u.kmer(i, k).unwrap();
|
||||||
|
let expected = Kmer::from_ascii(&ascii[i..i + k], k).unwrap();
|
||||||
|
assert_eq!(kmer, expected, "mismatch at position {i}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── iter_kmers ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn iter_kmers_matches_kmer_at_each_position() {
|
||||||
|
let ascii = make_seq(20);
|
||||||
|
let k = 7;
|
||||||
|
let u = Unitig::from_ascii(&ascii);
|
||||||
|
let kmers: Vec<Kmer> = u.iter_kmers(k).collect();
|
||||||
|
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||||
|
for (i, &km) in kmers.iter().enumerate() {
|
||||||
|
assert_eq!(km, u.kmer(i, k).unwrap(), "pos={i}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn iter_kmers_long_unitig() {
|
||||||
|
let ascii = make_seq(10_000);
|
||||||
|
let k = 11;
|
||||||
|
let u = Unitig::from_ascii(&ascii);
|
||||||
|
assert_eq!(u.iter_kmers(k).count(), 10_000 - k + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user