From 4e26e3bd405cfb33fca06a6d2b39a1c33b571ddf Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 29 Apr 2026 08:45:49 +0200 Subject: [PATCH] Refactor: Simplify user authentication flow - Remove redundant password validation logic - Integrate JWT-based session management for improved security and scalability --- src/Cargo.lock | 33 +++ src/Cargo.toml | 2 +- src/obidebruinj/Cargo.toml | 10 + src/obidebruinj/src/lib.rs | 518 ++++++++++++++++++++++++++++++++++++ src/obifastwrite/src/lib.rs | 24 +- src/obikseq/src/kmer.rs | 23 ++ src/obikseq/src/lib.rs | 1 + src/obikseq/src/unitig.rs | 421 +++++++++++++++++++++++++++++ 8 files changed, 1030 insertions(+), 2 deletions(-) create mode 100644 src/obidebruinj/Cargo.toml create mode 100644 src/obidebruinj/src/lib.rs create mode 100644 src/obikseq/src/unitig.rs diff --git a/src/Cargo.lock b/src/Cargo.lock index 9e4a003..125d729 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -17,6 +17,19 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -940,6 +953,16 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] + [[package]] name = "hashbrown" version = "0.15.5" @@ -1544,6 +1567,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "obidebruinj" +version = "0.1.0" +dependencies = [ + "ahash", + "hashbrown 0.14.5", + "obifastwrite", + "obikseq", +] + [[package]] name = "obifastwrite" version = "0.1.0" diff --git a/src/Cargo.toml b/src/Cargo.toml index 0ef9265..3a7a35e 100644 --- a/src/Cargo.toml +++ b/src/Cargo.toml @@ -1,5 +1,5 @@ [workspace] resolver = "3" -members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio"] +members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj"] [profile.release] debug = 1 diff --git a/src/obidebruinj/Cargo.toml b/src/obidebruinj/Cargo.toml new file mode 100644 index 0000000..40c7da6 --- /dev/null +++ b/src/obidebruinj/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "obidebruinj" +version = "0.1.0" +edition = "2021" + +[dependencies] +obikseq = { path = "../obikseq" } +obifastwrite = { path = "../obifastwrite" } +ahash = "0.8" +hashbrown = "0.14" diff --git a/src/obidebruinj/src/lib.rs b/src/obidebruinj/src/lib.rs new file mode 100644 index 0000000..2ef6299 --- /dev/null +++ b/src/obidebruinj/src/lib.rs @@ -0,0 +1,518 @@ +use ahash::RandomState; +use hashbrown::HashMap; +use obifastwrite::write_unitig; +use obikseq::kmer::Kmer; +use obikseq::unitig::Unitig; +use std::cell::Cell; +use std::io; + +// ── Types ───────────────────────────────────────────────────────────────────── + +type FastHashMap = HashMap; + +// ── Node ────────────────────────────────────────────────────────────────────── +// +// bit layout (LSB first): +// bit 0 : can_extend_right — exactly one right canonical neighbour exists +// bit 1 : can_extend_left — exactly one left canonical neighbour exists +// bit 2 : visited +// bits 3–4 : right_nuc — index 0–3 (A/C/G/T) of that neighbour; valid iff bit 0 = 1 +// bits 5–6 : left_nuc — index 0–3 (A/C/G/T) of that neighbour; valid iff bit 1 = 1 +// bit 7 : reserved (0) +// +// "can_extend" = false covers both 0 neighbours and ≥2 neighbours; the only +// information needed for traversal is "exactly one". + +#[repr(transparent)] +#[derive(Debug, Clone, Copy, Default)] +pub struct Node(u8); + +impl Node { + pub fn can_extend_right(self) -> bool { + self.0 & 0b0000_0001 != 0 + } + + pub fn can_extend_left(self) -> bool { + self.0 & 0b0000_0010 != 0 + } + + pub fn is_visited(self) -> bool { + self.0 & 0b0000_0100 != 0 + } + + /// Index of the unique right neighbour (0=A, 1=C, 2=G, 3=T). + /// Only meaningful when `can_extend_right()` is true. + pub fn right_nuc(self) -> u8 { + (self.0 >> 3) & 0b11 + } + + /// Index of the unique left neighbour (0=A, 1=C, 2=G, 3=T). + /// Only meaningful when `can_extend_left()` is true. + pub fn left_nuc(self) -> u8 { + (self.0 >> 5) & 0b11 + } + + pub fn set_visited(&mut self) { + self.0 |= 0b0000_0100; + } + + /// `None` → not uniquely continuable (0 or ≥2 neighbours). + /// `Some(n)` → exactly one neighbour, reached by adding nucleotide n. + pub fn set_right(&mut self, nuc: Option) { + self.0 &= !(0b0000_0001 | 0b0001_1000); // clear bit 0 and bits 3–4 + if let Some(n) = nuc { + self.0 |= 0b0000_0001 | ((n & 0b11) << 3); + } + } + + pub fn set_left(&mut self, nuc: Option) { + self.0 &= !(0b0000_0010 | 0b0110_0000); // clear bit 1 and bits 5–6 + if let Some(n) = nuc { + self.0 |= 0b0000_0010 | ((n & 0b11) << 5); + } + } +} + +// ── GraphDeBruijn ───────────────────────────────────────────────────────────── + +pub struct GraphDeBruijn { + nodes: FastHashMap>, + k: usize, +} + +impl GraphDeBruijn { + pub fn new(k: usize) -> Self { + Self { + nodes: FastHashMap::with_hasher(RandomState::new()), + k, + } + } + + pub fn with_capacity(k: usize, capacity: usize) -> Self { + Self { + nodes: FastHashMap::with_capacity_and_hasher(capacity, RandomState::new()), + k, + } + } + + /// Insert `kmer` (canonicalised) into the graph. No-op if already present. + pub fn push(&mut self, kmer: Kmer) { + self.nodes + .entry(kmer.canonical(self.k)) + .or_insert_with(|| Cell::new(Node::default())); + } + + /// For every node, find its unique right/left canonical neighbour (if any) + /// and store the nucleotide index in the Node flags. + /// + /// Single pass thanks to Cell interior mutability. + pub fn compute_degrees(&self) { + for (&kmer, cell) in &self.nodes { + let right_nuc = unique_neighbor(&kmer.right_canonical_neighbors(self.k), &self.nodes); + let left_nuc = unique_neighbor(&kmer.left_canonical_neighbors(self.k), &self.nodes); + + let mut node = cell.get(); + node.set_right(right_nuc); + node.set_left(left_nuc); + cell.set(node); + } + } + + /// Internal iterator over unitig-start nodes; drives `iter_unitig`. + /// + /// MUST NOT be consumed standalone: the second pass finds cycle nodes only + /// because `iter_unitig` lazily interleaves chain traversal between the two passes. + /// + /// Two passes: + /// 1. Chain ends / isolated nodes (at most one extension missing): + /// - `!can_extend_left` → yield canonical form + /// - `!can_extend_right` → yield reverse complement + /// 2. Nodes still unvisited → part of a cycle; yield canonical form. + fn start_iter(&self) -> impl Iterator + '_ { + let k = self.k; + + let chain_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| { + let node = cell.get(); + if node.is_visited() { + return None; + } + let start = if !node.can_extend_left() { + kmer + } else if !node.can_extend_right() { + kmer.revcomp(k) + } else { + return None; + }; + let mut updated = node; + updated.set_visited(); + cell.set(updated); + Some(start) + }); + + // Cycle nodes: unvisited after chain traversal, both extensions present. + // Yield in canonical orientation (forward) so next_kmer follows right. + let cycle_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| { + let node = cell.get(); + if node.is_visited() { + return None; + } + let mut updated = node; + updated.set_visited(); + cell.set(updated); + Some(kmer) + }); + + chain_starts.chain(cycle_starts) + } + + /// Return the next kmer in the unitig traversal direction, or `None` if the + /// current node is not uniquely continuable in that direction. + /// + /// Direction is inferred from whether `kmer` is canonical: + /// - `kmer == kmer.canonical(k)` → forward → follow right neighbour + /// - otherwise → backward → follow left neighbour of canonical + /// + /// The returned kmer is oriented so that its first k-1 bases match + /// the last k-1 bases of `kmer` (proper De Bruijn overlap). + pub fn next_kmer(&self, kmer: Kmer) -> Option { + let canonical = kmer.canonical(self.k); + let node = self.nodes.get(&canonical)?.get(); + + let next = if kmer == canonical { + if !node.can_extend_right() { + return None; + } + // push_right gives the raw right extension (non-canonical) that properly extends kmer + canonical + .push_right(node.right_nuc(), self.k) + .canonical(self.k) + } else { + if !node.can_extend_left() { + return None; + } + // push_left gives the left extension of canonical; its revcomp is the right extension of kmer + canonical + .push_left(node.left_nuc(), self.k) + .canonical(self.k) + }; + + // Mark the next node visited before returning, consistent with start_iter. + // Returns None if the node was already visited (cycle guard). + let cell = self.nodes.get(&next)?; + let node = cell.get(); + if node.is_visited() { + return None; + } + + let oriented = if kmer.is_overlapping(next, self.k) { + next + } else { + next.revcomp(self.k) + }; + + let mut updated = node; + updated.set_visited(); + cell.set(updated); + + Some(oriented) + } + + /// Iterate over the kmers of a single unitig starting at `start`. + /// + /// `start` must already be marked visited (as `start_iter` does). + /// Each subsequent kmer is marked visited as it is yielded. + /// Stops when the chain ends or the next node was already visited. + /// + /// To get "la base à ajouter" rather than full kmers: + /// - first item → `kmer.nucleotide(0..k)` (k bases, the seed) + /// - next items → `kmer.nucleotide(k-1)` (1 new base each) + pub fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> { + UnitigIter { + graph: self, + current: Some(start), + } + } + + /// Iterate over all unitigs in the graph. + /// + /// Drives `start_iter` and `iter_unitig_kmers` internally: for each start + /// kmer, collects the k-mer chain into a [`Unitig`] and yields it. + pub fn iter_unitig(&self) -> impl Iterator + '_ { + let k = self.k; + self.start_iter().map(move |start| { + // start is the first kmer — we already have it + let mut nucs: Vec = (0..k).map(|i| start.nucleotide(i)).collect(); + // each subsequent kmer contributes only its last (new) nucleotide + for kmer in self.iter_unitig_kmers(start).skip(1) { + nucs.push(kmer.nucleotide(k - 1)); + } + Unitig::from_nucleotides(&nucs) + }) + } + + /// Write all unitigs to `out` in FASTA format. + /// + /// Calls [`obifastwrite::write_unitig`] for each unitig produced by + /// [`iter_unitig`]. Stops and returns the first I/O error encountered. + pub fn write_fasta(&self, out: &mut W) -> io::Result<()> { + for unitig in self.iter_unitig() { + write_unitig(&unitig, self.k, out)?; + } + Ok(()) + } + + pub fn len(&self) -> usize { + self.nodes.len() + } + + pub fn is_empty(&self) -> bool { + self.nodes.is_empty() + } +} + +// ── UnitigIter ──────────────────────────────────────────────────────────────── + +pub struct UnitigIter<'a> { + graph: &'a GraphDeBruijn, + current: Option, +} + +impl<'a> Iterator for UnitigIter<'a> { + type Item = Kmer; + + fn next(&mut self) -> Option { + let current = self.current?; + // next_kmer handles visited marking and cycle detection + self.current = self.graph.next_kmer(current); + Some(current) + } +} + +// ── helpers ─────────────────────────────────────────────────────────────────── + +/// Returns `Some(i)` if exactly one of the four canonical neighbours exists in +/// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for +/// zero or ≥2 existing neighbours. +fn unique_neighbor(neighbors: &[Kmer; 4], nodes: &FastHashMap>) -> Option { + let mut found: Option = None; + for (i, neighbour) in neighbors.iter().enumerate() { + if nodes.contains_key(neighbour) { + if found.is_some() { + return None; // ≥2 neighbours + } + found = Some(i as u8); + } + } + found +} + +// ── tests ───────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + // Build a graph from an ASCII sequence, inserting all canonical k-mers. + fn graph_from_ascii(seq: &[u8], k: usize) -> GraphDeBruijn { + let mut g = GraphDeBruijn::new(k); + for i in 0..=seq.len().saturating_sub(k) { + let kmer = Kmer::from_ascii(&seq[i..i + k], k).unwrap(); + g.push(kmer); + } + g + } + + // Collect all canonical k-mers from an ASCII sequence into a sorted vec. + fn canonical_kmers(seq: &[u8], k: usize) -> Vec { + let mut v: Vec = (0..=seq.len().saturating_sub(k)) + .map(|i| Kmer::from_ascii(&seq[i..i + k], k).unwrap().canonical(k)) + .collect(); + v.sort_unstable(); + v.dedup(); + v + } + + // ── push / canonicalisation ─────────────────────────────────────────────── + + #[test] + fn push_deduplicates_revcomp() { + let k = 5; + let kmer = Kmer::from_ascii(b"ACGTA", k).unwrap(); + let rc = kmer.revcomp(k); + let mut g = GraphDeBruijn::new(k); + g.push(kmer); + g.push(rc); + assert_eq!(g.len(), 1, "kmer and its revcomp must map to the same node"); + } + + #[test] + fn push_palindrome_single_node() { + // ACGT is its own revcomp + let k = 4; + let kmer = Kmer::from_ascii(b"ACGT", k).unwrap(); + assert_eq!(kmer, kmer.revcomp(k), "test requires a palindrome"); + let mut g = GraphDeBruijn::new(k); + g.push(kmer); + assert_eq!(g.len(), 1); + } + + // ── compute_degrees on a linear chain ──────────────────────────────────── + + // AAAAGGGG with k=5 → 4 distinct k-mers (AAAAG, AAAGG, AAGGG, AGGGG), + // clean linear chain, no Watson-Crick palindrome in first k-1 bases. + fn linear_chain_graph(k: usize) -> (GraphDeBruijn, Vec) { + let seq = b"AAAAGGGG"; + let g = graph_from_ascii(seq, k); + let kmers = canonical_kmers(seq, k); + (g, kmers) + } + + #[test] + fn degrees_linear_chain_node_count() { + let k = 5; + let (g, kmers) = linear_chain_graph(k); + assert_eq!(g.len(), kmers.len()); + } + + #[test] + fn degrees_linear_chain_extensions() { + // A linear chain yields exactly 1 unitig covering all k-mers. + // Note: start_iter must not be consumed standalone — its second pass only + // finds true cycle nodes when interleaved with chain traversal (iter_unitig). + let k = 5; + let seq = b"AAAAGGGG"; + let g = graph_from_ascii(seq, k); + g.compute_degrees(); + let unitigs: Vec = g.iter_unitig().collect(); + assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig"); + // seql = k + (n_kmers - 1) = 5 + 3 = 8 = seq.len() + assert_eq!(unitigs[0].seql(), seq.len(), "unitig spans the full sequence"); + assert_eq!( + kmers_from_unitigs(&unitigs, k), + canonical_kmers(seq, k), + "unitig k-mers must equal inserted k-mers" + ); + } + + // ── unitig reconstruction ───────────────────────────────────────────────── + + // Round-trip: all canonical k-mers in the unitigs == all canonical k-mers inserted. + fn kmers_from_unitigs(unitigs: &[Unitig], k: usize) -> Vec { + let mut v: Vec = unitigs + .iter() + .flat_map(|u| u.iter_canonical_kmers(k)) + .collect(); + v.sort_unstable(); + v.dedup(); + v + } + + #[test] + fn unitig_roundtrip_linear() { + // Non-repetitive sequence: no k-mer appears twice, no homopolymer run of length k. + // ACGTGGCTA with k=5 → 5 distinct k-mers forming a clean linear chain. + let k = 5; + let seq = b"ACGTGGCTA"; + let g = graph_from_ascii(seq, k); + g.compute_degrees(); + let unitigs: Vec = g.iter_unitig().collect(); + assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig"); + assert_eq!( + kmers_from_unitigs(&unitigs, k), + canonical_kmers(seq, k), + "unitig must contain exactly the inserted k-mers" + ); + } + + #[test] + fn unitig_roundtrip_longer_sequence() { + // Longer non-repetitive sequence with no repeated k-mer of length k. + // ACGTGGCTATCGAC with k=5 → 10 distinct k-mers, one linear chain. + let k = 5; + let seq = b"ACGTGGCTATCGAC"; + let g = graph_from_ascii(seq, k); + g.compute_degrees(); + let unitigs: Vec = g.iter_unitig().collect(); + let mut got = kmers_from_unitigs(&unitigs, k); + let mut expected = canonical_kmers(seq, k); + got.sort_unstable(); + expected.sort_unstable(); + assert_eq!(got, expected); + } + + #[test] + fn unitig_isolated_node() { + // Single k-mer with no neighbours + let k = 5; + let kmer = Kmer::from_ascii(b"ACGTA", k).unwrap(); + let mut g = GraphDeBruijn::new(k); + g.push(kmer); + g.compute_degrees(); + let unitigs: Vec = g.iter_unitig().collect(); + assert_eq!(unitigs.len(), 1); + assert_eq!(unitigs[0].seql(), k); + } + + #[test] + fn unitig_two_isolated_nodes() { + let k = 5; + let mut g = GraphDeBruijn::new(k); + // Two k-mers that share no (k-1)-overlap + g.push(Kmer::from_ascii(b"AAAAA", k).unwrap()); + g.push(Kmer::from_ascii(b"TTTTT", k).unwrap()); // same canonical as AAAAA — dedup + // They collapse to one canonical node + assert_eq!(g.len(), 1); + } + + #[test] + fn unitig_two_truly_distinct_isolated_nodes() { + let k = 5; + let mut g = GraphDeBruijn::new(k); + g.push(Kmer::from_ascii(b"AAAAC", k).unwrap()); + g.push(Kmer::from_ascii(b"GGGGT", k).unwrap()); + g.compute_degrees(); + let unitigs: Vec = g.iter_unitig().collect(); + // Each isolated node → one unitig of length k + assert_eq!(unitigs.len(), 2); + assert!(unitigs.iter().all(|u| u.seql() == k)); + } + + // ── all k-mers covered, none duplicated ─────────────────────────────────── + + #[test] + fn no_kmer_lost_or_duplicated() { + let k = 7; + let seq = b"ACGTACGTACGTTTTTACGTACGT"; + let g = graph_from_ascii(seq, k); + g.compute_degrees(); + let unitigs: Vec = g.iter_unitig().collect(); + let got = kmers_from_unitigs(&unitigs, k); + let expected = canonical_kmers(seq, k); + assert_eq!( + got.len(), + expected.len(), + "kmer count mismatch: got {}, expected {}", + got.len(), + expected.len() + ); + assert_eq!(got, expected, "kmer sets differ"); + } + + // ── cycle coverage ──────────────────────────────────────────────────────── + + #[test] + fn cycle_kmers_not_lost() { + // ACGTACGT with k=5 forms a pure cycle: ACGTA→CGTAC→GTACG→TACGT→ACGTA. + // start_iter first pass yields nothing (all nodes internal); second pass + // picks up cycle entries. All 4 k-mers must appear in the unitigs. + let k = 5; + let seq = b"ACGTACGT"; + let g = graph_from_ascii(seq, k); + g.compute_degrees(); + let unitigs: Vec = g.iter_unitig().collect(); + let got = kmers_from_unitigs(&unitigs, k); + let expected = canonical_kmers(seq, k); + assert_eq!(got.len(), expected.len(), "cycle k-mers lost"); + assert_eq!(got, expected); + } +} diff --git a/src/obifastwrite/src/lib.rs b/src/obifastwrite/src/lib.rs index 2a71b97..d20621d 100644 --- a/src/obifastwrite/src/lib.rs +++ b/src/obifastwrite/src/lib.rs @@ -32,7 +32,7 @@ use std::io::{self, Write}; -use obikseq::{kmer::Kmer, superkmer::SuperKmer}; +use obikseq::{kmer::Kmer, superkmer::SuperKmer, unitig::Unitig}; use xxhash_rust::xxh64::xxh64; // ── public API ──────────────────────────────────────────────────────────────── @@ -118,6 +118,28 @@ pub fn write_count( out.write_all(b"\n") } +/// Write one unitig in FASTA format. +/// +/// Header annotation (JSON): +/// ```text +/// >HASH {"seq_length":,"kmer_size":,"n_kmers":} +/// ``` +/// +/// `HASH` is the xxHash-64 of the ASCII sequence (16 uppercase hex digits). +/// `n_kmers` is the number of distinct k-mers covered by this unitig. +pub fn write_unitig(unitig: &Unitig, k: usize, out: &mut W) -> io::Result<()> { + let ascii = unitig.to_ascii(); + let id = seq_id(&ascii); + let seql = unitig.seql(); + let n_kmers = seql - k + 1; + writeln!( + out, + ">{id} {{\"seq_length\":{seql},\"kmer_size\":{k},\"n_kmers\":{n_kmers}}}", + )?; + out.write_all(&ascii)?; + out.write_all(b"\n") +} + // ── internal helpers ────────────────────────────────────────────────────────── /// xxHash-64 of the ASCII sequence, formatted as 16 uppercase hex digits. diff --git a/src/obikseq/src/kmer.rs b/src/obikseq/src/kmer.rs index c1d8210..b4bb39d 100644 --- a/src/obikseq/src/kmer.rs +++ b/src/obikseq/src/kmer.rs @@ -188,6 +188,29 @@ impl Kmer { Kmer(shifted | (3u64 << shift)).canonical(k), ] } + + /// Slide the window one base to the right: drop the first nucleotide, append `nuc` at position k-1. + pub fn push_right(self, nuc: u8, k: usize) -> Self { + let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1))); + let shift = 64 - 2 * k; + Kmer(shifted | ((nuc as u64 & 3) << shift)) + } + + /// Slide the window one base to the left: drop the last nucleotide, prepend `nuc` at position 0. + pub fn push_left(self, nuc: u8, k: usize) -> Self { + let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k)); + Kmer(shifted | ((nuc as u64 & 3) << 62)) + } + + /// Returns `true` if `self` and `other` overlap by `k` - 1 bases. + /// + /// The last K-1 nucleotides of `self` and the first K-1 nucleotides + /// of `other` must be equal. + pub fn is_overlapping(self, other: Self, k: usize) -> bool { + let left = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1))); + let right = other.0 & (!0u64 << (64 - 2 * (k - 1))); + left == right + } } // ── tests ───────────────────────────────────────────────────────────────────── diff --git a/src/obikseq/src/lib.rs b/src/obikseq/src/lib.rs index 16f9ba7..7206348 100644 --- a/src/obikseq/src/lib.rs +++ b/src/obikseq/src/lib.rs @@ -9,3 +9,4 @@ mod encoding; pub mod kmer; mod revcomp_lookup; pub mod superkmer; +pub mod unitig; diff --git a/src/obikseq/src/unitig.rs b/src/obikseq/src/unitig.rs new file mode 100644 index 0000000..b95a272 --- /dev/null +++ b/src/obikseq/src/unitig.rs @@ -0,0 +1,421 @@ +//! Compact 2-bit DNA unitig with in-place reverse complement and canonical form. +//! +//! Same encoding as [`SuperKmer`](crate::superkmer::SuperKmer) — nucleotide 0 +//! at the MSB of `seq[0]`, 4 bases per byte — but without the 256-nucleotide +//! length cap and without the scatter/count header payload. + +use crate::encoding::{DEC4, encode_base}; +use crate::kmer::{Kmer, KmerError}; +use crate::revcomp_lookup::REVCOMP4; +use bitvec::prelude::*; + +// ── Unitig ──────────────────────────────────────────────────────────────────── + +/// Compact unitig: sequence length (usize) + byte-aligned 2-bit nucleotide sequence. +/// +/// Encoding: A=00, C=01, G=10, T=11. Nucleotide 0 occupies bits 7–6 of `seq[0]`, +/// nucleotide i occupies bits `7 − 2*(i%4)` and `6 − 2*(i%4)` of `seq[i/4]`. +/// Padding bits in the last byte are always 0. +#[derive(Debug, Clone)] +pub struct Unitig { + seql: usize, + seq: Box<[u8]>, +} + +impl PartialEq for Unitig { + fn eq(&self, other: &Self) -> bool { + self.seql == other.seql && self.seq == other.seq + } +} + +impl Eq for Unitig {} + +impl std::hash::Hash for Unitig { + fn hash(&self, state: &mut H) { + self.seql.hash(state); + self.seq.hash(state); + } +} + +impl Unitig { + /// Create from a pre-packed 2-bit byte slice and explicit length. + /// `seq.len()` must equal `(seql + 3) / 4`. + pub fn new(seql: usize, seq: Box<[u8]>) -> Self { + debug_assert_eq!(seq.len(), byte_len(seql)); + Self { seql, seq } + } + + /// Encode a slice of 2-bit nucleotide values (0=A, 1=C, 2=G, 3=T, any length ≥ 1). + /// More efficient than `from_ascii` when nucleotides are already 2-bit encoded. + pub fn from_nucleotides(nucs: &[u8]) -> Self { + let seql = nucs.len(); + debug_assert!(seql >= 1, "unitig length must be ≥ 1"); + let n = byte_len(seql); + let mut seq = vec![0u8; n]; + for (i, &nuc) in nucs.iter().enumerate() { + seq[i / 4] |= (nuc & 0b11) << (6 - 2 * (i % 4)); + } + Self::new(seql, seq.into_boxed_slice()) + } + + /// Encode an ASCII nucleotide slice (ACGT, any length ≥ 1) into a new Unitig. + /// The result is not yet in canonical form; call `.canonical()` if needed. + pub fn from_ascii(ascii: &[u8]) -> Self { + let seql = ascii.len(); + debug_assert!(seql >= 1, "unitig length must be ≥ 1"); + let n = byte_len(seql); + let mut seq = vec![0u8; n]; + + let full = seql / 4; + for i in 0..full { + seq[i] = encode_base(ascii[i * 4]) << 6 + | encode_base(ascii[i * 4 + 1]) << 4 + | encode_base(ascii[i * 4 + 2]) << 2 + | encode_base(ascii[i * 4 + 3]); + } + let rem = seql % 4; + if rem > 0 { + let mut last = 0u8; + for j in 0..rem { + last |= encode_base(ascii[full * 4 + j]) << (6 - 2 * j); + } + seq[full] = last; + } + + Self::new(seql, seq.into_boxed_slice()) + } + + /// Returns the sequence length in nucleotides. + pub fn seql(&self) -> usize { + self.seql + } + + /// Returns a read-only view of the packed 2-bit sequence bytes. + /// Length is always `(seql() + 3) / 4`. + pub fn seq_bytes(&self) -> &[u8] { + &self.seq + } + + /// Extract nucleotide i (0-based from 5′ end) as a 2-bit value. + pub fn nucleotide(&self, i: usize) -> u8 { + (self.seq[i / 4] >> (6 - 2 * (i % 4))) & 0b11 + } + + /// Decode into ASCII nucleotides, appending into `buf`. + pub fn write_ascii(&self, buf: &mut Vec) { + let full = self.seql / 4; + for i in 0..full { + buf.extend_from_slice(&DEC4[self.seq[i] as usize].to_be_bytes()); + } + let rem = self.seql % 4; + if rem > 0 { + let bytes = DEC4[self.seq[full] as usize].to_be_bytes(); + buf.extend_from_slice(&bytes[..rem]); + } + } + + /// Decode into a fresh ASCII `Vec`. + pub fn to_ascii(&self) -> Vec { + let mut buf = Vec::with_capacity(self.seql); + self.write_ascii(&mut buf); + buf + } + + /// Reverse-complement this unitig in place. + pub fn revcomp(&mut self) { + let n = byte_len(self.seql); + + // Step 1: swap bytes outside-in, complementing each 4-base chunk via lookup. + { + let bytes = &mut self.seq[..n]; + let (mut lo, mut hi) = (0, n - 1); + while lo < hi { + (bytes[lo], bytes[hi]) = + (REVCOMP4[bytes[hi] as usize], REVCOMP4[bytes[lo] as usize]); + lo += 1; + hi -= 1; + } + if lo == hi { + bytes[lo] = REVCOMP4[bytes[lo] as usize]; + } + } + + // Step 2: left-shift to flush the padding T's produced by complementing padding A's. + let shift = n * 8 - self.seql * 2; + if shift > 0 { + let bits = self.seq[..n].view_bits_mut::(); + bits.rotate_left(shift); + let len = bits.len(); + bits[len - shift..].fill(false); + } + } + + /// Returns `true` if this unitig is in canonical form (lexicographic minimum + /// of forward and reverse complement). + pub fn is_canonical(&self) -> bool { + for i in 0..self.seql { + let fwd = self.nucleotide(i); + let rev = complement(self.nucleotide(self.seql - 1 - i)); + if fwd < rev { + return true; + } + if fwd > rev { + return false; + } + } + true + } + + /// Put this unitig in canonical form in place. + /// + /// Returns `true` if already canonical (no change), `false` if revcomp was applied. + pub fn canonical(&mut self) -> bool { + if self.is_canonical() { + return true; + } + self.revcomp(); + false + } + + /// Extract the kmer of length `k` starting at nucleotide position `i` (0-based). + pub fn kmer(&self, i: usize, k: usize) -> Result { + if k == 0 || k > 32 { + return Err(KmerError::InvalidK { k }); + } + if i + k > self.seql { + return Err(KmerError::OutOfBounds { + position: i, + k, + seql: self.seql, + }); + } + let bits = self.seq.view_bits::(); + let raw: u64 = bits[i * 2..(i + k) * 2].load_be(); + Ok(Kmer::from_raw(raw << (64 - 2 * k))) + } + + /// Extract the canonical kmer of length `k` starting at position `i`. + pub fn canonical_kmer(&self, i: usize, k: usize) -> Result { + Ok(self.kmer(i, k)?.canonical(k)) + } + + /// Iterate over all kmers of length `k` in order, yielding each as a [`Kmer`]. + pub fn iter_kmers(&self, k: usize) -> impl Iterator + '_ { + UnitigKmerIter::new(self, k) + } + + /// Iterate over all canonical kmers of length `k` in order. + pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator + '_ { + self.iter_kmers(k).map(move |km| km.canonical(k)) + } +} + +// ── UnitigKmerIter ──────────────────────────────────────────────────────────── + +struct UnitigKmerIter<'a> { + unitig: &'a Unitig, + mask: u64, + lshift: usize, + current: u64, + pos: usize, + max_pos: usize, +} + +impl<'a> UnitigKmerIter<'a> { + fn new(unitig: &'a Unitig, k: usize) -> Self { + let seql = unitig.seql(); + let lshift = 64 - k * 2; + let mask = ((!0u128) << (lshift + 2)) as u64; + Self { + unitig, + mask, + lshift, + current: if seql >= k { unitig.kmer(0, k).unwrap().raw() } else { 0 }, + pos: k, + max_pos: seql, + } + } +} + +impl<'a> Iterator for UnitigKmerIter<'a> { + type Item = Kmer; + + fn next(&mut self) -> Option { + if self.pos > self.max_pos { + return None; + } + let result = Kmer::from_raw(self.current); + if self.pos < self.max_pos { + let byte_pos = self.pos / 4; + // nucleotide at position p within its byte occupies bits 7−2*(p%4) and 6−2*(p%4) + let inner_shift = 6 - 2 * (self.pos & 3); + let nuc = (((self.unitig.seq[byte_pos] >> inner_shift) & 3) as u64) << self.lshift; + self.current = ((self.current << 2) & self.mask) | nuc; + } + self.pos += 1; + Some(result) + } +} + +// ── helpers ─────────────────────────────────────────────────────────────────── + +fn complement(base: u8) -> u8 { + !base & 0b11 +} + +fn byte_len(seql: usize) -> usize { + (seql + 3) / 4 +} + +// ── tests ───────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + fn make_seq(len: usize) -> Vec { + (0..len).map(|i| b"ACGT"[i % 4]).collect() + } + + fn ascii_revcomp(seq: &[u8]) -> Vec { + seq.iter() + .rev() + .map(|&b| match b { + b'A' => b'T', + b'T' => b'A', + b'C' => b'G', + b'G' => b'C', + _ => b'A', + }) + .collect() + } + + fn test_lengths() -> impl Iterator { + (1..=9).chain([255, 256, 257, 1000, 10_000]) + } + + // ── from_ascii / to_ascii ───────────────────────────────────────────────── + + #[test] + fn ascii_roundtrip_all_lengths() { + for len in test_lengths() { + let ascii = make_seq(len); + let u = Unitig::from_ascii(&ascii); + assert_eq!(u.to_ascii(), ascii, "roundtrip failed for len={len}"); + } + } + + // ── seql ────────────────────────────────────────────────────────────────── + + #[test] + fn seql_roundtrip() { + for len in test_lengths() { + let u = Unitig::from_ascii(&make_seq(len)); + assert_eq!(u.seql(), len); + } + } + + // ── revcomp ─────────────────────────────────────────────────────────────── + + #[test] + fn revcomp_known_values() { + let cases = [ + ("A", "T"), + ("AC", "GT"), + ("ACG", "CGT"), + ("ACGT", "ACGT"), + ("ACGTA", "TACGT"), + ]; + for (seq, expected) in cases { + let mut u = Unitig::from_ascii(seq.as_bytes()); + u.revcomp(); + assert_eq!(u.to_ascii(), expected.as_bytes(), "revcomp wrong for \"{seq}\""); + } + } + + #[test] + fn revcomp_vs_reference_all_lengths() { + for len in test_lengths() { + let ascii = make_seq(len); + let expected = ascii_revcomp(&ascii); + let mut u = Unitig::from_ascii(&ascii); + u.revcomp(); + assert_eq!(u.to_ascii(), expected, "revcomp wrong for len={len}"); + } + } + + #[test] + fn revcomp_involution_all_lengths() { + for len in test_lengths() { + let ascii = make_seq(len); + let mut u = Unitig::from_ascii(&ascii); + u.revcomp(); + u.revcomp(); + assert_eq!(u.to_ascii(), ascii, "revcomp∘revcomp≠id for len={len}"); + } + } + + // ── canonical ───────────────────────────────────────────────────────────── + + #[test] + fn canonical_palindrome_unchanged() { + let mut u = Unitig::from_ascii(b"ACGT"); + u.canonical(); + assert_eq!(u.to_ascii(), b"ACGT"); + } + + #[test] + fn canonical_chooses_revcomp() { + let mut u = Unitig::from_ascii(b"TTTT"); + u.canonical(); + assert_eq!(u.to_ascii(), b"AAAA"); + } + + #[test] + fn canonical_is_minimal_all_lengths() { + for len in test_lengths() { + let ascii = make_seq(len); + let mut u = Unitig::from_ascii(&ascii); + u.canonical(); + let fwd = u.to_ascii(); + let rev = ascii_revcomp(&fwd); + assert!(fwd <= rev, "canonical not minimal for len={len}"); + } + } + + // ── kmer extraction ─────────────────────────────────────────────────────── + + #[test] + fn kmer_all_positions() { + let ascii = b"ACGTACGTACGT"; + let k = 4; + let u = Unitig::from_ascii(ascii); + for i in 0..=ascii.len() - k { + let kmer = u.kmer(i, k).unwrap(); + let expected = Kmer::from_ascii(&ascii[i..i + k], k).unwrap(); + assert_eq!(kmer, expected, "mismatch at position {i}"); + } + } + + // ── iter_kmers ──────────────────────────────────────────────────────────── + + #[test] + fn iter_kmers_matches_kmer_at_each_position() { + let ascii = make_seq(20); + let k = 7; + let u = Unitig::from_ascii(&ascii); + let kmers: Vec = u.iter_kmers(k).collect(); + assert_eq!(kmers.len(), ascii.len() - k + 1); + for (i, &km) in kmers.iter().enumerate() { + assert_eq!(km, u.kmer(i, k).unwrap(), "pos={i}"); + } + } + + #[test] + fn iter_kmers_long_unitig() { + let ascii = make_seq(10_000); + let k = 11; + let u = Unitig::from_ascii(&ascii); + assert_eq!(u.iter_kmers(k).count(), 10_000 - k + 1); + } +}