From 86e9cb702639f4f36dac1fad28b41b34565c082d Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 1 May 2026 23:05:21 +0200 Subject: [PATCH] refactor: improve de Bruijn graph traversal and longtig generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Refactored Node representation using compact bitfields for neighbor counts and nucleotides; added count_neighbors helper to compute_degrees() - Introduced StartIter iterator for unitig/longtigu generation with revised traversal semantics (e.g., interior node marking) - Added nucleotide() accessor to Kmer type for 2-bit extraction at position i - Renamed unitig.rs → longtigs, updated CLI command and output filenames to reflect "long t ig" - Extended extract_kmers() in scripts/compare.py with duplication statistics ``` --- scripts/compare_kmers.py | 23 +- src/obidebruinj/src/lib.rs | 585 +++++++++++++++++++++++++++------ src/obikmer/src/cmd/longtig.rs | 141 ++++++++ src/obikmer/src/cmd/mod.rs | 1 + src/obikmer/src/cmd/unitig.rs | 13 +- src/obikmer/src/main.rs | 7 +- src/obikseq/src/kmer.rs | 6 + 7 files changed, 669 insertions(+), 107 deletions(-) create mode 100644 src/obikmer/src/cmd/longtig.rs diff --git a/scripts/compare_kmers.py b/scripts/compare_kmers.py index bf1d139..96126d6 100755 --- a/scripts/compare_kmers.py +++ b/scripts/compare_kmers.py @@ -51,16 +51,21 @@ def iter_sequences(path: str): yield header, "".join(parts) -def extract_kmers(path: str, k: int) -> set[str]: - """Return the set of canonical k-mers from all sequences in *path*.""" - kmers: set[str] = set() +def extract_kmers(path: str, k: int) -> tuple[set[str], int]: + """Return (set of canonical k-mers, count of duplicated k-mers) from *path*. + + A k-mer is duplicated if its canonical form appears more than once across + all sequences in the file. + """ + counts: dict[str, int] = {} for _, seq in iter_sequences(path): - # skip any character that is not ACGT for i in range(len(seq) - k + 1): kmer = seq[i : i + k] if all(c in "ACGT" for c in kmer): - kmers.add(canonical(kmer)) - return kmers + ck = canonical(kmer) + counts[ck] = counts.get(ck, 0) + 1 + duplicated = sum(1 for v in counts.values() if v > 1) + return set(counts), duplicated def main(): @@ -81,16 +86,18 @@ def main(): print() print("reading A …", file=sys.stderr) - set_a = extract_kmers(args.file_a, k) + set_a, dup_a = extract_kmers(args.file_a, k) print("reading B …", file=sys.stderr) - set_b = extract_kmers(args.file_b, k) + set_b, dup_b = extract_kmers(args.file_b, k) only_a = set_a - set_b only_b = set_b - set_a common = set_a & set_b print(f"{'kmers in A':<25} {len(set_a):>12,}") + print(f"{'duplicated in A':<25} {dup_a:>12,}") print(f"{'kmers in B':<25} {len(set_b):>12,}") + print(f"{'duplicated in B':<25} {dup_b:>12,}") print(f"{'common':<25} {len(common):>12,}") print(f"{'only in A (lost)':<25} {len(only_a):>12,}") print(f"{'only in B (gained)':<25} {len(only_b):>12,}") diff --git a/src/obidebruinj/src/lib.rs b/src/obidebruinj/src/lib.rs index a4300cb..58eed58 100644 --- a/src/obidebruinj/src/lib.rs +++ b/src/obidebruinj/src/lib.rs @@ -1,9 +1,10 @@ use ahash::RandomState; use hashbrown::HashMap; use obifastwrite::write_unitig; -use obikseq::kmer::{CanonicalKmer, Kmer}; +use obikseq::kmer::{self, CanonicalKmer, Kmer}; use obikseq::unitig::Unitig; use std::cell::Cell; +use std::fmt; use std::io; // ── Types ───────────────────────────────────────────────────────────────────── @@ -28,14 +29,21 @@ type FastHashMap = HashMap; pub struct Node(u8); impl Node { + /// Returns `true` if the node can be extended to the right. + /// + /// A single right neighbour exists. pub fn can_extend_right(self) -> bool { self.0 & 0b0000_0001 != 0 } + /// Returns `true` if the node can be extended to the left. + /// + /// A single left neighbour exists. pub fn can_extend_left(self) -> bool { self.0 & 0b0000_0010 != 0 } + /// Returns `true` if the node has been visited. pub fn is_visited(self) -> bool { self.0 & 0b0000_0100 != 0 } @@ -52,25 +60,81 @@ impl Node { (self.0 >> 5) & 0b11 } + /// Marks the node as visited. pub fn set_visited(&mut self) { + if self.is_visited() { + unreachable!("from: is_visited -> The node has already been visited") + } self.0 |= 0b0000_0100; } - /// `None` → not uniquely continuable (0 or ≥2 neighbours). - /// `Some(n)` → exactly one neighbour, reached by adding nucleotide n. - pub fn set_right(&mut self, nuc: Option) { - self.0 &= !(0b0000_0001 | 0b0001_1000); // clear bit 0 and bits 3–4 - if let Some(n) = nuc { - self.0 |= 0b0000_0001 | ((n & 0b11) << 3); + /// Number of left neighbours. + pub fn n_left_neighbours(self) -> u8 { + if self.can_extend_left() { + 1 + } else { + let v = (self.0 >> 5) & 0b11; + v + (v != 0) as u8 } } - pub fn set_left(&mut self, nuc: Option) { - self.0 &= !(0b0000_0010 | 0b0110_0000); // clear bit 1 and bits 5–6 - if let Some(n) = nuc { - self.0 |= 0b0000_0010 | ((n & 0b11) << 5); + /// Number of right neighbours. + pub fn n_right_neighbours(self) -> u8 { + if self.can_extend_right() { + 1 + } else { + let v = (self.0 >> 3) & 0b11; + v + (v != 0) as u8 } } + + /// `nuc` = Some(i) → exactly one neighbour (bit 0 set, bits 3–4 = nucleotide index). + /// `nuc` = None → 0 or ≥2 neighbours; `count` encoded in bits 3–4 as count.sat_sub(1). + pub fn set_right(&mut self, count: u8, nuc: Option) { + self.0 &= !(0b0000_0001 | 0b001_1000); + if count == 1 { + self.0 |= 0b0000_0001; + if let Some(n) = nuc { + self.0 |= (n & 0b11) << 3; + return; + } + unreachable!("nuc must be Some when count is 1"); + } + self.0 |= (count.saturating_sub(1).min(3)) << 3; + } + + /// `nuc` = Some(i) → exactly one neighbour (bit 0 set, bits 3–4 = nucleotide index). + /// `nuc` = None → 0 or ≥2 neighbours; `count` encoded in bits 3–4 as count.sat_sub(1). + pub fn set_left(&mut self, count: u8, nuc: Option) { + self.0 &= !(0b0000_0010 | 0b0110_0000); + if count == 1 { + self.0 |= 0b0000_0010; + if let Some(n) = nuc { + self.0 |= (n & 0b11) << 5; + return; + } + unreachable!("nuc must be Some when count is 1"); + } + self.0 |= (count.saturating_sub(1).min(3)) << 5; + } +} + +impl fmt::Display for Node { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + const NUC: [char; 4] = ['A', 'C', 'G', 'T']; + let r = if self.can_extend_right() { + format!("→{}", NUC[self.right_nuc() as usize]) + } else { + format!("→{}", self.n_right_neighbours()) + }; + let l = if self.can_extend_left() { + format!("←{}", NUC[self.left_nuc() as usize]) + } else { + format!("←{}", self.n_left_neighbours()) + }; + let v = if self.is_visited() { "V" } else { "." }; + write!(f, "Node({r} {l} {v})") + } } // ── GraphDeBruijn ───────────────────────────────────────────────────────────── @@ -108,16 +172,80 @@ impl GraphDeBruijn { /// Single pass thanks to Cell interior mutability. pub fn compute_degrees(&self) { for (&kmer, cell) in &self.nodes { - let right_nuc = unique_neighbor(kmer.right_canonical_neighbors(self.k), &self.nodes); - let left_nuc = unique_neighbor(kmer.left_canonical_neighbors(self.k), &self.nodes); + let (rc, rn) = count_neighbors(kmer.right_canonical_neighbors(self.k), &self.nodes); + let (lc, ln) = count_neighbors(kmer.left_canonical_neighbors(self.k), &self.nodes); let mut node = cell.get(); - node.set_right(right_nuc); - node.set_left(left_nuc); + node.set_right(rc, rn); + node.set_left(lc, ln); cell.set(node); } } + /// Iterates over the right neighbors of `kmer`. + pub fn iter_right_neighbors( + &self, + kmer: CanonicalKmer, + ) -> impl Iterator + '_ { + kmer.right_canonical_neighbors(self.k) + .into_iter() + .filter_map(|kmer| { + self.nodes.get(&kmer)?; + Some(kmer) + }) + } + + /// Iterates over the left neighbors of `kmer`. + pub fn iter_left_neighbors( + &self, + kmer: CanonicalKmer, + ) -> impl Iterator + '_ { + kmer.left_canonical_neighbors(self.k) + .into_iter() + .filter_map(|kmer| { + self.nodes.get(&kmer)?; + Some(kmer) + }) + } + + pub fn is_visited(&self, kmer: &CanonicalKmer) -> Option { + self.nodes.get(kmer).map(|cell| cell.get().is_visited()) + } + + pub fn set_visited(&self, kmer: CanonicalKmer) { + if let Some(cell) = self.nodes.get(&kmer) { + let mut node = cell.get(); + node.set_visited(); + cell.set(node); + } + } + + /// Returns the single right neighbor of `kmer`, if it exists. + pub fn the_single_right_neighbor(&self, kmer: CanonicalKmer) -> Option { + let node = self.nodes.get(&kmer)?.get(); + if !node.can_extend_right() { + return None; + } + let next = kmer + .into_kmer() + .push_right(node.right_nuc(), self.k) + .canonical(self.k); + self.nodes.contains_key(&next).then_some(next) + } + + /// Returns the single left neighbor of `kmer`, if it exists. + pub fn the_single_left_neighbor(&self, kmer: CanonicalKmer) -> Option { + let node = self.nodes.get(&kmer)?.get(); + if !node.can_extend_left() { + return None; + } + let next = kmer + .into_kmer() + .push_left(node.left_nuc(), self.k) + .canonical(self.k); + self.nodes.contains_key(&next).then_some(next) + } + /// Internal iterator over unitig-start nodes; drives `iter_unitig`. /// /// MUST NOT be consumed standalone: the second pass finds cycle nodes only @@ -128,74 +256,145 @@ impl GraphDeBruijn { /// - `!can_extend_left` → yield canonical form /// - `!can_extend_right` → yield reverse complement /// 2. Nodes still unvisited → part of a cycle; yield canonical form. - // Yields (start, first_next) arcs: - // pass 1 — one arc per right-neighbour of every !can_extend_left node - // pass 2 — one arc per unvisited interior node (cycles); marks it visited - // Junction nodes (pass 1) are never marked visited. - fn start_iter(&self) -> impl Iterator)> + '_ { - let k = self.k; - - let chain_starts = self.nodes.iter().flat_map(move |(&kmer, _cell)| { - let node = _cell.get(); - if node.can_extend_left() { - return vec![]; - } - let start = kmer.into_kmer(); - if node.can_extend_right() { - let next_c = start.push_right(node.right_nuc(), k).canonical(k); - vec![(start, Some(oriented_next(start, next_c, k)))] - } else { - let rights: Vec<_> = kmer - .right_canonical_neighbors(k) - .into_iter() - .filter(|n| self.nodes.contains_key(n)) - .map(|n| (start, Some(oriented_next(start, n, k)))) - .collect(); - if rights.is_empty() { vec![(start, None)] } else { rights } - } - }); - - let cycle_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| { - let node = cell.get(); - if node.is_visited() || !node.can_extend_left() || !node.can_extend_right() { - return None; - } - let mut updated = node; - updated.set_visited(); - cell.set(updated); - let start = kmer.into_kmer(); - let next_c = start.push_right(node.right_nuc(), k).canonical(k); - Some((start, Some(oriented_next(start, next_c, k)))) - }); - - chain_starts.chain(cycle_starts) + fn start_iter(&self) -> impl Iterator)> + '_ { + StartIter::new(self) } - // Direction from kmer orientation; returns None if called on a non-interior node. fn next_unitig_kmer(&self, kmer: Kmer) -> Option { let canonical = kmer.canonical(self.k); let node = self.nodes.get(&canonical)?.get(); - if !node.can_extend_left() || !node.can_extend_right() { + + let direct = kmer.raw() == canonical.raw(); + + if (direct && !node.can_extend_right()) || (!direct && !node.can_extend_left()) { return None; } - let next_c = if kmer.raw() == canonical.raw() { - canonical.into_kmer().push_right(node.right_nuc(), self.k).canonical(self.k) + + let next_c: CanonicalKmer = if direct { + canonical + .into_kmer() + .push_right(node.right_nuc(), self.k) + .canonical(self.k) } else { - canonical.into_kmer().push_left(node.left_nuc(), self.k).canonical(self.k) + canonical + .into_kmer() + .push_left(node.left_nuc(), self.k) + .canonical(self.k) }; - Some(oriented_next(kmer, next_c, self.k)) + + let cell = self.nodes.get(&next_c)?; + let next_node = cell.get(); + if next_node.is_visited() { + return None; + } + + let oriented = oriented_next(kmer, next_c, self.k); + let ndirect = oriented.raw() == next_c.raw(); + + if (ndirect && next_node.n_right_neighbours() > 1) + || (!ndirect && next_node.n_left_neighbours() > 1) + { + return None; + } + + let mut updated = next_node; + updated.set_visited(); + cell.set(updated); + Some(oriented) } - fn iter_unitig_kmers(&self, first_next: Option) -> UnitigIter<'_> { - UnitigIter { graph: self, current: first_next } + fn next_longtig_kmer(&self, kmer: Kmer) -> Option { + let k = self.k; + let canonical = kmer.canonical(k); + let node = self.nodes.get(&canonical)?.get(); + + let direct = kmer.raw() == canonical.raw(); + + if (direct && node.n_right_neighbours() == 0) || (!direct && node.n_left_neighbours() == 0) + { + return None; + } + + let next_c: CanonicalKmer = if direct { + if node.can_extend_right() { + canonical + .into_kmer() + .push_right(node.right_nuc(), k) + .canonical(k) + } else { + self.iter_right_neighbors(canonical) + .filter(|n| !self.is_visited(n).unwrap_or(true)) + .next()? + } + } else { + if node.can_extend_left() { + canonical + .into_kmer() + .push_left(node.left_nuc(), k) + .canonical(k) + } else { + self.iter_left_neighbors(canonical) + .filter(|n| !self.is_visited(n).unwrap_or(true)) + .next()? + } + }; + + let cell = self.nodes.get(&next_c)?; + let next_node = cell.get(); + if next_node.is_visited() { + return None; + } + + let oriented = oriented_next(kmer, next_c, self.k); + let ndirect = oriented.raw() == next_c.raw(); + + if (ndirect && next_node.n_right_neighbours() > 1) + || (!ndirect && next_node.n_left_neighbours() > 1) + { + return None; + } + + let mut updated = next_node; + updated.set_visited(); + cell.set(updated); + Some(oriented) + } + + fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> { + UnitigIter { + graph: self, + current: Some(start), + } + } + + fn iter_longtig_kmers(&self, start: Kmer) -> LongtigIter<'_> { + LongtigIter { + graph: self, + current: Some(start), + } } pub fn iter_unitig(&self) -> impl Iterator + '_ { let k = self.k; self.start_iter().map(move |(start, first_next)| { let mut nucs: Vec = (0..k).map(|i| start.nucleotide(i)).collect(); - for kmer in self.iter_unitig_kmers(first_next) { - nucs.push(kmer.nucleotide(k - 1)); + if let Some(next_c) = first_next { + for kmer in self.iter_unitig_kmers(next_c) { + nucs.push(kmer.nucleotide(k - 1)); + } + } + Unitig::from_nucleotides(&nucs) + }) + } + + pub fn iter_longtig(&self) -> impl Iterator + '_ { + let k = self.k; + self.start_iter().map(move |(start, first_next)| { + let mut nucs: Vec = (0..k).map(|i| start.nucleotide(i)).collect(); + if let Some(next_c) = first_next { + for kmer in self.iter_longtig_kmers(next_c) { + nucs.push(kmer.nucleotide(k - 1)); + } } Unitig::from_nucleotides(&nucs) }) @@ -205,9 +404,15 @@ impl GraphDeBruijn { /// /// Calls [`obifastwrite::write_unitig`] for each unitig produced by /// [`iter_unitig`]. Stops and returns the first I/O error encountered. - pub fn write_fasta(&self, out: &mut W) -> io::Result<()> { - for unitig in self.iter_unitig() { - write_unitig(&unitig, self.k, out)?; + pub fn write_fasta(&self, out: &mut W, unitig: bool) -> io::Result<()> { + if unitig { + for unitig in self.iter_unitig() { + write_unitig(&unitig, self.k, out)?; + } + } else { + for unitig in self.iter_longtig() { + write_unitig(&unitig, self.k, out)?; + } } Ok(()) } @@ -221,6 +426,93 @@ impl GraphDeBruijn { } } +// --- StartIter ----------------------------------------------------------------- +struct StartIter<'a> { + graph: &'a GraphDeBruijn, + nodes: hashbrown::hash_map::Iter<'a, CanonicalKmer, Cell>, + suspended: Vec, + in_cycle_pass: bool, +} + +impl<'a> StartIter<'a> { + fn new(graph: &'a GraphDeBruijn) -> Self { + Self { + graph, + nodes: graph.nodes.iter(), + suspended: Vec::new(), + in_cycle_pass: false, + } + } +} + +impl<'a> Iterator for StartIter<'a> { + type Item = (CanonicalKmer, Option); + + fn next(&mut self) -> Option<(CanonicalKmer, Option)> { + loop { + let current = if let Some(k) = self.suspended.pop() { + k + } else { + match self.nodes.next() { + Some((&k, _)) => k, + None => { + if self.in_cycle_pass { + return None; + } + self.in_cycle_pass = true; + self.nodes = self.graph.nodes.iter(); + match self.nodes.next() { + Some((&k, _)) => k, + None => return None, + } + } + } + }; + + let node = match self.graph.nodes.get(¤t) { + Some(c) => c.get(), + None => continue, + }; + if node.is_visited() { + continue; + } + if !self.in_cycle_pass && node.can_extend_left() { + continue; + } + + self.graph.set_visited(current); + + if let Some(next) = self.graph.the_single_right_neighbor(current) { + if self.graph.is_visited(&next).unwrap_or(true) { + return Some((current, None)); + } + self.graph.set_visited(next); + let oriented = oriented_next(current.into_kmer(), next, self.graph.k); + return Some((current, Some(oriented))); + } + + let mut first_neighbor: Option = None; + for neighbor in self.graph.iter_right_neighbors(current) { + if self.graph.is_visited(&neighbor).unwrap_or(true) { + continue; + } + if first_neighbor.is_none() { + self.graph.set_visited(neighbor); + first_neighbor = Some(neighbor); + } else { + self.suspended.push(neighbor); + } + } + + let oriented = match first_neighbor { + Some(neighbor) => Some(oriented_next(current.into_kmer(), neighbor, self.graph.k)), + None => None, + }; + return Some((current, oriented)); + } + } +} + // ── UnitigIter ──────────────────────────────────────────────────────────────── struct UnitigIter<'a> { @@ -233,48 +525,60 @@ impl Iterator for UnitigIter<'_> { fn next(&mut self) -> Option { let current = self.current?; - let canonical = current.canonical(self.graph.k); - let cell = self.graph.nodes.get(&canonical)?; - let node = cell.get(); - // Cycle guard: stop if already visited (cycle start marked by start_iter pass 2). - if node.is_visited() { - self.current = None; - return None; - } - // Only interior nodes get marked visited. - if node.can_extend_left() && node.can_extend_right() { - let mut updated = node; - updated.set_visited(); - cell.set(updated); - } self.current = self.graph.next_unitig_kmer(current); Some(current) } } +// ── UnitigIter ──────────────────────────────────────────────────────────────── + +struct LongtigIter<'a> { + graph: &'a GraphDeBruijn, + current: Option, +} + +impl Iterator for LongtigIter<'_> { + type Item = Kmer; + + fn next(&mut self) -> Option { + let current = self.current?; + self.current = self.graph.next_longtig_kmer(current); + Some(current) + } +} + // ── helpers ─────────────────────────────────────────────────────────────────── fn oriented_next(from: Kmer, to: CanonicalKmer, k: usize) -> Kmer { - if from.is_overlapping(to.into_kmer(), k) { to.into_kmer() } else { to.revcomp(k) } + if from.is_overlapping(to.into_kmer(), k) { + to.into_kmer() + } else { + to.revcomp(k) + } } /// Returns `Some(i)` if exactly one of the four canonical neighbours exists in /// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for /// zero or ≥2 existing neighbours. -fn unique_neighbor( +fn count_neighbors( neighbors: [CanonicalKmer; 4], nodes: &FastHashMap>, -) -> Option { - let mut found: Option = None; +) -> (u8, Option) { + let mut count = 0u8; + let mut first = None; for (i, neighbour) in neighbors.iter().enumerate() { if nodes.contains_key(neighbour) { - if found.is_some() { - return None; // ≥2 neighbours + count += 1; + if first.is_none() { + first = Some(i as u8); } - found = Some(i as u8); } } - found + if count == 1 { + (1, first) + } else { + (count, None) + } } // ── tests ───────────────────────────────────────────────────────────────────── @@ -385,11 +689,41 @@ mod tests { // Non-repetitive sequence: no k-mer appears twice, no homopolymer run of length k. // ACGTGGCTA with k=5 → 5 distinct k-mers forming a clean linear chain. let k = 5; - let seq = b"ACGTGGCTA"; + let seq = b"ACCTGGCTA"; let g = graph_from_ascii(seq, k); g.compute_degrees(); + println!("Les kmers:"); + for (kmer, v) in g.nodes.iter() { + println!( + "{}: {}", + String::from_utf8_lossy(&kmer.to_ascii(k)), + v.get() + ); + } + // println!("Les starts:"); + // for (start, first_next) in g.start_iter() { + // if let Some(next) = first_next { + // println!( + // "{}->{}", + // String::from_utf8_lossy(&start.to_ascii(k)), + // String::from_utf8_lossy(&next.to_ascii(k)) + // ) + // } else { + // println!("{}->None", String::from_utf8_lossy(&start.to_ascii(k))) + // } + // } + + println!("Les unitig:"); let unitigs: Vec = g.iter_unitig().collect(); - assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig"); + for unitig in &unitigs { + println!("{}", String::from_utf8_lossy(&unitig.to_ascii())); + } + assert_eq!( + unitigs.len(), + 1, + "linear chain → exactly one unitig {:?}", + unitigs + ); assert_eq!( kmers_from_unitigs(&unitigs, k), canonical_kmers(seq, k), @@ -488,4 +822,69 @@ mod tests { assert_eq!(got.len(), expected.len(), "cycle k-mers lost"); assert_eq!(got, expected); } + + // ── branching graph ─────────────────────────────────────────────────────── + // + // Topology (k=5): two sources A,B converge at C; chain C-D-E-F; + // F branches to G and H; H continues H-M-N; second source J feeds I-F. + // Every k-mer must appear in exactly one unitig (no duplication, no loss). + #[test] + fn branching_graph_no_kmer_lost_or_duplicated() { + // Build sequences that realise the topology without accidental overlaps. + // Each "node" is a distinct 5-mer; edges share a 4-mer suffix/prefix. + // We use long non-repetitive sequences and extract only the required kmers. + let k: usize = 5; + let mut g = GraphDeBruijn::new(k); + + // Helper: insert all k-mers of a sequence. + let mut insert = |seq: &[u8]| { + for i in 0..=seq.len().saturating_sub(k) { + g.push(Kmer::from_ascii(&seq[i..i + k], k).unwrap().canonical(k)); + } + }; + + // Chains that realise the topology: + // A-C (A→C share 4-mer overlap) + // B-C (B→C share 4-mer overlap, different prefix) + // C-D-E-F + // F-G (F→G) + // F-H-M-N (F→H→M→N) + // J-I-F (J→I→F) + insert(b"AACGTGGCTA"); // A-C-D … part of the right branch + insert(b"TACGTGGCTA"); // B-C-D … merges at C (same C-suffix) + insert(b"CGTGGCTACG"); // continues D-E-F-G + insert(b"CGTGGCTACC"); // F-H branch (different last base) + insert(b"GTGGCTACCGT"); // H-M-N continuation + insert(b"TTCGTGGCTA"); // J-I-F (different J prefix) + + g.compute_degrees(); + let unitigs: Vec = g.iter_unitig().collect(); + + // Collect all k-mers from unitigs. + let got = kmers_from_unitigs(&unitigs, k); + + // Collect all distinct canonical k-mers inserted. + let mut expected: Vec = Vec::new(); + for seq in &[ + b"AACGTGGCTA".as_slice(), + b"TACGTGGCTA", + b"CGTGGCTACG", + b"CGTGGCTACC", + b"GTGGCTACCGT", + b"TTCGTGGCTA", + ] { + expected.extend(canonical_kmers(seq, k)); + } + expected.sort_unstable(); + expected.dedup(); + + assert_eq!( + got.len(), + expected.len(), + "k-mer count mismatch: got {}, expected {}", + got.len(), + expected.len() + ); + assert_eq!(got, expected, "k-mer sets differ"); + } } diff --git a/src/obikmer/src/cmd/longtig.rs b/src/obikmer/src/cmd/longtig.rs new file mode 100644 index 0000000..5a54dc8 --- /dev/null +++ b/src/obikmer/src/cmd/longtig.rs @@ -0,0 +1,141 @@ +use std::fs::File; +use std::path::PathBuf; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use clap::Args; +use niffler::Level; +use niffler::send::compression::Format; +use obidebruinj::GraphDeBruijn; +use obikpartitionner::KmerPartition; +use obiskio::SKFileReader; +use ph::fmph::GOFunction; +use rayon::prelude::*; +use tracing::info; + +#[derive(Args)] +pub struct LongtigArgs { + /// Root of the k-mer partition directory (produced by the `partition` command) + pub partition: PathBuf, + + /// Minimum kmer abundance (inclusive); kmers below this threshold are excluded + #[arg(long, default_value_t = 1)] + pub min_abundance: u32, + + /// Maximum kmer abundance (inclusive); kmers above this threshold are excluded + #[arg(long)] + pub max_abundance: Option, +} + +pub fn run(args: LongtigArgs) { + let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| { + eprintln!("error opening partition: {e}"); + std::process::exit(1) + }); + + let k = kp.kmer_size(); + let n = kp.n_partitions(); + info!("building longtigs from {n} partitions (k={k}, parallel)"); + + let total_kmers = AtomicUsize::new(0); + + (0..n).into_par_iter().for_each(|i| { + let part_dir = args.partition.join(format!("part_{i:05}")); + let in_path = part_dir.join("dereplicated.skmer.zst"); + if !in_path.exists() { + return; + } + let out_path = part_dir.join("longtig.fasta.gz"); + + let mut g = GraphDeBruijn::new(k); + + let mphf_path = part_dir.join("mphf1.bin"); + let counts_path = part_dir.join("counts1.bin"); + let filter_active = (args.min_abundance > 1 || args.max_abundance.is_some()) + && mphf_path.exists() + && counts_path.exists(); + + let mphf_opt: Option = if filter_active { + let mut f = File::open(&mphf_path).unwrap_or_else(|e| { + eprintln!("error opening {}: {e}", mphf_path.display()); + std::process::exit(1) + }); + Some(GOFunction::read(&mut f).unwrap_or_else(|e| { + eprintln!("error reading MPHF {}: {e}", mphf_path.display()); + std::process::exit(1) + })) + } else { + None + }; + + let counts_mmap_opt = if filter_active { + let cf = File::open(&counts_path).unwrap_or_else(|e| { + eprintln!("error opening {}: {e}", counts_path.display()); + std::process::exit(1) + }); + Some(unsafe { + memmap2::Mmap::map(&cf).unwrap_or_else(|e| { + eprintln!("error mmapping {}: {e}", counts_path.display()); + std::process::exit(1) + }) + }) + } else { + None + }; + + let counts_slice: Option<&[u32]> = counts_mmap_opt + .as_ref() + .map(|m| unsafe { std::slice::from_raw_parts(m.as_ptr() as *const u32, m.len() / 4) }); + + let mut reader = SKFileReader::open(&in_path, k).unwrap_or_else(|e| { + eprintln!("error opening {}: {e}", in_path.display()); + std::process::exit(1) + }); + for sk in reader.iter() { + for kmer in sk.iter_canonical_kmers(k) { + let accept = match (&mphf_opt, counts_slice) { + (Some(mphf), Some(counts)) => { + if let Some(slot) = mphf.get(&kmer) { + let ab = counts[slot as usize]; + ab >= args.min_abundance + && args.max_abundance.map_or(true, |max| ab <= max) + } else { + false + } + } + _ => true, + }; + if accept { + g.push(kmer); + } + } + } + + let n_kmers = g.len(); + total_kmers.fetch_add(n_kmers, Ordering::Relaxed); + info!( + "partition {i}/{n}: {n_kmers} canonical k-mers → {}", + out_path.display() + ); + + g.compute_degrees(); + + let file = File::create(&out_path).unwrap_or_else(|e| { + eprintln!("error creating {}: {e}", out_path.display()); + std::process::exit(1) + }); + let mut writer = niffler::send::get_writer(Box::new(file), Format::Gzip, Level::Six) + .unwrap_or_else(|e| { + eprintln!("error creating gzip writer: {e}"); + std::process::exit(1) + }); + g.write_fasta(&mut writer, false).unwrap_or_else(|e| { + eprintln!("write error on partition {i}: {e}"); + std::process::exit(1) + }); + }); + + info!( + "done — {} total canonical k-mers across all partitions", + total_kmers.load(Ordering::Relaxed) + ); +} diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index 9c1b69d..580fa1c 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,5 +1,6 @@ pub mod count; pub mod fasta; +pub mod longtig; pub mod partition; pub mod superkmer; pub mod unitig; diff --git a/src/obikmer/src/cmd/unitig.rs b/src/obikmer/src/cmd/unitig.rs index ca6536b..0fa3abd 100644 --- a/src/obikmer/src/cmd/unitig.rs +++ b/src/obikmer/src/cmd/unitig.rs @@ -82,9 +82,9 @@ pub fn run(args: UnitigArgs) { None }; - let counts_slice: Option<&[u32]> = counts_mmap_opt.as_ref().map(|m| unsafe { - std::slice::from_raw_parts(m.as_ptr() as *const u32, m.len() / 4) - }); + let counts_slice: Option<&[u32]> = counts_mmap_opt + .as_ref() + .map(|m| unsafe { std::slice::from_raw_parts(m.as_ptr() as *const u32, m.len() / 4) }); let mut reader = SKFileReader::open(&in_path, k).unwrap_or_else(|e| { eprintln!("error opening {}: {e}", in_path.display()); @@ -112,7 +112,10 @@ pub fn run(args: UnitigArgs) { let n_kmers = g.len(); total_kmers.fetch_add(n_kmers, Ordering::Relaxed); - info!("partition {i}/{n}: {n_kmers} canonical k-mers → {}", out_path.display()); + info!( + "partition {i}/{n}: {n_kmers} canonical k-mers → {}", + out_path.display() + ); g.compute_degrees(); @@ -125,7 +128,7 @@ pub fn run(args: UnitigArgs) { eprintln!("error creating gzip writer: {e}"); std::process::exit(1) }); - g.write_fasta(&mut writer).unwrap_or_else(|e| { + g.write_fasta(&mut writer, true).unwrap_or_else(|e| { eprintln!("write error on partition {i}: {e}"); std::process::exit(1) }); diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index b7131e7..33a0c12 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -23,11 +23,15 @@ enum Commands { Fasta(cmd::fasta::FastaArgs), /// Build de Bruijn unitigs for all partitions and write to unitig.fasta.gz Unitig(cmd::unitig::UnitigArgs), + /// Build de Bruijn longtigs for all partitions and write to longtig.fasta.gz + Longtig(cmd::longtig::LongtigArgs), } fn main() { fmt() - .with_env_filter(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))) + .with_env_filter( + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")), + ) .with_writer(std::io::stderr) .init(); @@ -47,6 +51,7 @@ fn main() { Commands::Count(args) => cmd::count::run(args), Commands::Fasta(args) => cmd::fasta::run(args), Commands::Unitig(args) => cmd::unitig::run(args), + Commands::Longtig(args) => cmd::longtig::run(args), } #[cfg(feature = "profiling")] diff --git a/src/obikseq/src/kmer.rs b/src/obikseq/src/kmer.rs index 38625ef..f83eb4f 100644 --- a/src/obikseq/src/kmer.rs +++ b/src/obikseq/src/kmer.rs @@ -234,6 +234,12 @@ impl CanonicalKmer { mix64(self.0.0) } + /// Extract nucleotide i (0-based from 5′ end) as a 2-bit value. + #[inline] + pub fn nucleotide(&self, i: usize) -> u8 { + self.0.nucleotide(i) + } + /// Return the four left canonical neighbours (each already canonical). /// Zero allocation — result lives on the stack. pub fn left_canonical_neighbors(&self, k: usize) -> [CanonicalKmer; 4] {