refactor: improve de Bruijn graph traversal and longtig generation
- Refactored Node representation using compact bitfields for neighbor counts and nucleotides; added count_neighbors helper to compute_degrees() - Introduced StartIter iterator for unitig/longtigu generation with revised traversal semantics (e.g., interior node marking) - Added nucleotide() accessor to Kmer type for 2-bit extraction at position i - Renamed unitig.rs → longtigs, updated CLI command and output filenames to reflect "long t ig" - Extended extract_kmers() in scripts/compare.py with duplication statistics ```
This commit is contained in:
@@ -51,16 +51,21 @@ def iter_sequences(path: str):
|
|||||||
yield header, "".join(parts)
|
yield header, "".join(parts)
|
||||||
|
|
||||||
|
|
||||||
def extract_kmers(path: str, k: int) -> set[str]:
|
def extract_kmers(path: str, k: int) -> tuple[set[str], int]:
|
||||||
"""Return the set of canonical k-mers from all sequences in *path*."""
|
"""Return (set of canonical k-mers, count of duplicated k-mers) from *path*.
|
||||||
kmers: set[str] = set()
|
|
||||||
|
A k-mer is duplicated if its canonical form appears more than once across
|
||||||
|
all sequences in the file.
|
||||||
|
"""
|
||||||
|
counts: dict[str, int] = {}
|
||||||
for _, seq in iter_sequences(path):
|
for _, seq in iter_sequences(path):
|
||||||
# skip any character that is not ACGT
|
|
||||||
for i in range(len(seq) - k + 1):
|
for i in range(len(seq) - k + 1):
|
||||||
kmer = seq[i : i + k]
|
kmer = seq[i : i + k]
|
||||||
if all(c in "ACGT" for c in kmer):
|
if all(c in "ACGT" for c in kmer):
|
||||||
kmers.add(canonical(kmer))
|
ck = canonical(kmer)
|
||||||
return kmers
|
counts[ck] = counts.get(ck, 0) + 1
|
||||||
|
duplicated = sum(1 for v in counts.values() if v > 1)
|
||||||
|
return set(counts), duplicated
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@@ -81,16 +86,18 @@ def main():
|
|||||||
print()
|
print()
|
||||||
|
|
||||||
print("reading A …", file=sys.stderr)
|
print("reading A …", file=sys.stderr)
|
||||||
set_a = extract_kmers(args.file_a, k)
|
set_a, dup_a = extract_kmers(args.file_a, k)
|
||||||
print("reading B …", file=sys.stderr)
|
print("reading B …", file=sys.stderr)
|
||||||
set_b = extract_kmers(args.file_b, k)
|
set_b, dup_b = extract_kmers(args.file_b, k)
|
||||||
|
|
||||||
only_a = set_a - set_b
|
only_a = set_a - set_b
|
||||||
only_b = set_b - set_a
|
only_b = set_b - set_a
|
||||||
common = set_a & set_b
|
common = set_a & set_b
|
||||||
|
|
||||||
print(f"{'kmers in A':<25} {len(set_a):>12,}")
|
print(f"{'kmers in A':<25} {len(set_a):>12,}")
|
||||||
|
print(f"{'duplicated in A':<25} {dup_a:>12,}")
|
||||||
print(f"{'kmers in B':<25} {len(set_b):>12,}")
|
print(f"{'kmers in B':<25} {len(set_b):>12,}")
|
||||||
|
print(f"{'duplicated in B':<25} {dup_b:>12,}")
|
||||||
print(f"{'common':<25} {len(common):>12,}")
|
print(f"{'common':<25} {len(common):>12,}")
|
||||||
print(f"{'only in A (lost)':<25} {len(only_a):>12,}")
|
print(f"{'only in A (lost)':<25} {len(only_a):>12,}")
|
||||||
print(f"{'only in B (gained)':<25} {len(only_b):>12,}")
|
print(f"{'only in B (gained)':<25} {len(only_b):>12,}")
|
||||||
|
|||||||
+490
-91
@@ -1,9 +1,10 @@
|
|||||||
use ahash::RandomState;
|
use ahash::RandomState;
|
||||||
use hashbrown::HashMap;
|
use hashbrown::HashMap;
|
||||||
use obifastwrite::write_unitig;
|
use obifastwrite::write_unitig;
|
||||||
use obikseq::kmer::{CanonicalKmer, Kmer};
|
use obikseq::kmer::{self, CanonicalKmer, Kmer};
|
||||||
use obikseq::unitig::Unitig;
|
use obikseq::unitig::Unitig;
|
||||||
use std::cell::Cell;
|
use std::cell::Cell;
|
||||||
|
use std::fmt;
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||||
@@ -28,14 +29,21 @@ type FastHashMap<K, V> = HashMap<K, V, RandomState>;
|
|||||||
pub struct Node(u8);
|
pub struct Node(u8);
|
||||||
|
|
||||||
impl Node {
|
impl Node {
|
||||||
|
/// Returns `true` if the node can be extended to the right.
|
||||||
|
///
|
||||||
|
/// A single right neighbour exists.
|
||||||
pub fn can_extend_right(self) -> bool {
|
pub fn can_extend_right(self) -> bool {
|
||||||
self.0 & 0b0000_0001 != 0
|
self.0 & 0b0000_0001 != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns `true` if the node can be extended to the left.
|
||||||
|
///
|
||||||
|
/// A single left neighbour exists.
|
||||||
pub fn can_extend_left(self) -> bool {
|
pub fn can_extend_left(self) -> bool {
|
||||||
self.0 & 0b0000_0010 != 0
|
self.0 & 0b0000_0010 != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns `true` if the node has been visited.
|
||||||
pub fn is_visited(self) -> bool {
|
pub fn is_visited(self) -> bool {
|
||||||
self.0 & 0b0000_0100 != 0
|
self.0 & 0b0000_0100 != 0
|
||||||
}
|
}
|
||||||
@@ -52,25 +60,81 @@ impl Node {
|
|||||||
(self.0 >> 5) & 0b11
|
(self.0 >> 5) & 0b11
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Marks the node as visited.
|
||||||
pub fn set_visited(&mut self) {
|
pub fn set_visited(&mut self) {
|
||||||
|
if self.is_visited() {
|
||||||
|
unreachable!("from: is_visited -> The node has already been visited")
|
||||||
|
}
|
||||||
self.0 |= 0b0000_0100;
|
self.0 |= 0b0000_0100;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// `None` → not uniquely continuable (0 or ≥2 neighbours).
|
/// Number of left neighbours.
|
||||||
/// `Some(n)` → exactly one neighbour, reached by adding nucleotide n.
|
pub fn n_left_neighbours(self) -> u8 {
|
||||||
pub fn set_right(&mut self, nuc: Option<u8>) {
|
if self.can_extend_left() {
|
||||||
self.0 &= !(0b0000_0001 | 0b0001_1000); // clear bit 0 and bits 3–4
|
1
|
||||||
if let Some(n) = nuc {
|
} else {
|
||||||
self.0 |= 0b0000_0001 | ((n & 0b11) << 3);
|
let v = (self.0 >> 5) & 0b11;
|
||||||
|
v + (v != 0) as u8
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_left(&mut self, nuc: Option<u8>) {
|
/// Number of right neighbours.
|
||||||
self.0 &= !(0b0000_0010 | 0b0110_0000); // clear bit 1 and bits 5–6
|
pub fn n_right_neighbours(self) -> u8 {
|
||||||
if let Some(n) = nuc {
|
if self.can_extend_right() {
|
||||||
self.0 |= 0b0000_0010 | ((n & 0b11) << 5);
|
1
|
||||||
|
} else {
|
||||||
|
let v = (self.0 >> 3) & 0b11;
|
||||||
|
v + (v != 0) as u8
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// `nuc` = Some(i) → exactly one neighbour (bit 0 set, bits 3–4 = nucleotide index).
|
||||||
|
/// `nuc` = None → 0 or ≥2 neighbours; `count` encoded in bits 3–4 as count.sat_sub(1).
|
||||||
|
pub fn set_right(&mut self, count: u8, nuc: Option<u8>) {
|
||||||
|
self.0 &= !(0b0000_0001 | 0b001_1000);
|
||||||
|
if count == 1 {
|
||||||
|
self.0 |= 0b0000_0001;
|
||||||
|
if let Some(n) = nuc {
|
||||||
|
self.0 |= (n & 0b11) << 3;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
unreachable!("nuc must be Some when count is 1");
|
||||||
|
}
|
||||||
|
self.0 |= (count.saturating_sub(1).min(3)) << 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `nuc` = Some(i) → exactly one neighbour (bit 0 set, bits 3–4 = nucleotide index).
|
||||||
|
/// `nuc` = None → 0 or ≥2 neighbours; `count` encoded in bits 3–4 as count.sat_sub(1).
|
||||||
|
pub fn set_left(&mut self, count: u8, nuc: Option<u8>) {
|
||||||
|
self.0 &= !(0b0000_0010 | 0b0110_0000);
|
||||||
|
if count == 1 {
|
||||||
|
self.0 |= 0b0000_0010;
|
||||||
|
if let Some(n) = nuc {
|
||||||
|
self.0 |= (n & 0b11) << 5;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
unreachable!("nuc must be Some when count is 1");
|
||||||
|
}
|
||||||
|
self.0 |= (count.saturating_sub(1).min(3)) << 5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for Node {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
const NUC: [char; 4] = ['A', 'C', 'G', 'T'];
|
||||||
|
let r = if self.can_extend_right() {
|
||||||
|
format!("→{}", NUC[self.right_nuc() as usize])
|
||||||
|
} else {
|
||||||
|
format!("→{}", self.n_right_neighbours())
|
||||||
|
};
|
||||||
|
let l = if self.can_extend_left() {
|
||||||
|
format!("←{}", NUC[self.left_nuc() as usize])
|
||||||
|
} else {
|
||||||
|
format!("←{}", self.n_left_neighbours())
|
||||||
|
};
|
||||||
|
let v = if self.is_visited() { "V" } else { "." };
|
||||||
|
write!(f, "Node({r} {l} {v})")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── GraphDeBruijn ─────────────────────────────────────────────────────────────
|
// ── GraphDeBruijn ─────────────────────────────────────────────────────────────
|
||||||
@@ -108,16 +172,80 @@ impl GraphDeBruijn {
|
|||||||
/// Single pass thanks to Cell interior mutability.
|
/// Single pass thanks to Cell interior mutability.
|
||||||
pub fn compute_degrees(&self) {
|
pub fn compute_degrees(&self) {
|
||||||
for (&kmer, cell) in &self.nodes {
|
for (&kmer, cell) in &self.nodes {
|
||||||
let right_nuc = unique_neighbor(kmer.right_canonical_neighbors(self.k), &self.nodes);
|
let (rc, rn) = count_neighbors(kmer.right_canonical_neighbors(self.k), &self.nodes);
|
||||||
let left_nuc = unique_neighbor(kmer.left_canonical_neighbors(self.k), &self.nodes);
|
let (lc, ln) = count_neighbors(kmer.left_canonical_neighbors(self.k), &self.nodes);
|
||||||
|
|
||||||
let mut node = cell.get();
|
let mut node = cell.get();
|
||||||
node.set_right(right_nuc);
|
node.set_right(rc, rn);
|
||||||
node.set_left(left_nuc);
|
node.set_left(lc, ln);
|
||||||
cell.set(node);
|
cell.set(node);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Iterates over the right neighbors of `kmer`.
|
||||||
|
pub fn iter_right_neighbors(
|
||||||
|
&self,
|
||||||
|
kmer: CanonicalKmer,
|
||||||
|
) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||||
|
kmer.right_canonical_neighbors(self.k)
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|kmer| {
|
||||||
|
self.nodes.get(&kmer)?;
|
||||||
|
Some(kmer)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Iterates over the left neighbors of `kmer`.
|
||||||
|
pub fn iter_left_neighbors(
|
||||||
|
&self,
|
||||||
|
kmer: CanonicalKmer,
|
||||||
|
) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||||
|
kmer.left_canonical_neighbors(self.k)
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|kmer| {
|
||||||
|
self.nodes.get(&kmer)?;
|
||||||
|
Some(kmer)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_visited(&self, kmer: &CanonicalKmer) -> Option<bool> {
|
||||||
|
self.nodes.get(kmer).map(|cell| cell.get().is_visited())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_visited(&self, kmer: CanonicalKmer) {
|
||||||
|
if let Some(cell) = self.nodes.get(&kmer) {
|
||||||
|
let mut node = cell.get();
|
||||||
|
node.set_visited();
|
||||||
|
cell.set(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the single right neighbor of `kmer`, if it exists.
|
||||||
|
pub fn the_single_right_neighbor(&self, kmer: CanonicalKmer) -> Option<CanonicalKmer> {
|
||||||
|
let node = self.nodes.get(&kmer)?.get();
|
||||||
|
if !node.can_extend_right() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let next = kmer
|
||||||
|
.into_kmer()
|
||||||
|
.push_right(node.right_nuc(), self.k)
|
||||||
|
.canonical(self.k);
|
||||||
|
self.nodes.contains_key(&next).then_some(next)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the single left neighbor of `kmer`, if it exists.
|
||||||
|
pub fn the_single_left_neighbor(&self, kmer: CanonicalKmer) -> Option<CanonicalKmer> {
|
||||||
|
let node = self.nodes.get(&kmer)?.get();
|
||||||
|
if !node.can_extend_left() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let next = kmer
|
||||||
|
.into_kmer()
|
||||||
|
.push_left(node.left_nuc(), self.k)
|
||||||
|
.canonical(self.k);
|
||||||
|
self.nodes.contains_key(&next).then_some(next)
|
||||||
|
}
|
||||||
|
|
||||||
/// Internal iterator over unitig-start nodes; drives `iter_unitig`.
|
/// Internal iterator over unitig-start nodes; drives `iter_unitig`.
|
||||||
///
|
///
|
||||||
/// MUST NOT be consumed standalone: the second pass finds cycle nodes only
|
/// MUST NOT be consumed standalone: the second pass finds cycle nodes only
|
||||||
@@ -128,75 +256,146 @@ impl GraphDeBruijn {
|
|||||||
/// - `!can_extend_left` → yield canonical form
|
/// - `!can_extend_left` → yield canonical form
|
||||||
/// - `!can_extend_right` → yield reverse complement
|
/// - `!can_extend_right` → yield reverse complement
|
||||||
/// 2. Nodes still unvisited → part of a cycle; yield canonical form.
|
/// 2. Nodes still unvisited → part of a cycle; yield canonical form.
|
||||||
// Yields (start, first_next) arcs:
|
fn start_iter(&self) -> impl Iterator<Item = (CanonicalKmer, Option<Kmer>)> + '_ {
|
||||||
// pass 1 — one arc per right-neighbour of every !can_extend_left node
|
StartIter::new(self)
|
||||||
// pass 2 — one arc per unvisited interior node (cycles); marks it visited
|
|
||||||
// Junction nodes (pass 1) are never marked visited.
|
|
||||||
fn start_iter(&self) -> impl Iterator<Item = (Kmer, Option<Kmer>)> + '_ {
|
|
||||||
let k = self.k;
|
|
||||||
|
|
||||||
let chain_starts = self.nodes.iter().flat_map(move |(&kmer, _cell)| {
|
|
||||||
let node = _cell.get();
|
|
||||||
if node.can_extend_left() {
|
|
||||||
return vec![];
|
|
||||||
}
|
|
||||||
let start = kmer.into_kmer();
|
|
||||||
if node.can_extend_right() {
|
|
||||||
let next_c = start.push_right(node.right_nuc(), k).canonical(k);
|
|
||||||
vec![(start, Some(oriented_next(start, next_c, k)))]
|
|
||||||
} else {
|
|
||||||
let rights: Vec<_> = kmer
|
|
||||||
.right_canonical_neighbors(k)
|
|
||||||
.into_iter()
|
|
||||||
.filter(|n| self.nodes.contains_key(n))
|
|
||||||
.map(|n| (start, Some(oriented_next(start, n, k))))
|
|
||||||
.collect();
|
|
||||||
if rights.is_empty() { vec![(start, None)] } else { rights }
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
let cycle_starts = self.nodes.iter().filter_map(move |(&kmer, cell)| {
|
|
||||||
let node = cell.get();
|
|
||||||
if node.is_visited() || !node.can_extend_left() || !node.can_extend_right() {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
let mut updated = node;
|
|
||||||
updated.set_visited();
|
|
||||||
cell.set(updated);
|
|
||||||
let start = kmer.into_kmer();
|
|
||||||
let next_c = start.push_right(node.right_nuc(), k).canonical(k);
|
|
||||||
Some((start, Some(oriented_next(start, next_c, k))))
|
|
||||||
});
|
|
||||||
|
|
||||||
chain_starts.chain(cycle_starts)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Direction from kmer orientation; returns None if called on a non-interior node.
|
|
||||||
fn next_unitig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
|
fn next_unitig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
|
||||||
let canonical = kmer.canonical(self.k);
|
let canonical = kmer.canonical(self.k);
|
||||||
let node = self.nodes.get(&canonical)?.get();
|
let node = self.nodes.get(&canonical)?.get();
|
||||||
if !node.can_extend_left() || !node.can_extend_right() {
|
|
||||||
|
let direct = kmer.raw() == canonical.raw();
|
||||||
|
|
||||||
|
if (direct && !node.can_extend_right()) || (!direct && !node.can_extend_left()) {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
let next_c = if kmer.raw() == canonical.raw() {
|
|
||||||
canonical.into_kmer().push_right(node.right_nuc(), self.k).canonical(self.k)
|
let next_c: CanonicalKmer = if direct {
|
||||||
|
canonical
|
||||||
|
.into_kmer()
|
||||||
|
.push_right(node.right_nuc(), self.k)
|
||||||
|
.canonical(self.k)
|
||||||
} else {
|
} else {
|
||||||
canonical.into_kmer().push_left(node.left_nuc(), self.k).canonical(self.k)
|
canonical
|
||||||
|
.into_kmer()
|
||||||
|
.push_left(node.left_nuc(), self.k)
|
||||||
|
.canonical(self.k)
|
||||||
};
|
};
|
||||||
Some(oriented_next(kmer, next_c, self.k))
|
|
||||||
|
let cell = self.nodes.get(&next_c)?;
|
||||||
|
let next_node = cell.get();
|
||||||
|
if next_node.is_visited() {
|
||||||
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn iter_unitig_kmers(&self, first_next: Option<Kmer>) -> UnitigIter<'_> {
|
let oriented = oriented_next(kmer, next_c, self.k);
|
||||||
UnitigIter { graph: self, current: first_next }
|
let ndirect = oriented.raw() == next_c.raw();
|
||||||
|
|
||||||
|
if (ndirect && next_node.n_right_neighbours() > 1)
|
||||||
|
|| (!ndirect && next_node.n_left_neighbours() > 1)
|
||||||
|
{
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut updated = next_node;
|
||||||
|
updated.set_visited();
|
||||||
|
cell.set(updated);
|
||||||
|
Some(oriented)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next_longtig_kmer(&self, kmer: Kmer) -> Option<Kmer> {
|
||||||
|
let k = self.k;
|
||||||
|
let canonical = kmer.canonical(k);
|
||||||
|
let node = self.nodes.get(&canonical)?.get();
|
||||||
|
|
||||||
|
let direct = kmer.raw() == canonical.raw();
|
||||||
|
|
||||||
|
if (direct && node.n_right_neighbours() == 0) || (!direct && node.n_left_neighbours() == 0)
|
||||||
|
{
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let next_c: CanonicalKmer = if direct {
|
||||||
|
if node.can_extend_right() {
|
||||||
|
canonical
|
||||||
|
.into_kmer()
|
||||||
|
.push_right(node.right_nuc(), k)
|
||||||
|
.canonical(k)
|
||||||
|
} else {
|
||||||
|
self.iter_right_neighbors(canonical)
|
||||||
|
.filter(|n| !self.is_visited(n).unwrap_or(true))
|
||||||
|
.next()?
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if node.can_extend_left() {
|
||||||
|
canonical
|
||||||
|
.into_kmer()
|
||||||
|
.push_left(node.left_nuc(), k)
|
||||||
|
.canonical(k)
|
||||||
|
} else {
|
||||||
|
self.iter_left_neighbors(canonical)
|
||||||
|
.filter(|n| !self.is_visited(n).unwrap_or(true))
|
||||||
|
.next()?
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let cell = self.nodes.get(&next_c)?;
|
||||||
|
let next_node = cell.get();
|
||||||
|
if next_node.is_visited() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let oriented = oriented_next(kmer, next_c, self.k);
|
||||||
|
let ndirect = oriented.raw() == next_c.raw();
|
||||||
|
|
||||||
|
if (ndirect && next_node.n_right_neighbours() > 1)
|
||||||
|
|| (!ndirect && next_node.n_left_neighbours() > 1)
|
||||||
|
{
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut updated = next_node;
|
||||||
|
updated.set_visited();
|
||||||
|
cell.set(updated);
|
||||||
|
Some(oriented)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> {
|
||||||
|
UnitigIter {
|
||||||
|
graph: self,
|
||||||
|
current: Some(start),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn iter_longtig_kmers(&self, start: Kmer) -> LongtigIter<'_> {
|
||||||
|
LongtigIter {
|
||||||
|
graph: self,
|
||||||
|
current: Some(start),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn iter_unitig(&self) -> impl Iterator<Item = Unitig> + '_ {
|
pub fn iter_unitig(&self) -> impl Iterator<Item = Unitig> + '_ {
|
||||||
let k = self.k;
|
let k = self.k;
|
||||||
self.start_iter().map(move |(start, first_next)| {
|
self.start_iter().map(move |(start, first_next)| {
|
||||||
let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect();
|
let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect();
|
||||||
for kmer in self.iter_unitig_kmers(first_next) {
|
if let Some(next_c) = first_next {
|
||||||
|
for kmer in self.iter_unitig_kmers(next_c) {
|
||||||
nucs.push(kmer.nucleotide(k - 1));
|
nucs.push(kmer.nucleotide(k - 1));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
Unitig::from_nucleotides(&nucs)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn iter_longtig(&self) -> impl Iterator<Item = Unitig> + '_ {
|
||||||
|
let k = self.k;
|
||||||
|
self.start_iter().map(move |(start, first_next)| {
|
||||||
|
let mut nucs: Vec<u8> = (0..k).map(|i| start.nucleotide(i)).collect();
|
||||||
|
if let Some(next_c) = first_next {
|
||||||
|
for kmer in self.iter_longtig_kmers(next_c) {
|
||||||
|
nucs.push(kmer.nucleotide(k - 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
Unitig::from_nucleotides(&nucs)
|
Unitig::from_nucleotides(&nucs)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -205,10 +404,16 @@ impl GraphDeBruijn {
|
|||||||
///
|
///
|
||||||
/// Calls [`obifastwrite::write_unitig`] for each unitig produced by
|
/// Calls [`obifastwrite::write_unitig`] for each unitig produced by
|
||||||
/// [`iter_unitig`]. Stops and returns the first I/O error encountered.
|
/// [`iter_unitig`]. Stops and returns the first I/O error encountered.
|
||||||
pub fn write_fasta<W: io::Write>(&self, out: &mut W) -> io::Result<()> {
|
pub fn write_fasta<W: io::Write>(&self, out: &mut W, unitig: bool) -> io::Result<()> {
|
||||||
|
if unitig {
|
||||||
for unitig in self.iter_unitig() {
|
for unitig in self.iter_unitig() {
|
||||||
write_unitig(&unitig, self.k, out)?;
|
write_unitig(&unitig, self.k, out)?;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
for unitig in self.iter_longtig() {
|
||||||
|
write_unitig(&unitig, self.k, out)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -221,6 +426,93 @@ impl GraphDeBruijn {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- StartIter -----------------------------------------------------------------
|
||||||
|
struct StartIter<'a> {
|
||||||
|
graph: &'a GraphDeBruijn,
|
||||||
|
nodes: hashbrown::hash_map::Iter<'a, CanonicalKmer, Cell<Node>>,
|
||||||
|
suspended: Vec<CanonicalKmer>,
|
||||||
|
in_cycle_pass: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> StartIter<'a> {
|
||||||
|
fn new(graph: &'a GraphDeBruijn) -> Self {
|
||||||
|
Self {
|
||||||
|
graph,
|
||||||
|
nodes: graph.nodes.iter(),
|
||||||
|
suspended: Vec::new(),
|
||||||
|
in_cycle_pass: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for StartIter<'a> {
|
||||||
|
type Item = (CanonicalKmer, Option<Kmer>);
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<(CanonicalKmer, Option<Kmer>)> {
|
||||||
|
loop {
|
||||||
|
let current = if let Some(k) = self.suspended.pop() {
|
||||||
|
k
|
||||||
|
} else {
|
||||||
|
match self.nodes.next() {
|
||||||
|
Some((&k, _)) => k,
|
||||||
|
None => {
|
||||||
|
if self.in_cycle_pass {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
self.in_cycle_pass = true;
|
||||||
|
self.nodes = self.graph.nodes.iter();
|
||||||
|
match self.nodes.next() {
|
||||||
|
Some((&k, _)) => k,
|
||||||
|
None => return None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let node = match self.graph.nodes.get(¤t) {
|
||||||
|
Some(c) => c.get(),
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
if node.is_visited() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if !self.in_cycle_pass && node.can_extend_left() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.graph.set_visited(current);
|
||||||
|
|
||||||
|
if let Some(next) = self.graph.the_single_right_neighbor(current) {
|
||||||
|
if self.graph.is_visited(&next).unwrap_or(true) {
|
||||||
|
return Some((current, None));
|
||||||
|
}
|
||||||
|
self.graph.set_visited(next);
|
||||||
|
let oriented = oriented_next(current.into_kmer(), next, self.graph.k);
|
||||||
|
return Some((current, Some(oriented)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut first_neighbor: Option<CanonicalKmer> = None;
|
||||||
|
for neighbor in self.graph.iter_right_neighbors(current) {
|
||||||
|
if self.graph.is_visited(&neighbor).unwrap_or(true) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if first_neighbor.is_none() {
|
||||||
|
self.graph.set_visited(neighbor);
|
||||||
|
first_neighbor = Some(neighbor);
|
||||||
|
} else {
|
||||||
|
self.suspended.push(neighbor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let oriented = match first_neighbor {
|
||||||
|
Some(neighbor) => Some(oriented_next(current.into_kmer(), neighbor, self.graph.k)),
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
return Some((current, oriented));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── UnitigIter ────────────────────────────────────────────────────────────────
|
// ── UnitigIter ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
struct UnitigIter<'a> {
|
struct UnitigIter<'a> {
|
||||||
@@ -233,48 +525,60 @@ impl Iterator for UnitigIter<'_> {
|
|||||||
|
|
||||||
fn next(&mut self) -> Option<Kmer> {
|
fn next(&mut self) -> Option<Kmer> {
|
||||||
let current = self.current?;
|
let current = self.current?;
|
||||||
let canonical = current.canonical(self.graph.k);
|
|
||||||
let cell = self.graph.nodes.get(&canonical)?;
|
|
||||||
let node = cell.get();
|
|
||||||
// Cycle guard: stop if already visited (cycle start marked by start_iter pass 2).
|
|
||||||
if node.is_visited() {
|
|
||||||
self.current = None;
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
// Only interior nodes get marked visited.
|
|
||||||
if node.can_extend_left() && node.can_extend_right() {
|
|
||||||
let mut updated = node;
|
|
||||||
updated.set_visited();
|
|
||||||
cell.set(updated);
|
|
||||||
}
|
|
||||||
self.current = self.graph.next_unitig_kmer(current);
|
self.current = self.graph.next_unitig_kmer(current);
|
||||||
Some(current)
|
Some(current)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── UnitigIter ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
struct LongtigIter<'a> {
|
||||||
|
graph: &'a GraphDeBruijn,
|
||||||
|
current: Option<Kmer>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for LongtigIter<'_> {
|
||||||
|
type Item = Kmer;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Kmer> {
|
||||||
|
let current = self.current?;
|
||||||
|
self.current = self.graph.next_longtig_kmer(current);
|
||||||
|
Some(current)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
fn oriented_next(from: Kmer, to: CanonicalKmer, k: usize) -> Kmer {
|
fn oriented_next(from: Kmer, to: CanonicalKmer, k: usize) -> Kmer {
|
||||||
if from.is_overlapping(to.into_kmer(), k) { to.into_kmer() } else { to.revcomp(k) }
|
if from.is_overlapping(to.into_kmer(), k) {
|
||||||
|
to.into_kmer()
|
||||||
|
} else {
|
||||||
|
to.revcomp(k)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `Some(i)` if exactly one of the four canonical neighbours exists in
|
/// Returns `Some(i)` if exactly one of the four canonical neighbours exists in
|
||||||
/// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for
|
/// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for
|
||||||
/// zero or ≥2 existing neighbours.
|
/// zero or ≥2 existing neighbours.
|
||||||
fn unique_neighbor(
|
fn count_neighbors(
|
||||||
neighbors: [CanonicalKmer; 4],
|
neighbors: [CanonicalKmer; 4],
|
||||||
nodes: &FastHashMap<CanonicalKmer, Cell<Node>>,
|
nodes: &FastHashMap<CanonicalKmer, Cell<Node>>,
|
||||||
) -> Option<u8> {
|
) -> (u8, Option<u8>) {
|
||||||
let mut found: Option<u8> = None;
|
let mut count = 0u8;
|
||||||
|
let mut first = None;
|
||||||
for (i, neighbour) in neighbors.iter().enumerate() {
|
for (i, neighbour) in neighbors.iter().enumerate() {
|
||||||
if nodes.contains_key(neighbour) {
|
if nodes.contains_key(neighbour) {
|
||||||
if found.is_some() {
|
count += 1;
|
||||||
return None; // ≥2 neighbours
|
if first.is_none() {
|
||||||
}
|
first = Some(i as u8);
|
||||||
found = Some(i as u8);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
found
|
}
|
||||||
|
if count == 1 {
|
||||||
|
(1, first)
|
||||||
|
} else {
|
||||||
|
(count, None)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||||
@@ -385,11 +689,41 @@ mod tests {
|
|||||||
// Non-repetitive sequence: no k-mer appears twice, no homopolymer run of length k.
|
// Non-repetitive sequence: no k-mer appears twice, no homopolymer run of length k.
|
||||||
// ACGTGGCTA with k=5 → 5 distinct k-mers forming a clean linear chain.
|
// ACGTGGCTA with k=5 → 5 distinct k-mers forming a clean linear chain.
|
||||||
let k = 5;
|
let k = 5;
|
||||||
let seq = b"ACGTGGCTA";
|
let seq = b"ACCTGGCTA";
|
||||||
let g = graph_from_ascii(seq, k);
|
let g = graph_from_ascii(seq, k);
|
||||||
g.compute_degrees();
|
g.compute_degrees();
|
||||||
|
println!("Les kmers:");
|
||||||
|
for (kmer, v) in g.nodes.iter() {
|
||||||
|
println!(
|
||||||
|
"{}: {}",
|
||||||
|
String::from_utf8_lossy(&kmer.to_ascii(k)),
|
||||||
|
v.get()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
// println!("Les starts:");
|
||||||
|
// for (start, first_next) in g.start_iter() {
|
||||||
|
// if let Some(next) = first_next {
|
||||||
|
// println!(
|
||||||
|
// "{}->{}",
|
||||||
|
// String::from_utf8_lossy(&start.to_ascii(k)),
|
||||||
|
// String::from_utf8_lossy(&next.to_ascii(k))
|
||||||
|
// )
|
||||||
|
// } else {
|
||||||
|
// println!("{}->None", String::from_utf8_lossy(&start.to_ascii(k)))
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
println!("Les unitig:");
|
||||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||||
assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig");
|
for unitig in &unitigs {
|
||||||
|
println!("{}", String::from_utf8_lossy(&unitig.to_ascii()));
|
||||||
|
}
|
||||||
|
assert_eq!(
|
||||||
|
unitigs.len(),
|
||||||
|
1,
|
||||||
|
"linear chain → exactly one unitig {:?}",
|
||||||
|
unitigs
|
||||||
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
kmers_from_unitigs(&unitigs, k),
|
kmers_from_unitigs(&unitigs, k),
|
||||||
canonical_kmers(seq, k),
|
canonical_kmers(seq, k),
|
||||||
@@ -488,4 +822,69 @@ mod tests {
|
|||||||
assert_eq!(got.len(), expected.len(), "cycle k-mers lost");
|
assert_eq!(got.len(), expected.len(), "cycle k-mers lost");
|
||||||
assert_eq!(got, expected);
|
assert_eq!(got, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── branching graph ───────────────────────────────────────────────────────
|
||||||
|
//
|
||||||
|
// Topology (k=5): two sources A,B converge at C; chain C-D-E-F;
|
||||||
|
// F branches to G and H; H continues H-M-N; second source J feeds I-F.
|
||||||
|
// Every k-mer must appear in exactly one unitig (no duplication, no loss).
|
||||||
|
#[test]
|
||||||
|
fn branching_graph_no_kmer_lost_or_duplicated() {
|
||||||
|
// Build sequences that realise the topology without accidental overlaps.
|
||||||
|
// Each "node" is a distinct 5-mer; edges share a 4-mer suffix/prefix.
|
||||||
|
// We use long non-repetitive sequences and extract only the required kmers.
|
||||||
|
let k: usize = 5;
|
||||||
|
let mut g = GraphDeBruijn::new(k);
|
||||||
|
|
||||||
|
// Helper: insert all k-mers of a sequence.
|
||||||
|
let mut insert = |seq: &[u8]| {
|
||||||
|
for i in 0..=seq.len().saturating_sub(k) {
|
||||||
|
g.push(Kmer::from_ascii(&seq[i..i + k], k).unwrap().canonical(k));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Chains that realise the topology:
|
||||||
|
// A-C (A→C share 4-mer overlap)
|
||||||
|
// B-C (B→C share 4-mer overlap, different prefix)
|
||||||
|
// C-D-E-F
|
||||||
|
// F-G (F→G)
|
||||||
|
// F-H-M-N (F→H→M→N)
|
||||||
|
// J-I-F (J→I→F)
|
||||||
|
insert(b"AACGTGGCTA"); // A-C-D … part of the right branch
|
||||||
|
insert(b"TACGTGGCTA"); // B-C-D … merges at C (same C-suffix)
|
||||||
|
insert(b"CGTGGCTACG"); // continues D-E-F-G
|
||||||
|
insert(b"CGTGGCTACC"); // F-H branch (different last base)
|
||||||
|
insert(b"GTGGCTACCGT"); // H-M-N continuation
|
||||||
|
insert(b"TTCGTGGCTA"); // J-I-F (different J prefix)
|
||||||
|
|
||||||
|
g.compute_degrees();
|
||||||
|
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||||
|
|
||||||
|
// Collect all k-mers from unitigs.
|
||||||
|
let got = kmers_from_unitigs(&unitigs, k);
|
||||||
|
|
||||||
|
// Collect all distinct canonical k-mers inserted.
|
||||||
|
let mut expected: Vec<CanonicalKmer> = Vec::new();
|
||||||
|
for seq in &[
|
||||||
|
b"AACGTGGCTA".as_slice(),
|
||||||
|
b"TACGTGGCTA",
|
||||||
|
b"CGTGGCTACG",
|
||||||
|
b"CGTGGCTACC",
|
||||||
|
b"GTGGCTACCGT",
|
||||||
|
b"TTCGTGGCTA",
|
||||||
|
] {
|
||||||
|
expected.extend(canonical_kmers(seq, k));
|
||||||
|
}
|
||||||
|
expected.sort_unstable();
|
||||||
|
expected.dedup();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
got.len(),
|
||||||
|
expected.len(),
|
||||||
|
"k-mer count mismatch: got {}, expected {}",
|
||||||
|
got.len(),
|
||||||
|
expected.len()
|
||||||
|
);
|
||||||
|
assert_eq!(got, expected, "k-mer sets differ");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,141 @@
|
|||||||
|
use std::fs::File;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
|
|
||||||
|
use clap::Args;
|
||||||
|
use niffler::Level;
|
||||||
|
use niffler::send::compression::Format;
|
||||||
|
use obidebruinj::GraphDeBruijn;
|
||||||
|
use obikpartitionner::KmerPartition;
|
||||||
|
use obiskio::SKFileReader;
|
||||||
|
use ph::fmph::GOFunction;
|
||||||
|
use rayon::prelude::*;
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
|
#[derive(Args)]
|
||||||
|
pub struct LongtigArgs {
|
||||||
|
/// Root of the k-mer partition directory (produced by the `partition` command)
|
||||||
|
pub partition: PathBuf,
|
||||||
|
|
||||||
|
/// Minimum kmer abundance (inclusive); kmers below this threshold are excluded
|
||||||
|
#[arg(long, default_value_t = 1)]
|
||||||
|
pub min_abundance: u32,
|
||||||
|
|
||||||
|
/// Maximum kmer abundance (inclusive); kmers above this threshold are excluded
|
||||||
|
#[arg(long)]
|
||||||
|
pub max_abundance: Option<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run(args: LongtigArgs) {
|
||||||
|
let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error opening partition: {e}");
|
||||||
|
std::process::exit(1)
|
||||||
|
});
|
||||||
|
|
||||||
|
let k = kp.kmer_size();
|
||||||
|
let n = kp.n_partitions();
|
||||||
|
info!("building longtigs from {n} partitions (k={k}, parallel)");
|
||||||
|
|
||||||
|
let total_kmers = AtomicUsize::new(0);
|
||||||
|
|
||||||
|
(0..n).into_par_iter().for_each(|i| {
|
||||||
|
let part_dir = args.partition.join(format!("part_{i:05}"));
|
||||||
|
let in_path = part_dir.join("dereplicated.skmer.zst");
|
||||||
|
if !in_path.exists() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let out_path = part_dir.join("longtig.fasta.gz");
|
||||||
|
|
||||||
|
let mut g = GraphDeBruijn::new(k);
|
||||||
|
|
||||||
|
let mphf_path = part_dir.join("mphf1.bin");
|
||||||
|
let counts_path = part_dir.join("counts1.bin");
|
||||||
|
let filter_active = (args.min_abundance > 1 || args.max_abundance.is_some())
|
||||||
|
&& mphf_path.exists()
|
||||||
|
&& counts_path.exists();
|
||||||
|
|
||||||
|
let mphf_opt: Option<GOFunction> = if filter_active {
|
||||||
|
let mut f = File::open(&mphf_path).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error opening {}: {e}", mphf_path.display());
|
||||||
|
std::process::exit(1)
|
||||||
|
});
|
||||||
|
Some(GOFunction::read(&mut f).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error reading MPHF {}: {e}", mphf_path.display());
|
||||||
|
std::process::exit(1)
|
||||||
|
}))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let counts_mmap_opt = if filter_active {
|
||||||
|
let cf = File::open(&counts_path).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error opening {}: {e}", counts_path.display());
|
||||||
|
std::process::exit(1)
|
||||||
|
});
|
||||||
|
Some(unsafe {
|
||||||
|
memmap2::Mmap::map(&cf).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error mmapping {}: {e}", counts_path.display());
|
||||||
|
std::process::exit(1)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let counts_slice: Option<&[u32]> = counts_mmap_opt
|
||||||
|
.as_ref()
|
||||||
|
.map(|m| unsafe { std::slice::from_raw_parts(m.as_ptr() as *const u32, m.len() / 4) });
|
||||||
|
|
||||||
|
let mut reader = SKFileReader::open(&in_path, k).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error opening {}: {e}", in_path.display());
|
||||||
|
std::process::exit(1)
|
||||||
|
});
|
||||||
|
for sk in reader.iter() {
|
||||||
|
for kmer in sk.iter_canonical_kmers(k) {
|
||||||
|
let accept = match (&mphf_opt, counts_slice) {
|
||||||
|
(Some(mphf), Some(counts)) => {
|
||||||
|
if let Some(slot) = mphf.get(&kmer) {
|
||||||
|
let ab = counts[slot as usize];
|
||||||
|
ab >= args.min_abundance
|
||||||
|
&& args.max_abundance.map_or(true, |max| ab <= max)
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => true,
|
||||||
|
};
|
||||||
|
if accept {
|
||||||
|
g.push(kmer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let n_kmers = g.len();
|
||||||
|
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
||||||
|
info!(
|
||||||
|
"partition {i}/{n}: {n_kmers} canonical k-mers → {}",
|
||||||
|
out_path.display()
|
||||||
|
);
|
||||||
|
|
||||||
|
g.compute_degrees();
|
||||||
|
|
||||||
|
let file = File::create(&out_path).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error creating {}: {e}", out_path.display());
|
||||||
|
std::process::exit(1)
|
||||||
|
});
|
||||||
|
let mut writer = niffler::send::get_writer(Box::new(file), Format::Gzip, Level::Six)
|
||||||
|
.unwrap_or_else(|e| {
|
||||||
|
eprintln!("error creating gzip writer: {e}");
|
||||||
|
std::process::exit(1)
|
||||||
|
});
|
||||||
|
g.write_fasta(&mut writer, false).unwrap_or_else(|e| {
|
||||||
|
eprintln!("write error on partition {i}: {e}");
|
||||||
|
std::process::exit(1)
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"done — {} total canonical k-mers across all partitions",
|
||||||
|
total_kmers.load(Ordering::Relaxed)
|
||||||
|
);
|
||||||
|
}
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
pub mod count;
|
pub mod count;
|
||||||
pub mod fasta;
|
pub mod fasta;
|
||||||
|
pub mod longtig;
|
||||||
pub mod partition;
|
pub mod partition;
|
||||||
pub mod superkmer;
|
pub mod superkmer;
|
||||||
pub mod unitig;
|
pub mod unitig;
|
||||||
|
|||||||
@@ -82,9 +82,9 @@ pub fn run(args: UnitigArgs) {
|
|||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
let counts_slice: Option<&[u32]> = counts_mmap_opt.as_ref().map(|m| unsafe {
|
let counts_slice: Option<&[u32]> = counts_mmap_opt
|
||||||
std::slice::from_raw_parts(m.as_ptr() as *const u32, m.len() / 4)
|
.as_ref()
|
||||||
});
|
.map(|m| unsafe { std::slice::from_raw_parts(m.as_ptr() as *const u32, m.len() / 4) });
|
||||||
|
|
||||||
let mut reader = SKFileReader::open(&in_path, k).unwrap_or_else(|e| {
|
let mut reader = SKFileReader::open(&in_path, k).unwrap_or_else(|e| {
|
||||||
eprintln!("error opening {}: {e}", in_path.display());
|
eprintln!("error opening {}: {e}", in_path.display());
|
||||||
@@ -112,7 +112,10 @@ pub fn run(args: UnitigArgs) {
|
|||||||
|
|
||||||
let n_kmers = g.len();
|
let n_kmers = g.len();
|
||||||
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
||||||
info!("partition {i}/{n}: {n_kmers} canonical k-mers → {}", out_path.display());
|
info!(
|
||||||
|
"partition {i}/{n}: {n_kmers} canonical k-mers → {}",
|
||||||
|
out_path.display()
|
||||||
|
);
|
||||||
|
|
||||||
g.compute_degrees();
|
g.compute_degrees();
|
||||||
|
|
||||||
@@ -125,7 +128,7 @@ pub fn run(args: UnitigArgs) {
|
|||||||
eprintln!("error creating gzip writer: {e}");
|
eprintln!("error creating gzip writer: {e}");
|
||||||
std::process::exit(1)
|
std::process::exit(1)
|
||||||
});
|
});
|
||||||
g.write_fasta(&mut writer).unwrap_or_else(|e| {
|
g.write_fasta(&mut writer, true).unwrap_or_else(|e| {
|
||||||
eprintln!("write error on partition {i}: {e}");
|
eprintln!("write error on partition {i}: {e}");
|
||||||
std::process::exit(1)
|
std::process::exit(1)
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -23,11 +23,15 @@ enum Commands {
|
|||||||
Fasta(cmd::fasta::FastaArgs),
|
Fasta(cmd::fasta::FastaArgs),
|
||||||
/// Build de Bruijn unitigs for all partitions and write to unitig.fasta.gz
|
/// Build de Bruijn unitigs for all partitions and write to unitig.fasta.gz
|
||||||
Unitig(cmd::unitig::UnitigArgs),
|
Unitig(cmd::unitig::UnitigArgs),
|
||||||
|
/// Build de Bruijn longtigs for all partitions and write to longtig.fasta.gz
|
||||||
|
Longtig(cmd::longtig::LongtigArgs),
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
fmt()
|
fmt()
|
||||||
.with_env_filter(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
|
.with_env_filter(
|
||||||
|
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
|
||||||
|
)
|
||||||
.with_writer(std::io::stderr)
|
.with_writer(std::io::stderr)
|
||||||
.init();
|
.init();
|
||||||
|
|
||||||
@@ -47,6 +51,7 @@ fn main() {
|
|||||||
Commands::Count(args) => cmd::count::run(args),
|
Commands::Count(args) => cmd::count::run(args),
|
||||||
Commands::Fasta(args) => cmd::fasta::run(args),
|
Commands::Fasta(args) => cmd::fasta::run(args),
|
||||||
Commands::Unitig(args) => cmd::unitig::run(args),
|
Commands::Unitig(args) => cmd::unitig::run(args),
|
||||||
|
Commands::Longtig(args) => cmd::longtig::run(args),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "profiling")]
|
#[cfg(feature = "profiling")]
|
||||||
|
|||||||
@@ -234,6 +234,12 @@ impl CanonicalKmer {
|
|||||||
mix64(self.0.0)
|
mix64(self.0.0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extract nucleotide i (0-based from 5′ end) as a 2-bit value.
|
||||||
|
#[inline]
|
||||||
|
pub fn nucleotide(&self, i: usize) -> u8 {
|
||||||
|
self.0.nucleotide(i)
|
||||||
|
}
|
||||||
|
|
||||||
/// Return the four left canonical neighbours (each already canonical).
|
/// Return the four left canonical neighbours (each already canonical).
|
||||||
/// Zero allocation — result lives on the stack.
|
/// Zero allocation — result lives on the stack.
|
||||||
pub fn left_canonical_neighbors(&self, k: usize) -> [CanonicalKmer; 4] {
|
pub fn left_canonical_neighbors(&self, k: usize) -> [CanonicalKmer; 4] {
|
||||||
|
|||||||
Reference in New Issue
Block a user