refactor: centralize k-mer config and introduce packed sequences
Centralize k-mer and minimizer configuration using a thread-safe global module, and replace manual bit-packing with a memory-efficient `PackedSeq` type. Refactor core sequence and k-mer types to use compile-time length enforcement and centralized hashing. Introduce a new De Bruijn graph implementation with compact node encoding and traversal iterators. Update I/O, partitioning, and builder modules to align with the new architecture, and add the `xxhash-rust` dependency.
This commit is contained in:
@@ -15,6 +15,7 @@ use remove_dir_all::remove_dir_all;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
use obikseq::Sequence;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obiskio::{SKFileMeta, SKFileReader, SKFileWriter, SKResult};
|
||||
use rayon::prelude::*;
|
||||
@@ -124,8 +125,7 @@ impl KmerPartition {
|
||||
/// Route and write one super-kmer to its partition file.
|
||||
pub fn write(&mut self, rsk: RoutableSuperKmer) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
let partition =
|
||||
(rsk.minimizer().seq_hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||
let partition = (rsk.minimizer().seq_hash() & self.partitions_mask) as usize;
|
||||
let sk = rsk.into_superkmer();
|
||||
self.ensure_writer(partition)?.write(&sk)
|
||||
}
|
||||
@@ -134,8 +134,7 @@ impl KmerPartition {
|
||||
pub fn write_batch(&mut self, rsks: Vec<RoutableSuperKmer>) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
for rsk in rsks {
|
||||
let partition =
|
||||
(rsk.minimizer().seq_hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||
let partition = (rsk.minimizer().seq_hash() & self.partitions_mask) as usize;
|
||||
let sk = rsk.into_superkmer();
|
||||
self.ensure_writer(partition)?.write(&sk)?;
|
||||
}
|
||||
@@ -202,7 +201,6 @@ impl KmerPartition {
|
||||
/// more temporary file descriptors — all managed by the global fd pool.
|
||||
pub fn dereplicate(&self) -> SKResult<()> {
|
||||
let level = self.level;
|
||||
let k = self.kmer_size;
|
||||
let root = &self.root_path;
|
||||
let sys = System::new_all();
|
||||
// available_memory() can return 0 on macOS when the compressor page count exceeds
|
||||
@@ -223,7 +221,7 @@ impl KmerPartition {
|
||||
}
|
||||
let raw_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
let n_buckets = optimal_buckets(&raw_path, available_per_thread);
|
||||
dereplicate_partition(&dir, level, n_buckets, k)
|
||||
dereplicate_partition(&dir, level, n_buckets)
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -328,8 +326,7 @@ impl KmerPartition {
|
||||
let dir = self.root_path.join(format!("part_{:05}", partition));
|
||||
fs::create_dir_all(&dir)?;
|
||||
let file_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
let writer =
|
||||
SKFileWriter::create_with(file_path, self.kmer_size, Format::Zstd, self.level)?;
|
||||
let writer = SKFileWriter::create_with(file_path, Format::Zstd, self.level)?;
|
||||
self.writers[partition] = Some(writer);
|
||||
}
|
||||
Ok(self.writers[partition].as_mut().unwrap())
|
||||
@@ -415,18 +412,18 @@ fn level_from_u32(n: u32) -> Level {
|
||||
const MAX_SK_COUNT: u64 = (1 << 24) - 1;
|
||||
|
||||
/// Deduplicate one partition directory in place (two-phase split + merge).
|
||||
fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize, k: usize) -> SKResult<()> {
|
||||
fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()> {
|
||||
let raw_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
if !raw_path.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let out_path = dir.join(format!("dereplicated.{SK_EXT}"));
|
||||
let mut writer = SKFileWriter::create_with(&out_path, k, Format::Zstd, level)?;
|
||||
let mut writer = SKFileWriter::create_with(&out_path, Format::Zstd, level)?;
|
||||
|
||||
if n_temp == 1 {
|
||||
// ── Direct path: partition fits in memory, no split needed ────────────
|
||||
let map = load_bucket(&raw_path, k)?;
|
||||
let map = load_bucket(&raw_path)?;
|
||||
remove_skmer_file(&raw_path)?;
|
||||
flush_map(map, &mut writer)?;
|
||||
} else {
|
||||
@@ -439,10 +436,10 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize, k: usize) -> S
|
||||
{
|
||||
let mut writers: Vec<SKFileWriter> = temp_paths
|
||||
.iter()
|
||||
.map(|p| SKFileWriter::create_with(p, k, Format::Zstd, level))
|
||||
.map(|p| SKFileWriter::create_with(p, Format::Zstd, level))
|
||||
.collect::<SKResult<_>>()?;
|
||||
|
||||
let mut reader = SKFileReader::open(&raw_path, k)?;
|
||||
let mut reader = SKFileReader::open(&raw_path)?;
|
||||
while let Some(sk) = reader.read()? {
|
||||
let bucket = (sk.seq_hash() & temp_mask) as usize;
|
||||
writers[bucket].write(&sk)?;
|
||||
@@ -455,7 +452,7 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize, k: usize) -> S
|
||||
|
||||
// ── Phase 2: merge each temp bucket into the output ───────────────────
|
||||
for temp_path in &temp_paths {
|
||||
let map = load_bucket(temp_path, k)?;
|
||||
let map = load_bucket(temp_path)?;
|
||||
remove_skmer_file(temp_path)?;
|
||||
flush_map(map, &mut writer)?;
|
||||
}
|
||||
@@ -466,14 +463,14 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize, k: usize) -> S
|
||||
}
|
||||
|
||||
/// Read a SuperKmer file into a deduplication map (already canonical).
|
||||
fn load_bucket(path: &Path, k: usize) -> SKResult<HashMap<SuperKmer, u64>> {
|
||||
fn load_bucket(path: &Path) -> SKResult<HashMap<SuperKmer, u64>> {
|
||||
let capacity = SKFileMeta::read(path)
|
||||
.ok()
|
||||
.flatten()
|
||||
.map(|m| m.instances as usize)
|
||||
.unwrap_or(0);
|
||||
let mut map: HashMap<SuperKmer, u64> = HashMap::with_capacity(capacity);
|
||||
let mut reader = SKFileReader::open(path, k)?;
|
||||
let mut reader = SKFileReader::open(path)?;
|
||||
while let Some(sk) = reader.read()? {
|
||||
let count = sk.count() as u64;
|
||||
*map.entry(sk).or_insert(0) += count;
|
||||
@@ -512,10 +509,10 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
let mut seen: HashSet<CanonicalKmer> = HashSet::with_capacity(capacity);
|
||||
let mut pass1_superkmers: u64 = 0;
|
||||
{
|
||||
let mut reader = SKFileReader::open(dedup_path, k)?;
|
||||
let mut reader = SKFileReader::open(dedup_path)?;
|
||||
while let Some(sk) = reader.read()? {
|
||||
pass1_superkmers += 1;
|
||||
for kmer in sk.iter_canonical_kmers(k) {
|
||||
for kmer in sk.iter_canonical_kmers() {
|
||||
seen.insert(kmer);
|
||||
}
|
||||
}
|
||||
@@ -555,10 +552,10 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
{
|
||||
let counts =
|
||||
unsafe { std::slice::from_raw_parts_mut(mmap.as_mut_ptr() as *mut u32, n_kmers) };
|
||||
let mut reader = SKFileReader::open(dedup_path, k)?;
|
||||
let mut reader = SKFileReader::open(dedup_path)?;
|
||||
while let Some(sk) = reader.read()? {
|
||||
pass2_superkmers += 1;
|
||||
let seql = sk.len();
|
||||
let seql = sk.seql();
|
||||
let sk_count = sk.count();
|
||||
if pass2_superkmers <= 3 {
|
||||
debug!(
|
||||
@@ -570,7 +567,7 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
continue;
|
||||
}
|
||||
pass2_count_sum += sk_count as u64;
|
||||
for kmer in sk.iter_canonical_kmers(k) {
|
||||
for kmer in sk.iter_canonical_kmers() {
|
||||
if let Some(idx) = mphf.get(&kmer) {
|
||||
counts[idx as usize] = counts[idx as usize].saturating_add(sk_count);
|
||||
pass2_kmer_hits += 1;
|
||||
|
||||
Reference in New Issue
Block a user