refactor: implement RoutableSuperKmer and update k-mer indexing pipeline
Replace raw SuperkMer routing with a new RoutableSuperKimer type that embeds canonical sequences and precomputed minimizers, enabling direct partition routing via hash. Update the build pipeline to yield RoutableSuperKmers throughout (builder, scatterer), refactor FASTA/unitig export commands to use the new type and compressed outputs (.fasta.gz, .unitigs.fasta.zst), revise SuperKmer header to store n_kmers instead of seql (avoiding 256-byte wrap), and update documentation to reflect minimizer-based theory, two evidence-encoding strategies for unitig-MPHF indexing (global offset vs. ID+rank), and the new obipipeline library architecture with parallel workers, biased scheduling, and error handling.
This commit is contained in:
@@ -15,6 +15,7 @@ use remove_dir_all::remove_dir_all;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
use obiskio::{SKFileMeta, SKFileReader, SKFileWriter, SKResult};
|
||||
use rayon::prelude::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -102,8 +103,8 @@ impl KmerPartition {
|
||||
.into());
|
||||
}
|
||||
let meta_path = root_path.join(META_FILENAME);
|
||||
let meta: PartitionMeta = serde_json::from_reader(fs::File::open(&meta_path)?)
|
||||
.map_err(io::Error::other)?;
|
||||
let meta: PartitionMeta =
|
||||
serde_json::from_reader(fs::File::open(&meta_path)?).map_err(io::Error::other)?;
|
||||
|
||||
let level = level_from_u32(meta.level);
|
||||
let n_partitions = 1usize << meta.n_bits;
|
||||
@@ -120,19 +121,21 @@ impl KmerPartition {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn write(&mut self, sk: &mut SuperKmer) -> SKResult<()> {
|
||||
/// Route and write one super-kmer to its partition file.
|
||||
pub fn write(&mut self, rsk: RoutableSuperKmer) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
let partition = self.partition_of(sk)?;
|
||||
sk.init_count();
|
||||
self.ensure_writer(partition)?.write(sk)
|
||||
let partition = (rsk.minimizer().hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||
let sk = rsk.into_superkmer();
|
||||
self.ensure_writer(partition)?.write(&sk)
|
||||
}
|
||||
|
||||
pub fn write_batch(&mut self, sks: &mut [SuperKmer]) -> SKResult<()> {
|
||||
/// Route and write a batch of super-kmers.
|
||||
pub fn write_batch(&mut self, rsks: Vec<RoutableSuperKmer>) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
for sk in sks {
|
||||
let partition = self.partition_of(sk)?;
|
||||
sk.init_count();
|
||||
self.ensure_writer(partition)?.write(sk)?;
|
||||
for rsk in rsks {
|
||||
let partition = (rsk.minimizer().hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||
let sk = rsk.into_superkmer();
|
||||
self.ensure_writer(partition)?.write(&sk)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -164,6 +167,18 @@ impl KmerPartition {
|
||||
&self.root_path
|
||||
}
|
||||
|
||||
pub fn kmer_size(&self) -> usize {
|
||||
self.kmer_size
|
||||
}
|
||||
|
||||
pub fn minimizer_size(&self) -> usize {
|
||||
self.minimizer_size
|
||||
}
|
||||
|
||||
pub fn n_partitions(&self) -> usize {
|
||||
self.n_partitions
|
||||
}
|
||||
|
||||
/// Deduplicate all `raw.{ext}` files in parallel, replacing each with a
|
||||
/// `dereplicated.{ext}` file where identical canonical sequences are merged
|
||||
/// and their counts summed.
|
||||
@@ -185,6 +200,7 @@ impl KmerPartition {
|
||||
/// more temporary file descriptors — all managed by the global fd pool.
|
||||
pub fn dereplicate(&self) -> SKResult<()> {
|
||||
let level = self.level;
|
||||
let k = self.kmer_size;
|
||||
let root = &self.root_path;
|
||||
let sys = System::new_all();
|
||||
// available_memory() can return 0 on macOS when the compressor page count exceeds
|
||||
@@ -205,7 +221,7 @@ impl KmerPartition {
|
||||
}
|
||||
let raw_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
let n_buckets = optimal_buckets(&raw_path, available_per_thread);
|
||||
dereplicate_partition(&dir, level, n_buckets)
|
||||
dereplicate_partition(&dir, level, n_buckets, k)
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -270,8 +286,10 @@ impl KmerPartition {
|
||||
}
|
||||
}
|
||||
|
||||
let global_spectrum_map: BTreeMap<String, u64> =
|
||||
global_spectrum.iter().map(|(&c, &f)| (format!("{c:010}"), f)).collect();
|
||||
let global_spectrum_map: BTreeMap<String, u64> = global_spectrum
|
||||
.iter()
|
||||
.map(|(&c, &f)| (format!("{c:010}"), f))
|
||||
.collect();
|
||||
serde_json::to_writer_pretty(
|
||||
fs::File::create(root.join("kmer_spectrum_raw.json"))?,
|
||||
&serde_json::json!({ "f0": global_f0, "f1": global_f1, "spectrum": &global_spectrum_map }),
|
||||
@@ -291,14 +309,6 @@ impl KmerPartition {
|
||||
}
|
||||
}
|
||||
|
||||
fn partition_of(&self, sk: &SuperKmer) -> SKResult<usize> {
|
||||
let minimizer = sk
|
||||
.kmer(sk.minimizer_pos() as usize, self.minimizer_size)
|
||||
.map_err(|e| io::Error::other(e))?
|
||||
.canonical(self.minimizer_size);
|
||||
Ok((minimizer.hash(self.minimizer_size) & self.partitions_mask) as usize)
|
||||
}
|
||||
|
||||
fn write_meta(&self, n_bits: usize) -> SKResult<()> {
|
||||
let meta = PartitionMeta {
|
||||
n_bits,
|
||||
@@ -316,7 +326,8 @@ impl KmerPartition {
|
||||
let dir = self.root_path.join(format!("part_{:05}", partition));
|
||||
fs::create_dir_all(&dir)?;
|
||||
let file_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
let writer = SKFileWriter::create_with(file_path, Format::Zstd, self.level)?;
|
||||
let writer =
|
||||
SKFileWriter::create_with(file_path, self.kmer_size, Format::Zstd, self.level)?;
|
||||
self.writers[partition] = Some(writer);
|
||||
}
|
||||
Ok(self.writers[partition].as_mut().unwrap())
|
||||
@@ -373,33 +384,47 @@ fn optimal_buckets(raw_path: &Path, available_bytes: u64) -> usize {
|
||||
|
||||
fn level_from_u32(n: u32) -> Level {
|
||||
match n {
|
||||
0 => Level::Zero, 1 => Level::One, 2 => Level::Two, 3 => Level::Three,
|
||||
4 => Level::Four, 5 => Level::Five, 6 => Level::Six, 7 => Level::Seven,
|
||||
8 => Level::Eight, 9 => Level::Nine, 10 => Level::Ten, 11 => Level::Eleven,
|
||||
12 => Level::Twelve, 13 => Level::Thirteen, 14 => Level::Fourteen,
|
||||
15 => Level::Fifteen, 16 => Level::Sixteen, 17 => Level::Seventeen,
|
||||
18 => Level::Eighteen, 19 => Level::Nineteen, 20 => Level::Twenty,
|
||||
0 => Level::Zero,
|
||||
1 => Level::One,
|
||||
2 => Level::Two,
|
||||
3 => Level::Three,
|
||||
4 => Level::Four,
|
||||
5 => Level::Five,
|
||||
6 => Level::Six,
|
||||
7 => Level::Seven,
|
||||
8 => Level::Eight,
|
||||
9 => Level::Nine,
|
||||
10 => Level::Ten,
|
||||
11 => Level::Eleven,
|
||||
12 => Level::Twelve,
|
||||
13 => Level::Thirteen,
|
||||
14 => Level::Fourteen,
|
||||
15 => Level::Fifteen,
|
||||
16 => Level::Sixteen,
|
||||
17 => Level::Seventeen,
|
||||
18 => Level::Eighteen,
|
||||
19 => Level::Nineteen,
|
||||
20 => Level::Twenty,
|
||||
_ => Level::TwentyOne,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Maximum value that fits in the 24-bit COUNT field of a SuperKmer header.
|
||||
const MAX_SK_COUNT: u64 = (1 << 24) - 1;
|
||||
|
||||
/// Deduplicate one partition directory in place (two-phase split + merge).
|
||||
fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()> {
|
||||
fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize, k: usize) -> SKResult<()> {
|
||||
let raw_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
if !raw_path.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let out_path = dir.join(format!("dereplicated.{SK_EXT}"));
|
||||
let mut writer = SKFileWriter::create_with(&out_path, Format::Zstd, level)?;
|
||||
let mut writer = SKFileWriter::create_with(&out_path, k, Format::Zstd, level)?;
|
||||
|
||||
if n_temp == 1 {
|
||||
// ── Direct path: partition fits in memory, no split needed ────────────
|
||||
let map = load_bucket(&raw_path)?;
|
||||
let map = load_bucket(&raw_path, k)?;
|
||||
remove_skmer_file(&raw_path)?;
|
||||
flush_map(map, &mut writer)?;
|
||||
} else {
|
||||
@@ -412,10 +437,10 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()
|
||||
{
|
||||
let mut writers: Vec<SKFileWriter> = temp_paths
|
||||
.iter()
|
||||
.map(|p| SKFileWriter::create_with(p, Format::Zstd, level))
|
||||
.map(|p| SKFileWriter::create_with(p, k, Format::Zstd, level))
|
||||
.collect::<SKResult<_>>()?;
|
||||
|
||||
let mut reader = SKFileReader::open(&raw_path)?;
|
||||
let mut reader = SKFileReader::open(&raw_path, k)?;
|
||||
while let Some(mut sk) = reader.read()? {
|
||||
sk.canonical();
|
||||
let bucket = (sk.hash() & temp_mask) as usize;
|
||||
@@ -429,7 +454,7 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()
|
||||
|
||||
// ── Phase 2: merge each temp bucket into the output ───────────────────
|
||||
for temp_path in &temp_paths {
|
||||
let map = load_bucket(temp_path)?;
|
||||
let map = load_bucket(temp_path, k)?;
|
||||
remove_skmer_file(temp_path)?;
|
||||
flush_map(map, &mut writer)?;
|
||||
}
|
||||
@@ -440,14 +465,14 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()
|
||||
}
|
||||
|
||||
/// Read a SuperKmer file into a deduplication map (already canonical).
|
||||
fn load_bucket(path: &Path) -> SKResult<HashMap<SuperKmer, u64>> {
|
||||
fn load_bucket(path: &Path, k: usize) -> SKResult<HashMap<SuperKmer, u64>> {
|
||||
let capacity = SKFileMeta::read(path)
|
||||
.ok()
|
||||
.flatten()
|
||||
.map(|m| m.instances as usize)
|
||||
.unwrap_or(0);
|
||||
let mut map: HashMap<SuperKmer, u64> = HashMap::with_capacity(capacity);
|
||||
let mut reader = SKFileReader::open(path)?;
|
||||
let mut reader = SKFileReader::open(path, k)?;
|
||||
while let Some(mut sk) = reader.read()? {
|
||||
sk.canonical();
|
||||
let count = sk.count() as u64;
|
||||
@@ -487,7 +512,7 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
let mut seen: HashSet<Kmer> = HashSet::with_capacity(capacity);
|
||||
let mut pass1_superkmers: u64 = 0;
|
||||
{
|
||||
let mut reader = SKFileReader::open(dedup_path)?;
|
||||
let mut reader = SKFileReader::open(dedup_path, k)?;
|
||||
while let Some(sk) = reader.read()? {
|
||||
pass1_superkmers += 1;
|
||||
for kmer in sk.iter_canonical_kmers(k) {
|
||||
@@ -497,7 +522,10 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
}
|
||||
let kmers: Vec<Kmer> = seen.into_iter().collect();
|
||||
let n_kmers = kmers.len();
|
||||
debug!("{}: pass1 superkmers={pass1_superkmers} unique_kmers={n_kmers}", dir.display());
|
||||
debug!(
|
||||
"{}: pass1 superkmers={pass1_superkmers} unique_kmers={n_kmers}",
|
||||
dir.display()
|
||||
);
|
||||
|
||||
if n_kmers == 0 {
|
||||
return Ok(());
|
||||
@@ -527,13 +555,16 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
{
|
||||
let counts =
|
||||
unsafe { std::slice::from_raw_parts_mut(mmap.as_mut_ptr() as *mut u32, n_kmers) };
|
||||
let mut reader = SKFileReader::open(dedup_path)?;
|
||||
let mut reader = SKFileReader::open(dedup_path, k)?;
|
||||
while let Some(sk) = reader.read()? {
|
||||
pass2_superkmers += 1;
|
||||
let seql = sk.seql();
|
||||
let seql = sk.len();
|
||||
let sk_count = sk.count();
|
||||
if pass2_superkmers <= 3 {
|
||||
debug!("{}: sk#{pass2_superkmers} seql={seql} count={sk_count}", dir.display());
|
||||
debug!(
|
||||
"{}: sk#{pass2_superkmers} seql={seql} count={sk_count}",
|
||||
dir.display()
|
||||
);
|
||||
}
|
||||
if seql < k {
|
||||
continue;
|
||||
@@ -566,8 +597,10 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
let f0 = n_kmers as u64;
|
||||
let f1: u64 = spectrum.iter().map(|(&c, &f)| c as u64 * f).sum();
|
||||
|
||||
let spectrum_map: BTreeMap<String, u64> =
|
||||
spectrum.iter().map(|(&c, &f)| (format!("{c:010}"), f)).collect();
|
||||
let spectrum_map: BTreeMap<String, u64> = spectrum
|
||||
.iter()
|
||||
.map(|(&c, &f)| (format!("{c:010}"), f))
|
||||
.collect();
|
||||
serde_json::to_writer_pretty(
|
||||
fs::File::create(dir.join("kmer_spectrum_raw.json"))?,
|
||||
&serde_json::json!({ "f0": f0, "f1": f1, "spectrum": &spectrum_map }),
|
||||
|
||||
Reference in New Issue
Block a user