feat: introduce layered MPHF indexing and partition metadata

Refactors obikindex and obikpartitionner to delegate index construction to a new layered MPHF implementation. Adds resume-safe building with abundance filtering and count persistence, while introducing a PartitionMeta struct for JSON configuration persistence. Updates OKIError to wrap layer-specific errors, replaces single-path extraction with full path collection and logging, and registers new internal dependencies across the workspace.
This commit is contained in:
Eric Coissac
2026-05-20 21:01:16 +02:00
parent 17c9e076bd
commit c5bcb7b8fa
10 changed files with 193 additions and 229 deletions
+4 -2
View File
@@ -13,8 +13,10 @@ obikrope = { path = "../obikrope" }
[dependencies]
niffler = "3.0.0"
remove_dir_all = "0.8"
obikseq = { path = "../obikseq" }
obiskio = { path = "../obiskio" }
obikseq = { path = "../obikseq" }
obiskio = { path = "../obiskio" }
obidebruinj = { path = "../obidebruinj" }
obilayeredmap = { path = "../obilayeredmap" }
rayon = "1"
sysinfo = "0.33"
serde = { version = "1", features = ["derive"] }
+135
View File
@@ -0,0 +1,135 @@
use std::fs;
use std::io;
use cacheline_ef::{CachelineEf, CachelineEfVec};
use epserde::prelude::*;
use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
use obidebruinj::GraphDeBruijn;
use obilayeredmap::{OLMError, layer::Layer};
use obiskio::{SKError, SKFileMeta, SKFileReader};
use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
use crate::partition::KmerPartition;
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
fn olm_to_sk(e: OLMError) -> SKError {
match e {
OLMError::Io(io_err) => SKError::Io(io_err),
other => SKError::InvalidData { context: "layer build", detail: other.to_string() },
}
}
fn remove_if_exists(path: &std::path::Path) {
if let Err(e) = fs::remove_file(path) {
if e.kind() != io::ErrorKind::NotFound {
eprintln!("warning: could not remove {}: {e}", path.display());
}
}
}
impl KmerPartition {
/// Build the layered MPHF index for partition `i`.
///
/// Returns the number of canonical k-mers indexed, or 0 if the partition
/// has no data or its layer was already built (resume-safe).
///
/// Abundance filtering is applied when `min_ab > 1` or `max_ab.is_some()`,
/// using `mphf1.bin` + `counts1.bin` if they exist.
/// Count payload is stored iff `with_counts` is true.
pub fn build_index_layer(
&self,
i: usize,
min_ab: u32,
max_ab: Option<u32>,
with_counts: bool,
) -> Result<usize, SKError> {
let part_dir = self.part_dir(i);
let dedup_path = part_dir.join("dereplicated.skmer.zst");
if !dedup_path.exists() {
return Ok(0);
}
let layer_dir = part_dir.join("index").join("layer_0");
if layer_dir.join("mphf.bin").exists() {
return Ok(0);
}
let filter_active = min_ab > 1 || max_ab.is_some();
let need_counts = filter_active || with_counts;
let mphf1_opt: Option<Mphf> = if need_counts {
let p = part_dir.join("mphf1.bin");
p.exists().then(|| Mphf::load_full(&p).ok()).flatten()
} else {
None
};
let counts1_opt: Option<PersistentCompactIntVec> = if need_counts {
let p = part_dir.join("counts1.bin");
p.exists()
.then(|| PersistentCompactIntVec::open(&p).ok())
.flatten()
} else {
None
};
let mut g = GraphDeBruijn::new();
let mut reader = SKFileReader::open(&dedup_path)?;
for sk in reader.iter() {
for kmer in sk.iter_canonical_kmers() {
let accept = if filter_active {
match (&mphf1_opt, &counts1_opt) {
(Some(mphf), Some(counts)) => {
let ab = counts.get(mphf.index(&kmer.raw()));
ab >= min_ab && max_ab.map_or(true, |max| ab <= max)
}
_ => true,
}
} else {
true
};
if accept {
g.push(kmer);
}
}
}
let n_kmers = g.len();
g.compute_degrees();
fs::create_dir_all(&layer_dir)?;
let mut uw = Layer::<()>::unitig_writer(&layer_dir).map_err(olm_to_sk)?;
for unitig in g.iter_unitig() {
uw.write(&unitig)?;
}
uw.close()?;
if with_counts {
Layer::<PersistentCompactIntMatrix>::build(&layer_dir, |kmer| {
match (&mphf1_opt, &counts1_opt) {
(Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())),
_ => 1,
}
})
.map_err(olm_to_sk)?;
} else {
Layer::<()>::build(&layer_dir).map_err(olm_to_sk)?;
}
Ok(n_kmers)
}
/// Remove intermediate build artifacts for partition `i`.
///
/// Deletes `dereplicated.skmer.zst` (+ sidecar), `mphf1.bin`, `counts1.bin`.
pub fn remove_build_artifacts(&self, i: usize) {
let part_dir = self.part_dir(i);
let dedup = part_dir.join("dereplicated.skmer.zst");
remove_if_exists(&SKFileMeta::sidecar_path(&dedup));
remove_if_exists(&dedup);
remove_if_exists(&part_dir.join("mphf1.bin"));
remove_if_exists(&part_dir.join("counts1.bin"));
}
}
+1
View File
@@ -1,3 +1,4 @@
mod index_layer;
mod kmer_sort;
mod partition;
+12 -61
View File
@@ -18,7 +18,6 @@ use obiskio::{SKFileMeta, SKFileReader, SKFileWriter, SKResult};
use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
use rayon::prelude::*;
use remove_dir_all::remove_dir_all;
use serde::{Deserialize, Serialize};
use sysinfo::System;
use niffler::Level;
@@ -28,18 +27,9 @@ use crate::kmer_sort::{chunk_size_from_ram, sort_unique_kmers};
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
const META_FILENAME: &str = "partition.meta";
const SK_EXT: &str = "skmer.zst";
pub const PARTITIONS_SUBDIR: &str = "partitions";
#[derive(Serialize, Deserialize)]
struct PartitionMeta {
n_bits: usize,
kmer_size: usize,
minimizer_size: usize,
level: u32,
}
pub struct KmerPartition {
root_path: PathBuf,
n_partitions: usize,
@@ -98,11 +88,15 @@ impl KmerPartition {
level,
closed: false,
};
partition.write_meta(n_bits)?;
Ok(partition)
}
pub fn open<P: AsRef<Path>>(path: P) -> SKResult<Self> {
pub fn open_with_config<P: AsRef<Path>>(
path: P,
kmer_size: usize,
minimizer_size: usize,
n_bits: usize,
) -> SKResult<Self> {
let root_path = path.as_ref().to_owned();
if !root_path.exists() {
return Err(io::Error::new(
@@ -111,22 +105,17 @@ impl KmerPartition {
)
.into());
}
let meta_path = root_path.join(META_FILENAME);
let meta: PartitionMeta =
serde_json::from_reader(fs::File::open(&meta_path)?).map_err(io::Error::other)?;
let level = level_from_u32(meta.level);
let n_partitions = 1usize << meta.n_bits;
let n_partitions = 1usize << n_bits;
let writers = (0..n_partitions).map(|_| None).collect();
Ok(Self {
root_path,
n_partitions,
partitions_mask: (1u64 << meta.n_bits) - 1,
kmer_size: meta.kmer_size,
minimizer_size: meta.minimizer_size,
partitions_mask: (1u64 << n_bits) - 1,
kmer_size,
minimizer_size,
writers,
level,
closed: true, // read-only: writing is not allowed on an opened partition
level: Level::One,
closed: true,
})
}
@@ -339,18 +328,6 @@ impl KmerPartition {
}
}
fn write_meta(&self, n_bits: usize) -> SKResult<()> {
let meta = PartitionMeta {
n_bits,
kmer_size: self.kmer_size,
minimizer_size: self.minimizer_size,
level: u32::from(self.level),
};
let f = fs::File::create(self.root_path.join(META_FILENAME))?;
serde_json::to_writer_pretty(f, &meta).map_err(|e| io::Error::other(e))?;
Ok(())
}
fn ensure_writer(&mut self, partition: usize) -> SKResult<&mut SKFileWriter> {
if self.writers[partition].is_none() {
let dir = self.root_path.join(PARTITIONS_SUBDIR).join(format!("part_{:05}", partition));
@@ -411,32 +388,6 @@ fn optimal_buckets(raw_path: &Path, available_bytes: u64) -> usize {
n.next_power_of_two() as usize
}
fn level_from_u32(n: u32) -> Level {
match n {
0 => Level::Zero,
1 => Level::One,
2 => Level::Two,
3 => Level::Three,
4 => Level::Four,
5 => Level::Five,
6 => Level::Six,
7 => Level::Seven,
8 => Level::Eight,
9 => Level::Nine,
10 => Level::Ten,
11 => Level::Eleven,
12 => Level::Twelve,
13 => Level::Thirteen,
14 => Level::Fourteen,
15 => Level::Fifteen,
16 => Level::Sixteen,
17 => Level::Seventeen,
18 => Level::Eighteen,
19 => Level::Nineteen,
20 => Level::Twenty,
_ => Level::TwentyOne,
}
}
/// Maximum value that fits in the 24-bit COUNT field of a SuperKmer header.
const MAX_SK_COUNT: u64 = (1 << 24) - 1;