feat: add configurable block sizes and in-place reindex command
Propagate configurable block size (`block_bits`) through index and layer construction to control unitig chunking and optimize memory/performance trade-offs. Introduce an in-place `reindex` command and library method to convert indices between exact and approximate evidence formats. Add validation to reject merging indexes with mismatched evidence types, and update parallel kmer counting to use `AtomicUsize` for thread-safe aggregation. Includes CLI argument parsing, metadata persistence, and updated tests.
This commit is contained in:
@@ -80,39 +80,33 @@ impl<D: LayerData> Layer<D> {
|
||||
|
||||
/// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and
|
||||
/// `mphf.bin` already present in `layer_dir`.
|
||||
///
|
||||
/// See [`MphfLayer::build_exact_evidence`] for the full contract.
|
||||
pub fn build_exact_evidence(layer_dir: &Path) -> OLMResult<usize> {
|
||||
MphfLayer::build_exact_evidence(layer_dir)
|
||||
/// `block_bits` controls the `.idx` block size (2^block_bits chunks/block).
|
||||
pub fn build_exact_evidence(layer_dir: &Path, block_bits: u8) -> OLMResult<usize> {
|
||||
MphfLayer::build_exact_evidence(layer_dir, block_bits)
|
||||
}
|
||||
|
||||
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already
|
||||
/// present in `layer_dir`. `b` — fingerprint bits (1..=64); `z` — Findere
|
||||
/// consecutive k-mer parameter (≥1).
|
||||
///
|
||||
/// See [`MphfLayer::build_approx_evidence`] for the full contract.
|
||||
pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
|
||||
MphfLayer::build_approx_evidence(layer_dir, b, z)
|
||||
}
|
||||
|
||||
/// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
|
||||
/// `kind`.
|
||||
pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind) -> OLMResult<usize> {
|
||||
MphfLayer::build_evidence(layer_dir, kind)
|
||||
/// Dispatch to `build_exact_evidence` or `build_approx_evidence`.
|
||||
/// `block_bits` is forwarded to exact evidence only.
|
||||
pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult<usize> {
|
||||
MphfLayer::build_evidence(layer_dir, kind, block_bits)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Mode 1 — set membership ───────────────────────────────────────────────────
|
||||
|
||||
impl Layer<()> {
|
||||
pub fn build(out_dir: &Path) -> OLMResult<usize> {
|
||||
MphfLayer::build(out_dir, &mut |_, _| Ok(()))
|
||||
pub fn build(out_dir: &Path, block_bits: u8) -> OLMResult<usize> {
|
||||
MphfLayer::build(out_dir, block_bits, &mut |_, _| Ok(()))
|
||||
}
|
||||
|
||||
/// Create a presence matrix for a set-membership layer (first merge).
|
||||
///
|
||||
/// All `n_kmers` slots are set to `true`: every kmer in this layer belongs
|
||||
/// to genome_0, so genome_0 is present at every slot.
|
||||
pub fn init_presence_matrix(layer_dir: &Path, n_kmers: usize) -> OLMResult<()> {
|
||||
let presence_dir = layer_dir.join(PRESENCE_DIR);
|
||||
fs::create_dir_all(&presence_dir).map_err(OLMError::Io)?;
|
||||
@@ -126,16 +120,20 @@ impl Layer<()> {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Mode 2 — count matrix (1 column per layer) ────────────────────────────────
|
||||
// ── Mode 2 — count matrix ─────────────────────────────────────────────────────
|
||||
|
||||
impl Layer<PersistentCompactIntMatrix> {
|
||||
pub fn build(out_dir: &Path, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
|
||||
pub fn build(
|
||||
out_dir: &Path,
|
||||
block_bits: u8,
|
||||
count_of: impl Fn(CanonicalKmer) -> u32,
|
||||
) -> OLMResult<usize> {
|
||||
let n = UnitigFileReader::open_sequential(&out_dir.join(UNITIGS_FILE))?.n_kmers();
|
||||
let counts_dir = out_dir.join(COUNTS_DIR);
|
||||
let mut mb = PersistentCompactIntMatrixBuilder::new(n, &counts_dir)
|
||||
.map_err(OLMError::Io)?;
|
||||
let mut col = mb.add_col().map_err(OLMError::Io)?;
|
||||
let n_built = MphfLayer::build(out_dir, &mut |slot, kmer| {
|
||||
let n_built = MphfLayer::build(out_dir, block_bits, &mut |slot, kmer| {
|
||||
col.set(slot, count_of(kmer));
|
||||
Ok(())
|
||||
})?;
|
||||
@@ -146,16 +144,16 @@ impl Layer<PersistentCompactIntMatrix> {
|
||||
|
||||
pub fn build_from_map(
|
||||
out_dir: &Path,
|
||||
block_bits: u8,
|
||||
counts: &HashMap<CanonicalKmer, u32>,
|
||||
) -> OLMResult<usize> {
|
||||
Self::build(out_dir, |kmer| counts.get(&kmer).copied().unwrap_or(0))
|
||||
Self::build(out_dir, block_bits, |kmer| counts.get(&kmer).copied().unwrap_or(0))
|
||||
}
|
||||
}
|
||||
|
||||
// ── Mode 2 — count matrix column append ──────────────────────────────────────
|
||||
|
||||
impl Layer<PersistentCompactIntMatrix> {
|
||||
/// Append a genome column to an existing count matrix.
|
||||
pub fn append_genome_column(
|
||||
layer_dir: &Path,
|
||||
value_of: impl Fn(usize) -> u32,
|
||||
@@ -165,10 +163,9 @@ impl Layer<PersistentCompactIntMatrix> {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Mode 3 — presence/absence matrix (1 column per genome) ───────────────────
|
||||
// ── Mode 3 — presence/absence matrix ─────────────────────────────────────────
|
||||
|
||||
impl Layer<PersistentBitMatrix> {
|
||||
/// Append a genome column to an existing presence matrix.
|
||||
pub fn append_genome_column(
|
||||
layer_dir: &Path,
|
||||
value_of: impl Fn(usize) -> bool,
|
||||
@@ -179,6 +176,7 @@ impl Layer<PersistentBitMatrix> {
|
||||
|
||||
pub fn build_presence(
|
||||
out_dir: &Path,
|
||||
block_bits: u8,
|
||||
n_genomes: usize,
|
||||
present_in: impl Fn(CanonicalKmer, usize) -> bool,
|
||||
) -> OLMResult<usize> {
|
||||
@@ -188,7 +186,7 @@ impl Layer<PersistentBitMatrix> {
|
||||
let mut cols: Vec<_> = (0..n_genomes)
|
||||
.map(|_| mb.add_col().map_err(OLMError::Io))
|
||||
.collect::<OLMResult<_>>()?;
|
||||
let n_built = MphfLayer::build(out_dir, &mut |slot, kmer| {
|
||||
let n_built = MphfLayer::build(out_dir, block_bits, &mut |slot, kmer| {
|
||||
for (g, col) in cols.iter_mut().enumerate() {
|
||||
col.set(slot, present_in(kmer, g));
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::path::{Path, PathBuf};
|
||||
|
||||
use obicompactvec::PersistentCompactIntMatrix;
|
||||
use obikseq::CanonicalKmer;
|
||||
use obiskio::UnitigFileWriter;
|
||||
use obiskio::{UnitigFileWriter, DEFAULT_BLOCK_BITS};
|
||||
|
||||
use crate::error::OLMResult;
|
||||
use crate::layer::{Hit, Layer, LayerData};
|
||||
@@ -90,7 +90,7 @@ impl LayeredMap<()> {
|
||||
pub fn push_layer(&mut self) -> OLMResult<usize> {
|
||||
let i = self.layers.len();
|
||||
let dir = layer_dir(&self.root, i);
|
||||
Layer::<()>::build(&dir)?;
|
||||
Layer::<()>::build(&dir, DEFAULT_BLOCK_BITS)?;
|
||||
self.append_layer()?;
|
||||
Ok(i)
|
||||
}
|
||||
@@ -102,7 +102,7 @@ impl LayeredMap<PersistentCompactIntMatrix> {
|
||||
pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
|
||||
let i = self.layers.len();
|
||||
let dir = layer_dir(&self.root, i);
|
||||
Layer::<PersistentCompactIntMatrix>::build(&dir, count_of)?;
|
||||
Layer::<PersistentCompactIntMatrix>::build(&dir, DEFAULT_BLOCK_BITS, count_of)?;
|
||||
self.append_layer()?;
|
||||
Ok(i)
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::path::Path;
|
||||
use cacheline_ef::{CachelineEf, CachelineEfVec};
|
||||
use epserde::prelude::*;
|
||||
use obikseq::CanonicalKmer;
|
||||
use obiskio::{UnitigFileReader, UnitigFileWriter, build_unitig_idx, DEFAULT_BLOCK_BITS};
|
||||
use obiskio::{UnitigFileReader, UnitigFileWriter, build_unitig_idx};
|
||||
use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
|
||||
|
||||
use crate::error::{OLMError, OLMResult};
|
||||
@@ -110,25 +110,26 @@ impl MphfLayer {
|
||||
}
|
||||
|
||||
/// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
|
||||
/// `kind`.
|
||||
pub fn build_evidence(dir: &Path, kind: &EvidenceKind) -> OLMResult<usize> {
|
||||
/// `kind`. `block_bits` is forwarded to exact evidence only.
|
||||
pub fn build_evidence(dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult<usize> {
|
||||
match kind {
|
||||
EvidenceKind::Exact => Self::build_exact_evidence(dir),
|
||||
EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z),
|
||||
EvidenceKind::Exact => Self::build_exact_evidence(dir, block_bits),
|
||||
EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`.
|
||||
///
|
||||
/// `block_bits` controls the `.idx` block size (2^block_bits chunks per block).
|
||||
/// Uses sequential iteration — no `.idx` required on entry.
|
||||
pub fn build_exact_evidence(dir: &Path) -> OLMResult<usize> {
|
||||
pub fn build_exact_evidence(dir: &Path, block_bits: u8) -> OLMResult<usize> {
|
||||
let unitig_path = dir.join(UNITIGS_FILE);
|
||||
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
|
||||
let n = unitigs.n_kmers();
|
||||
|
||||
if n == 0 {
|
||||
fs::File::create(dir.join(EVIDENCE_FILE))?;
|
||||
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
||||
build_unitig_idx(&unitig_path, block_bits)?;
|
||||
LayerMeta::exact().save(dir)?;
|
||||
return Ok(0);
|
||||
}
|
||||
@@ -154,7 +155,7 @@ impl MphfLayer {
|
||||
}
|
||||
|
||||
ev.write(&dir.join(EVIDENCE_FILE))?;
|
||||
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
||||
build_unitig_idx(&unitig_path, block_bits)?;
|
||||
LayerMeta::exact().save(dir)?;
|
||||
Ok(n)
|
||||
}
|
||||
@@ -202,13 +203,14 @@ impl MphfLayer {
|
||||
/// population. Returns the number of kmers indexed.
|
||||
pub(crate) fn build(
|
||||
dir: &Path,
|
||||
block_bits: u8,
|
||||
fill_slot: &mut impl FnMut(usize, CanonicalKmer) -> OLMResult<()>,
|
||||
) -> OLMResult<usize> {
|
||||
use rayon::prelude::*;
|
||||
|
||||
let unitig_path = dir.join(UNITIGS_FILE);
|
||||
|
||||
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
||||
build_unitig_idx(&unitig_path, block_bits)?;
|
||||
|
||||
let unitigs = UnitigFileReader::open(&unitig_path)?;
|
||||
let n = unitigs.n_kmers();
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use super::*;
|
||||
use obicompactvec::PersistentCompactIntMatrix;
|
||||
use obikseq::{set_k, Kmer, Sequence as _, Unitig};
|
||||
use obiskio::DEFAULT_BLOCK_BITS;
|
||||
use tempfile::tempdir;
|
||||
|
||||
fn write_unitigs(dir: &Path, seqs: &[&[u8]]) {
|
||||
@@ -23,7 +24,7 @@ fn build_and_query_all_kmers_found() {
|
||||
let dir = tempdir().unwrap();
|
||||
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||
let kmers = all_canonical_kmers(dir.path());
|
||||
Layer::<()>::build(dir.path()).unwrap();
|
||||
Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS).unwrap();
|
||||
let layer = Layer::<()>::open(dir.path()).unwrap();
|
||||
for kmer in kmers {
|
||||
assert!(layer.query(kmer).is_some(), "kmer should be present");
|
||||
@@ -40,6 +41,7 @@ fn counts_are_stored_and_retrieved() {
|
||||
kmers.iter().enumerate().map(|(i, &k)| (k, i as u32 + 1)).collect();
|
||||
Layer::<PersistentCompactIntMatrix>::build(
|
||||
dir.path(),
|
||||
DEFAULT_BLOCK_BITS,
|
||||
|kmer| count_map.get(&kmer).copied().unwrap_or(0),
|
||||
).unwrap();
|
||||
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
|
||||
@@ -54,7 +56,7 @@ fn query_absent_returns_none() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||
Layer::<()>::build(dir.path()).unwrap();
|
||||
Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS).unwrap();
|
||||
let layer = Layer::<()>::open(dir.path()).unwrap();
|
||||
let absent = Kmer::from_ascii(b"CCCC").unwrap().canonical();
|
||||
assert!(layer.query(absent).is_none());
|
||||
@@ -65,7 +67,7 @@ fn open_after_build_is_consistent() {
|
||||
set_k(4);
|
||||
let dir = tempdir().unwrap();
|
||||
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||
let n = Layer::<PersistentCompactIntMatrix>::build(dir.path(), |_| 7).unwrap();
|
||||
let n = Layer::<PersistentCompactIntMatrix>::build(dir.path(), DEFAULT_BLOCK_BITS, |_| 7).unwrap();
|
||||
assert_eq!(n, 4);
|
||||
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
|
||||
let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical();
|
||||
|
||||
Reference in New Issue
Block a user