feat: add k-mer index rebuild and compaction feature

This commit introduces a new `rebuild` CLI subcommand that reconstructs an existing multi-layer k-mer index into a compact, single-layer index. It implements a configurable filtering pipeline supporting min/max genome fraction/count and total count thresholds, parallel partition processing via `rayon`, and CLI progress tracking. The change also restructures module declarations across `obikindex` and `obikpartitionner` to integrate the new rebuild and layer-handling logic.
This commit is contained in:
Eric Coissac
2026-05-21 12:11:55 +02:00
parent 3fa1dbf8cc
commit d9aa211b8f
9 changed files with 530 additions and 3 deletions
+205
View File
@@ -0,0 +1,205 @@
use std::fs;
use std::io;
use std::path::{Path, PathBuf};
use obicompactvec::{PersistentBitMatrixBuilder,
PersistentBitVecBuilder,
PersistentCompactIntMatrixBuilder,
PersistentCompactIntVecBuilder};
use obidebruinj::GraphDeBruijn;
use obiskio::{SKError, SKResult, UnitigFileReader};
use obilayeredmap::{Layer, MphfLayer, OLMError};
use obilayeredmap::meta::PartitionMeta;
use crate::filter::KmerFilter;
use crate::merge_layer::{MergeMode, SrcLayerData};
use crate::partition::KmerPartition;
const INDEX_SUBDIR: &str = "index";
fn olm_to_sk(e: OLMError) -> SKError {
match e {
OLMError::Io(e) => SKError::Io(e),
other => SKError::InvalidData { context: "rebuild", detail: other.to_string() },
}
}
fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
dir.join(format!("col_{col:06}.pbiv"))
}
fn col_path_int(dir: &Path, col: usize) -> PathBuf {
dir.join(format!("col_{col:06}.pciv"))
}
fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
fs::write(dir.join("meta.json"), format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"))
}
// ── ColBuilder ────────────────────────────────────────────────────────────────
enum ColBuilder {
Bit(PersistentBitVecBuilder),
Int(PersistentCompactIntVecBuilder),
}
impl ColBuilder {
fn set_val(&mut self, slot: usize, value: u32) {
match self {
ColBuilder::Bit(b) => b.set(slot, value > 0),
ColBuilder::Int(b) => b.set(slot, value),
}
}
fn close(self) -> SKResult<()> {
match self {
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
}
}
}
// ── Helpers ───────────────────────────────────────────────────────────────────
fn load_meta(dir: &Path) -> SKResult<PartitionMeta> {
match PartitionMeta::load(dir) {
Ok(m) => Ok(m),
Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => {
let mut n = 0usize;
while dir.join(format!("layer_{n}")).exists() { n += 1; }
let m = PartitionMeta { n_layers: n };
m.save(dir).map_err(olm_to_sk)?;
Ok(m)
}
Err(e) => Err(olm_to_sk(e)),
}
}
fn passes_all(filters: &[Box<dyn KmerFilter>], row: &[u32], n_genomes: usize) -> bool {
filters.iter().all(|f| f.passes(row, n_genomes))
}
// ── KmerPartition::rebuild_partition ─────────────────────────────────────────
impl KmerPartition {
/// Rebuild partition `i` from `src` into `self` (an empty destination partition).
///
/// Only k-mers whose per-genome row passes all `filters` are written.
/// The output is a single-layer index — regardless of how many layers the
/// source has.
///
/// `n_genomes` is the number of genome columns in the source (and destination).
pub fn rebuild_partition(
&self,
src: &KmerPartition,
i: usize,
filters: &[Box<dyn KmerFilter>],
mode: MergeMode,
n_genomes: usize,
) -> SKResult<()> {
let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
if !src_index_dir.exists() {
return Ok(());
}
let src_meta = load_meta(&src_index_dir)?;
if src_meta.n_layers == 0 {
return Ok(());
}
// ── Pass 1: collect filtered kmers into de Bruijn graph ───────────────
let mut g = GraphDeBruijn::new();
for l in 0..src_meta.n_layers {
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
let unitigs_path = src_layer_dir.join("unitigs.bin");
if !unitigs_path.exists() { continue; }
let reader = UnitigFileReader::open(&unitigs_path)?;
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let row = src_data.lookup(kmer, n_genomes);
if passes_all(filters, &row, n_genomes) {
g.push(kmer);
}
}
}
if g.len() == 0 {
return Ok(());
}
let n_new = g.len();
g.compute_degrees();
// ── Build MPHF in dst layer_0 ─────────────────────────────────────────
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
let dst_layer_dir = dst_index_dir.join("layer_0");
fs::create_dir_all(&dst_layer_dir)?;
let mut uw = Layer::<()>::unitig_writer(&dst_layer_dir).map_err(olm_to_sk)?;
for unitig in g.iter_unitig() {
uw.write(&unitig)?;
}
uw.close()?;
drop(g);
Layer::<()>::build(&dst_layer_dir).map_err(olm_to_sk)?;
let dst_mphf = MphfLayer::open(&dst_layer_dir).map_err(olm_to_sk)?;
// ── Prepare matrix builders (one column per genome) ───────────────────
let data_dir = match mode {
MergeMode::Presence => dst_layer_dir.join("presence"),
MergeMode::Count => dst_layer_dir.join("counts"),
};
fs::create_dir_all(&data_dir)?;
let mut builders: Vec<ColBuilder> = match mode {
MergeMode::Presence => {
PersistentBitMatrixBuilder::new(n_new, &data_dir)
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
(0..n_genomes).map(|g| -> SKResult<ColBuilder> {
let b = PersistentBitVecBuilder::new(n_new, &col_path_bit(&data_dir, g))?;
Ok(ColBuilder::Bit(b))
}).collect::<SKResult<_>>()?
}
MergeMode::Count => {
PersistentCompactIntMatrixBuilder::new(n_new, &data_dir)
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
(0..n_genomes).map(|g| -> SKResult<ColBuilder> {
let b = PersistentCompactIntVecBuilder::new(n_new, &col_path_int(&data_dir, g))?;
Ok(ColBuilder::Int(b))
}).collect::<SKResult<_>>()?
}
};
// ── Pass 2: fill builders ─────────────────────────────────────────────
for l in 0..src_meta.n_layers {
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
let unitigs_path = src_layer_dir.join("unitigs.bin");
if !unitigs_path.exists() { continue; }
let reader = UnitigFileReader::open(&unitigs_path)?;
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let row = src_data.lookup(kmer, n_genomes);
if !passes_all(filters, &row, n_genomes) { continue; }
if let Some(slot) = dst_mphf.find(kmer) {
for (col, &value) in row.iter().enumerate() {
builders[col].set_val(slot, value);
}
}
}
}
// ── Close builders, write metadata ────────────────────────────────────
for b in builders { b.close()?; }
write_matrix_meta(&data_dir, n_new, n_genomes).map_err(SKError::Io)?;
PartitionMeta { n_layers: 1 }.save(&dst_index_dir).map_err(olm_to_sk)?;
Ok(())
}
}