feat: add k-mer index rebuild and compaction feature

This commit introduces a new `rebuild` CLI subcommand that reconstructs an existing multi-layer k-mer index into a compact, single-layer index. It implements a configurable filtering pipeline supporting min/max genome fraction/count and total count thresholds, parallel partition processing via `rayon`, and CLI progress tracking. The change also restructures module declarations across `obikindex` and `obikpartitionner` to integrate the new rebuild and layer-handling logic.
This commit is contained in:
Eric Coissac
2026-05-21 12:11:55 +02:00
parent 3fa1dbf8cc
commit d9aa211b8f
9 changed files with 530 additions and 3 deletions
+1
View File
@@ -5,6 +5,7 @@ mod distance;
mod dump; mod dump;
mod index; mod index;
mod merge; mod merge;
mod rebuild;
pub use error::{OKIError, OKIResult}; pub use error::{OKIError, OKIResult};
pub use distance::{DistanceMetric, DistanceOutput}; pub use distance::{DistanceMetric, DistanceOutput};
+116
View File
@@ -0,0 +1,116 @@
use std::fs;
use std::io;
use std::path::Path;
use std::time::Duration;
use indicatif::{ProgressBar, ProgressStyle};
use obikpartitionner::{KmerFilter, KmerPartition, MergeMode};
use obisys::{Reporter, Stage};
use rayon::prelude::*;
use tracing::info;
use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex;
use crate::meta::IndexMeta;
use crate::state::{IndexState, SENTINEL_INDEXED};
impl KmerIndex {
/// Rebuild `src` into a new compact single-layer index at `output`.
///
/// Only k-mers whose per-genome row passes every filter in `filters` are
/// written. If `filters` is empty every k-mer is kept (pure compaction).
///
/// `mode` controls whether the output stores counts or presence/absence.
/// A count source may be rebuilt in presence mode; a presence source
/// cannot be rebuilt in count mode.
pub fn rebuild<P: AsRef<Path>>(
output: P,
src: &KmerIndex,
filters: &[Box<dyn KmerFilter>],
mode: MergeMode,
force: bool,
rep: &mut Reporter,
) -> OKIResult<Self> {
let output = output.as_ref();
if src.state() != IndexState::Indexed {
return Err(OKIError::NotIndexed(src.root_path.clone()));
}
if mode == MergeMode::Count && !src.meta.config.with_counts {
return Err(OKIError::InvalidInput(
"cannot rebuild in count mode from a presence-only source index".into(),
));
}
if output.exists() {
if force {
fs::remove_dir_all(output)?;
} else {
return Err(OKIError::Io(io::Error::new(
io::ErrorKind::AlreadyExists,
format!("{}: output directory already exists", output.display()),
)));
}
}
// ── Create output directory + metadata ────────────────────────────────
fs::create_dir_all(output)?;
let mut meta = IndexMeta::new(src.meta.config.clone());
meta.config.with_counts = mode == MergeMode::Count;
meta.genomes = src.meta.genomes.clone();
meta.write(output)?;
let n_genomes = src.meta.genomes.len();
let n_partitions = src.partition.n_partitions();
// ── Create an empty destination KmerPartition ─────────────────────────
// Create the partitions/ subdirectory so KmerPartition::open_with_config works.
fs::create_dir_all(output.join(obikpartitionner::PARTITIONS_SUBDIR))?;
let dst_partition = KmerPartition::open_with_config(
output,
meta.config.kmer_size,
meta.config.minimizer_size,
meta.config.n_bits,
)?;
info!(
"rebuild: {} partition(s), {} genome(s), mode={:?}",
n_partitions, n_genomes, mode,
);
let t = Stage::start("rebuild");
let pb = ProgressBar::new(n_partitions as u64).with_style(
ProgressStyle::with_template("rebuild — [{bar:20}] {pos}/{len} | {msg}")
.unwrap()
.progress_chars("=> "),
);
pb.enable_steady_tick(Duration::from_millis(100));
let src_partition = &src.partition;
let errors: Vec<obiskio::SKError> = (0..n_partitions)
.into_par_iter()
.filter_map(|i| {
let result = dst_partition
.rebuild_partition(src_partition, i, filters, mode, n_genomes)
.err();
pb.inc(1);
result
})
.collect();
pb.finish_and_clear();
if let Some(e) = errors.into_iter().next() {
return Err(OKIError::Partition(e));
}
rep.push(t.stop());
// Write SENTINEL_INDEXED — output is ready to use.
fs::File::create(output.join(SENTINEL_INDEXED))?;
KmerIndex::open(output)
}
}
+1
View File
@@ -2,5 +2,6 @@ pub mod distance;
pub mod dump; pub mod dump;
pub mod index; pub mod index;
pub mod merge; pub mod merge;
pub mod rebuild;
pub mod superkmer; pub mod superkmer;
pub mod unitig; pub mod unitig;
+111
View File
@@ -0,0 +1,111 @@
use std::path::PathBuf;
use clap::Args;
use obikindex::{KmerIndex, MergeMode};
use obikpartitionner::filter::{
KmerFilter, MaxGenomeCount, MaxGenomeFraction, MaxTotalCount,
MinGenomeCount, MinGenomeFraction, MinTotalCount,
};
use obisys::Reporter;
use obikseq::{set_k, set_m};
use tracing::info;
#[derive(Args)]
pub struct RebuildArgs {
/// Source index directory
pub source: PathBuf,
/// Output index directory
#[arg(short, long)]
pub output: PathBuf,
/// Minimum fraction of genomes containing the k-mer [0.01.0]
#[arg(long)]
pub min_quorum_frac: Option<f64>,
/// Maximum fraction of genomes containing the k-mer [0.01.0]
#[arg(long)]
pub max_quorum_frac: Option<f64>,
/// Minimum number of genomes containing the k-mer
#[arg(long)]
pub min_quorum_count: Option<usize>,
/// Maximum number of genomes containing the k-mer
#[arg(long)]
pub max_quorum_count: Option<usize>,
/// Minimum total count across all genomes (count index only)
#[arg(long)]
pub min_total_count: Option<u32>,
/// Maximum total count across all genomes (count index only)
#[arg(long)]
pub max_total_count: Option<u32>,
/// Per-genome count threshold for presence in quorum filters (default 0)
#[arg(long, default_value = "0")]
pub presence_threshold: u32,
/// Output as presence/absence instead of counts
#[arg(long)]
pub presence: bool,
/// Overwrite existing output directory
#[arg(short, long)]
pub force: bool,
}
pub fn run(args: RebuildArgs) {
let src = KmerIndex::open(&args.source).unwrap_or_else(|e| {
eprintln!("error opening source index: {e}");
std::process::exit(1);
});
set_k(src.kmer_size());
set_m(src.minimizer_size());
let n_genomes = src.meta().genomes.len();
let mode = if args.presence || !src.meta().config.with_counts {
MergeMode::Presence
} else {
MergeMode::Count
};
info!(
"rebuild: {} genome(s), mode={:?}, source={}",
n_genomes, mode, args.source.display()
);
let pt = args.presence_threshold;
let mut filters: Vec<Box<dyn KmerFilter>> = Vec::new();
if let Some(v) = args.min_quorum_frac {
filters.push(Box::new(MinGenomeFraction { frac: v, threshold: pt }));
}
if let Some(v) = args.max_quorum_frac {
filters.push(Box::new(MaxGenomeFraction { frac: v, threshold: pt }));
}
if let Some(v) = args.min_quorum_count {
filters.push(Box::new(MinGenomeCount { count: v, threshold: pt }));
}
if let Some(v) = args.max_quorum_count {
filters.push(Box::new(MaxGenomeCount { count: v, threshold: pt }));
}
if let Some(v) = args.min_total_count {
filters.push(Box::new(MinTotalCount { total: v }));
}
if let Some(v) = args.max_total_count {
filters.push(Box::new(MaxTotalCount { total: v }));
}
let mut rep = Reporter::new();
KmerIndex::rebuild(&args.output, &src, &filters, mode, args.force, &mut rep)
.unwrap_or_else(|e| {
eprintln!("error rebuilding index: {e}");
std::process::exit(1);
});
rep.print();
info!("rebuilt index → {}", args.output.display());
}
+3
View File
@@ -20,6 +20,8 @@ enum Commands {
Index(cmd::index::IndexArgs), Index(cmd::index::IndexArgs),
/// Merge multiple built indexes into one /// Merge multiple built indexes into one
Merge(cmd::merge::MergeArgs), Merge(cmd::merge::MergeArgs),
/// Filter and compact an existing index into a new single-layer index
Rebuild(cmd::rebuild::RebuildArgs),
/// Dump all indexed kmers as CSV (kmer + per-genome counts or presence) /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence)
Dump(cmd::dump::DumpArgs), Dump(cmd::dump::DumpArgs),
/// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees /// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees
@@ -51,6 +53,7 @@ fn main() {
Commands::Index(args) => cmd::index::run(args), Commands::Index(args) => cmd::index::run(args),
Commands::Merge(args) => cmd::merge::run(args), Commands::Merge(args) => cmd::merge::run(args),
Commands::Dump(args) => cmd::dump::run(args), Commands::Dump(args) => cmd::dump::run(args),
Commands::Rebuild(args) => cmd::rebuild::run(args),
Commands::Distance(args) => cmd::distance::run(args), Commands::Distance(args) => cmd::distance::run(args),
Commands::Unitig(args) => cmd::unitig::run(args), Commands::Unitig(args) => cmd::unitig::run(args),
} }
+87
View File
@@ -0,0 +1,87 @@
/// Trait for kmer row filters used in the rebuild command.
///
/// `row` contains raw per-genome counts (or 0/1 for presence/absence data).
/// `n_genomes` equals `row.len()`.
pub trait KmerFilter: Send + Sync {
fn passes(&self, row: &[u32], n_genomes: usize) -> bool;
}
// ── Quorum filters ─────────────────────────────────────────────────────────────
fn present_count(row: &[u32], threshold: u32) -> usize {
row.iter().filter(|&&v| v > threshold).count()
}
/// At least `frac` fraction of genomes contain this kmer (count > `threshold`).
pub struct MinGenomeFraction {
pub frac: f64,
pub threshold: u32,
}
impl KmerFilter for MinGenomeFraction {
fn passes(&self, row: &[u32], n_genomes: usize) -> bool {
let p = present_count(row, self.threshold);
p as f64 / n_genomes as f64 >= self.frac
}
}
/// At most `frac` fraction of genomes contain this kmer (count > `threshold`).
pub struct MaxGenomeFraction {
pub frac: f64,
pub threshold: u32,
}
impl KmerFilter for MaxGenomeFraction {
fn passes(&self, row: &[u32], n_genomes: usize) -> bool {
let p = present_count(row, self.threshold);
p as f64 / n_genomes as f64 <= self.frac
}
}
/// At least `count` genomes contain this kmer (count > `threshold`).
pub struct MinGenomeCount {
pub count: usize,
pub threshold: u32,
}
impl KmerFilter for MinGenomeCount {
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
present_count(row, self.threshold) >= self.count
}
}
/// At most `count` genomes contain this kmer (count > `threshold`).
pub struct MaxGenomeCount {
pub count: usize,
pub threshold: u32,
}
impl KmerFilter for MaxGenomeCount {
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
present_count(row, self.threshold) <= self.count
}
}
// ── Total-count filters (count indexes only) ───────────────────────────────────
/// Sum of counts across all genomes >= `total`.
pub struct MinTotalCount {
pub total: u32,
}
impl KmerFilter for MinTotalCount {
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
row.iter().sum::<u32>() >= self.total
}
}
/// Sum of counts across all genomes <= `total`.
pub struct MaxTotalCount {
pub total: u32,
}
impl KmerFilter for MaxTotalCount {
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
row.iter().sum::<u32>() <= self.total
}
}
+3
View File
@@ -1,9 +1,12 @@
pub mod filter;
mod distance; mod distance;
mod dump_layer; mod dump_layer;
mod index_layer; mod index_layer;
mod kmer_sort; mod kmer_sort;
mod merge_layer; mod merge_layer;
mod partition; mod partition;
mod rebuild_layer;
pub use filter::KmerFilter;
pub use merge_layer::MergeMode; pub use merge_layer::MergeMode;
pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR}; pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};
+3 -3
View File
@@ -44,7 +44,7 @@ impl ColBuilder {
// ── SrcLayerData — opened source matrix for pass-2 lookup ───────────────────── // ── SrcLayerData — opened source matrix for pass-2 lookup ─────────────────────
enum SrcLayerData { pub(crate) enum SrcLayerData {
/// Pure set-membership layer (no data matrix): every kmer is present in all genomes. /// Pure set-membership layer (no data matrix): every kmer is present in all genomes.
SetMembership, SetMembership,
Presence(MphfLayer, PersistentBitMatrix), Presence(MphfLayer, PersistentBitMatrix),
@@ -52,7 +52,7 @@ enum SrcLayerData {
} }
impl SrcLayerData { impl SrcLayerData {
fn open(layer_dir: &Path, mode: MergeMode) -> SKResult<Self> { pub(crate) fn open(layer_dir: &Path, mode: MergeMode) -> SKResult<Self> {
let presence_dir = layer_dir.join("presence"); let presence_dir = layer_dir.join("presence");
let counts_dir = layer_dir.join("counts"); let counts_dir = layer_dir.join("counts");
match mode { match mode {
@@ -83,7 +83,7 @@ impl SrcLayerData {
} }
/// Return one value per source genome for `kmer`. /// Return one value per source genome for `kmer`.
fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> { pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
match self { match self {
SrcLayerData::SetMembership => vec![1u32; n_genomes], SrcLayerData::SetMembership => vec![1u32; n_genomes],
SrcLayerData::Presence(mphf, mat) => { SrcLayerData::Presence(mphf, mat) => {
+205
View File
@@ -0,0 +1,205 @@
use std::fs;
use std::io;
use std::path::{Path, PathBuf};
use obicompactvec::{PersistentBitMatrixBuilder,
PersistentBitVecBuilder,
PersistentCompactIntMatrixBuilder,
PersistentCompactIntVecBuilder};
use obidebruinj::GraphDeBruijn;
use obiskio::{SKError, SKResult, UnitigFileReader};
use obilayeredmap::{Layer, MphfLayer, OLMError};
use obilayeredmap::meta::PartitionMeta;
use crate::filter::KmerFilter;
use crate::merge_layer::{MergeMode, SrcLayerData};
use crate::partition::KmerPartition;
const INDEX_SUBDIR: &str = "index";
fn olm_to_sk(e: OLMError) -> SKError {
match e {
OLMError::Io(e) => SKError::Io(e),
other => SKError::InvalidData { context: "rebuild", detail: other.to_string() },
}
}
fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
dir.join(format!("col_{col:06}.pbiv"))
}
fn col_path_int(dir: &Path, col: usize) -> PathBuf {
dir.join(format!("col_{col:06}.pciv"))
}
fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
fs::write(dir.join("meta.json"), format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"))
}
// ── ColBuilder ────────────────────────────────────────────────────────────────
enum ColBuilder {
Bit(PersistentBitVecBuilder),
Int(PersistentCompactIntVecBuilder),
}
impl ColBuilder {
fn set_val(&mut self, slot: usize, value: u32) {
match self {
ColBuilder::Bit(b) => b.set(slot, value > 0),
ColBuilder::Int(b) => b.set(slot, value),
}
}
fn close(self) -> SKResult<()> {
match self {
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
}
}
}
// ── Helpers ───────────────────────────────────────────────────────────────────
fn load_meta(dir: &Path) -> SKResult<PartitionMeta> {
match PartitionMeta::load(dir) {
Ok(m) => Ok(m),
Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => {
let mut n = 0usize;
while dir.join(format!("layer_{n}")).exists() { n += 1; }
let m = PartitionMeta { n_layers: n };
m.save(dir).map_err(olm_to_sk)?;
Ok(m)
}
Err(e) => Err(olm_to_sk(e)),
}
}
fn passes_all(filters: &[Box<dyn KmerFilter>], row: &[u32], n_genomes: usize) -> bool {
filters.iter().all(|f| f.passes(row, n_genomes))
}
// ── KmerPartition::rebuild_partition ─────────────────────────────────────────
impl KmerPartition {
/// Rebuild partition `i` from `src` into `self` (an empty destination partition).
///
/// Only k-mers whose per-genome row passes all `filters` are written.
/// The output is a single-layer index — regardless of how many layers the
/// source has.
///
/// `n_genomes` is the number of genome columns in the source (and destination).
pub fn rebuild_partition(
&self,
src: &KmerPartition,
i: usize,
filters: &[Box<dyn KmerFilter>],
mode: MergeMode,
n_genomes: usize,
) -> SKResult<()> {
let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
if !src_index_dir.exists() {
return Ok(());
}
let src_meta = load_meta(&src_index_dir)?;
if src_meta.n_layers == 0 {
return Ok(());
}
// ── Pass 1: collect filtered kmers into de Bruijn graph ───────────────
let mut g = GraphDeBruijn::new();
for l in 0..src_meta.n_layers {
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
let unitigs_path = src_layer_dir.join("unitigs.bin");
if !unitigs_path.exists() { continue; }
let reader = UnitigFileReader::open(&unitigs_path)?;
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let row = src_data.lookup(kmer, n_genomes);
if passes_all(filters, &row, n_genomes) {
g.push(kmer);
}
}
}
if g.len() == 0 {
return Ok(());
}
let n_new = g.len();
g.compute_degrees();
// ── Build MPHF in dst layer_0 ─────────────────────────────────────────
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
let dst_layer_dir = dst_index_dir.join("layer_0");
fs::create_dir_all(&dst_layer_dir)?;
let mut uw = Layer::<()>::unitig_writer(&dst_layer_dir).map_err(olm_to_sk)?;
for unitig in g.iter_unitig() {
uw.write(&unitig)?;
}
uw.close()?;
drop(g);
Layer::<()>::build(&dst_layer_dir).map_err(olm_to_sk)?;
let dst_mphf = MphfLayer::open(&dst_layer_dir).map_err(olm_to_sk)?;
// ── Prepare matrix builders (one column per genome) ───────────────────
let data_dir = match mode {
MergeMode::Presence => dst_layer_dir.join("presence"),
MergeMode::Count => dst_layer_dir.join("counts"),
};
fs::create_dir_all(&data_dir)?;
let mut builders: Vec<ColBuilder> = match mode {
MergeMode::Presence => {
PersistentBitMatrixBuilder::new(n_new, &data_dir)
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
(0..n_genomes).map(|g| -> SKResult<ColBuilder> {
let b = PersistentBitVecBuilder::new(n_new, &col_path_bit(&data_dir, g))?;
Ok(ColBuilder::Bit(b))
}).collect::<SKResult<_>>()?
}
MergeMode::Count => {
PersistentCompactIntMatrixBuilder::new(n_new, &data_dir)
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
(0..n_genomes).map(|g| -> SKResult<ColBuilder> {
let b = PersistentCompactIntVecBuilder::new(n_new, &col_path_int(&data_dir, g))?;
Ok(ColBuilder::Int(b))
}).collect::<SKResult<_>>()?
}
};
// ── Pass 2: fill builders ─────────────────────────────────────────────
for l in 0..src_meta.n_layers {
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
let unitigs_path = src_layer_dir.join("unitigs.bin");
if !unitigs_path.exists() { continue; }
let reader = UnitigFileReader::open(&unitigs_path)?;
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let row = src_data.lookup(kmer, n_genomes);
if !passes_all(filters, &row, n_genomes) { continue; }
if let Some(slot) = dst_mphf.find(kmer) {
for (col, &value) in row.iter().enumerate() {
builders[col].set_val(slot, value);
}
}
}
}
// ── Close builders, write metadata ────────────────────────────────────
for b in builders { b.close()?; }
write_matrix_meta(&data_dir, n_new, n_genomes).map_err(SKError::Io)?;
PartitionMeta { n_layers: 1 }.save(&dst_index_dir).map_err(olm_to_sk)?;
Ok(())
}
}