feat: add k-mer index rebuild and compaction feature
This commit introduces a new `rebuild` CLI subcommand that reconstructs an existing multi-layer k-mer index into a compact, single-layer index. It implements a configurable filtering pipeline supporting min/max genome fraction/count and total count thresholds, parallel partition processing via `rayon`, and CLI progress tracking. The change also restructures module declarations across `obikindex` and `obikpartitionner` to integrate the new rebuild and layer-handling logic.
This commit is contained in:
@@ -0,0 +1,87 @@
|
||||
/// Trait for kmer row filters used in the rebuild command.
|
||||
///
|
||||
/// `row` contains raw per-genome counts (or 0/1 for presence/absence data).
|
||||
/// `n_genomes` equals `row.len()`.
|
||||
pub trait KmerFilter: Send + Sync {
|
||||
fn passes(&self, row: &[u32], n_genomes: usize) -> bool;
|
||||
}
|
||||
|
||||
// ── Quorum filters ─────────────────────────────────────────────────────────────
|
||||
|
||||
fn present_count(row: &[u32], threshold: u32) -> usize {
|
||||
row.iter().filter(|&&v| v > threshold).count()
|
||||
}
|
||||
|
||||
/// At least `frac` fraction of genomes contain this kmer (count > `threshold`).
|
||||
pub struct MinGenomeFraction {
|
||||
pub frac: f64,
|
||||
pub threshold: u32,
|
||||
}
|
||||
|
||||
impl KmerFilter for MinGenomeFraction {
|
||||
fn passes(&self, row: &[u32], n_genomes: usize) -> bool {
|
||||
let p = present_count(row, self.threshold);
|
||||
p as f64 / n_genomes as f64 >= self.frac
|
||||
}
|
||||
}
|
||||
|
||||
/// At most `frac` fraction of genomes contain this kmer (count > `threshold`).
|
||||
pub struct MaxGenomeFraction {
|
||||
pub frac: f64,
|
||||
pub threshold: u32,
|
||||
}
|
||||
|
||||
impl KmerFilter for MaxGenomeFraction {
|
||||
fn passes(&self, row: &[u32], n_genomes: usize) -> bool {
|
||||
let p = present_count(row, self.threshold);
|
||||
p as f64 / n_genomes as f64 <= self.frac
|
||||
}
|
||||
}
|
||||
|
||||
/// At least `count` genomes contain this kmer (count > `threshold`).
|
||||
pub struct MinGenomeCount {
|
||||
pub count: usize,
|
||||
pub threshold: u32,
|
||||
}
|
||||
|
||||
impl KmerFilter for MinGenomeCount {
|
||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||
present_count(row, self.threshold) >= self.count
|
||||
}
|
||||
}
|
||||
|
||||
/// At most `count` genomes contain this kmer (count > `threshold`).
|
||||
pub struct MaxGenomeCount {
|
||||
pub count: usize,
|
||||
pub threshold: u32,
|
||||
}
|
||||
|
||||
impl KmerFilter for MaxGenomeCount {
|
||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||
present_count(row, self.threshold) <= self.count
|
||||
}
|
||||
}
|
||||
|
||||
// ── Total-count filters (count indexes only) ───────────────────────────────────
|
||||
|
||||
/// Sum of counts across all genomes >= `total`.
|
||||
pub struct MinTotalCount {
|
||||
pub total: u32,
|
||||
}
|
||||
|
||||
impl KmerFilter for MinTotalCount {
|
||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||
row.iter().sum::<u32>() >= self.total
|
||||
}
|
||||
}
|
||||
|
||||
/// Sum of counts across all genomes <= `total`.
|
||||
pub struct MaxTotalCount {
|
||||
pub total: u32,
|
||||
}
|
||||
|
||||
impl KmerFilter for MaxTotalCount {
|
||||
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
|
||||
row.iter().sum::<u32>() <= self.total
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,12 @@
|
||||
pub mod filter;
|
||||
mod distance;
|
||||
mod dump_layer;
|
||||
mod index_layer;
|
||||
mod kmer_sort;
|
||||
mod merge_layer;
|
||||
mod partition;
|
||||
mod rebuild_layer;
|
||||
|
||||
pub use filter::KmerFilter;
|
||||
pub use merge_layer::MergeMode;
|
||||
pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};
|
||||
|
||||
@@ -44,7 +44,7 @@ impl ColBuilder {
|
||||
|
||||
// ── SrcLayerData — opened source matrix for pass-2 lookup ─────────────────────
|
||||
|
||||
enum SrcLayerData {
|
||||
pub(crate) enum SrcLayerData {
|
||||
/// Pure set-membership layer (no data matrix): every kmer is present in all genomes.
|
||||
SetMembership,
|
||||
Presence(MphfLayer, PersistentBitMatrix),
|
||||
@@ -52,7 +52,7 @@ enum SrcLayerData {
|
||||
}
|
||||
|
||||
impl SrcLayerData {
|
||||
fn open(layer_dir: &Path, mode: MergeMode) -> SKResult<Self> {
|
||||
pub(crate) fn open(layer_dir: &Path, mode: MergeMode) -> SKResult<Self> {
|
||||
let presence_dir = layer_dir.join("presence");
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
match mode {
|
||||
@@ -83,7 +83,7 @@ impl SrcLayerData {
|
||||
}
|
||||
|
||||
/// Return one value per source genome for `kmer`.
|
||||
fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
|
||||
pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
|
||||
match self {
|
||||
SrcLayerData::SetMembership => vec![1u32; n_genomes],
|
||||
SrcLayerData::Presence(mphf, mat) => {
|
||||
|
||||
@@ -0,0 +1,205 @@
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use obicompactvec::{PersistentBitMatrixBuilder,
|
||||
PersistentBitVecBuilder,
|
||||
PersistentCompactIntMatrixBuilder,
|
||||
PersistentCompactIntVecBuilder};
|
||||
use obidebruinj::GraphDeBruijn;
|
||||
use obiskio::{SKError, SKResult, UnitigFileReader};
|
||||
use obilayeredmap::{Layer, MphfLayer, OLMError};
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
|
||||
use crate::filter::KmerFilter;
|
||||
use crate::merge_layer::{MergeMode, SrcLayerData};
|
||||
use crate::partition::KmerPartition;
|
||||
|
||||
const INDEX_SUBDIR: &str = "index";
|
||||
|
||||
fn olm_to_sk(e: OLMError) -> SKError {
|
||||
match e {
|
||||
OLMError::Io(e) => SKError::Io(e),
|
||||
other => SKError::InvalidData { context: "rebuild", detail: other.to_string() },
|
||||
}
|
||||
}
|
||||
|
||||
fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pbiv"))
|
||||
}
|
||||
|
||||
fn col_path_int(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pciv"))
|
||||
}
|
||||
|
||||
fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
|
||||
fs::write(dir.join("meta.json"), format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"))
|
||||
}
|
||||
|
||||
// ── ColBuilder ────────────────────────────────────────────────────────────────
|
||||
|
||||
enum ColBuilder {
|
||||
Bit(PersistentBitVecBuilder),
|
||||
Int(PersistentCompactIntVecBuilder),
|
||||
}
|
||||
|
||||
impl ColBuilder {
|
||||
fn set_val(&mut self, slot: usize, value: u32) {
|
||||
match self {
|
||||
ColBuilder::Bit(b) => b.set(slot, value > 0),
|
||||
ColBuilder::Int(b) => b.set(slot, value),
|
||||
}
|
||||
}
|
||||
|
||||
fn close(self) -> SKResult<()> {
|
||||
match self {
|
||||
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
|
||||
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn load_meta(dir: &Path) -> SKResult<PartitionMeta> {
|
||||
match PartitionMeta::load(dir) {
|
||||
Ok(m) => Ok(m),
|
||||
Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => {
|
||||
let mut n = 0usize;
|
||||
while dir.join(format!("layer_{n}")).exists() { n += 1; }
|
||||
let m = PartitionMeta { n_layers: n };
|
||||
m.save(dir).map_err(olm_to_sk)?;
|
||||
Ok(m)
|
||||
}
|
||||
Err(e) => Err(olm_to_sk(e)),
|
||||
}
|
||||
}
|
||||
|
||||
fn passes_all(filters: &[Box<dyn KmerFilter>], row: &[u32], n_genomes: usize) -> bool {
|
||||
filters.iter().all(|f| f.passes(row, n_genomes))
|
||||
}
|
||||
|
||||
// ── KmerPartition::rebuild_partition ─────────────────────────────────────────
|
||||
|
||||
impl KmerPartition {
|
||||
/// Rebuild partition `i` from `src` into `self` (an empty destination partition).
|
||||
///
|
||||
/// Only k-mers whose per-genome row passes all `filters` are written.
|
||||
/// The output is a single-layer index — regardless of how many layers the
|
||||
/// source has.
|
||||
///
|
||||
/// `n_genomes` is the number of genome columns in the source (and destination).
|
||||
pub fn rebuild_partition(
|
||||
&self,
|
||||
src: &KmerPartition,
|
||||
i: usize,
|
||||
filters: &[Box<dyn KmerFilter>],
|
||||
mode: MergeMode,
|
||||
n_genomes: usize,
|
||||
) -> SKResult<()> {
|
||||
let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
|
||||
if !src_index_dir.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let src_meta = load_meta(&src_index_dir)?;
|
||||
if src_meta.n_layers == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// ── Pass 1: collect filtered kmers into de Bruijn graph ───────────────
|
||||
let mut g = GraphDeBruijn::new();
|
||||
|
||||
for l in 0..src_meta.n_layers {
|
||||
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||
let unitigs_path = src_layer_dir.join("unitigs.bin");
|
||||
if !unitigs_path.exists() { continue; }
|
||||
|
||||
let reader = UnitigFileReader::open(&unitigs_path)?;
|
||||
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
let row = src_data.lookup(kmer, n_genomes);
|
||||
if passes_all(filters, &row, n_genomes) {
|
||||
g.push(kmer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if g.len() == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let n_new = g.len();
|
||||
g.compute_degrees();
|
||||
|
||||
// ── Build MPHF in dst layer_0 ─────────────────────────────────────────
|
||||
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
|
||||
let dst_layer_dir = dst_index_dir.join("layer_0");
|
||||
fs::create_dir_all(&dst_layer_dir)?;
|
||||
|
||||
let mut uw = Layer::<()>::unitig_writer(&dst_layer_dir).map_err(olm_to_sk)?;
|
||||
for unitig in g.iter_unitig() {
|
||||
uw.write(&unitig)?;
|
||||
}
|
||||
uw.close()?;
|
||||
drop(g);
|
||||
|
||||
Layer::<()>::build(&dst_layer_dir).map_err(olm_to_sk)?;
|
||||
let dst_mphf = MphfLayer::open(&dst_layer_dir).map_err(olm_to_sk)?;
|
||||
|
||||
// ── Prepare matrix builders (one column per genome) ───────────────────
|
||||
let data_dir = match mode {
|
||||
MergeMode::Presence => dst_layer_dir.join("presence"),
|
||||
MergeMode::Count => dst_layer_dir.join("counts"),
|
||||
};
|
||||
fs::create_dir_all(&data_dir)?;
|
||||
|
||||
let mut builders: Vec<ColBuilder> = match mode {
|
||||
MergeMode::Presence => {
|
||||
PersistentBitMatrixBuilder::new(n_new, &data_dir)
|
||||
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
|
||||
(0..n_genomes).map(|g| -> SKResult<ColBuilder> {
|
||||
let b = PersistentBitVecBuilder::new(n_new, &col_path_bit(&data_dir, g))?;
|
||||
Ok(ColBuilder::Bit(b))
|
||||
}).collect::<SKResult<_>>()?
|
||||
}
|
||||
MergeMode::Count => {
|
||||
PersistentCompactIntMatrixBuilder::new(n_new, &data_dir)
|
||||
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
|
||||
(0..n_genomes).map(|g| -> SKResult<ColBuilder> {
|
||||
let b = PersistentCompactIntVecBuilder::new(n_new, &col_path_int(&data_dir, g))?;
|
||||
Ok(ColBuilder::Int(b))
|
||||
}).collect::<SKResult<_>>()?
|
||||
}
|
||||
};
|
||||
|
||||
// ── Pass 2: fill builders ─────────────────────────────────────────────
|
||||
for l in 0..src_meta.n_layers {
|
||||
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||
let unitigs_path = src_layer_dir.join("unitigs.bin");
|
||||
if !unitigs_path.exists() { continue; }
|
||||
|
||||
let reader = UnitigFileReader::open(&unitigs_path)?;
|
||||
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
let row = src_data.lookup(kmer, n_genomes);
|
||||
if !passes_all(filters, &row, n_genomes) { continue; }
|
||||
if let Some(slot) = dst_mphf.find(kmer) {
|
||||
for (col, &value) in row.iter().enumerate() {
|
||||
builders[col].set_val(slot, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Close builders, write metadata ────────────────────────────────────
|
||||
for b in builders { b.close()?; }
|
||||
write_matrix_meta(&data_dir, n_new, n_genomes).map_err(SKError::Io)?;
|
||||
|
||||
PartitionMeta { n_layers: 1 }.save(&dst_index_dir).map_err(olm_to_sk)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user