feat: add select command for genome column projection and aggregation

Introduces the `select` CLI command to project and aggregate genome-level k-mer data by column. Adds `filter` as an alias for `rebuild`. The implementation uses parallel partition processing, supports metadata-driven grouping with configurable aggregation operators, and performs atomic in-place rewrites or filtered exports. Updates documentation and navigation accordingly.
This commit is contained in:
Eric Coissac
2026-06-09 15:05:08 +02:00
parent b0dab452f6
commit e66adef23d
11 changed files with 958 additions and 1 deletions
+1
View File
@@ -7,6 +7,7 @@ mod index;
mod merge;
mod rebuild;
mod reindex;
mod select;
mod stats;
pub use error::{OKIError, OKIResult};
+166
View File
@@ -0,0 +1,166 @@
use std::fs;
use std::io;
use std::path::Path;
use obikpartitionner::{KmerPartition, OutputCol, PARTITIONS_SUBDIR};
use obisys::{Stage, progress_bar};
use rayon::prelude::*;
use tracing::info;
use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex;
use crate::meta::{GenomeInfo, IndexMeta};
use crate::state::{IndexState, SENTINEL_INDEXED};
impl KmerIndex {
/// Create a new index at `output` by projecting/aggregating the genome columns
/// of `src` according to `specs`.
///
/// `output_presence` — if true, output uses bit matrices (0/1), regardless of
/// whether the source stores counts. The caller is responsible for ensuring all
/// specs use logical operators when `output_presence=true` on a count source.
pub fn select<P: AsRef<Path>>(
output: P,
src: &KmerIndex,
specs: &[OutputCol],
threshold: u32,
output_presence: bool,
force: bool,
) -> OKIResult<Self> {
let output = output.as_ref();
if src.state() != IndexState::Indexed {
return Err(OKIError::NotIndexed(src.root_path.clone()));
}
if output.exists() {
if force {
fs::remove_dir_all(output)?;
} else {
return Err(OKIError::Io(io::Error::new(
io::ErrorKind::AlreadyExists,
format!("{}: output directory already exists", output.display()),
)));
}
}
fs::create_dir_all(output)?;
let mut meta = IndexMeta::new(src.meta.config.clone());
meta.config.with_counts = !output_presence;
meta.genomes = specs.iter()
.map(|s| GenomeInfo::new(s.label.clone()))
.collect();
meta.write(output)?;
let n_src_genomes = src.meta.genomes.len();
let n_partitions = src.partition.n_partitions();
fs::create_dir_all(output.join(PARTITIONS_SUBDIR))?;
let dst_partition = KmerPartition::open_with_config(
output,
meta.config.kmer_size,
meta.config.minimizer_size,
meta.config.n_bits,
)?;
info!(
"select: {} partition(s), {} source genome(s) → {} output column(s)",
n_partitions, n_src_genomes, specs.len(),
);
let t = Stage::start("select");
let pb = progress_bar("select", n_partitions as u64, "partitions");
let src_partition = &src.partition;
let errors: Vec<obiskio::SKError> = (0..n_partitions)
.into_par_iter()
.filter_map(|i| {
let result = dst_partition.select_partition(
src_partition, i, specs,
n_src_genomes, threshold, output_presence,
false,
);
pb.inc(1);
result.err()
})
.collect();
pb.finish_and_clear();
if let Some(e) = errors.into_iter().next() {
return Err(OKIError::Partition(e));
}
let _ = t.stop();
fs::File::create(output.join(SENTINEL_INDEXED))?;
let idx = KmerIndex::open(output)?;
idx.pack_matrices()?;
Ok(idx)
}
/// Rewrite the genome columns of this index in-place according to `specs`.
///
/// The MPHF and unitig files are unchanged; only data matrices are rewritten.
pub fn select_in_place(
&mut self,
specs: &[OutputCol],
threshold: u32,
output_presence: bool,
) -> OKIResult<()> {
if self.state() != IndexState::Indexed {
return Err(OKIError::NotIndexed(self.root_path.clone()));
}
let n_src_genomes = self.meta.genomes.len();
let n_partitions = self.partition.n_partitions();
// Open a second handle to the same path so we can borrow src and dst simultaneously.
let src_partition = KmerPartition::open_with_config(
&self.root_path,
self.meta.config.kmer_size,
self.meta.config.minimizer_size,
self.meta.config.n_bits,
)?;
info!(
"select (in-place): {} partition(s), {} source genome(s) → {} output column(s)",
n_partitions, n_src_genomes, specs.len(),
);
let t = Stage::start("select");
let pb = progress_bar("select", n_partitions as u64, "partitions");
let errors: Vec<obiskio::SKError> = (0..n_partitions)
.into_par_iter()
.filter_map(|i| {
let result = self.partition.select_partition(
&src_partition, i, specs,
n_src_genomes, threshold, output_presence,
true,
);
pb.inc(1);
result.err()
})
.collect();
pb.finish_and_clear();
if let Some(e) = errors.into_iter().next() {
return Err(OKIError::Partition(e));
}
let _ = t.stop();
// Update index.meta with new genome list and with_counts flag.
self.meta.config.with_counts = !output_presence;
self.meta.genomes = specs.iter()
.map(|s| GenomeInfo::new(s.label.clone()))
.collect();
self.meta.write(&self.root_path)?;
self.pack_matrices()?;
Ok(())
}
}