feat: add select command for genome column projection and aggregation
Introduces the `select` CLI command to project and aggregate genome-level k-mer data by column. Adds `filter` as an alias for `rebuild`. The implementation uses parallel partition processing, supports metadata-driven grouping with configurable aggregation operators, and performs atomic in-place rewrites or filtered exports. Updates documentation and navigation accordingly.
This commit is contained in:
@@ -7,6 +7,7 @@ mod index;
|
||||
mod merge;
|
||||
mod rebuild;
|
||||
mod reindex;
|
||||
mod select;
|
||||
mod stats;
|
||||
|
||||
pub use error::{OKIError, OKIResult};
|
||||
|
||||
@@ -0,0 +1,166 @@
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
|
||||
use obikpartitionner::{KmerPartition, OutputCol, PARTITIONS_SUBDIR};
|
||||
use obisys::{Stage, progress_bar};
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
use crate::error::{OKIError, OKIResult};
|
||||
use crate::index::KmerIndex;
|
||||
use crate::meta::{GenomeInfo, IndexMeta};
|
||||
use crate::state::{IndexState, SENTINEL_INDEXED};
|
||||
|
||||
impl KmerIndex {
|
||||
/// Create a new index at `output` by projecting/aggregating the genome columns
|
||||
/// of `src` according to `specs`.
|
||||
///
|
||||
/// `output_presence` — if true, output uses bit matrices (0/1), regardless of
|
||||
/// whether the source stores counts. The caller is responsible for ensuring all
|
||||
/// specs use logical operators when `output_presence=true` on a count source.
|
||||
pub fn select<P: AsRef<Path>>(
|
||||
output: P,
|
||||
src: &KmerIndex,
|
||||
specs: &[OutputCol],
|
||||
threshold: u32,
|
||||
output_presence: bool,
|
||||
force: bool,
|
||||
) -> OKIResult<Self> {
|
||||
let output = output.as_ref();
|
||||
|
||||
if src.state() != IndexState::Indexed {
|
||||
return Err(OKIError::NotIndexed(src.root_path.clone()));
|
||||
}
|
||||
|
||||
if output.exists() {
|
||||
if force {
|
||||
fs::remove_dir_all(output)?;
|
||||
} else {
|
||||
return Err(OKIError::Io(io::Error::new(
|
||||
io::ErrorKind::AlreadyExists,
|
||||
format!("{}: output directory already exists", output.display()),
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
fs::create_dir_all(output)?;
|
||||
let mut meta = IndexMeta::new(src.meta.config.clone());
|
||||
meta.config.with_counts = !output_presence;
|
||||
meta.genomes = specs.iter()
|
||||
.map(|s| GenomeInfo::new(s.label.clone()))
|
||||
.collect();
|
||||
meta.write(output)?;
|
||||
|
||||
let n_src_genomes = src.meta.genomes.len();
|
||||
let n_partitions = src.partition.n_partitions();
|
||||
|
||||
fs::create_dir_all(output.join(PARTITIONS_SUBDIR))?;
|
||||
let dst_partition = KmerPartition::open_with_config(
|
||||
output,
|
||||
meta.config.kmer_size,
|
||||
meta.config.minimizer_size,
|
||||
meta.config.n_bits,
|
||||
)?;
|
||||
|
||||
info!(
|
||||
"select: {} partition(s), {} source genome(s) → {} output column(s)",
|
||||
n_partitions, n_src_genomes, specs.len(),
|
||||
);
|
||||
|
||||
let t = Stage::start("select");
|
||||
let pb = progress_bar("select", n_partitions as u64, "partitions");
|
||||
let src_partition = &src.partition;
|
||||
|
||||
let errors: Vec<obiskio::SKError> = (0..n_partitions)
|
||||
.into_par_iter()
|
||||
.filter_map(|i| {
|
||||
let result = dst_partition.select_partition(
|
||||
src_partition, i, specs,
|
||||
n_src_genomes, threshold, output_presence,
|
||||
false,
|
||||
);
|
||||
pb.inc(1);
|
||||
result.err()
|
||||
})
|
||||
.collect();
|
||||
|
||||
pb.finish_and_clear();
|
||||
|
||||
if let Some(e) = errors.into_iter().next() {
|
||||
return Err(OKIError::Partition(e));
|
||||
}
|
||||
|
||||
let _ = t.stop();
|
||||
|
||||
fs::File::create(output.join(SENTINEL_INDEXED))?;
|
||||
|
||||
let idx = KmerIndex::open(output)?;
|
||||
idx.pack_matrices()?;
|
||||
Ok(idx)
|
||||
}
|
||||
|
||||
/// Rewrite the genome columns of this index in-place according to `specs`.
|
||||
///
|
||||
/// The MPHF and unitig files are unchanged; only data matrices are rewritten.
|
||||
pub fn select_in_place(
|
||||
&mut self,
|
||||
specs: &[OutputCol],
|
||||
threshold: u32,
|
||||
output_presence: bool,
|
||||
) -> OKIResult<()> {
|
||||
if self.state() != IndexState::Indexed {
|
||||
return Err(OKIError::NotIndexed(self.root_path.clone()));
|
||||
}
|
||||
|
||||
let n_src_genomes = self.meta.genomes.len();
|
||||
let n_partitions = self.partition.n_partitions();
|
||||
|
||||
// Open a second handle to the same path so we can borrow src and dst simultaneously.
|
||||
let src_partition = KmerPartition::open_with_config(
|
||||
&self.root_path,
|
||||
self.meta.config.kmer_size,
|
||||
self.meta.config.minimizer_size,
|
||||
self.meta.config.n_bits,
|
||||
)?;
|
||||
|
||||
info!(
|
||||
"select (in-place): {} partition(s), {} source genome(s) → {} output column(s)",
|
||||
n_partitions, n_src_genomes, specs.len(),
|
||||
);
|
||||
|
||||
let t = Stage::start("select");
|
||||
let pb = progress_bar("select", n_partitions as u64, "partitions");
|
||||
|
||||
let errors: Vec<obiskio::SKError> = (0..n_partitions)
|
||||
.into_par_iter()
|
||||
.filter_map(|i| {
|
||||
let result = self.partition.select_partition(
|
||||
&src_partition, i, specs,
|
||||
n_src_genomes, threshold, output_presence,
|
||||
true,
|
||||
);
|
||||
pb.inc(1);
|
||||
result.err()
|
||||
})
|
||||
.collect();
|
||||
|
||||
pb.finish_and_clear();
|
||||
|
||||
if let Some(e) = errors.into_iter().next() {
|
||||
return Err(OKIError::Partition(e));
|
||||
}
|
||||
|
||||
let _ = t.stop();
|
||||
|
||||
// Update index.meta with new genome list and with_counts flag.
|
||||
self.meta.config.with_counts = !output_presence;
|
||||
self.meta.genomes = specs.iter()
|
||||
.map(|s| GenomeInfo::new(s.label.clone()))
|
||||
.collect();
|
||||
self.meta.write(&self.root_path)?;
|
||||
|
||||
self.pack_matrices()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user