feat: add select command for genome column projection and aggregation

Introduces the `select` CLI command to project and aggregate genome-level k-mer data by column. Adds `filter` as an alias for `rebuild`. The implementation uses parallel partition processing, supports metadata-driven grouping with configurable aggregation operators, and performs atomic in-place rewrites or filtered exports. Updates documentation and navigation accordingly.
This commit is contained in:
Eric Coissac
2026-06-09 15:05:08 +02:00
parent b0dab452f6
commit e66adef23d
11 changed files with 958 additions and 1 deletions
+2
View File
@@ -7,7 +7,9 @@ mod merge_layer;
mod partition;
mod query_layer;
mod rebuild_layer;
mod select_layer;
pub use filter::{GroupQuorumFilter, KmerFilter, passes_all};
pub use merge_layer::MergeMode;
pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};
pub use select_layer::{AggOp, OutputCol};
+287
View File
@@ -0,0 +1,287 @@
use std::fs;
use std::io;
use std::path::{Path, PathBuf};
use obicompactvec::{
PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
};
use obilayeredmap::meta::PartitionMeta;
use obilayeredmap::OLMError;
use obiskio::{SKError, SKResult};
use crate::partition::KmerPartition;
const INDEX_SUBDIR: &str = "index";
// ── AggOp ─────────────────────────────────────────────────────────────────────
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AggOp {
Any,
All,
None,
Sum,
Min,
Max,
}
impl AggOp {
pub fn is_logical(self) -> bool {
matches!(self, AggOp::Any | AggOp::All | AggOp::None)
}
}
// ── OutputCol ─────────────────────────────────────────────────────────────────
pub struct OutputCol {
pub label: String,
pub indices: Vec<usize>,
pub op: AggOp,
}
// ── Aggregation ───────────────────────────────────────────────────────────────
#[inline]
fn aggregate(op: AggOp, indices: &[usize], src_row: &[u32], threshold: u32) -> u32 {
match op {
AggOp::Any => {
if indices.iter().any(|&i| src_row[i] > threshold) { 1 } else { 0 }
}
AggOp::All => {
if indices.is_empty() { return 0; }
if indices.iter().all(|&i| src_row[i] > threshold) { 1 } else { 0 }
}
AggOp::None => {
if indices.iter().all(|&i| src_row[i] <= threshold) { 1 } else { 0 }
}
AggOp::Sum => {
indices.iter().map(|&i| src_row[i]).fold(0u32, |a, b| a.saturating_add(b))
}
AggOp::Min => indices.iter().map(|&i| src_row[i]).min().unwrap_or(0),
AggOp::Max => indices.iter().map(|&i| src_row[i]).max().unwrap_or(0),
}
}
// ── ColBuilder ────────────────────────────────────────────────────────────────
enum ColBuilder {
Bit(PersistentBitVecBuilder),
Int(PersistentCompactIntVecBuilder),
}
impl ColBuilder {
fn set_val(&mut self, slot: usize, value: u32) {
match self {
ColBuilder::Bit(b) => b.set(slot, value > 0),
ColBuilder::Int(b) => b.set(slot, value),
}
}
fn close(self) -> SKResult<()> {
match self {
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
}
}
}
// ── Helpers ───────────────────────────────────────────────────────────────────
fn olm_to_sk(e: OLMError) -> SKError {
match e {
OLMError::Io(e) => SKError::Io(e),
other => SKError::InvalidData { context: "select", detail: other.to_string() },
}
}
fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
dir.join(format!("col_{col:06}.pbiv"))
}
fn col_path_int(dir: &Path, col: usize) -> PathBuf {
dir.join(format!("col_{col:06}.pciv"))
}
fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
fs::write(
dir.join("meta.json"),
format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"),
)
}
/// Copy all plain files (not subdirectories) from `src_dir` to `dst_dir`.
fn copy_layer_files(src_dir: &Path, dst_dir: &Path) -> io::Result<()> {
for entry in fs::read_dir(src_dir)? {
let entry = entry?;
let path = entry.path();
if path.is_file() {
fs::copy(&path, dst_dir.join(entry.file_name()))?;
}
}
Ok(())
}
// ── fill_builders ─────────────────────────────────────────────────────────────
fn fill_builders(
builders: &mut [ColBuilder],
specs: &[OutputCol],
n: usize,
n_src: usize,
src_layer_dir: &Path,
src_is_count: bool,
threshold: u32,
) -> SKResult<()> {
let mut src_buf = vec![0u32; n_src];
if src_is_count {
let mat = PersistentCompactIntMatrix::open(src_layer_dir).map_err(SKError::Io)?;
for slot in 0..n {
mat.fill_row(slot, &mut src_buf);
for (col, spec) in specs.iter().enumerate() {
builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold));
}
}
} else {
let mat = PersistentBitMatrix::open(src_layer_dir).map_err(SKError::Io)?;
for slot in 0..n {
mat.fill_row(slot, &mut src_buf);
for (col, spec) in specs.iter().enumerate() {
builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold));
}
}
}
Ok(())
}
// ── KmerPartition::select_partition ──────────────────────────────────────────
impl KmerPartition {
/// Rewrite the data matrices of partition `i` in `src` into `self`.
///
/// `specs` defines the output columns (projection/aggregation).
/// `output_presence` — if true, all output builders use bit (0/1) format.
/// `in_place` — `self` and `src` share the same root; write to temp dirs then swap.
pub fn select_partition(
&self,
src: &KmerPartition,
i: usize,
specs: &[OutputCol],
n_src_genomes: usize,
threshold: u32,
output_presence: bool,
in_place: bool,
) -> SKResult<()> {
let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
if !src_index_dir.exists() {
return Ok(());
}
let src_meta = PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?;
if src_meta.n_layers == 0 {
return Ok(());
}
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
if !in_place {
fs::create_dir_all(&dst_index_dir)?;
}
let n_out = specs.len();
let data_subdir = if output_presence { "presence" } else { "counts" };
for l in 0..src_meta.n_layers {
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
if !src_layer_dir.exists() { continue; }
let dst_layer_dir = dst_index_dir.join(format!("layer_{l}"));
let counts_dir = src_layer_dir.join("counts");
let presence_dir = src_layer_dir.join("presence");
let src_is_count = counts_dir.exists() && !presence_dir.exists();
// Determine number of slots from the source matrix.
let n = if counts_dir.exists() {
PersistentCompactIntMatrix::open(&src_layer_dir).map_err(SKError::Io)?.n()
} else if presence_dir.exists() {
PersistentBitMatrix::open(&src_layer_dir).map_err(SKError::Io)?.n()
} else {
// Implicit single-genome layer: no data matrix needed in output either.
if !in_place {
fs::create_dir_all(&dst_layer_dir)?;
copy_layer_files(&src_layer_dir, &dst_layer_dir)?;
}
continue;
};
// Choose the output data directory (temp name for in-place).
let (dst_data_dir, final_data_dir) = if in_place {
let tmp = dst_layer_dir.join(format!("{data_subdir}_new"));
let perm = dst_layer_dir.join(data_subdir);
(tmp, perm)
} else {
let perm = dst_layer_dir.join(data_subdir);
(perm.clone(), perm)
};
if !in_place {
fs::create_dir_all(&dst_layer_dir)?;
copy_layer_files(&src_layer_dir, &dst_layer_dir)?;
}
fs::create_dir_all(&dst_data_dir)?;
// Initialise packed-format skeleton.
if output_presence {
PersistentBitMatrixBuilder::new(n, &dst_data_dir)
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
} else {
PersistentCompactIntMatrixBuilder::new(n, &dst_data_dir)
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
}
// Create column builders.
let mut builders: Vec<ColBuilder> = (0..n_out)
.map(|col| -> SKResult<ColBuilder> {
if output_presence {
Ok(ColBuilder::Bit(PersistentBitVecBuilder::new(
n, &col_path_bit(&dst_data_dir, col),
)?))
} else {
Ok(ColBuilder::Int(PersistentCompactIntVecBuilder::new(
n, &col_path_int(&dst_data_dir, col),
)?))
}
})
.collect::<SKResult<_>>()?;
fill_builders(
&mut builders, specs, n, n_src_genomes,
&src_layer_dir, src_is_count, threshold,
)?;
for b in builders { b.close()?; }
write_matrix_meta(&dst_data_dir, n, n_out).map_err(SKError::Io)?;
// In-place: swap old data dir for new.
if in_place {
let old_data_dir = if src_is_count {
dst_layer_dir.join("counts")
} else {
dst_layer_dir.join("presence")
};
if old_data_dir.exists() {
fs::remove_dir_all(&old_data_dir)?;
}
fs::rename(&dst_data_dir, &final_data_dir)?;
}
}
if !in_place {
PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?
.save(&dst_index_dir).map_err(olm_to_sk)?;
}
Ok(())
}
}