feat: add select command for genome column projection and aggregation
Introduces the `select` CLI command to project and aggregate genome-level k-mer data by column. Adds `filter` as an alias for `rebuild`. The implementation uses parallel partition processing, supports metadata-driven grouping with configurable aggregation operators, and performs atomic in-place rewrites or filtered exports. Updates documentation and navigation accordingly.
This commit is contained in:
@@ -7,7 +7,9 @@ mod merge_layer;
|
||||
mod partition;
|
||||
mod query_layer;
|
||||
mod rebuild_layer;
|
||||
mod select_layer;
|
||||
|
||||
pub use filter::{GroupQuorumFilter, KmerFilter, passes_all};
|
||||
pub use merge_layer::MergeMode;
|
||||
pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};
|
||||
pub use select_layer::{AggOp, OutputCol};
|
||||
|
||||
@@ -0,0 +1,287 @@
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use obicompactvec::{
|
||||
PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
|
||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
|
||||
};
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
use obilayeredmap::OLMError;
|
||||
use obiskio::{SKError, SKResult};
|
||||
|
||||
use crate::partition::KmerPartition;
|
||||
|
||||
const INDEX_SUBDIR: &str = "index";
|
||||
|
||||
// ── AggOp ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum AggOp {
|
||||
Any,
|
||||
All,
|
||||
None,
|
||||
Sum,
|
||||
Min,
|
||||
Max,
|
||||
}
|
||||
|
||||
impl AggOp {
|
||||
pub fn is_logical(self) -> bool {
|
||||
matches!(self, AggOp::Any | AggOp::All | AggOp::None)
|
||||
}
|
||||
}
|
||||
|
||||
// ── OutputCol ─────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct OutputCol {
|
||||
pub label: String,
|
||||
pub indices: Vec<usize>,
|
||||
pub op: AggOp,
|
||||
}
|
||||
|
||||
// ── Aggregation ───────────────────────────────────────────────────────────────
|
||||
|
||||
#[inline]
|
||||
fn aggregate(op: AggOp, indices: &[usize], src_row: &[u32], threshold: u32) -> u32 {
|
||||
match op {
|
||||
AggOp::Any => {
|
||||
if indices.iter().any(|&i| src_row[i] > threshold) { 1 } else { 0 }
|
||||
}
|
||||
AggOp::All => {
|
||||
if indices.is_empty() { return 0; }
|
||||
if indices.iter().all(|&i| src_row[i] > threshold) { 1 } else { 0 }
|
||||
}
|
||||
AggOp::None => {
|
||||
if indices.iter().all(|&i| src_row[i] <= threshold) { 1 } else { 0 }
|
||||
}
|
||||
AggOp::Sum => {
|
||||
indices.iter().map(|&i| src_row[i]).fold(0u32, |a, b| a.saturating_add(b))
|
||||
}
|
||||
AggOp::Min => indices.iter().map(|&i| src_row[i]).min().unwrap_or(0),
|
||||
AggOp::Max => indices.iter().map(|&i| src_row[i]).max().unwrap_or(0),
|
||||
}
|
||||
}
|
||||
|
||||
// ── ColBuilder ────────────────────────────────────────────────────────────────
|
||||
|
||||
enum ColBuilder {
|
||||
Bit(PersistentBitVecBuilder),
|
||||
Int(PersistentCompactIntVecBuilder),
|
||||
}
|
||||
|
||||
impl ColBuilder {
|
||||
fn set_val(&mut self, slot: usize, value: u32) {
|
||||
match self {
|
||||
ColBuilder::Bit(b) => b.set(slot, value > 0),
|
||||
ColBuilder::Int(b) => b.set(slot, value),
|
||||
}
|
||||
}
|
||||
|
||||
fn close(self) -> SKResult<()> {
|
||||
match self {
|
||||
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
|
||||
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn olm_to_sk(e: OLMError) -> SKError {
|
||||
match e {
|
||||
OLMError::Io(e) => SKError::Io(e),
|
||||
other => SKError::InvalidData { context: "select", detail: other.to_string() },
|
||||
}
|
||||
}
|
||||
|
||||
fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pbiv"))
|
||||
}
|
||||
|
||||
fn col_path_int(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pciv"))
|
||||
}
|
||||
|
||||
fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
|
||||
fs::write(
|
||||
dir.join("meta.json"),
|
||||
format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"),
|
||||
)
|
||||
}
|
||||
|
||||
/// Copy all plain files (not subdirectories) from `src_dir` to `dst_dir`.
|
||||
fn copy_layer_files(src_dir: &Path, dst_dir: &Path) -> io::Result<()> {
|
||||
for entry in fs::read_dir(src_dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_file() {
|
||||
fs::copy(&path, dst_dir.join(entry.file_name()))?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ── fill_builders ─────────────────────────────────────────────────────────────
|
||||
|
||||
fn fill_builders(
|
||||
builders: &mut [ColBuilder],
|
||||
specs: &[OutputCol],
|
||||
n: usize,
|
||||
n_src: usize,
|
||||
src_layer_dir: &Path,
|
||||
src_is_count: bool,
|
||||
threshold: u32,
|
||||
) -> SKResult<()> {
|
||||
let mut src_buf = vec![0u32; n_src];
|
||||
|
||||
if src_is_count {
|
||||
let mat = PersistentCompactIntMatrix::open(src_layer_dir).map_err(SKError::Io)?;
|
||||
for slot in 0..n {
|
||||
mat.fill_row(slot, &mut src_buf);
|
||||
for (col, spec) in specs.iter().enumerate() {
|
||||
builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let mat = PersistentBitMatrix::open(src_layer_dir).map_err(SKError::Io)?;
|
||||
for slot in 0..n {
|
||||
mat.fill_row(slot, &mut src_buf);
|
||||
for (col, spec) in specs.iter().enumerate() {
|
||||
builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ── KmerPartition::select_partition ──────────────────────────────────────────
|
||||
|
||||
impl KmerPartition {
|
||||
/// Rewrite the data matrices of partition `i` in `src` into `self`.
|
||||
///
|
||||
/// `specs` defines the output columns (projection/aggregation).
|
||||
/// `output_presence` — if true, all output builders use bit (0/1) format.
|
||||
/// `in_place` — `self` and `src` share the same root; write to temp dirs then swap.
|
||||
pub fn select_partition(
|
||||
&self,
|
||||
src: &KmerPartition,
|
||||
i: usize,
|
||||
specs: &[OutputCol],
|
||||
n_src_genomes: usize,
|
||||
threshold: u32,
|
||||
output_presence: bool,
|
||||
in_place: bool,
|
||||
) -> SKResult<()> {
|
||||
let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
|
||||
if !src_index_dir.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let src_meta = PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?;
|
||||
if src_meta.n_layers == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
|
||||
if !in_place {
|
||||
fs::create_dir_all(&dst_index_dir)?;
|
||||
}
|
||||
|
||||
let n_out = specs.len();
|
||||
let data_subdir = if output_presence { "presence" } else { "counts" };
|
||||
|
||||
for l in 0..src_meta.n_layers {
|
||||
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||
if !src_layer_dir.exists() { continue; }
|
||||
|
||||
let dst_layer_dir = dst_index_dir.join(format!("layer_{l}"));
|
||||
|
||||
let counts_dir = src_layer_dir.join("counts");
|
||||
let presence_dir = src_layer_dir.join("presence");
|
||||
let src_is_count = counts_dir.exists() && !presence_dir.exists();
|
||||
|
||||
// Determine number of slots from the source matrix.
|
||||
let n = if counts_dir.exists() {
|
||||
PersistentCompactIntMatrix::open(&src_layer_dir).map_err(SKError::Io)?.n()
|
||||
} else if presence_dir.exists() {
|
||||
PersistentBitMatrix::open(&src_layer_dir).map_err(SKError::Io)?.n()
|
||||
} else {
|
||||
// Implicit single-genome layer: no data matrix needed in output either.
|
||||
if !in_place {
|
||||
fs::create_dir_all(&dst_layer_dir)?;
|
||||
copy_layer_files(&src_layer_dir, &dst_layer_dir)?;
|
||||
}
|
||||
continue;
|
||||
};
|
||||
|
||||
// Choose the output data directory (temp name for in-place).
|
||||
let (dst_data_dir, final_data_dir) = if in_place {
|
||||
let tmp = dst_layer_dir.join(format!("{data_subdir}_new"));
|
||||
let perm = dst_layer_dir.join(data_subdir);
|
||||
(tmp, perm)
|
||||
} else {
|
||||
let perm = dst_layer_dir.join(data_subdir);
|
||||
(perm.clone(), perm)
|
||||
};
|
||||
|
||||
if !in_place {
|
||||
fs::create_dir_all(&dst_layer_dir)?;
|
||||
copy_layer_files(&src_layer_dir, &dst_layer_dir)?;
|
||||
}
|
||||
fs::create_dir_all(&dst_data_dir)?;
|
||||
|
||||
// Initialise packed-format skeleton.
|
||||
if output_presence {
|
||||
PersistentBitMatrixBuilder::new(n, &dst_data_dir)
|
||||
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
|
||||
} else {
|
||||
PersistentCompactIntMatrixBuilder::new(n, &dst_data_dir)
|
||||
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
|
||||
}
|
||||
|
||||
// Create column builders.
|
||||
let mut builders: Vec<ColBuilder> = (0..n_out)
|
||||
.map(|col| -> SKResult<ColBuilder> {
|
||||
if output_presence {
|
||||
Ok(ColBuilder::Bit(PersistentBitVecBuilder::new(
|
||||
n, &col_path_bit(&dst_data_dir, col),
|
||||
)?))
|
||||
} else {
|
||||
Ok(ColBuilder::Int(PersistentCompactIntVecBuilder::new(
|
||||
n, &col_path_int(&dst_data_dir, col),
|
||||
)?))
|
||||
}
|
||||
})
|
||||
.collect::<SKResult<_>>()?;
|
||||
|
||||
fill_builders(
|
||||
&mut builders, specs, n, n_src_genomes,
|
||||
&src_layer_dir, src_is_count, threshold,
|
||||
)?;
|
||||
|
||||
for b in builders { b.close()?; }
|
||||
write_matrix_meta(&dst_data_dir, n, n_out).map_err(SKError::Io)?;
|
||||
|
||||
// In-place: swap old data dir for new.
|
||||
if in_place {
|
||||
let old_data_dir = if src_is_count {
|
||||
dst_layer_dir.join("counts")
|
||||
} else {
|
||||
dst_layer_dir.join("presence")
|
||||
};
|
||||
if old_data_dir.exists() {
|
||||
fs::remove_dir_all(&old_data_dir)?;
|
||||
}
|
||||
fs::rename(&dst_data_dir, &final_data_dir)?;
|
||||
}
|
||||
}
|
||||
|
||||
if !in_place {
|
||||
PartitionMeta::load(&src_index_dir).map_err(olm_to_sk)?
|
||||
.save(&dst_index_dir).map_err(olm_to_sk)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user