feat: add kmer filtering and refactor layer iteration
Introduce a `passes_all` utility to validate kmer rows against multiple filters using short-circuit logic. Integrate a `filters` parameter into the iteration functions to conditionally emit kmers based on filter results. Extract repetitive layer traversal and filtering into an `iter_src_layers` helper, refactoring Pass 1 and Pass 2 to eliminate duplication. Additionally, add a debug conditional to the dump output to include partition and layer metadata alongside kmer sequences.
This commit is contained in:
@@ -7,11 +7,12 @@ use obicompactvec::{PersistentBitMatrixBuilder,
|
||||
PersistentCompactIntMatrixBuilder,
|
||||
PersistentCompactIntVecBuilder};
|
||||
use obidebruinj::GraphDeBruijn;
|
||||
use obikseq::CanonicalKmer;
|
||||
use obiskio::{SKError, SKResult, UnitigFileReader};
|
||||
use obilayeredmap::{IndexMode, Layer, MphfLayer, OLMError};
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
|
||||
use crate::filter::KmerFilter;
|
||||
use crate::filter::{KmerFilter, passes_all};
|
||||
use crate::merge_layer::{MergeMode, SrcLayerData};
|
||||
use crate::partition::KmerPartition;
|
||||
|
||||
@@ -75,8 +76,34 @@ fn load_meta(dir: &Path) -> SKResult<PartitionMeta> {
|
||||
}
|
||||
}
|
||||
|
||||
fn passes_all(filters: &[Box<dyn KmerFilter>], row: &[u32], n_genomes: usize) -> bool {
|
||||
filters.iter().all(|f| f.passes(row, n_genomes))
|
||||
/// Iterate all kmers in `src_index_dir` that pass `filters`, yielding `(kmer, row)`.
|
||||
///
|
||||
/// Uses [`SrcLayerData`] semantics: counts take priority over presence when
|
||||
/// `mode = Count`; presence (or implicit all-ones) is used for `Presence`.
|
||||
fn iter_src_layers(
|
||||
src_index_dir: &Path,
|
||||
mode: MergeMode,
|
||||
n_genomes: usize,
|
||||
filters: &[Box<dyn KmerFilter>],
|
||||
mut cb: impl FnMut(CanonicalKmer, Box<[u32]>),
|
||||
) -> SKResult<()> {
|
||||
let src_meta = load_meta(src_index_dir)?;
|
||||
for l in 0..src_meta.n_layers {
|
||||
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||
let unitigs_path = src_layer_dir.join("unitigs.bin");
|
||||
if !unitigs_path.exists() { continue; }
|
||||
|
||||
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
let row = src_data.lookup(kmer, n_genomes);
|
||||
if passes_all(filters, &row, n_genomes) {
|
||||
cb(kmer, row.into_boxed_slice());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ── KmerPartition::rebuild_partition ─────────────────────────────────────────
|
||||
@@ -110,22 +137,9 @@ impl KmerPartition {
|
||||
|
||||
// ── Pass 1: collect filtered kmers into de Bruijn graph ───────────────
|
||||
let mut g = GraphDeBruijn::new();
|
||||
|
||||
for l in 0..src_meta.n_layers {
|
||||
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||
let unitigs_path = src_layer_dir.join("unitigs.bin");
|
||||
if !unitigs_path.exists() { continue; }
|
||||
|
||||
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
let row = src_data.lookup(kmer, n_genomes);
|
||||
if passes_all(filters, &row, n_genomes) {
|
||||
g.push(kmer);
|
||||
}
|
||||
}
|
||||
}
|
||||
iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, _row| {
|
||||
g.push(kmer);
|
||||
})?;
|
||||
|
||||
if g.len() == 0 {
|
||||
return Ok(());
|
||||
@@ -176,24 +190,13 @@ impl KmerPartition {
|
||||
};
|
||||
|
||||
// ── Pass 2: fill builders ─────────────────────────────────────────────
|
||||
for l in 0..src_meta.n_layers {
|
||||
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||
let unitigs_path = src_layer_dir.join("unitigs.bin");
|
||||
if !unitigs_path.exists() { continue; }
|
||||
|
||||
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
let row = src_data.lookup(kmer, n_genomes);
|
||||
if !passes_all(filters, &row, n_genomes) { continue; }
|
||||
if let Some(slot) = dst_mphf.find(kmer) {
|
||||
for (col, &value) in row.iter().enumerate() {
|
||||
builders[col].set_val(slot, value);
|
||||
}
|
||||
iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, row| {
|
||||
if let Some(slot) = dst_mphf.find(kmer) {
|
||||
for (col, &value) in row.iter().enumerate() {
|
||||
builders[col].set_val(slot, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
})?;
|
||||
|
||||
// ── Close builders, write metadata ────────────────────────────────────
|
||||
for b in builders { b.close()?; }
|
||||
|
||||
Reference in New Issue
Block a user