Push mtzqmmrlmzzx #34

Merged
coissac merged 25 commits from push-mtzqmmrlmzzx into main 2026-06-22 08:47:24 +00:00
7 changed files with 462 additions and 65 deletions
Showing only changes of commit 7c1efa9cbb - Show all commits
+67 -1
View File
@@ -5,7 +5,7 @@ use std::path::{Path, PathBuf};
use memmap2::{Mmap, MmapMut}; use memmap2::{Mmap, MmapMut};
use crate::reader::PersistentCompactIntVec; use crate::reader::PersistentCompactIntVec;
use crate::views::{BitSliceView, BitSliceIter}; use crate::views::{BitSliceIter, BitSliceView, IntSliceView};
const MAGIC: [u8; 4] = *b"PBIV"; const MAGIC: [u8; 4] = *b"PBIV";
@@ -241,6 +241,72 @@ impl PersistentBitVecBuilder {
} }
} }
/// OR in bits at slots where `pred(col[slot])` is true.
pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
let n = self.n;
let primary = col.primary_bytes();
let words = self.data_words_mut();
let nw = n_words(n);
for wi in 0..nw {
let base = wi * 64;
let limit = (base + 64).min(n);
let mut mask = 0u64;
for bit in 0..(limit - base) {
let b = primary[base + bit];
if b < 255 && pred(b as u32) { mask |= 1u64 << bit; }
}
words[wi] |= mask;
}
for (slot, val) in col.overflow_entries() {
if pred(val) { words[slot >> 6] |= 1u64 << (slot & 63); }
}
}
/// Clear bits at slots where `pred(col[slot])` is false.
pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
let n = self.n;
let primary = col.primary_bytes();
let words = self.data_words_mut();
let nw = n_words(n);
for wi in 0..nw {
let base = wi * 64;
let limit = (base + 64).min(n);
let mut mask = 0u64;
for bit in 0..(limit - base) {
let b = primary[base + bit];
if b < 255 && !pred(b as u32) { mask |= 1u64 << bit; }
}
words[wi] &= !mask;
}
for (slot, val) in col.overflow_entries() {
if !pred(val) { words[slot >> 6] &= !(1u64 << (slot & 63)); }
}
}
/// Toggle bits at slots where `pred(col[slot])` is true.
pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
let n = self.n;
let primary = col.primary_bytes();
let words = self.data_words_mut();
let nw = n_words(n);
for wi in 0..nw {
let base = wi * 64;
let limit = (base + 64).min(n);
let mut mask = 0u64;
for bit in 0..(limit - base) {
let b = primary[base + bit];
if b < 255 && pred(b as u32) { mask |= 1u64 << bit; }
}
words[wi] ^= mask;
}
for (slot, val) in col.overflow_entries() {
if pred(val) { words[slot >> 6] ^= 1u64 << (slot & 63); }
}
}
pub fn iter(&self) -> BitSliceIter<'_> { pub fn iter(&self) -> BitSliceIter<'_> {
self.view().iter() self.view().iter()
} }
+70
View File
@@ -70,3 +70,73 @@ pub trait MatrixGroupOps {
b.freeze() b.freeze()
} }
} }
// ── FilterMask — expression tree for column-based slot filters ────────────────
/// A composable filter expression that can be evaluated against a matrix
/// using only column operations (no MPHF lookup per kmer).
///
/// `threshold` semantics follow [`MatrixGroupOps::partial_group_presence_count`]:
/// a slot contributes to the count when its value is **≥ threshold**.
/// To match the row-level filter (`value > t`), callers should pass `t + 1`.
#[derive(Debug, Clone)]
pub enum FilterMask {
/// Slot passes if count of columns in `indices` with value ≥ `threshold` is ≥ `min_count`.
PresenceGeq { indices: Vec<usize>, threshold: u32, min_count: usize },
/// Slot passes if count of columns in `indices` with value ≥ `threshold` is ≤ `max_count`.
PresenceLeq { indices: Vec<usize>, threshold: u32, max_count: usize },
/// Slot passes if sum of values across `indices` columns is ≥ `min_sum`.
SumGeq { indices: Vec<usize>, min_sum: u32 },
/// Slot passes if sum of values across `indices` columns is ≤ `max_sum`.
SumLeq { indices: Vec<usize>, max_sum: u32 },
/// Slot passes if it passes all sub-expressions. Empty `And` is always true.
And(Vec<FilterMask>),
}
/// Evaluate a [`FilterMask`] against `mat`, returning a per-slot `TempBitVec`
/// where bit=1 means the slot passes the filter.
pub fn eval_filter_mask(expr: &FilterMask, mat: &dyn MatrixGroupOps, n: usize) -> io::Result<TempBitVec> {
match expr {
FilterMask::PresenceGeq { indices, threshold, min_count } => {
let g = ColGroup::new("", indices.clone());
let counts = mat.partial_group_presence_count(&g, *threshold)?;
let mut b = TempBitVecBuilder::new(n)?;
let mc = *min_count as u32;
b.or_where(counts.view(), |v| v >= mc);
b.freeze()
}
FilterMask::PresenceLeq { indices, threshold, max_count } => {
let g = ColGroup::new("", indices.clone());
let counts = mat.partial_group_presence_count(&g, *threshold)?;
let mut b = TempBitVecBuilder::new(n)?;
let mc = *max_count as u32;
b.or_where(counts.view(), |v| v <= mc);
b.freeze()
}
FilterMask::SumGeq { indices, min_sum } => {
let g = ColGroup::new("", indices.clone());
let sums = mat.partial_group_sum(&g)?;
let mut b = TempBitVecBuilder::new(n)?;
let ms = *min_sum;
b.or_where(sums.view(), |v| v >= ms);
b.freeze()
}
FilterMask::SumLeq { indices, max_sum } => {
let g = ColGroup::new("", indices.clone());
let sums = mat.partial_group_sum(&g)?;
let mut b = TempBitVecBuilder::new(n)?;
let ms = *max_sum;
b.or_where(sums.view(), |v| v <= ms);
b.freeze()
}
FilterMask::And(parts) => {
let mut b = TempBitVecBuilder::new(n)?;
b.not(); // initialise à tout-1 (tout passe)
for part in parts {
let m = eval_filter_mask(part, mat, n)?;
b.and(m.view());
}
b.freeze()
}
}
}
+1 -1
View File
@@ -15,7 +15,7 @@ pub mod traits;
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder}; pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix}; pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
pub use builder::PersistentCompactIntVecBuilder; pub use builder::PersistentCompactIntVecBuilder;
pub use colgroup::{ColGroup, MatrixGroupOps}; pub use colgroup::{ColGroup, FilterMask, MatrixGroupOps, eval_filter_mask};
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix}; pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
pub use layer_meta::LayerMeta; pub use layer_meta::LayerMeta;
pub use reader::PersistentCompactIntVec; pub use reader::PersistentCompactIntVec;
+25 -12
View File
@@ -73,18 +73,31 @@ impl TempBitVecBuilder {
self.builder.or(other); self.builder.or(other);
} }
/// Set self[slot] where pred(col[slot]) is true. Two-pass: primary then overflow. pub(crate) fn and(&mut self, other: BitSliceView<'_>) {
self.builder.and(other);
}
pub(crate) fn xor(&mut self, other: BitSliceView<'_>) {
self.builder.xor(other);
}
pub(crate) fn not(&mut self) {
self.builder.not();
}
pub(crate) fn copy_from(&mut self, src: BitSliceView<'_>) {
self.builder.copy_from(src);
}
pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
for slot in 0..col.len() { self.builder.or_where(col, pred);
let b = col.primary_bytes()[slot]; }
if b < 255 && pred(b as u32) {
self.builder.set(slot, true); pub(crate) fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
} self.builder.and_where(col, pred);
} }
for (slot, val) in col.overflow_entries() {
if pred(val) { pub(crate) fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
self.builder.set(slot, true); self.builder.xor_where(col, pred);
}
}
} }
} }
+120
View File
@@ -1,9 +1,24 @@
use obicompactvec::FilterMask;
/// Trait for kmer row filters. /// Trait for kmer row filters.
/// ///
/// `row` contains raw per-genome counts (or 0/1 for presence/absence data). /// `row` contains raw per-genome counts (or 0/1 for presence/absence data).
/// `n_genomes` equals `row.len()`. /// `n_genomes` equals `row.len()`.
pub trait KmerFilter: Send + Sync { pub trait KmerFilter: Send + Sync {
fn passes(&self, row: &[u32], n_genomes: usize) -> bool; fn passes(&self, row: &[u32], n_genomes: usize) -> bool;
/// Express this filter as a [`FilterMask`] column-operation expression.
///
/// Returns `Some(expr)` if the filter can be evaluated solely from matrix
/// column aggregates (no per-kmer row scan needed). Returns `None` if the
/// filter requires row-level inspection.
///
/// `threshold` semantics in the returned mask use `>= threshold`, matching
/// [`obicompactvec::MatrixGroupOps`]. Implementations must add 1 to any
/// row-level threshold that uses strict `>` comparison.
fn column_mask_expr(&self, _n_genomes: usize) -> Option<FilterMask> {
None
}
} }
/// True when `row` passes every filter in `filters`. /// True when `row` passes every filter in `filters`.
@@ -29,6 +44,16 @@ impl KmerFilter for MinGenomeFraction {
let p = present_count(row, self.threshold); let p = present_count(row, self.threshold);
p as f64 / n_genomes as f64 >= self.frac p as f64 / n_genomes as f64 >= self.frac
} }
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
let t = self.threshold.checked_add(1)?;
let min_count = (self.frac * n_genomes as f64).ceil() as usize;
Some(FilterMask::PresenceGeq {
indices: (0..n_genomes).collect(),
threshold: t,
min_count,
})
}
} }
/// At most `frac` fraction of genomes contain this kmer (count > `threshold`). /// At most `frac` fraction of genomes contain this kmer (count > `threshold`).
@@ -42,6 +67,16 @@ impl KmerFilter for MaxGenomeFraction {
let p = present_count(row, self.threshold); let p = present_count(row, self.threshold);
p as f64 / n_genomes as f64 <= self.frac p as f64 / n_genomes as f64 <= self.frac
} }
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
let t = self.threshold.checked_add(1)?;
let max_count = (self.frac * n_genomes as f64).floor() as usize;
Some(FilterMask::PresenceLeq {
indices: (0..n_genomes).collect(),
threshold: t,
max_count,
})
}
} }
/// At least `count` genomes contain this kmer (count > `threshold`). /// At least `count` genomes contain this kmer (count > `threshold`).
@@ -54,6 +89,15 @@ impl KmerFilter for MinGenomeCount {
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
present_count(row, self.threshold) >= self.count present_count(row, self.threshold) >= self.count
} }
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
let t = self.threshold.checked_add(1)?;
Some(FilterMask::PresenceGeq {
indices: (0..n_genomes).collect(),
threshold: t,
min_count: self.count,
})
}
} }
/// At most `count` genomes contain this kmer (count > `threshold`). /// At most `count` genomes contain this kmer (count > `threshold`).
@@ -66,6 +110,15 @@ impl KmerFilter for MaxGenomeCount {
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
present_count(row, self.threshold) <= self.count present_count(row, self.threshold) <= self.count
} }
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
let t = self.threshold.checked_add(1)?;
Some(FilterMask::PresenceLeq {
indices: (0..n_genomes).collect(),
threshold: t,
max_count: self.count,
})
}
} }
// ── Total-count filters (count indexes only) ─────────────────────────────────── // ── Total-count filters (count indexes only) ───────────────────────────────────
@@ -79,6 +132,13 @@ impl KmerFilter for MinTotalCount {
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
row.iter().sum::<u32>() >= self.total row.iter().sum::<u32>() >= self.total
} }
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
Some(FilterMask::SumGeq {
indices: (0..n_genomes).collect(),
min_sum: self.total,
})
}
} }
/// Sum of counts across all genomes <= `total`. /// Sum of counts across all genomes <= `total`.
@@ -90,6 +150,13 @@ impl KmerFilter for MaxTotalCount {
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
row.iter().sum::<u32>() <= self.total row.iter().sum::<u32>() <= self.total
} }
fn column_mask_expr(&self, n_genomes: usize) -> Option<FilterMask> {
Some(FilterMask::SumLeq {
indices: (0..n_genomes).collect(),
max_sum: self.total,
})
}
} }
// ── Group-based quorum filter ───────────────────────────────────────────────── // ── Group-based quorum filter ─────────────────────────────────────────────────
@@ -113,6 +180,37 @@ pub struct GroupQuorumFilter {
pub max_outgroup_frac: f64, pub max_outgroup_frac: f64,
} }
impl GroupQuorumFilter {
// Build PresenceGeq/PresenceLeq constraints for one group (ingroup or outgroup).
fn group_mask_parts(
indices: &[usize],
threshold: u32,
min_count: usize,
max_count: usize,
min_frac: f64,
max_frac: f64,
parts: &mut Vec<FilterMask>,
) {
let n = indices.len();
let geq = min_count.max((min_frac * n as f64).ceil() as usize);
if geq > 0 {
parts.push(FilterMask::PresenceGeq {
indices: indices.to_vec(),
threshold,
min_count: geq,
});
}
let leq = max_count.min((max_frac * n as f64).floor() as usize);
if leq < n {
parts.push(FilterMask::PresenceLeq {
indices: indices.to_vec(),
threshold,
max_count: leq,
});
}
}
}
impl KmerFilter for GroupQuorumFilter { impl KmerFilter for GroupQuorumFilter {
fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { fn passes(&self, row: &[u32], _n_genomes: usize) -> bool {
if !self.ingroup_idx.is_empty() { if !self.ingroup_idx.is_empty() {
@@ -139,4 +237,26 @@ impl KmerFilter for GroupQuorumFilter {
} }
true true
} }
fn column_mask_expr(&self, _n_genomes: usize) -> Option<FilterMask> {
let t = self.threshold.checked_add(1)?;
let mut parts: Vec<FilterMask> = Vec::new();
if !self.ingroup_idx.is_empty() {
Self::group_mask_parts(
&self.ingroup_idx, t,
self.min_count, self.max_count,
self.min_frac, self.max_frac,
&mut parts,
);
}
if !self.outgroup_idx.is_empty() {
Self::group_mask_parts(
&self.outgroup_idx, t,
self.min_outgroup_count, self.max_outgroup_count,
self.min_outgroup_frac, self.max_outgroup_frac,
&mut parts,
);
}
Some(FilterMask::And(parts))
}
} }
+36
View File
@@ -10,6 +10,7 @@ use obipipeline::{
}; };
use obicompactvec::{ use obicompactvec::{
MatrixGroupOps,
PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder, PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
}; };
@@ -78,6 +79,41 @@ impl SrcLayerData {
} }
buf buf
} }
pub(crate) fn n_slots(&self) -> usize {
match self {
SrcLayerData::Presence(_, mat) => mat.n(),
SrcLayerData::Count(_, mat) => mat.n(),
}
}
/// MPHF lookup: returns the slot index for `kmer` (kmer must be in the domain).
#[inline]
pub(crate) fn slot(&self, kmer: CanonicalKmer) -> usize {
match self {
SrcLayerData::Presence(mphf, _) => mphf.index(kmer),
SrcLayerData::Count(mphf, _) => mphf.index(kmer),
}
}
/// Row lookup by slot index, bypassing the MPHF.
#[inline]
pub(crate) fn fill_row_by_slot(&self, slot: usize, n_genomes: usize) -> Vec<u32> {
let mut buf = vec![0u32; n_genomes];
match self {
SrcLayerData::Presence(_, mat) => mat.fill_row(slot, &mut buf),
SrcLayerData::Count(_, mat) => mat.fill_row(slot, &mut buf),
}
buf
}
/// Call `f` with a reference to the underlying matrix as `&dyn MatrixGroupOps`.
pub(crate) fn with_matrix<R>(&self, f: impl FnOnce(&dyn MatrixGroupOps) -> R) -> R {
match self {
SrcLayerData::Presence(_, mat) => f(mat),
SrcLayerData::Count(_, mat) => f(mat),
}
}
} }
// ── helpers ─────────────────────────────────────────────────────────────────── // ── helpers ───────────────────────────────────────────────────────────────────
+143 -51
View File
@@ -1,8 +1,9 @@
use std::path::Path; use std::path::Path;
use obicompactvec::{ use obicompactvec::{
PersistentBitMatrixBuilder, PersistentBitVecBuilder, PersistentCompactIntMatrixBuilder, FilterMask, eval_filter_mask,
PersistentCompactIntVecBuilder, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
}; };
use obidebruinj::GraphDeBruijn; use obidebruinj::GraphDeBruijn;
use obikseq::CanonicalKmer; use obikseq::CanonicalKmer;
@@ -10,18 +11,135 @@ use obilayeredmap::meta::PartitionMeta;
use obilayeredmap::{IndexMode, MphfLayer}; use obilayeredmap::{IndexMode, MphfLayer};
use obiskio::{SKError, SKResult, UnitigFileReader}; use obiskio::{SKError, SKResult, UnitigFileReader};
use crate::common::{ColBuilder, col_path_bit, col_path_int, load_meta, olm_to_sk, write_matrix_meta}; use crate::common::{load_meta, olm_to_sk};
use crate::filter::{KmerFilter, passes_all}; use crate::filter::KmerFilter;
use crate::graph_pipeline::materialize_layer; use crate::graph_pipeline::materialize_layer;
use crate::merge_layer::{MergeMode, SrcLayerData}; use crate::merge_layer::{MergeMode, SrcLayerData};
use crate::partition::KmerPartition; use crate::partition::KmerPartition;
const INDEX_SUBDIR: &str = "index"; const INDEX_SUBDIR: &str = "index";
/// Iterate all kmers in `src_index_dir` that pass `filters`, yielding `(kmer, row)`. // ── Builders — pair matrix builder + column builders for one mode ─────────────
enum Builders {
Presence(PersistentBitMatrixBuilder, Vec<PersistentBitVecBuilder>),
Count(PersistentCompactIntMatrixBuilder, Vec<PersistentCompactIntVecBuilder>),
}
impl Builders {
fn new(mode: MergeMode, n: usize, dir: &Path, n_genomes: usize) -> SKResult<Self> {
match mode {
MergeMode::Presence => {
let mut mat = PersistentBitMatrixBuilder::new(n, dir).map_err(SKError::Io)?;
let mut cols = Vec::with_capacity(n_genomes);
for _ in 0..n_genomes { cols.push(mat.add_col().map_err(SKError::Io)?); }
Ok(Builders::Presence(mat, cols))
}
MergeMode::Count => {
let mut mat = PersistentCompactIntMatrixBuilder::new(n, dir).map_err(SKError::Io)?;
let mut cols = Vec::with_capacity(n_genomes);
for _ in 0..n_genomes { cols.push(mat.add_col().map_err(SKError::Io)?); }
Ok(Builders::Count(mat, cols))
}
}
}
fn set_val(&mut self, col: usize, slot: usize, value: u32) {
match self {
Builders::Presence(_, cols) => cols[col].set(slot, value > 0),
Builders::Count(_, cols) => cols[col].set(slot, value),
}
}
fn close(self) -> SKResult<()> {
match self {
Builders::Presence(mat, cols) => {
for b in cols { b.close().map_err(SKError::Io)?; }
mat.close().map_err(SKError::Io)
}
Builders::Count(mat, cols) => {
for b in cols { b.close().map_err(SKError::Io)?; }
mat.close().map_err(SKError::Io)
}
}
}
}
// ── try_compute_combined_mask ─────────────────────────────────────────────────
/// Build a per-slot `TempBitVec` mask from `filters` using column operations
/// on the source matrix — no per-kmer MPHF lookup or row read needed.
/// ///
/// Uses [`SrcLayerData`] semantics: counts take priority over presence when /// Returns `Some(mask)` when every filter in `filters` can express itself as
/// `mode = Count`; presence (or implicit all-ones) is used for `Presence`. /// a [`FilterMask`] expression. Returns `None` when any filter requires
/// row-level inspection (fall back to `passes_all`).
fn try_compute_combined_mask(
filters: &[Box<dyn KmerFilter>],
src_data: &SrcLayerData,
n_genomes: usize,
) -> SKResult<Option<obicompactvec::TempBitVec>> {
if filters.is_empty() {
return Ok(None);
}
let mut exprs: Vec<FilterMask> = Vec::with_capacity(filters.len());
for f in filters {
match f.column_mask_expr(n_genomes) {
Some(expr) => exprs.push(expr),
None => return Ok(None),
}
}
let combined = FilterMask::And(exprs);
let n = src_data.n_slots();
let mask = src_data
.with_matrix(|mat| eval_filter_mask(&combined, mat, n))
.map_err(SKError::Io)?;
Ok(Some(mask))
}
// ── iter_src_kmers_masked (pass 1) ────────────────────────────────────────────
/// Iterate all passing kmers in `src_index_dir`, yielding only the kmer value.
///
/// When all filters can be expressed as column operations, a per-slot mask is
/// computed once per layer and used for O(1) slot-check per kmer instead of a
/// full row read. Falls back to row-level `passes_all` otherwise.
fn iter_src_kmers_masked(
src_index_dir: &Path,
mode: MergeMode,
n_genomes: usize,
filters: &[Box<dyn KmerFilter>],
mut cb: impl FnMut(CanonicalKmer),
) -> SKResult<()> {
let src_meta = load_meta(src_index_dir, "rebuild")?;
for l in 0..src_meta.n_layers {
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
let unitigs_path = src_layer_dir.join("unitigs.bin");
if !unitigs_path.exists() { continue; }
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
let mask = try_compute_combined_mask(filters, &src_data, n_genomes)?;
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let slot = src_data.slot(kmer);
let passes = match &mask {
Some(m) => m.get(slot),
None => {
let row = src_data.fill_row_by_slot(slot, n_genomes);
filters.iter().all(|f| f.passes(&row, n_genomes))
}
};
if passes { cb(kmer); }
}
}
Ok(())
}
// ── iter_src_layers (pass 2) ──────────────────────────────────────────────────
/// Iterate all passing kmers in `src_index_dir`, yielding `(kmer, row)`.
///
/// When the slot mask is available, skips the row read for filtered-out slots.
fn iter_src_layers( fn iter_src_layers(
src_index_dir: &Path, src_index_dir: &Path,
mode: MergeMode, mode: MergeMode,
@@ -33,17 +151,23 @@ fn iter_src_layers(
for l in 0..src_meta.n_layers { for l in 0..src_meta.n_layers {
let src_layer_dir = src_index_dir.join(format!("layer_{l}")); let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
let unitigs_path = src_layer_dir.join("unitigs.bin"); let unitigs_path = src_layer_dir.join("unitigs.bin");
if !unitigs_path.exists() { if !unitigs_path.exists() { continue; }
continue;
}
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
let src_data = SrcLayerData::open(&src_layer_dir, mode)?; let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
let mask = try_compute_combined_mask(filters, &src_data, n_genomes)?;
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let row = src_data.lookup(kmer, n_genomes); let slot = src_data.slot(kmer);
if passes_all(filters, &row, n_genomes) { if let Some(ref m) = mask {
if !m.get(slot) { continue; }
let row = src_data.fill_row_by_slot(slot, n_genomes);
cb(kmer, row.into_boxed_slice()); cb(kmer, row.into_boxed_slice());
} else {
let row = src_data.fill_row_by_slot(slot, n_genomes);
if filters.iter().all(|f| f.passes(&row, n_genomes)) {
cb(kmer, row.into_boxed_slice());
}
} }
} }
} }
@@ -81,7 +205,7 @@ impl KmerPartition {
// ── Pass 1: collect filtered kmers into de Bruijn graph ─────────────── // ── Pass 1: collect filtered kmers into de Bruijn graph ───────────────
let mut g = GraphDeBruijn::new(); let mut g = GraphDeBruijn::new();
iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, _row| { iter_src_kmers_masked(&src_index_dir, mode, n_genomes, filters, |kmer| {
g.push(kmer); g.push(kmer);
})?; })?;
@@ -100,54 +224,22 @@ impl KmerPartition {
// ── Prepare matrix builders (one column per genome) ─────────────────── // ── Prepare matrix builders (one column per genome) ───────────────────
let data_dir = match mode { let data_dir = match mode {
MergeMode::Presence => dst_layer_dir.join("presence"), MergeMode::Presence => dst_layer_dir.join("presence"),
MergeMode::Count => dst_layer_dir.join("counts"), MergeMode::Count => dst_layer_dir.join("counts"),
}; };
std::fs::create_dir_all(&data_dir)?; std::fs::create_dir_all(&data_dir)?;
let mut builders = Builders::new(mode, n_new, &data_dir, n_genomes)?;
let mut builders: Vec<ColBuilder> = match mode {
MergeMode::Presence => {
PersistentBitMatrixBuilder::new(n_new, &data_dir)
.map_err(SKError::Io)?
.close()
.map_err(SKError::Io)?;
(0..n_genomes)
.map(|g| -> SKResult<ColBuilder> {
let b = PersistentBitVecBuilder::new(n_new, &col_path_bit(&data_dir, g))?;
Ok(ColBuilder::Bit(b))
})
.collect::<SKResult<_>>()?
}
MergeMode::Count => {
PersistentCompactIntMatrixBuilder::new(n_new, &data_dir)
.map_err(SKError::Io)?
.close()
.map_err(SKError::Io)?;
(0..n_genomes)
.map(|g| -> SKResult<ColBuilder> {
let b = PersistentCompactIntVecBuilder::new(
n_new,
&col_path_int(&data_dir, g),
)?;
Ok(ColBuilder::Int(b))
})
.collect::<SKResult<_>>()?
}
};
// ── Pass 2: fill builders ───────────────────────────────────────────── // ── Pass 2: fill builders ─────────────────────────────────────────────
iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, row| { iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, row| {
if let Some(slot) = dst_mphf.find(kmer) { if let Some(slot) = dst_mphf.find(kmer) {
for (col, &value) in row.iter().enumerate() { for (col, &value) in row.iter().enumerate() {
builders[col].set_val(slot, value); builders.set_val(col, slot, value);
} }
} }
})?; })?;
// ── Close builders, write metadata ──────────────────────────────────── // ── Close builders and write metadata ─────────────────────────────────
for b in builders { builders.close()?;
b.close()?;
}
write_matrix_meta(&data_dir, n_new, n_genomes).map_err(SKError::Io)?;
PartitionMeta { PartitionMeta {
n_layers: 1, n_layers: 1,