feat: add selective k-mer filtering to dump and rebuild commands

Add the `obidebruinj` dependency and introduce `FilterArgs` CLI arguments for ingroup/outgroup predicates and count/fraction thresholds. Extend `GroupFilterParams` to support outgroup filtering, and integrate the filter collection into `KmerIndex::dump` and `rebuild` commands. This enables selective k-mer filtering during index operations and CSV exports.
This commit is contained in:
Eric Coissac
2026-06-04 20:54:31 +02:00
parent a1499e6153
commit 3e62ffe010
6 changed files with 112 additions and 92 deletions
+12 -10
View File
@@ -2,6 +2,7 @@ use std::io::Write;
use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex;
use obikpartitionner::KmerFilter;
impl KmerIndex {
/// Write a CSV table of all indexed kmers to `out`.
@@ -14,8 +15,13 @@ impl KmerIndex {
///
/// The caller must have set the global kmer length (`obikseq::set_k`) before
/// calling this method.
pub fn dump<W: Write>(&self, out: &mut W, force_presence: bool, debug: bool) -> OKIResult<()> {
pub fn dump<W: Write>(
&self,
out: &mut W,
force_presence: bool,
debug: bool,
filters: &[Box<dyn KmerFilter>],
) -> OKIResult<()> {
let genomes = &self.meta.genomes;
let use_counts = self.meta.config.with_counts && !force_presence;
let n_genomes = genomes.len().max(1);
@@ -36,25 +42,21 @@ impl KmerIndex {
for i in 0..n {
if debug {
self.partition
.iter_partition_kmers_located(i, use_counts, n_genomes, &[], |part, layer, kmer, row| {
.iter_partition_kmers_located(i, use_counts, n_genomes, filters, |part, layer, kmer, row| {
let seq = String::from_utf8(kmer.to_ascii())
.unwrap_or_else(|_| "?".repeat(kmer_size));
let _ = write!(out, "{part},{layer},{seq}");
for &v in row.iter() {
let _ = write!(out, ",{v}");
}
for &v in row.iter() { let _ = write!(out, ",{v}"); }
let _ = writeln!(out);
})
.map_err(OKIError::Partition)?;
} else {
self.partition
.iter_partition_kmers(i, use_counts, n_genomes, &[], |kmer, row| {
.iter_partition_kmers(i, use_counts, n_genomes, filters, |kmer, row| {
let seq = String::from_utf8(kmer.to_ascii())
.unwrap_or_else(|_| "?".repeat(kmer_size));
let _ = write!(out, "{seq}");
for &v in row.iter() {
let _ = write!(out, ",{v}");
}
for &v in row.iter() { let _ = write!(out, ",{v}"); }
let _ = writeln!(out);
})
.map_err(OKIError::Partition)?;