diff --git a/src/Cargo.lock b/src/Cargo.lock index fe3f9a1..e946c77 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1525,6 +1525,7 @@ dependencies = [ "csv", "indicatif", "kodama", + "obidebruinj", "obifastwrite", "obikindex", "obikpartitionner", diff --git a/src/obikindex/src/dump.rs b/src/obikindex/src/dump.rs index cb28652..6e8481a 100644 --- a/src/obikindex/src/dump.rs +++ b/src/obikindex/src/dump.rs @@ -2,6 +2,7 @@ use std::io::Write; use crate::error::{OKIError, OKIResult}; use crate::index::KmerIndex; +use obikpartitionner::KmerFilter; impl KmerIndex { /// Write a CSV table of all indexed kmers to `out`. @@ -14,8 +15,13 @@ impl KmerIndex { /// /// The caller must have set the global kmer length (`obikseq::set_k`) before /// calling this method. - pub fn dump(&self, out: &mut W, force_presence: bool, debug: bool) -> OKIResult<()> { - + pub fn dump( + &self, + out: &mut W, + force_presence: bool, + debug: bool, + filters: &[Box], + ) -> OKIResult<()> { let genomes = &self.meta.genomes; let use_counts = self.meta.config.with_counts && !force_presence; let n_genomes = genomes.len().max(1); @@ -36,25 +42,21 @@ impl KmerIndex { for i in 0..n { if debug { self.partition - .iter_partition_kmers_located(i, use_counts, n_genomes, &[], |part, layer, kmer, row| { + .iter_partition_kmers_located(i, use_counts, n_genomes, filters, |part, layer, kmer, row| { let seq = String::from_utf8(kmer.to_ascii()) .unwrap_or_else(|_| "?".repeat(kmer_size)); let _ = write!(out, "{part},{layer},{seq}"); - for &v in row.iter() { - let _ = write!(out, ",{v}"); - } + for &v in row.iter() { let _ = write!(out, ",{v}"); } let _ = writeln!(out); }) .map_err(OKIError::Partition)?; } else { self.partition - .iter_partition_kmers(i, use_counts, n_genomes, &[], |kmer, row| { + .iter_partition_kmers(i, use_counts, n_genomes, filters, |kmer, row| { let seq = String::from_utf8(kmer.to_ascii()) .unwrap_or_else(|_| "?".repeat(kmer_size)); let _ = write!(out, "{seq}"); - for &v in row.iter() { - let _ = write!(out, ",{v}"); - } + for &v in row.iter() { let _ = write!(out, ",{v}"); } let _ = writeln!(out); }) .map_err(OKIError::Partition)?; diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml index cb7a7a2..2dcfb91 100644 --- a/src/obikmer/Cargo.toml +++ b/src/obikmer/Cargo.toml @@ -12,6 +12,7 @@ obikseq = { path = "../obikseq" } obiread = { path = "../obiread" } obiskbuilder = { path = "../obiskbuilder" } obifastwrite = { path = "../obifastwrite" } +obidebruinj = { path = "../obidebruinj" } obipipeline = { path = "../obipipeline" } obikrope = { path = "../obikrope" } obikpartitionner = { path = "../obikpartitionner" } diff --git a/src/obikmer/src/cmd/dump.rs b/src/obikmer/src/cmd/dump.rs index c1d56ed..7ec5dc7 100644 --- a/src/obikmer/src/cmd/dump.rs +++ b/src/obikmer/src/cmd/dump.rs @@ -5,6 +5,8 @@ use clap::Args; use obikindex::KmerIndex; use tracing::info; +use super::predicate::FilterArgs; + #[derive(Args)] pub struct DumpArgs { /// Index directory to dump @@ -17,6 +19,9 @@ pub struct DumpArgs { /// Prepend partition and layer columns to each row #[arg(long, default_value_t = false)] pub debug: bool, + + #[command(flatten)] + pub filter: FilterArgs, } pub fn run(args: DumpArgs) { @@ -28,13 +33,15 @@ pub fn run(args: DumpArgs) { info!( "dumping {} partitions, {} genome(s)", idx.n_partitions(), - idx.meta().genomes.len() + &idx.meta().genomes.len() ); + let filters = args.filter.build_filters(&idx.meta().genomes); + let stdout = io::stdout(); let mut out = BufWriter::new(stdout.lock()); - idx.dump(&mut out, args.force_presence, args.debug).unwrap_or_else(|e| { + idx.dump(&mut out, args.force_presence, args.debug, &filters).unwrap_or_else(|e| { eprintln!("dump error: {e}"); std::process::exit(1); }); diff --git a/src/obikmer/src/cmd/predicate.rs b/src/obikmer/src/cmd/predicate.rs index 3faee82..e568b58 100644 --- a/src/obikmer/src/cmd/predicate.rs +++ b/src/obikmer/src/cmd/predicate.rs @@ -1,7 +1,8 @@ use std::collections::HashMap; +use clap::Args; use obikindex::GenomeInfo; -use obikpartitionner::GroupQuorumFilter; +use obikpartitionner::{GroupQuorumFilter, KmerFilter}; // ── Operator ────────────────────────────────────────────────────────────────── @@ -141,6 +142,88 @@ fn classify( /// - `ingroup` predicates only: outgroup indices are empty. /// - `outgroup` predicates only: ingroup indices are empty. /// - Both defined: ingroup wins on overlap; uncategorized genomes are ignored. +/// CLI args for ingroup/outgroup filtering — embeddable in any command via `#[command(flatten)]`. +#[derive(Args)] +pub struct FilterArgs { + /// Ingroup predicate (repeatable; AND). Forms: `key=v1|v2`, `key!=v`, `key~path`, `key!~path`, `*`/`all` + #[arg(long, value_name = "PRED")] + pub ingroup: Vec, + + /// Outgroup predicate (repeatable; OR). Forms: `key=v1|v2`, `key!=v`, `key~path`, `key!~path`, `*`/`all` + #[arg(long, value_name = "PRED")] + pub outgroup: Vec, + + /// Minimum number of ingroup genomes containing the k-mer + #[arg(long)] + pub min_count: Option, + + /// Maximum number of ingroup genomes containing the k-mer + #[arg(long)] + pub max_count: Option, + + /// Minimum fraction of ingroup genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub min_frac: Option, + + /// Maximum fraction of ingroup genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub max_frac: Option, + + /// Minimum number of outgroup genomes containing the k-mer + #[arg(long)] + pub min_outgroup_count: Option, + + /// Maximum number of outgroup genomes containing the k-mer + #[arg(long)] + pub max_outgroup_count: Option, + + /// Minimum fraction of outgroup genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub min_outgroup_frac: Option, + + /// Maximum fraction of outgroup genomes containing the k-mer [0.0–1.0] + #[arg(long)] + pub max_outgroup_frac: Option, + + /// Per-genome count threshold to consider a genome as "containing" the k-mer (default 0) + #[arg(long, default_value = "0")] + pub presence_threshold: u32, +} + +impl FilterArgs { + /// Parse predicates and build a filter list ready to pass to `iter_partition_kmers`. + pub fn build_filters(&self, genomes: &[GenomeInfo]) -> Vec> { + let ingroup_preds: Vec = self.ingroup.iter() + .map(|s| MetaPred::parse(s).unwrap_or_else(|e| { + eprintln!("error in --ingroup: {e}"); + std::process::exit(1); + })) + .collect(); + let outgroup_preds: Vec = self.outgroup.iter() + .map(|s| MetaPred::parse(s).unwrap_or_else(|e| { + eprintln!("error in --outgroup: {e}"); + std::process::exit(1); + })) + .collect(); + vec![Box::new(build_group_filter( + genomes, + &ingroup_preds, + &outgroup_preds, + GroupFilterParams { + threshold: self.presence_threshold, + min_count: self.min_count, + max_count: self.max_count, + min_frac: self.min_frac, + max_frac: self.max_frac, + min_outgroup_count: self.min_outgroup_count, + max_outgroup_count: self.max_outgroup_count, + min_outgroup_frac: self.min_outgroup_frac, + max_outgroup_frac: self.max_outgroup_frac, + }, + ))] + } +} + pub struct GroupFilterParams { pub threshold: u32, pub min_count: Option, diff --git a/src/obikmer/src/cmd/rebuild.rs b/src/obikmer/src/cmd/rebuild.rs index ca78694..594d5f4 100644 --- a/src/obikmer/src/cmd/rebuild.rs +++ b/src/obikmer/src/cmd/rebuild.rs @@ -6,7 +6,7 @@ use obikpartitionner::filter::{MaxTotalCount, MinTotalCount}; use obisys::Reporter; use tracing::info; -use super::predicate::{GroupFilterParams, MetaPred, build_group_filter}; +use super::predicate::FilterArgs; #[derive(Args)] pub struct RebuildArgs { @@ -17,47 +17,8 @@ pub struct RebuildArgs { #[arg(short, long)] pub output: PathBuf, - /// Ingroup predicate (repeatable; AND between flags). - /// Forms: `key=val1|val2`, `key!=val`, `key~path`, `key!~path`, `*`/`all` - #[arg(long, value_name = "PRED")] - pub ingroup: Vec, - - /// Outgroup predicate (repeatable; OR between flags). - /// Forms: `key=val1|val2`, `key!=val`, `key~path`, `key!~path`, `*`/`all` - #[arg(long, value_name = "PRED")] - pub outgroup: Vec, - - /// Minimum number of ingroup genomes containing the k-mer - #[arg(long)] - pub min_count: Option, - - /// Maximum number of ingroup genomes containing the k-mer - #[arg(long)] - pub max_count: Option, - - /// Minimum fraction of ingroup genomes containing the k-mer [0.0–1.0] - #[arg(long)] - pub min_frac: Option, - - /// Maximum fraction of ingroup genomes containing the k-mer [0.0–1.0] - #[arg(long)] - pub max_frac: Option, - - /// Minimum number of outgroup genomes containing the k-mer - #[arg(long)] - pub min_outgroup_count: Option, - - /// Maximum number of outgroup genomes containing the k-mer - #[arg(long)] - pub max_outgroup_count: Option, - - /// Minimum fraction of outgroup genomes containing the k-mer [0.0–1.0] - #[arg(long)] - pub min_outgroup_frac: Option, - - /// Maximum fraction of outgroup genomes containing the k-mer [0.0–1.0] - #[arg(long)] - pub max_outgroup_frac: Option, + #[command(flatten)] + pub filter: FilterArgs, /// Minimum total count across all genomes (count index only) #[arg(long)] @@ -67,10 +28,6 @@ pub struct RebuildArgs { #[arg(long)] pub max_total_count: Option, - /// Per-genome count threshold to consider a genome as "containing" the k-mer (default 0) - #[arg(long, default_value = "0")] - pub presence_threshold: u32, - /// Output as presence/absence instead of counts #[arg(long)] pub presence: bool, @@ -92,43 +49,12 @@ pub fn run(args: RebuildArgs) { MergeMode::Count }; - let ingroup_preds: Vec = args.ingroup.iter() - .map(|s| MetaPred::parse(s).unwrap_or_else(|e| { - eprintln!("error in --ingroup: {e}"); - std::process::exit(1); - })) - .collect(); - - let outgroup_preds: Vec = args.outgroup.iter() - .map(|s| MetaPred::parse(s).unwrap_or_else(|e| { - eprintln!("error in --outgroup: {e}"); - std::process::exit(1); - })) - .collect(); - info!( "rebuild: {} genome(s), mode={:?}, source={}", - src.meta().genomes.len(), mode, args.source.display() + &src.meta().genomes.len(), mode, args.source.display() ); - let mut filters: Vec> = Vec::new(); - - filters.push(Box::new(build_group_filter( - &src.meta().genomes, - &ingroup_preds, - &outgroup_preds, - GroupFilterParams { - threshold: args.presence_threshold, - min_count: args.min_count, - max_count: args.max_count, - min_frac: args.min_frac, - max_frac: args.max_frac, - min_outgroup_count: args.min_outgroup_count, - max_outgroup_count: args.max_outgroup_count, - min_outgroup_frac: args.min_outgroup_frac, - max_outgroup_frac: args.max_outgroup_frac, - }, - ))); + let mut filters = args.filter.build_filters(&src.meta().genomes); if let Some(v) = args.min_total_count { filters.push(Box::new(MinTotalCount { total: v }));