Push ruqusmkoyvwm #16

Merged
coissac merged 10 commits from push-ruqusmkoyvwm into main 2026-06-05 08:41:08 +00:00
6 changed files with 112 additions and 92 deletions
Showing only changes of commit 3e62ffe010 - Show all commits
+1
View File
@@ -1525,6 +1525,7 @@ dependencies = [
"csv", "csv",
"indicatif", "indicatif",
"kodama", "kodama",
"obidebruinj",
"obifastwrite", "obifastwrite",
"obikindex", "obikindex",
"obikpartitionner", "obikpartitionner",
+12 -10
View File
@@ -2,6 +2,7 @@ use std::io::Write;
use crate::error::{OKIError, OKIResult}; use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex; use crate::index::KmerIndex;
use obikpartitionner::KmerFilter;
impl KmerIndex { impl KmerIndex {
/// Write a CSV table of all indexed kmers to `out`. /// Write a CSV table of all indexed kmers to `out`.
@@ -14,8 +15,13 @@ impl KmerIndex {
/// ///
/// The caller must have set the global kmer length (`obikseq::set_k`) before /// The caller must have set the global kmer length (`obikseq::set_k`) before
/// calling this method. /// calling this method.
pub fn dump<W: Write>(&self, out: &mut W, force_presence: bool, debug: bool) -> OKIResult<()> { pub fn dump<W: Write>(
&self,
out: &mut W,
force_presence: bool,
debug: bool,
filters: &[Box<dyn KmerFilter>],
) -> OKIResult<()> {
let genomes = &self.meta.genomes; let genomes = &self.meta.genomes;
let use_counts = self.meta.config.with_counts && !force_presence; let use_counts = self.meta.config.with_counts && !force_presence;
let n_genomes = genomes.len().max(1); let n_genomes = genomes.len().max(1);
@@ -36,25 +42,21 @@ impl KmerIndex {
for i in 0..n { for i in 0..n {
if debug { if debug {
self.partition self.partition
.iter_partition_kmers_located(i, use_counts, n_genomes, &[], |part, layer, kmer, row| { .iter_partition_kmers_located(i, use_counts, n_genomes, filters, |part, layer, kmer, row| {
let seq = String::from_utf8(kmer.to_ascii()) let seq = String::from_utf8(kmer.to_ascii())
.unwrap_or_else(|_| "?".repeat(kmer_size)); .unwrap_or_else(|_| "?".repeat(kmer_size));
let _ = write!(out, "{part},{layer},{seq}"); let _ = write!(out, "{part},{layer},{seq}");
for &v in row.iter() { for &v in row.iter() { let _ = write!(out, ",{v}"); }
let _ = write!(out, ",{v}");
}
let _ = writeln!(out); let _ = writeln!(out);
}) })
.map_err(OKIError::Partition)?; .map_err(OKIError::Partition)?;
} else { } else {
self.partition self.partition
.iter_partition_kmers(i, use_counts, n_genomes, &[], |kmer, row| { .iter_partition_kmers(i, use_counts, n_genomes, filters, |kmer, row| {
let seq = String::from_utf8(kmer.to_ascii()) let seq = String::from_utf8(kmer.to_ascii())
.unwrap_or_else(|_| "?".repeat(kmer_size)); .unwrap_or_else(|_| "?".repeat(kmer_size));
let _ = write!(out, "{seq}"); let _ = write!(out, "{seq}");
for &v in row.iter() { for &v in row.iter() { let _ = write!(out, ",{v}"); }
let _ = write!(out, ",{v}");
}
let _ = writeln!(out); let _ = writeln!(out);
}) })
.map_err(OKIError::Partition)?; .map_err(OKIError::Partition)?;
+1
View File
@@ -12,6 +12,7 @@ obikseq = { path = "../obikseq" }
obiread = { path = "../obiread" } obiread = { path = "../obiread" }
obiskbuilder = { path = "../obiskbuilder" } obiskbuilder = { path = "../obiskbuilder" }
obifastwrite = { path = "../obifastwrite" } obifastwrite = { path = "../obifastwrite" }
obidebruinj = { path = "../obidebruinj" }
obipipeline = { path = "../obipipeline" } obipipeline = { path = "../obipipeline" }
obikrope = { path = "../obikrope" } obikrope = { path = "../obikrope" }
obikpartitionner = { path = "../obikpartitionner" } obikpartitionner = { path = "../obikpartitionner" }
+9 -2
View File
@@ -5,6 +5,8 @@ use clap::Args;
use obikindex::KmerIndex; use obikindex::KmerIndex;
use tracing::info; use tracing::info;
use super::predicate::FilterArgs;
#[derive(Args)] #[derive(Args)]
pub struct DumpArgs { pub struct DumpArgs {
/// Index directory to dump /// Index directory to dump
@@ -17,6 +19,9 @@ pub struct DumpArgs {
/// Prepend partition and layer columns to each row /// Prepend partition and layer columns to each row
#[arg(long, default_value_t = false)] #[arg(long, default_value_t = false)]
pub debug: bool, pub debug: bool,
#[command(flatten)]
pub filter: FilterArgs,
} }
pub fn run(args: DumpArgs) { pub fn run(args: DumpArgs) {
@@ -28,13 +33,15 @@ pub fn run(args: DumpArgs) {
info!( info!(
"dumping {} partitions, {} genome(s)", "dumping {} partitions, {} genome(s)",
idx.n_partitions(), idx.n_partitions(),
idx.meta().genomes.len() &idx.meta().genomes.len()
); );
let filters = args.filter.build_filters(&idx.meta().genomes);
let stdout = io::stdout(); let stdout = io::stdout();
let mut out = BufWriter::new(stdout.lock()); let mut out = BufWriter::new(stdout.lock());
idx.dump(&mut out, args.force_presence, args.debug).unwrap_or_else(|e| { idx.dump(&mut out, args.force_presence, args.debug, &filters).unwrap_or_else(|e| {
eprintln!("dump error: {e}"); eprintln!("dump error: {e}");
std::process::exit(1); std::process::exit(1);
}); });
+84 -1
View File
@@ -1,7 +1,8 @@
use std::collections::HashMap; use std::collections::HashMap;
use clap::Args;
use obikindex::GenomeInfo; use obikindex::GenomeInfo;
use obikpartitionner::GroupQuorumFilter; use obikpartitionner::{GroupQuorumFilter, KmerFilter};
// ── Operator ────────────────────────────────────────────────────────────────── // ── Operator ──────────────────────────────────────────────────────────────────
@@ -141,6 +142,88 @@ fn classify(
/// - `ingroup` predicates only: outgroup indices are empty. /// - `ingroup` predicates only: outgroup indices are empty.
/// - `outgroup` predicates only: ingroup indices are empty. /// - `outgroup` predicates only: ingroup indices are empty.
/// - Both defined: ingroup wins on overlap; uncategorized genomes are ignored. /// - Both defined: ingroup wins on overlap; uncategorized genomes are ignored.
/// CLI args for ingroup/outgroup filtering — embeddable in any command via `#[command(flatten)]`.
#[derive(Args)]
pub struct FilterArgs {
/// Ingroup predicate (repeatable; AND). Forms: `key=v1|v2`, `key!=v`, `key~path`, `key!~path`, `*`/`all`
#[arg(long, value_name = "PRED")]
pub ingroup: Vec<String>,
/// Outgroup predicate (repeatable; OR). Forms: `key=v1|v2`, `key!=v`, `key~path`, `key!~path`, `*`/`all`
#[arg(long, value_name = "PRED")]
pub outgroup: Vec<String>,
/// Minimum number of ingroup genomes containing the k-mer
#[arg(long)]
pub min_count: Option<usize>,
/// Maximum number of ingroup genomes containing the k-mer
#[arg(long)]
pub max_count: Option<usize>,
/// Minimum fraction of ingroup genomes containing the k-mer [0.01.0]
#[arg(long)]
pub min_frac: Option<f64>,
/// Maximum fraction of ingroup genomes containing the k-mer [0.01.0]
#[arg(long)]
pub max_frac: Option<f64>,
/// Minimum number of outgroup genomes containing the k-mer
#[arg(long)]
pub min_outgroup_count: Option<usize>,
/// Maximum number of outgroup genomes containing the k-mer
#[arg(long)]
pub max_outgroup_count: Option<usize>,
/// Minimum fraction of outgroup genomes containing the k-mer [0.01.0]
#[arg(long)]
pub min_outgroup_frac: Option<f64>,
/// Maximum fraction of outgroup genomes containing the k-mer [0.01.0]
#[arg(long)]
pub max_outgroup_frac: Option<f64>,
/// Per-genome count threshold to consider a genome as "containing" the k-mer (default 0)
#[arg(long, default_value = "0")]
pub presence_threshold: u32,
}
impl FilterArgs {
/// Parse predicates and build a filter list ready to pass to `iter_partition_kmers`.
pub fn build_filters(&self, genomes: &[GenomeInfo]) -> Vec<Box<dyn KmerFilter>> {
let ingroup_preds: Vec<MetaPred> = self.ingroup.iter()
.map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
eprintln!("error in --ingroup: {e}");
std::process::exit(1);
}))
.collect();
let outgroup_preds: Vec<MetaPred> = self.outgroup.iter()
.map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
eprintln!("error in --outgroup: {e}");
std::process::exit(1);
}))
.collect();
vec![Box::new(build_group_filter(
genomes,
&ingroup_preds,
&outgroup_preds,
GroupFilterParams {
threshold: self.presence_threshold,
min_count: self.min_count,
max_count: self.max_count,
min_frac: self.min_frac,
max_frac: self.max_frac,
min_outgroup_count: self.min_outgroup_count,
max_outgroup_count: self.max_outgroup_count,
min_outgroup_frac: self.min_outgroup_frac,
max_outgroup_frac: self.max_outgroup_frac,
},
))]
}
}
pub struct GroupFilterParams { pub struct GroupFilterParams {
pub threshold: u32, pub threshold: u32,
pub min_count: Option<usize>, pub min_count: Option<usize>,
+5 -79
View File
@@ -6,7 +6,7 @@ use obikpartitionner::filter::{MaxTotalCount, MinTotalCount};
use obisys::Reporter; use obisys::Reporter;
use tracing::info; use tracing::info;
use super::predicate::{GroupFilterParams, MetaPred, build_group_filter}; use super::predicate::FilterArgs;
#[derive(Args)] #[derive(Args)]
pub struct RebuildArgs { pub struct RebuildArgs {
@@ -17,47 +17,8 @@ pub struct RebuildArgs {
#[arg(short, long)] #[arg(short, long)]
pub output: PathBuf, pub output: PathBuf,
/// Ingroup predicate (repeatable; AND between flags). #[command(flatten)]
/// Forms: `key=val1|val2`, `key!=val`, `key~path`, `key!~path`, `*`/`all` pub filter: FilterArgs,
#[arg(long, value_name = "PRED")]
pub ingroup: Vec<String>,
/// Outgroup predicate (repeatable; OR between flags).
/// Forms: `key=val1|val2`, `key!=val`, `key~path`, `key!~path`, `*`/`all`
#[arg(long, value_name = "PRED")]
pub outgroup: Vec<String>,
/// Minimum number of ingroup genomes containing the k-mer
#[arg(long)]
pub min_count: Option<usize>,
/// Maximum number of ingroup genomes containing the k-mer
#[arg(long)]
pub max_count: Option<usize>,
/// Minimum fraction of ingroup genomes containing the k-mer [0.01.0]
#[arg(long)]
pub min_frac: Option<f64>,
/// Maximum fraction of ingroup genomes containing the k-mer [0.01.0]
#[arg(long)]
pub max_frac: Option<f64>,
/// Minimum number of outgroup genomes containing the k-mer
#[arg(long)]
pub min_outgroup_count: Option<usize>,
/// Maximum number of outgroup genomes containing the k-mer
#[arg(long)]
pub max_outgroup_count: Option<usize>,
/// Minimum fraction of outgroup genomes containing the k-mer [0.01.0]
#[arg(long)]
pub min_outgroup_frac: Option<f64>,
/// Maximum fraction of outgroup genomes containing the k-mer [0.01.0]
#[arg(long)]
pub max_outgroup_frac: Option<f64>,
/// Minimum total count across all genomes (count index only) /// Minimum total count across all genomes (count index only)
#[arg(long)] #[arg(long)]
@@ -67,10 +28,6 @@ pub struct RebuildArgs {
#[arg(long)] #[arg(long)]
pub max_total_count: Option<u32>, pub max_total_count: Option<u32>,
/// Per-genome count threshold to consider a genome as "containing" the k-mer (default 0)
#[arg(long, default_value = "0")]
pub presence_threshold: u32,
/// Output as presence/absence instead of counts /// Output as presence/absence instead of counts
#[arg(long)] #[arg(long)]
pub presence: bool, pub presence: bool,
@@ -92,43 +49,12 @@ pub fn run(args: RebuildArgs) {
MergeMode::Count MergeMode::Count
}; };
let ingroup_preds: Vec<MetaPred> = args.ingroup.iter()
.map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
eprintln!("error in --ingroup: {e}");
std::process::exit(1);
}))
.collect();
let outgroup_preds: Vec<MetaPred> = args.outgroup.iter()
.map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
eprintln!("error in --outgroup: {e}");
std::process::exit(1);
}))
.collect();
info!( info!(
"rebuild: {} genome(s), mode={:?}, source={}", "rebuild: {} genome(s), mode={:?}, source={}",
src.meta().genomes.len(), mode, args.source.display() &src.meta().genomes.len(), mode, args.source.display()
); );
let mut filters: Vec<Box<dyn obikpartitionner::KmerFilter>> = Vec::new(); let mut filters = args.filter.build_filters(&src.meta().genomes);
filters.push(Box::new(build_group_filter(
&src.meta().genomes,
&ingroup_preds,
&outgroup_preds,
GroupFilterParams {
threshold: args.presence_threshold,
min_count: args.min_count,
max_count: args.max_count,
min_frac: args.min_frac,
max_frac: args.max_frac,
min_outgroup_count: args.min_outgroup_count,
max_outgroup_count: args.max_outgroup_count,
min_outgroup_frac: args.min_outgroup_frac,
max_outgroup_frac: args.max_outgroup_frac,
},
)));
if let Some(v) = args.min_total_count { if let Some(v) = args.min_total_count {
filters.push(Box::new(MinTotalCount { total: v })); filters.push(Box::new(MinTotalCount { total: v }));