Push ruqusmkoyvwm #16
Generated
+1
@@ -1525,6 +1525,7 @@ dependencies = [
|
||||
"csv",
|
||||
"indicatif",
|
||||
"kodama",
|
||||
"obidebruinj",
|
||||
"obifastwrite",
|
||||
"obikindex",
|
||||
"obikpartitionner",
|
||||
|
||||
+12
-10
@@ -2,6 +2,7 @@ use std::io::Write;
|
||||
|
||||
use crate::error::{OKIError, OKIResult};
|
||||
use crate::index::KmerIndex;
|
||||
use obikpartitionner::KmerFilter;
|
||||
|
||||
impl KmerIndex {
|
||||
/// Write a CSV table of all indexed kmers to `out`.
|
||||
@@ -14,8 +15,13 @@ impl KmerIndex {
|
||||
///
|
||||
/// The caller must have set the global kmer length (`obikseq::set_k`) before
|
||||
/// calling this method.
|
||||
pub fn dump<W: Write>(&self, out: &mut W, force_presence: bool, debug: bool) -> OKIResult<()> {
|
||||
|
||||
pub fn dump<W: Write>(
|
||||
&self,
|
||||
out: &mut W,
|
||||
force_presence: bool,
|
||||
debug: bool,
|
||||
filters: &[Box<dyn KmerFilter>],
|
||||
) -> OKIResult<()> {
|
||||
let genomes = &self.meta.genomes;
|
||||
let use_counts = self.meta.config.with_counts && !force_presence;
|
||||
let n_genomes = genomes.len().max(1);
|
||||
@@ -36,25 +42,21 @@ impl KmerIndex {
|
||||
for i in 0..n {
|
||||
if debug {
|
||||
self.partition
|
||||
.iter_partition_kmers_located(i, use_counts, n_genomes, &[], |part, layer, kmer, row| {
|
||||
.iter_partition_kmers_located(i, use_counts, n_genomes, filters, |part, layer, kmer, row| {
|
||||
let seq = String::from_utf8(kmer.to_ascii())
|
||||
.unwrap_or_else(|_| "?".repeat(kmer_size));
|
||||
let _ = write!(out, "{part},{layer},{seq}");
|
||||
for &v in row.iter() {
|
||||
let _ = write!(out, ",{v}");
|
||||
}
|
||||
for &v in row.iter() { let _ = write!(out, ",{v}"); }
|
||||
let _ = writeln!(out);
|
||||
})
|
||||
.map_err(OKIError::Partition)?;
|
||||
} else {
|
||||
self.partition
|
||||
.iter_partition_kmers(i, use_counts, n_genomes, &[], |kmer, row| {
|
||||
.iter_partition_kmers(i, use_counts, n_genomes, filters, |kmer, row| {
|
||||
let seq = String::from_utf8(kmer.to_ascii())
|
||||
.unwrap_or_else(|_| "?".repeat(kmer_size));
|
||||
let _ = write!(out, "{seq}");
|
||||
for &v in row.iter() {
|
||||
let _ = write!(out, ",{v}");
|
||||
}
|
||||
for &v in row.iter() { let _ = write!(out, ",{v}"); }
|
||||
let _ = writeln!(out);
|
||||
})
|
||||
.map_err(OKIError::Partition)?;
|
||||
|
||||
@@ -12,6 +12,7 @@ obikseq = { path = "../obikseq" }
|
||||
obiread = { path = "../obiread" }
|
||||
obiskbuilder = { path = "../obiskbuilder" }
|
||||
obifastwrite = { path = "../obifastwrite" }
|
||||
obidebruinj = { path = "../obidebruinj" }
|
||||
obipipeline = { path = "../obipipeline" }
|
||||
obikrope = { path = "../obikrope" }
|
||||
obikpartitionner = { path = "../obikpartitionner" }
|
||||
|
||||
@@ -5,6 +5,8 @@ use clap::Args;
|
||||
use obikindex::KmerIndex;
|
||||
use tracing::info;
|
||||
|
||||
use super::predicate::FilterArgs;
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct DumpArgs {
|
||||
/// Index directory to dump
|
||||
@@ -17,6 +19,9 @@ pub struct DumpArgs {
|
||||
/// Prepend partition and layer columns to each row
|
||||
#[arg(long, default_value_t = false)]
|
||||
pub debug: bool,
|
||||
|
||||
#[command(flatten)]
|
||||
pub filter: FilterArgs,
|
||||
}
|
||||
|
||||
pub fn run(args: DumpArgs) {
|
||||
@@ -28,13 +33,15 @@ pub fn run(args: DumpArgs) {
|
||||
info!(
|
||||
"dumping {} partitions, {} genome(s)",
|
||||
idx.n_partitions(),
|
||||
idx.meta().genomes.len()
|
||||
&idx.meta().genomes.len()
|
||||
);
|
||||
|
||||
let filters = args.filter.build_filters(&idx.meta().genomes);
|
||||
|
||||
let stdout = io::stdout();
|
||||
let mut out = BufWriter::new(stdout.lock());
|
||||
|
||||
idx.dump(&mut out, args.force_presence, args.debug).unwrap_or_else(|e| {
|
||||
idx.dump(&mut out, args.force_presence, args.debug, &filters).unwrap_or_else(|e| {
|
||||
eprintln!("dump error: {e}");
|
||||
std::process::exit(1);
|
||||
});
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use clap::Args;
|
||||
use obikindex::GenomeInfo;
|
||||
use obikpartitionner::GroupQuorumFilter;
|
||||
use obikpartitionner::{GroupQuorumFilter, KmerFilter};
|
||||
|
||||
// ── Operator ──────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -141,6 +142,88 @@ fn classify(
|
||||
/// - `ingroup` predicates only: outgroup indices are empty.
|
||||
/// - `outgroup` predicates only: ingroup indices are empty.
|
||||
/// - Both defined: ingroup wins on overlap; uncategorized genomes are ignored.
|
||||
/// CLI args for ingroup/outgroup filtering — embeddable in any command via `#[command(flatten)]`.
|
||||
#[derive(Args)]
|
||||
pub struct FilterArgs {
|
||||
/// Ingroup predicate (repeatable; AND). Forms: `key=v1|v2`, `key!=v`, `key~path`, `key!~path`, `*`/`all`
|
||||
#[arg(long, value_name = "PRED")]
|
||||
pub ingroup: Vec<String>,
|
||||
|
||||
/// Outgroup predicate (repeatable; OR). Forms: `key=v1|v2`, `key!=v`, `key~path`, `key!~path`, `*`/`all`
|
||||
#[arg(long, value_name = "PRED")]
|
||||
pub outgroup: Vec<String>,
|
||||
|
||||
/// Minimum number of ingroup genomes containing the k-mer
|
||||
#[arg(long)]
|
||||
pub min_count: Option<usize>,
|
||||
|
||||
/// Maximum number of ingroup genomes containing the k-mer
|
||||
#[arg(long)]
|
||||
pub max_count: Option<usize>,
|
||||
|
||||
/// Minimum fraction of ingroup genomes containing the k-mer [0.0–1.0]
|
||||
#[arg(long)]
|
||||
pub min_frac: Option<f64>,
|
||||
|
||||
/// Maximum fraction of ingroup genomes containing the k-mer [0.0–1.0]
|
||||
#[arg(long)]
|
||||
pub max_frac: Option<f64>,
|
||||
|
||||
/// Minimum number of outgroup genomes containing the k-mer
|
||||
#[arg(long)]
|
||||
pub min_outgroup_count: Option<usize>,
|
||||
|
||||
/// Maximum number of outgroup genomes containing the k-mer
|
||||
#[arg(long)]
|
||||
pub max_outgroup_count: Option<usize>,
|
||||
|
||||
/// Minimum fraction of outgroup genomes containing the k-mer [0.0–1.0]
|
||||
#[arg(long)]
|
||||
pub min_outgroup_frac: Option<f64>,
|
||||
|
||||
/// Maximum fraction of outgroup genomes containing the k-mer [0.0–1.0]
|
||||
#[arg(long)]
|
||||
pub max_outgroup_frac: Option<f64>,
|
||||
|
||||
/// Per-genome count threshold to consider a genome as "containing" the k-mer (default 0)
|
||||
#[arg(long, default_value = "0")]
|
||||
pub presence_threshold: u32,
|
||||
}
|
||||
|
||||
impl FilterArgs {
|
||||
/// Parse predicates and build a filter list ready to pass to `iter_partition_kmers`.
|
||||
pub fn build_filters(&self, genomes: &[GenomeInfo]) -> Vec<Box<dyn KmerFilter>> {
|
||||
let ingroup_preds: Vec<MetaPred> = self.ingroup.iter()
|
||||
.map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
|
||||
eprintln!("error in --ingroup: {e}");
|
||||
std::process::exit(1);
|
||||
}))
|
||||
.collect();
|
||||
let outgroup_preds: Vec<MetaPred> = self.outgroup.iter()
|
||||
.map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
|
||||
eprintln!("error in --outgroup: {e}");
|
||||
std::process::exit(1);
|
||||
}))
|
||||
.collect();
|
||||
vec![Box::new(build_group_filter(
|
||||
genomes,
|
||||
&ingroup_preds,
|
||||
&outgroup_preds,
|
||||
GroupFilterParams {
|
||||
threshold: self.presence_threshold,
|
||||
min_count: self.min_count,
|
||||
max_count: self.max_count,
|
||||
min_frac: self.min_frac,
|
||||
max_frac: self.max_frac,
|
||||
min_outgroup_count: self.min_outgroup_count,
|
||||
max_outgroup_count: self.max_outgroup_count,
|
||||
min_outgroup_frac: self.min_outgroup_frac,
|
||||
max_outgroup_frac: self.max_outgroup_frac,
|
||||
},
|
||||
))]
|
||||
}
|
||||
}
|
||||
|
||||
pub struct GroupFilterParams {
|
||||
pub threshold: u32,
|
||||
pub min_count: Option<usize>,
|
||||
|
||||
@@ -6,7 +6,7 @@ use obikpartitionner::filter::{MaxTotalCount, MinTotalCount};
|
||||
use obisys::Reporter;
|
||||
use tracing::info;
|
||||
|
||||
use super::predicate::{GroupFilterParams, MetaPred, build_group_filter};
|
||||
use super::predicate::FilterArgs;
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct RebuildArgs {
|
||||
@@ -17,47 +17,8 @@ pub struct RebuildArgs {
|
||||
#[arg(short, long)]
|
||||
pub output: PathBuf,
|
||||
|
||||
/// Ingroup predicate (repeatable; AND between flags).
|
||||
/// Forms: `key=val1|val2`, `key!=val`, `key~path`, `key!~path`, `*`/`all`
|
||||
#[arg(long, value_name = "PRED")]
|
||||
pub ingroup: Vec<String>,
|
||||
|
||||
/// Outgroup predicate (repeatable; OR between flags).
|
||||
/// Forms: `key=val1|val2`, `key!=val`, `key~path`, `key!~path`, `*`/`all`
|
||||
#[arg(long, value_name = "PRED")]
|
||||
pub outgroup: Vec<String>,
|
||||
|
||||
/// Minimum number of ingroup genomes containing the k-mer
|
||||
#[arg(long)]
|
||||
pub min_count: Option<usize>,
|
||||
|
||||
/// Maximum number of ingroup genomes containing the k-mer
|
||||
#[arg(long)]
|
||||
pub max_count: Option<usize>,
|
||||
|
||||
/// Minimum fraction of ingroup genomes containing the k-mer [0.0–1.0]
|
||||
#[arg(long)]
|
||||
pub min_frac: Option<f64>,
|
||||
|
||||
/// Maximum fraction of ingroup genomes containing the k-mer [0.0–1.0]
|
||||
#[arg(long)]
|
||||
pub max_frac: Option<f64>,
|
||||
|
||||
/// Minimum number of outgroup genomes containing the k-mer
|
||||
#[arg(long)]
|
||||
pub min_outgroup_count: Option<usize>,
|
||||
|
||||
/// Maximum number of outgroup genomes containing the k-mer
|
||||
#[arg(long)]
|
||||
pub max_outgroup_count: Option<usize>,
|
||||
|
||||
/// Minimum fraction of outgroup genomes containing the k-mer [0.0–1.0]
|
||||
#[arg(long)]
|
||||
pub min_outgroup_frac: Option<f64>,
|
||||
|
||||
/// Maximum fraction of outgroup genomes containing the k-mer [0.0–1.0]
|
||||
#[arg(long)]
|
||||
pub max_outgroup_frac: Option<f64>,
|
||||
#[command(flatten)]
|
||||
pub filter: FilterArgs,
|
||||
|
||||
/// Minimum total count across all genomes (count index only)
|
||||
#[arg(long)]
|
||||
@@ -67,10 +28,6 @@ pub struct RebuildArgs {
|
||||
#[arg(long)]
|
||||
pub max_total_count: Option<u32>,
|
||||
|
||||
/// Per-genome count threshold to consider a genome as "containing" the k-mer (default 0)
|
||||
#[arg(long, default_value = "0")]
|
||||
pub presence_threshold: u32,
|
||||
|
||||
/// Output as presence/absence instead of counts
|
||||
#[arg(long)]
|
||||
pub presence: bool,
|
||||
@@ -92,43 +49,12 @@ pub fn run(args: RebuildArgs) {
|
||||
MergeMode::Count
|
||||
};
|
||||
|
||||
let ingroup_preds: Vec<MetaPred> = args.ingroup.iter()
|
||||
.map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
|
||||
eprintln!("error in --ingroup: {e}");
|
||||
std::process::exit(1);
|
||||
}))
|
||||
.collect();
|
||||
|
||||
let outgroup_preds: Vec<MetaPred> = args.outgroup.iter()
|
||||
.map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
|
||||
eprintln!("error in --outgroup: {e}");
|
||||
std::process::exit(1);
|
||||
}))
|
||||
.collect();
|
||||
|
||||
info!(
|
||||
"rebuild: {} genome(s), mode={:?}, source={}",
|
||||
src.meta().genomes.len(), mode, args.source.display()
|
||||
&src.meta().genomes.len(), mode, args.source.display()
|
||||
);
|
||||
|
||||
let mut filters: Vec<Box<dyn obikpartitionner::KmerFilter>> = Vec::new();
|
||||
|
||||
filters.push(Box::new(build_group_filter(
|
||||
&src.meta().genomes,
|
||||
&ingroup_preds,
|
||||
&outgroup_preds,
|
||||
GroupFilterParams {
|
||||
threshold: args.presence_threshold,
|
||||
min_count: args.min_count,
|
||||
max_count: args.max_count,
|
||||
min_frac: args.min_frac,
|
||||
max_frac: args.max_frac,
|
||||
min_outgroup_count: args.min_outgroup_count,
|
||||
max_outgroup_count: args.max_outgroup_count,
|
||||
min_outgroup_frac: args.min_outgroup_frac,
|
||||
max_outgroup_frac: args.max_outgroup_frac,
|
||||
},
|
||||
)));
|
||||
let mut filters = args.filter.build_filters(&src.meta().genomes);
|
||||
|
||||
if let Some(v) = args.min_total_count {
|
||||
filters.push(Box::new(MinTotalCount { total: v }));
|
||||
|
||||
Reference in New Issue
Block a user