Push ruqusmkoyvwm #16
Generated
+1
@@ -1525,6 +1525,7 @@ dependencies = [
|
|||||||
"csv",
|
"csv",
|
||||||
"indicatif",
|
"indicatif",
|
||||||
"kodama",
|
"kodama",
|
||||||
|
"obidebruinj",
|
||||||
"obifastwrite",
|
"obifastwrite",
|
||||||
"obikindex",
|
"obikindex",
|
||||||
"obikpartitionner",
|
"obikpartitionner",
|
||||||
|
|||||||
+12
-10
@@ -2,6 +2,7 @@ use std::io::Write;
|
|||||||
|
|
||||||
use crate::error::{OKIError, OKIResult};
|
use crate::error::{OKIError, OKIResult};
|
||||||
use crate::index::KmerIndex;
|
use crate::index::KmerIndex;
|
||||||
|
use obikpartitionner::KmerFilter;
|
||||||
|
|
||||||
impl KmerIndex {
|
impl KmerIndex {
|
||||||
/// Write a CSV table of all indexed kmers to `out`.
|
/// Write a CSV table of all indexed kmers to `out`.
|
||||||
@@ -14,8 +15,13 @@ impl KmerIndex {
|
|||||||
///
|
///
|
||||||
/// The caller must have set the global kmer length (`obikseq::set_k`) before
|
/// The caller must have set the global kmer length (`obikseq::set_k`) before
|
||||||
/// calling this method.
|
/// calling this method.
|
||||||
pub fn dump<W: Write>(&self, out: &mut W, force_presence: bool, debug: bool) -> OKIResult<()> {
|
pub fn dump<W: Write>(
|
||||||
|
&self,
|
||||||
|
out: &mut W,
|
||||||
|
force_presence: bool,
|
||||||
|
debug: bool,
|
||||||
|
filters: &[Box<dyn KmerFilter>],
|
||||||
|
) -> OKIResult<()> {
|
||||||
let genomes = &self.meta.genomes;
|
let genomes = &self.meta.genomes;
|
||||||
let use_counts = self.meta.config.with_counts && !force_presence;
|
let use_counts = self.meta.config.with_counts && !force_presence;
|
||||||
let n_genomes = genomes.len().max(1);
|
let n_genomes = genomes.len().max(1);
|
||||||
@@ -36,25 +42,21 @@ impl KmerIndex {
|
|||||||
for i in 0..n {
|
for i in 0..n {
|
||||||
if debug {
|
if debug {
|
||||||
self.partition
|
self.partition
|
||||||
.iter_partition_kmers_located(i, use_counts, n_genomes, &[], |part, layer, kmer, row| {
|
.iter_partition_kmers_located(i, use_counts, n_genomes, filters, |part, layer, kmer, row| {
|
||||||
let seq = String::from_utf8(kmer.to_ascii())
|
let seq = String::from_utf8(kmer.to_ascii())
|
||||||
.unwrap_or_else(|_| "?".repeat(kmer_size));
|
.unwrap_or_else(|_| "?".repeat(kmer_size));
|
||||||
let _ = write!(out, "{part},{layer},{seq}");
|
let _ = write!(out, "{part},{layer},{seq}");
|
||||||
for &v in row.iter() {
|
for &v in row.iter() { let _ = write!(out, ",{v}"); }
|
||||||
let _ = write!(out, ",{v}");
|
|
||||||
}
|
|
||||||
let _ = writeln!(out);
|
let _ = writeln!(out);
|
||||||
})
|
})
|
||||||
.map_err(OKIError::Partition)?;
|
.map_err(OKIError::Partition)?;
|
||||||
} else {
|
} else {
|
||||||
self.partition
|
self.partition
|
||||||
.iter_partition_kmers(i, use_counts, n_genomes, &[], |kmer, row| {
|
.iter_partition_kmers(i, use_counts, n_genomes, filters, |kmer, row| {
|
||||||
let seq = String::from_utf8(kmer.to_ascii())
|
let seq = String::from_utf8(kmer.to_ascii())
|
||||||
.unwrap_or_else(|_| "?".repeat(kmer_size));
|
.unwrap_or_else(|_| "?".repeat(kmer_size));
|
||||||
let _ = write!(out, "{seq}");
|
let _ = write!(out, "{seq}");
|
||||||
for &v in row.iter() {
|
for &v in row.iter() { let _ = write!(out, ",{v}"); }
|
||||||
let _ = write!(out, ",{v}");
|
|
||||||
}
|
|
||||||
let _ = writeln!(out);
|
let _ = writeln!(out);
|
||||||
})
|
})
|
||||||
.map_err(OKIError::Partition)?;
|
.map_err(OKIError::Partition)?;
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ obikseq = { path = "../obikseq" }
|
|||||||
obiread = { path = "../obiread" }
|
obiread = { path = "../obiread" }
|
||||||
obiskbuilder = { path = "../obiskbuilder" }
|
obiskbuilder = { path = "../obiskbuilder" }
|
||||||
obifastwrite = { path = "../obifastwrite" }
|
obifastwrite = { path = "../obifastwrite" }
|
||||||
|
obidebruinj = { path = "../obidebruinj" }
|
||||||
obipipeline = { path = "../obipipeline" }
|
obipipeline = { path = "../obipipeline" }
|
||||||
obikrope = { path = "../obikrope" }
|
obikrope = { path = "../obikrope" }
|
||||||
obikpartitionner = { path = "../obikpartitionner" }
|
obikpartitionner = { path = "../obikpartitionner" }
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ use clap::Args;
|
|||||||
use obikindex::KmerIndex;
|
use obikindex::KmerIndex;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
|
use super::predicate::FilterArgs;
|
||||||
|
|
||||||
#[derive(Args)]
|
#[derive(Args)]
|
||||||
pub struct DumpArgs {
|
pub struct DumpArgs {
|
||||||
/// Index directory to dump
|
/// Index directory to dump
|
||||||
@@ -17,6 +19,9 @@ pub struct DumpArgs {
|
|||||||
/// Prepend partition and layer columns to each row
|
/// Prepend partition and layer columns to each row
|
||||||
#[arg(long, default_value_t = false)]
|
#[arg(long, default_value_t = false)]
|
||||||
pub debug: bool,
|
pub debug: bool,
|
||||||
|
|
||||||
|
#[command(flatten)]
|
||||||
|
pub filter: FilterArgs,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn run(args: DumpArgs) {
|
pub fn run(args: DumpArgs) {
|
||||||
@@ -28,13 +33,15 @@ pub fn run(args: DumpArgs) {
|
|||||||
info!(
|
info!(
|
||||||
"dumping {} partitions, {} genome(s)",
|
"dumping {} partitions, {} genome(s)",
|
||||||
idx.n_partitions(),
|
idx.n_partitions(),
|
||||||
idx.meta().genomes.len()
|
&idx.meta().genomes.len()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let filters = args.filter.build_filters(&idx.meta().genomes);
|
||||||
|
|
||||||
let stdout = io::stdout();
|
let stdout = io::stdout();
|
||||||
let mut out = BufWriter::new(stdout.lock());
|
let mut out = BufWriter::new(stdout.lock());
|
||||||
|
|
||||||
idx.dump(&mut out, args.force_presence, args.debug).unwrap_or_else(|e| {
|
idx.dump(&mut out, args.force_presence, args.debug, &filters).unwrap_or_else(|e| {
|
||||||
eprintln!("dump error: {e}");
|
eprintln!("dump error: {e}");
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use clap::Args;
|
||||||
use obikindex::GenomeInfo;
|
use obikindex::GenomeInfo;
|
||||||
use obikpartitionner::GroupQuorumFilter;
|
use obikpartitionner::{GroupQuorumFilter, KmerFilter};
|
||||||
|
|
||||||
// ── Operator ──────────────────────────────────────────────────────────────────
|
// ── Operator ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -141,6 +142,88 @@ fn classify(
|
|||||||
/// - `ingroup` predicates only: outgroup indices are empty.
|
/// - `ingroup` predicates only: outgroup indices are empty.
|
||||||
/// - `outgroup` predicates only: ingroup indices are empty.
|
/// - `outgroup` predicates only: ingroup indices are empty.
|
||||||
/// - Both defined: ingroup wins on overlap; uncategorized genomes are ignored.
|
/// - Both defined: ingroup wins on overlap; uncategorized genomes are ignored.
|
||||||
|
/// CLI args for ingroup/outgroup filtering — embeddable in any command via `#[command(flatten)]`.
|
||||||
|
#[derive(Args)]
|
||||||
|
pub struct FilterArgs {
|
||||||
|
/// Ingroup predicate (repeatable; AND). Forms: `key=v1|v2`, `key!=v`, `key~path`, `key!~path`, `*`/`all`
|
||||||
|
#[arg(long, value_name = "PRED")]
|
||||||
|
pub ingroup: Vec<String>,
|
||||||
|
|
||||||
|
/// Outgroup predicate (repeatable; OR). Forms: `key=v1|v2`, `key!=v`, `key~path`, `key!~path`, `*`/`all`
|
||||||
|
#[arg(long, value_name = "PRED")]
|
||||||
|
pub outgroup: Vec<String>,
|
||||||
|
|
||||||
|
/// Minimum number of ingroup genomes containing the k-mer
|
||||||
|
#[arg(long)]
|
||||||
|
pub min_count: Option<usize>,
|
||||||
|
|
||||||
|
/// Maximum number of ingroup genomes containing the k-mer
|
||||||
|
#[arg(long)]
|
||||||
|
pub max_count: Option<usize>,
|
||||||
|
|
||||||
|
/// Minimum fraction of ingroup genomes containing the k-mer [0.0–1.0]
|
||||||
|
#[arg(long)]
|
||||||
|
pub min_frac: Option<f64>,
|
||||||
|
|
||||||
|
/// Maximum fraction of ingroup genomes containing the k-mer [0.0–1.0]
|
||||||
|
#[arg(long)]
|
||||||
|
pub max_frac: Option<f64>,
|
||||||
|
|
||||||
|
/// Minimum number of outgroup genomes containing the k-mer
|
||||||
|
#[arg(long)]
|
||||||
|
pub min_outgroup_count: Option<usize>,
|
||||||
|
|
||||||
|
/// Maximum number of outgroup genomes containing the k-mer
|
||||||
|
#[arg(long)]
|
||||||
|
pub max_outgroup_count: Option<usize>,
|
||||||
|
|
||||||
|
/// Minimum fraction of outgroup genomes containing the k-mer [0.0–1.0]
|
||||||
|
#[arg(long)]
|
||||||
|
pub min_outgroup_frac: Option<f64>,
|
||||||
|
|
||||||
|
/// Maximum fraction of outgroup genomes containing the k-mer [0.0–1.0]
|
||||||
|
#[arg(long)]
|
||||||
|
pub max_outgroup_frac: Option<f64>,
|
||||||
|
|
||||||
|
/// Per-genome count threshold to consider a genome as "containing" the k-mer (default 0)
|
||||||
|
#[arg(long, default_value = "0")]
|
||||||
|
pub presence_threshold: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FilterArgs {
|
||||||
|
/// Parse predicates and build a filter list ready to pass to `iter_partition_kmers`.
|
||||||
|
pub fn build_filters(&self, genomes: &[GenomeInfo]) -> Vec<Box<dyn KmerFilter>> {
|
||||||
|
let ingroup_preds: Vec<MetaPred> = self.ingroup.iter()
|
||||||
|
.map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error in --ingroup: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
}))
|
||||||
|
.collect();
|
||||||
|
let outgroup_preds: Vec<MetaPred> = self.outgroup.iter()
|
||||||
|
.map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error in --outgroup: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
}))
|
||||||
|
.collect();
|
||||||
|
vec![Box::new(build_group_filter(
|
||||||
|
genomes,
|
||||||
|
&ingroup_preds,
|
||||||
|
&outgroup_preds,
|
||||||
|
GroupFilterParams {
|
||||||
|
threshold: self.presence_threshold,
|
||||||
|
min_count: self.min_count,
|
||||||
|
max_count: self.max_count,
|
||||||
|
min_frac: self.min_frac,
|
||||||
|
max_frac: self.max_frac,
|
||||||
|
min_outgroup_count: self.min_outgroup_count,
|
||||||
|
max_outgroup_count: self.max_outgroup_count,
|
||||||
|
min_outgroup_frac: self.min_outgroup_frac,
|
||||||
|
max_outgroup_frac: self.max_outgroup_frac,
|
||||||
|
},
|
||||||
|
))]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct GroupFilterParams {
|
pub struct GroupFilterParams {
|
||||||
pub threshold: u32,
|
pub threshold: u32,
|
||||||
pub min_count: Option<usize>,
|
pub min_count: Option<usize>,
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use obikpartitionner::filter::{MaxTotalCount, MinTotalCount};
|
|||||||
use obisys::Reporter;
|
use obisys::Reporter;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use super::predicate::{GroupFilterParams, MetaPred, build_group_filter};
|
use super::predicate::FilterArgs;
|
||||||
|
|
||||||
#[derive(Args)]
|
#[derive(Args)]
|
||||||
pub struct RebuildArgs {
|
pub struct RebuildArgs {
|
||||||
@@ -17,47 +17,8 @@ pub struct RebuildArgs {
|
|||||||
#[arg(short, long)]
|
#[arg(short, long)]
|
||||||
pub output: PathBuf,
|
pub output: PathBuf,
|
||||||
|
|
||||||
/// Ingroup predicate (repeatable; AND between flags).
|
#[command(flatten)]
|
||||||
/// Forms: `key=val1|val2`, `key!=val`, `key~path`, `key!~path`, `*`/`all`
|
pub filter: FilterArgs,
|
||||||
#[arg(long, value_name = "PRED")]
|
|
||||||
pub ingroup: Vec<String>,
|
|
||||||
|
|
||||||
/// Outgroup predicate (repeatable; OR between flags).
|
|
||||||
/// Forms: `key=val1|val2`, `key!=val`, `key~path`, `key!~path`, `*`/`all`
|
|
||||||
#[arg(long, value_name = "PRED")]
|
|
||||||
pub outgroup: Vec<String>,
|
|
||||||
|
|
||||||
/// Minimum number of ingroup genomes containing the k-mer
|
|
||||||
#[arg(long)]
|
|
||||||
pub min_count: Option<usize>,
|
|
||||||
|
|
||||||
/// Maximum number of ingroup genomes containing the k-mer
|
|
||||||
#[arg(long)]
|
|
||||||
pub max_count: Option<usize>,
|
|
||||||
|
|
||||||
/// Minimum fraction of ingroup genomes containing the k-mer [0.0–1.0]
|
|
||||||
#[arg(long)]
|
|
||||||
pub min_frac: Option<f64>,
|
|
||||||
|
|
||||||
/// Maximum fraction of ingroup genomes containing the k-mer [0.0–1.0]
|
|
||||||
#[arg(long)]
|
|
||||||
pub max_frac: Option<f64>,
|
|
||||||
|
|
||||||
/// Minimum number of outgroup genomes containing the k-mer
|
|
||||||
#[arg(long)]
|
|
||||||
pub min_outgroup_count: Option<usize>,
|
|
||||||
|
|
||||||
/// Maximum number of outgroup genomes containing the k-mer
|
|
||||||
#[arg(long)]
|
|
||||||
pub max_outgroup_count: Option<usize>,
|
|
||||||
|
|
||||||
/// Minimum fraction of outgroup genomes containing the k-mer [0.0–1.0]
|
|
||||||
#[arg(long)]
|
|
||||||
pub min_outgroup_frac: Option<f64>,
|
|
||||||
|
|
||||||
/// Maximum fraction of outgroup genomes containing the k-mer [0.0–1.0]
|
|
||||||
#[arg(long)]
|
|
||||||
pub max_outgroup_frac: Option<f64>,
|
|
||||||
|
|
||||||
/// Minimum total count across all genomes (count index only)
|
/// Minimum total count across all genomes (count index only)
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
@@ -67,10 +28,6 @@ pub struct RebuildArgs {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
pub max_total_count: Option<u32>,
|
pub max_total_count: Option<u32>,
|
||||||
|
|
||||||
/// Per-genome count threshold to consider a genome as "containing" the k-mer (default 0)
|
|
||||||
#[arg(long, default_value = "0")]
|
|
||||||
pub presence_threshold: u32,
|
|
||||||
|
|
||||||
/// Output as presence/absence instead of counts
|
/// Output as presence/absence instead of counts
|
||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
pub presence: bool,
|
pub presence: bool,
|
||||||
@@ -92,43 +49,12 @@ pub fn run(args: RebuildArgs) {
|
|||||||
MergeMode::Count
|
MergeMode::Count
|
||||||
};
|
};
|
||||||
|
|
||||||
let ingroup_preds: Vec<MetaPred> = args.ingroup.iter()
|
|
||||||
.map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
|
|
||||||
eprintln!("error in --ingroup: {e}");
|
|
||||||
std::process::exit(1);
|
|
||||||
}))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let outgroup_preds: Vec<MetaPred> = args.outgroup.iter()
|
|
||||||
.map(|s| MetaPred::parse(s).unwrap_or_else(|e| {
|
|
||||||
eprintln!("error in --outgroup: {e}");
|
|
||||||
std::process::exit(1);
|
|
||||||
}))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"rebuild: {} genome(s), mode={:?}, source={}",
|
"rebuild: {} genome(s), mode={:?}, source={}",
|
||||||
src.meta().genomes.len(), mode, args.source.display()
|
&src.meta().genomes.len(), mode, args.source.display()
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut filters: Vec<Box<dyn obikpartitionner::KmerFilter>> = Vec::new();
|
let mut filters = args.filter.build_filters(&src.meta().genomes);
|
||||||
|
|
||||||
filters.push(Box::new(build_group_filter(
|
|
||||||
&src.meta().genomes,
|
|
||||||
&ingroup_preds,
|
|
||||||
&outgroup_preds,
|
|
||||||
GroupFilterParams {
|
|
||||||
threshold: args.presence_threshold,
|
|
||||||
min_count: args.min_count,
|
|
||||||
max_count: args.max_count,
|
|
||||||
min_frac: args.min_frac,
|
|
||||||
max_frac: args.max_frac,
|
|
||||||
min_outgroup_count: args.min_outgroup_count,
|
|
||||||
max_outgroup_count: args.max_outgroup_count,
|
|
||||||
min_outgroup_frac: args.min_outgroup_frac,
|
|
||||||
max_outgroup_frac: args.max_outgroup_frac,
|
|
||||||
},
|
|
||||||
)));
|
|
||||||
|
|
||||||
if let Some(v) = args.min_total_count {
|
if let Some(v) = args.min_total_count {
|
||||||
filters.push(Box::new(MinTotalCount { total: v }));
|
filters.push(Box::new(MinTotalCount { total: v }));
|
||||||
|
|||||||
Reference in New Issue
Block a user