feat: add --head and --presence-threshold to dump and distance

Introduces `--head N` to the `dump` command for early iteration termination and `--presence-threshold N` to the `distance` command for Jaccard filtering on count indexes. Updates filter defaults to adapt based on explicit ingroup/outgroup declarations. Fixes a Rust type mismatch in the unitig closure and updates partition iteration callbacks to return `bool` for proper early termination support. Documentation is updated accordingly.
This commit is contained in:
Eric Coissac
2026-06-09 09:47:44 +02:00
parent 650eea43b6
commit d626d42ec7
7 changed files with 105 additions and 29 deletions
+12 -4
View File
@@ -20,6 +20,7 @@ impl KmerIndex {
out: &mut W,
force_presence: bool,
debug: bool,
head: Option<usize>,
filters: &[Box<dyn KmerFilter>],
) -> OKIResult<()> {
let genomes = &self.meta.genomes;
@@ -39,8 +40,10 @@ impl KmerIndex {
// ── Rows ──────────────────────────────────────────────────────────────
let n = self.n_partitions();
let mut remaining = head.unwrap_or(usize::MAX);
for i in 0..n {
if debug {
if remaining == 0 { break; }
let cont = if debug {
self.partition
.iter_partition_kmers_located(i, use_counts, n_genomes, filters, |part, layer, kmer, row| {
let seq = String::from_utf8(kmer.to_ascii())
@@ -48,8 +51,10 @@ impl KmerIndex {
let _ = write!(out, "{part},{layer},{seq}");
for &v in row.iter() { let _ = write!(out, ",{v}"); }
let _ = writeln!(out);
remaining -= 1;
remaining > 0
})
.map_err(OKIError::Partition)?;
.map_err(OKIError::Partition)?
} else {
self.partition
.iter_partition_kmers(i, use_counts, n_genomes, filters, |kmer, row| {
@@ -58,9 +63,12 @@ impl KmerIndex {
let _ = write!(out, "{seq}");
for &v in row.iter() { let _ = write!(out, ",{v}"); }
let _ = writeln!(out);
remaining -= 1;
remaining > 0
})
.map_err(OKIError::Partition)?;
}
.map_err(OKIError::Partition)?
};
if !cont { break; }
}
out.flush()?;
+5 -1
View File
@@ -20,6 +20,10 @@ pub struct DumpArgs {
#[arg(long, default_value_t = false)]
pub debug: bool,
/// Only output the first N kmers
#[arg(long)]
pub head: Option<usize>,
#[command(flatten)]
pub filter: FilterArgs,
}
@@ -41,7 +45,7 @@ pub fn run(args: DumpArgs) {
let stdout = io::stdout();
let mut out = BufWriter::new(stdout.lock());
idx.dump(&mut out, args.force_presence, args.debug, &filters).unwrap_or_else(|e| {
idx.dump(&mut out, args.force_presence, args.debug, args.head, &filters).unwrap_or_else(|e| {
eprintln!("dump error: {e}");
std::process::exit(1);
});
+7 -2
View File
@@ -162,6 +162,7 @@ pub struct FilterArgs {
pub max_count: Option<usize>,
/// Minimum fraction of ingroup genomes containing the k-mer [0.01.0]
/// (default 1.0 when --ingroup is set, 0.0 otherwise)
#[arg(long)]
pub min_frac: Option<f64>,
@@ -174,6 +175,7 @@ pub struct FilterArgs {
pub min_outgroup_count: Option<usize>,
/// Maximum number of outgroup genomes containing the k-mer
/// (default 0 when --outgroup is set, no constraint otherwise)
#[arg(long)]
pub max_outgroup_count: Option<usize>,
@@ -258,16 +260,19 @@ pub fn build_group_filter(
let in_size = ingroup_idx.len();
let out_size = outgroup_idx.len();
let default_min_frac = if !ingroup_preds.is_empty() { 1.0 } else { 0.0 };
let default_max_outgroup_count = if !outgroup_preds.is_empty() { 0 } else { out_size };
GroupQuorumFilter {
ingroup_idx,
outgroup_idx,
threshold: p.threshold,
min_count: p.min_count.unwrap_or(0),
max_count: p.max_count.unwrap_or(in_size),
min_frac: p.min_frac.unwrap_or(0.0),
min_frac: p.min_frac.unwrap_or(default_min_frac),
max_frac: p.max_frac.unwrap_or(1.0),
min_outgroup_count: p.min_outgroup_count.unwrap_or(0),
max_outgroup_count: p.max_outgroup_count.unwrap_or(out_size),
max_outgroup_count: p.max_outgroup_count.unwrap_or(default_max_outgroup_count),
min_outgroup_frac: p.min_outgroup_frac.unwrap_or(0.0),
max_outgroup_frac: p.max_outgroup_frac.unwrap_or(1.0),
}
+1
View File
@@ -48,6 +48,7 @@ pub fn run(args: UnitigArgs) {
partition
.iter_partition_kmers(i, use_counts, n_genomes, &filters, |kmer, _row| {
local_g.push(kmer);
true
})
.unwrap_or_else(|e| {
eprintln!("error reading partition {i}: {e}");
+43 -18
View File
@@ -26,17 +26,19 @@ impl KmerPartition {
/// If no data matrix exists for a layer (pure set-membership, single genome),
/// a row of `n_genomes` ones is emitted for every kmer in that layer — unless
/// the filter rejects it, in which case the whole layer is skipped.
/// Like [`iter_partition_kmers`] but the callback returns `false` to stop early.
/// Returns `Ok(true)` if all kmers were visited, `Ok(false)` if the callback halted.
pub fn iter_partition_kmers(
&self,
part: usize,
use_counts: bool,
n_genomes: usize,
filters: &[Box<dyn KmerFilter>],
mut cb: impl FnMut(CanonicalKmer, Box<[u32]>),
) -> SKResult<()> {
mut cb: impl FnMut(CanonicalKmer, Box<[u32]>) -> bool,
) -> SKResult<bool> {
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
if !index_dir.exists() {
return Ok(());
return Ok(true);
}
let index_mode = PartitionMeta::load(&index_dir)
@@ -54,56 +56,68 @@ impl KmerPartition {
let counts_dir = layer_dir.join("counts");
let presence_dir = layer_dir.join("presence");
if use_counts && counts_dir.exists() {
let cont = if use_counts && counts_dir.exists() {
let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
let mut cont = true;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) {
let row = mat.row(slot);
if passes_all(filters, &row, n_genomes) {
cb(kmer, row);
cont = cb(kmer, row);
if !cont { break; }
}
}
}
cont
} else if !use_counts && presence_dir.exists() {
let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
let mut cont = true;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) {
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
if passes_all(filters, &row, n_genomes) {
cb(kmer, row);
cont = cb(kmer, row);
if !cont { break; }
}
}
}
cont
} else {
// No data matrix: implicit presence — all values are 1.
// The filter result is identical for every kmer, so evaluate once.
let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
let mut cont = true;
if passes_all(filters, &all_present, n_genomes) {
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if mphf.find(kmer).is_some() {
cb(kmer, all_present.clone());
cont = cb(kmer, all_present.clone());
if !cont { break; }
}
}
}
}
cont
};
if !cont { return Ok(false); }
}
Ok(())
Ok(true)
}
/// Like [`iter_partition_kmers`] but the callback also receives `(partition, layer)`
/// indices, enabling debug output that identifies where each kmer was stored.
/// Returns `Ok(true)` if all kmers were visited, `Ok(false)` if the callback halted.
pub fn iter_partition_kmers_located(
&self,
part: usize,
use_counts: bool,
n_genomes: usize,
filters: &[Box<dyn KmerFilter>],
mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>),
) -> SKResult<()> {
mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>) -> bool,
) -> SKResult<bool> {
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
if !index_dir.exists() {
return Ok(());
return Ok(true);
}
let index_mode = PartitionMeta::load(&index_dir)
@@ -120,39 +134,50 @@ impl KmerPartition {
let counts_dir = layer_dir.join("counts");
let presence_dir = layer_dir.join("presence");
if use_counts && counts_dir.exists() {
let cont = if use_counts && counts_dir.exists() {
let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
let mut cont = true;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) {
let row = mat.row(slot);
if passes_all(filters, &row, n_genomes) {
cb(part, layer, kmer, row);
cont = cb(part, layer, kmer, row);
if !cont { break; }
}
}
}
cont
} else if !use_counts && presence_dir.exists() {
let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
let mut cont = true;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) {
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
if passes_all(filters, &row, n_genomes) {
cb(part, layer, kmer, row);
cont = cb(part, layer, kmer, row);
if !cont { break; }
}
}
}
cont
} else {
let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
let mut cont = true;
if passes_all(filters, &all_present, n_genomes) {
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if mphf.find(kmer).is_some() {
cb(part, layer, kmer, all_present.clone());
cont = cb(part, layer, kmer, all_present.clone());
if !cont { break; }
}
}
}
}
cont
};
if !cont { return Ok(false); }
layer += 1;
}
Ok(())
Ok(true)
}
}