feat: add --head and --presence-threshold to dump and distance

Introduces `--head N` to the `dump` command for early iteration termination and `--presence-threshold N` to the `distance` command for Jaccard filtering on count indexes. Updates filter defaults to adapt based on explicit ingroup/outgroup declarations. Fixes a Rust type mismatch in the unitig closure and updates partition iteration callbacks to return `bool` for proper early termination support. Documentation is updated accordingly.
This commit is contained in:
Eric Coissac
2026-06-09 09:47:44 +02:00
parent 650eea43b6
commit d626d42ec7
7 changed files with 105 additions and 29 deletions
+43 -18
View File
@@ -26,17 +26,19 @@ impl KmerPartition {
/// If no data matrix exists for a layer (pure set-membership, single genome),
/// a row of `n_genomes` ones is emitted for every kmer in that layer — unless
/// the filter rejects it, in which case the whole layer is skipped.
/// Like [`iter_partition_kmers`] but the callback returns `false` to stop early.
/// Returns `Ok(true)` if all kmers were visited, `Ok(false)` if the callback halted.
pub fn iter_partition_kmers(
&self,
part: usize,
use_counts: bool,
n_genomes: usize,
filters: &[Box<dyn KmerFilter>],
mut cb: impl FnMut(CanonicalKmer, Box<[u32]>),
) -> SKResult<()> {
mut cb: impl FnMut(CanonicalKmer, Box<[u32]>) -> bool,
) -> SKResult<bool> {
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
if !index_dir.exists() {
return Ok(());
return Ok(true);
}
let index_mode = PartitionMeta::load(&index_dir)
@@ -54,56 +56,68 @@ impl KmerPartition {
let counts_dir = layer_dir.join("counts");
let presence_dir = layer_dir.join("presence");
if use_counts && counts_dir.exists() {
let cont = if use_counts && counts_dir.exists() {
let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
let mut cont = true;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) {
let row = mat.row(slot);
if passes_all(filters, &row, n_genomes) {
cb(kmer, row);
cont = cb(kmer, row);
if !cont { break; }
}
}
}
cont
} else if !use_counts && presence_dir.exists() {
let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
let mut cont = true;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) {
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
if passes_all(filters, &row, n_genomes) {
cb(kmer, row);
cont = cb(kmer, row);
if !cont { break; }
}
}
}
cont
} else {
// No data matrix: implicit presence — all values are 1.
// The filter result is identical for every kmer, so evaluate once.
let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
let mut cont = true;
if passes_all(filters, &all_present, n_genomes) {
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if mphf.find(kmer).is_some() {
cb(kmer, all_present.clone());
cont = cb(kmer, all_present.clone());
if !cont { break; }
}
}
}
}
cont
};
if !cont { return Ok(false); }
}
Ok(())
Ok(true)
}
/// Like [`iter_partition_kmers`] but the callback also receives `(partition, layer)`
/// indices, enabling debug output that identifies where each kmer was stored.
/// Returns `Ok(true)` if all kmers were visited, `Ok(false)` if the callback halted.
pub fn iter_partition_kmers_located(
&self,
part: usize,
use_counts: bool,
n_genomes: usize,
filters: &[Box<dyn KmerFilter>],
mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>),
) -> SKResult<()> {
mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>) -> bool,
) -> SKResult<bool> {
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
if !index_dir.exists() {
return Ok(());
return Ok(true);
}
let index_mode = PartitionMeta::load(&index_dir)
@@ -120,39 +134,50 @@ impl KmerPartition {
let counts_dir = layer_dir.join("counts");
let presence_dir = layer_dir.join("presence");
if use_counts && counts_dir.exists() {
let cont = if use_counts && counts_dir.exists() {
let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
let mut cont = true;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) {
let row = mat.row(slot);
if passes_all(filters, &row, n_genomes) {
cb(part, layer, kmer, row);
cont = cb(part, layer, kmer, row);
if !cont { break; }
}
}
}
cont
} else if !use_counts && presence_dir.exists() {
let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
let mut cont = true;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) {
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
if passes_all(filters, &row, n_genomes) {
cb(part, layer, kmer, row);
cont = cb(part, layer, kmer, row);
if !cont { break; }
}
}
}
cont
} else {
let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
let mut cont = true;
if passes_all(filters, &all_present, n_genomes) {
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if mphf.find(kmer).is_some() {
cb(part, layer, kmer, all_present.clone());
cont = cb(part, layer, kmer, all_present.clone());
if !cont { break; }
}
}
}
}
cont
};
if !cont { return Ok(false); }
layer += 1;
}
Ok(())
Ok(true)
}
}