feat: add --head and --presence-threshold to dump and distance
Introduces `--head N` to the `dump` command for early iteration termination and `--presence-threshold N` to the `distance` command for Jaccard filtering on count indexes. Updates filter defaults to adapt based on explicit ingroup/outgroup declarations. Fixes a Rust type mismatch in the unitig closure and updates partition iteration callbacks to return `bool` for proper early termination support. Documentation is updated accordingly.
This commit is contained in:
@@ -26,17 +26,19 @@ impl KmerPartition {
|
||||
/// If no data matrix exists for a layer (pure set-membership, single genome),
|
||||
/// a row of `n_genomes` ones is emitted for every kmer in that layer — unless
|
||||
/// the filter rejects it, in which case the whole layer is skipped.
|
||||
/// Like [`iter_partition_kmers`] but the callback returns `false` to stop early.
|
||||
/// Returns `Ok(true)` if all kmers were visited, `Ok(false)` if the callback halted.
|
||||
pub fn iter_partition_kmers(
|
||||
&self,
|
||||
part: usize,
|
||||
use_counts: bool,
|
||||
n_genomes: usize,
|
||||
filters: &[Box<dyn KmerFilter>],
|
||||
mut cb: impl FnMut(CanonicalKmer, Box<[u32]>),
|
||||
) -> SKResult<()> {
|
||||
mut cb: impl FnMut(CanonicalKmer, Box<[u32]>) -> bool,
|
||||
) -> SKResult<bool> {
|
||||
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
|
||||
if !index_dir.exists() {
|
||||
return Ok(());
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
let index_mode = PartitionMeta::load(&index_dir)
|
||||
@@ -54,56 +56,68 @@ impl KmerPartition {
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
let presence_dir = layer_dir.join("presence");
|
||||
|
||||
if use_counts && counts_dir.exists() {
|
||||
let cont = if use_counts && counts_dir.exists() {
|
||||
let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
|
||||
let mut cont = true;
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
if let Some(slot) = mphf.find(kmer) {
|
||||
let row = mat.row(slot);
|
||||
if passes_all(filters, &row, n_genomes) {
|
||||
cb(kmer, row);
|
||||
cont = cb(kmer, row);
|
||||
if !cont { break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
cont
|
||||
} else if !use_counts && presence_dir.exists() {
|
||||
let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
|
||||
let mut cont = true;
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
if let Some(slot) = mphf.find(kmer) {
|
||||
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
|
||||
if passes_all(filters, &row, n_genomes) {
|
||||
cb(kmer, row);
|
||||
cont = cb(kmer, row);
|
||||
if !cont { break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
cont
|
||||
} else {
|
||||
// No data matrix: implicit presence — all values are 1.
|
||||
// The filter result is identical for every kmer, so evaluate once.
|
||||
let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
|
||||
let mut cont = true;
|
||||
if passes_all(filters, &all_present, n_genomes) {
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
if mphf.find(kmer).is_some() {
|
||||
cb(kmer, all_present.clone());
|
||||
cont = cb(kmer, all_present.clone());
|
||||
if !cont { break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cont
|
||||
};
|
||||
|
||||
if !cont { return Ok(false); }
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// Like [`iter_partition_kmers`] but the callback also receives `(partition, layer)`
|
||||
/// indices, enabling debug output that identifies where each kmer was stored.
|
||||
/// Returns `Ok(true)` if all kmers were visited, `Ok(false)` if the callback halted.
|
||||
pub fn iter_partition_kmers_located(
|
||||
&self,
|
||||
part: usize,
|
||||
use_counts: bool,
|
||||
n_genomes: usize,
|
||||
filters: &[Box<dyn KmerFilter>],
|
||||
mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>),
|
||||
) -> SKResult<()> {
|
||||
mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>) -> bool,
|
||||
) -> SKResult<bool> {
|
||||
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
|
||||
if !index_dir.exists() {
|
||||
return Ok(());
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
let index_mode = PartitionMeta::load(&index_dir)
|
||||
@@ -120,39 +134,50 @@ impl KmerPartition {
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
let presence_dir = layer_dir.join("presence");
|
||||
|
||||
if use_counts && counts_dir.exists() {
|
||||
let cont = if use_counts && counts_dir.exists() {
|
||||
let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
|
||||
let mut cont = true;
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
if let Some(slot) = mphf.find(kmer) {
|
||||
let row = mat.row(slot);
|
||||
if passes_all(filters, &row, n_genomes) {
|
||||
cb(part, layer, kmer, row);
|
||||
cont = cb(part, layer, kmer, row);
|
||||
if !cont { break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
cont
|
||||
} else if !use_counts && presence_dir.exists() {
|
||||
let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
|
||||
let mut cont = true;
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
if let Some(slot) = mphf.find(kmer) {
|
||||
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
|
||||
if passes_all(filters, &row, n_genomes) {
|
||||
cb(part, layer, kmer, row);
|
||||
cont = cb(part, layer, kmer, row);
|
||||
if !cont { break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
cont
|
||||
} else {
|
||||
let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
|
||||
let mut cont = true;
|
||||
if passes_all(filters, &all_present, n_genomes) {
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
if mphf.find(kmer).is_some() {
|
||||
cb(part, layer, kmer, all_present.clone());
|
||||
cont = cb(part, layer, kmer, all_present.clone());
|
||||
if !cont { break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cont
|
||||
};
|
||||
|
||||
if !cont { return Ok(false); }
|
||||
layer += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(true)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user