diff --git a/docmd/implementation/rebuild_filter.md b/docmd/implementation/rebuild_filter.md index 88bf3b2..7d4cc4e 100644 --- a/docmd/implementation/rebuild_filter.md +++ b/docmd/implementation/rebuild_filter.md @@ -96,8 +96,16 @@ For each genome: | `--max-total-count N` | all genomes | sum of per-genome counts ≤ N (`rebuild` only) | | `--presence-threshold N` | all | per-genome count > N to be considered "present" (default 0) | -Defaults: mins = 0 (no lower bound), max counts = group size, max fracs = 1.0 -(no upper bound). A filter with all defaults is a no-op. +**Conditional defaults** — the default for `--min-frac` and `--max-outgroup-count` depends on whether the corresponding group was explicitly declared: + +| Situation | `--min-frac` default | `--max-outgroup-count` default | +|-----------|----------------------|-------------------------------| +| Neither `--ingroup` nor `--outgroup` | 0.0 (no-op) | no constraint (no-op) | +| `--ingroup` only | **1.0** — all ingroup genomes must carry the k-mer | no constraint | +| `--outgroup` only | 0.0 | **0** — no outgroup genome may carry the k-mer | +| Both declared | **1.0** | **0** | + +Explicit flags always override these defaults. All other bounds (`--min-count`, `--max-count`, `--max-frac`, `--min-outgroup-*`) default to 0 / group size / 1.0 regardless of whether groups are declared. Fractions are computed over the size of the classified group, not over total genome count. An empty group (no genome classified as ingroup/outgroup) never @@ -169,6 +177,31 @@ obikmer unitig myindex \ --max-outgroup-count 0 ``` +## Command-specific options + +### `dump --head N` + +Stops output after the first N k-mers that pass all active filters. +Iteration terminates immediately — subsequent partitions and layers are not scanned. +Useful for quick inspection of large indexes without loading the entire dataset. + +```sh +obikmer dump myindex --head 100 +obikmer dump myindex --head 20 --ingroup "species=Betula_nana" --min-count 1 +``` + +### `distance --presence-threshold N` + +When computing Jaccard distance on a **count index**, a k-mer is considered present in a genome if its count is ≥ N (default 1). +This option is independent of the `--presence-threshold` used in `rebuild`/`query` filtering. + +```sh +# Jaccard treating kmers with count ≥ 2 as present +obikmer distance myindex --metric jaccard --presence-threshold 2 +``` + +This parameter has no effect on presence/absence indexes (where values are already 0/1) or on metrics other than Jaccard. + ## Implementation - **`obikpartitionner::filter::GroupQuorumFilter`** — implements `KmerFilter` diff --git a/docmd/index.md b/docmd/index.md index 5e04a48..f373720 100644 --- a/docmd/index.md +++ b/docmd/index.md @@ -11,9 +11,9 @@ | `merge` | Merge multiple built indexes into one | | `rebuild` | Filter and compact an existing index into a new single-layer index; supports ingroup/outgroup predicates on genome metadata | | `query` | Query an index with sequences and annotate matches | -| `dump` | Dump all indexed k-mers as CSV (kmer + per-genome counts or presence); supports the same ingroup/outgroup filtering as `rebuild` | +| `dump` | Dump all indexed k-mers as CSV (kmer + per-genome counts or presence); supports the same ingroup/outgroup filtering as `rebuild`; `--head N` limits output to the first N k-mers | | `annotate` | Add or update genome metadata from a CSV file; or dump metadata as CSV | -| `distance` | Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees | +| `distance` | Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees; `--presence-threshold N` sets the minimum count to consider a k-mer present when computing Jaccard on count indexes (default 1) | | `unitig` | Build a global de Bruijn graph across all partitions and enumerate its unitigs as FASTA; supports the same ingroup/outgroup filtering as `rebuild` | | `estimate` | Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing | | `reindex` | Convert an index's evidence in-place: exact ↔ approx | diff --git a/src/obikindex/src/dump.rs b/src/obikindex/src/dump.rs index 6e8481a..917dff2 100644 --- a/src/obikindex/src/dump.rs +++ b/src/obikindex/src/dump.rs @@ -20,6 +20,7 @@ impl KmerIndex { out: &mut W, force_presence: bool, debug: bool, + head: Option, filters: &[Box], ) -> OKIResult<()> { let genomes = &self.meta.genomes; @@ -39,8 +40,10 @@ impl KmerIndex { // ── Rows ────────────────────────────────────────────────────────────── let n = self.n_partitions(); + let mut remaining = head.unwrap_or(usize::MAX); for i in 0..n { - if debug { + if remaining == 0 { break; } + let cont = if debug { self.partition .iter_partition_kmers_located(i, use_counts, n_genomes, filters, |part, layer, kmer, row| { let seq = String::from_utf8(kmer.to_ascii()) @@ -48,8 +51,10 @@ impl KmerIndex { let _ = write!(out, "{part},{layer},{seq}"); for &v in row.iter() { let _ = write!(out, ",{v}"); } let _ = writeln!(out); + remaining -= 1; + remaining > 0 }) - .map_err(OKIError::Partition)?; + .map_err(OKIError::Partition)? } else { self.partition .iter_partition_kmers(i, use_counts, n_genomes, filters, |kmer, row| { @@ -58,9 +63,12 @@ impl KmerIndex { let _ = write!(out, "{seq}"); for &v in row.iter() { let _ = write!(out, ",{v}"); } let _ = writeln!(out); + remaining -= 1; + remaining > 0 }) - .map_err(OKIError::Partition)?; - } + .map_err(OKIError::Partition)? + }; + if !cont { break; } } out.flush()?; diff --git a/src/obikmer/src/cmd/dump.rs b/src/obikmer/src/cmd/dump.rs index 7ec5dc7..437d7c3 100644 --- a/src/obikmer/src/cmd/dump.rs +++ b/src/obikmer/src/cmd/dump.rs @@ -20,6 +20,10 @@ pub struct DumpArgs { #[arg(long, default_value_t = false)] pub debug: bool, + /// Only output the first N kmers + #[arg(long)] + pub head: Option, + #[command(flatten)] pub filter: FilterArgs, } @@ -41,7 +45,7 @@ pub fn run(args: DumpArgs) { let stdout = io::stdout(); let mut out = BufWriter::new(stdout.lock()); - idx.dump(&mut out, args.force_presence, args.debug, &filters).unwrap_or_else(|e| { + idx.dump(&mut out, args.force_presence, args.debug, args.head, &filters).unwrap_or_else(|e| { eprintln!("dump error: {e}"); std::process::exit(1); }); diff --git a/src/obikmer/src/cmd/predicate.rs b/src/obikmer/src/cmd/predicate.rs index e568b58..dc1102d 100644 --- a/src/obikmer/src/cmd/predicate.rs +++ b/src/obikmer/src/cmd/predicate.rs @@ -162,6 +162,7 @@ pub struct FilterArgs { pub max_count: Option, /// Minimum fraction of ingroup genomes containing the k-mer [0.0–1.0] + /// (default 1.0 when --ingroup is set, 0.0 otherwise) #[arg(long)] pub min_frac: Option, @@ -174,6 +175,7 @@ pub struct FilterArgs { pub min_outgroup_count: Option, /// Maximum number of outgroup genomes containing the k-mer + /// (default 0 when --outgroup is set, no constraint otherwise) #[arg(long)] pub max_outgroup_count: Option, @@ -258,16 +260,19 @@ pub fn build_group_filter( let in_size = ingroup_idx.len(); let out_size = outgroup_idx.len(); + let default_min_frac = if !ingroup_preds.is_empty() { 1.0 } else { 0.0 }; + let default_max_outgroup_count = if !outgroup_preds.is_empty() { 0 } else { out_size }; + GroupQuorumFilter { ingroup_idx, outgroup_idx, threshold: p.threshold, min_count: p.min_count.unwrap_or(0), max_count: p.max_count.unwrap_or(in_size), - min_frac: p.min_frac.unwrap_or(0.0), + min_frac: p.min_frac.unwrap_or(default_min_frac), max_frac: p.max_frac.unwrap_or(1.0), min_outgroup_count: p.min_outgroup_count.unwrap_or(0), - max_outgroup_count: p.max_outgroup_count.unwrap_or(out_size), + max_outgroup_count: p.max_outgroup_count.unwrap_or(default_max_outgroup_count), min_outgroup_frac: p.min_outgroup_frac.unwrap_or(0.0), max_outgroup_frac: p.max_outgroup_frac.unwrap_or(1.0), } diff --git a/src/obikmer/src/cmd/unitig.rs b/src/obikmer/src/cmd/unitig.rs index 461943d..daac7cf 100644 --- a/src/obikmer/src/cmd/unitig.rs +++ b/src/obikmer/src/cmd/unitig.rs @@ -48,6 +48,7 @@ pub fn run(args: UnitigArgs) { partition .iter_partition_kmers(i, use_counts, n_genomes, &filters, |kmer, _row| { local_g.push(kmer); + true }) .unwrap_or_else(|e| { eprintln!("error reading partition {i}: {e}"); diff --git a/src/obikpartitionner/src/dump_layer.rs b/src/obikpartitionner/src/dump_layer.rs index adc26ac..e1c8226 100644 --- a/src/obikpartitionner/src/dump_layer.rs +++ b/src/obikpartitionner/src/dump_layer.rs @@ -26,17 +26,19 @@ impl KmerPartition { /// If no data matrix exists for a layer (pure set-membership, single genome), /// a row of `n_genomes` ones is emitted for every kmer in that layer — unless /// the filter rejects it, in which case the whole layer is skipped. + /// Like [`iter_partition_kmers`] but the callback returns `false` to stop early. + /// Returns `Ok(true)` if all kmers were visited, `Ok(false)` if the callback halted. pub fn iter_partition_kmers( &self, part: usize, use_counts: bool, n_genomes: usize, filters: &[Box], - mut cb: impl FnMut(CanonicalKmer, Box<[u32]>), - ) -> SKResult<()> { + mut cb: impl FnMut(CanonicalKmer, Box<[u32]>) -> bool, + ) -> SKResult { let index_dir = self.part_dir(part).join(INDEX_SUBDIR); if !index_dir.exists() { - return Ok(()); + return Ok(true); } let index_mode = PartitionMeta::load(&index_dir) @@ -54,56 +56,68 @@ impl KmerPartition { let counts_dir = layer_dir.join("counts"); let presence_dir = layer_dir.join("presence"); - if use_counts && counts_dir.exists() { + let cont = if use_counts && counts_dir.exists() { let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?; + let mut cont = true; for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { if let Some(slot) = mphf.find(kmer) { let row = mat.row(slot); if passes_all(filters, &row, n_genomes) { - cb(kmer, row); + cont = cb(kmer, row); + if !cont { break; } } } } + cont } else if !use_counts && presence_dir.exists() { let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?; + let mut cont = true; for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { if let Some(slot) = mphf.find(kmer) { let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect(); if passes_all(filters, &row, n_genomes) { - cb(kmer, row); + cont = cb(kmer, row); + if !cont { break; } } } } + cont } else { // No data matrix: implicit presence — all values are 1. // The filter result is identical for every kmer, so evaluate once. let all_present: Box<[u32]> = vec![1u32; n_genomes].into(); + let mut cont = true; if passes_all(filters, &all_present, n_genomes) { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { if mphf.find(kmer).is_some() { - cb(kmer, all_present.clone()); + cont = cb(kmer, all_present.clone()); + if !cont { break; } } } } - } + cont + }; + + if !cont { return Ok(false); } } - Ok(()) + Ok(true) } /// Like [`iter_partition_kmers`] but the callback also receives `(partition, layer)` /// indices, enabling debug output that identifies where each kmer was stored. + /// Returns `Ok(true)` if all kmers were visited, `Ok(false)` if the callback halted. pub fn iter_partition_kmers_located( &self, part: usize, use_counts: bool, n_genomes: usize, filters: &[Box], - mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>), - ) -> SKResult<()> { + mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>) -> bool, + ) -> SKResult { let index_dir = self.part_dir(part).join(INDEX_SUBDIR); if !index_dir.exists() { - return Ok(()); + return Ok(true); } let index_mode = PartitionMeta::load(&index_dir) @@ -120,39 +134,50 @@ impl KmerPartition { let counts_dir = layer_dir.join("counts"); let presence_dir = layer_dir.join("presence"); - if use_counts && counts_dir.exists() { + let cont = if use_counts && counts_dir.exists() { let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?; + let mut cont = true; for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { if let Some(slot) = mphf.find(kmer) { let row = mat.row(slot); if passes_all(filters, &row, n_genomes) { - cb(part, layer, kmer, row); + cont = cb(part, layer, kmer, row); + if !cont { break; } } } } + cont } else if !use_counts && presence_dir.exists() { let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?; + let mut cont = true; for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { if let Some(slot) = mphf.find(kmer) { let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect(); if passes_all(filters, &row, n_genomes) { - cb(part, layer, kmer, row); + cont = cb(part, layer, kmer, row); + if !cont { break; } } } } + cont } else { let all_present: Box<[u32]> = vec![1u32; n_genomes].into(); + let mut cont = true; if passes_all(filters, &all_present, n_genomes) { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { if mphf.find(kmer).is_some() { - cb(part, layer, kmer, all_present.clone()); + cont = cb(part, layer, kmer, all_present.clone()); + if !cont { break; } } } } - } + cont + }; + + if !cont { return Ok(false); } layer += 1; } - Ok(()) + Ok(true) } }