From 694da5208e40c54b2a5ab679ba8cec5b449de39e Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 26 May 2026 15:25:57 +0200 Subject: [PATCH] feat: add Findere z-window filtering and detail mode coverage tracking Introduces the `--findere-z` CLI flag to override the index's sliding window parameter and implements `apply_findere` to filter k-mer hits using a z-consecutive positions window. Adds structural support for `--detail` mode, including per-sequence k-mer offsets, conditional allocation of per-position coverage vectors, and JSON serialization. Updates architecture documentation, CLI references, and annotation schemas to align with the new implementation, resolving prior discrepancies with `--detail` and `--mismatch` flags. --- docmd/architecture/query.md | 117 +++++++++++++++-------- src/obikmer/src/cmd/query.rs | 174 ++++++++++++++++++++++++++++------- 2 files changed, 218 insertions(+), 73 deletions(-) diff --git a/docmd/architecture/query.md b/docmd/architecture/query.md index cb83859..f53896d 100644 --- a/docmd/architecture/query.md +++ b/docmd/architecture/query.md @@ -26,7 +26,8 @@ for each batch of sequences: query_partition(p, superkmers_routed_to_p) → load QueryLayer(s) for p → for each kmer in each superkmer: MphfLayer::find(kmer) - broadcast results back to each (seq_idx, kmer_offset) that referenced the superkmer + apply_findere(sk_kmer_results, effective_z) ← per superkmer + broadcast confirmed results back to each (seq_idx, kmer_offset) emit annotated sequences ``` @@ -36,28 +37,46 @@ Parallelism is **not yet active** in the current implementation: batches are pro --- -## Layer lookup: `MphfLayer::find` +## Findere z-window filter -`MphfLayer::open` reads `layer_meta.json` and loads either exact or approximate evidence. The caller (`QueryLayer::find`) never chooses the dispatch path — it is fixed at open time by `LayerEvidence`: +For approximate and hybrid index modes, a raw k-mer hit is a positive from the fingerprint test with false-positive rate 1/2^b. The Findere scheme reduces the effective FP rate to 1/2^(b·z) by requiring **z consecutive k-mers** to all test positive before any of them is counted as a confirmed match. + +`apply_findere` implements this as a sliding-window confirmation, independently for each genome: ```rust -pub fn find(&self, kmer: CanonicalKmer) -> Option { - match &self.ev { - LayerEvidence::Exact { .. } => self.find_exact(kmer), - LayerEvidence::Approx { .. } => self.find_approx(kmer), - } -} +fn apply_findere( + results: &[Option>], + z: usize, + n_genomes: usize, +) -> Vec>> ``` -### Exact layers +For each genome g, a position i is confirmed if there exists at least one window of z consecutive positions `[j, j+z)` that contains i and where all z positions are present for g (i.e. `results[pos]` is `Some(row)` and `row[g] > 0`). The implementation uses a single O(n) sliding-window scan per genome. -`find_exact` maps the k-mer through the MPHF to a slot, then calls `UnitigFileReader::verify_canonical_kmer(chunk_id, rank, kmer)` to confirm the stored k-mer matches. Zero false positives. Requires `UnitigFileReader::open()` (random-access via `.idx`); `open_sequential()` cannot serve random-access verification. +Unconfirmed positions are zeroed in the returned row. If all genomes are zeroed for a position, it is returned as `None`. -### Approximate layers +**Short superkmers**: when a superkmer contains fewer than z k-mers, no complete z-window can be formed. Rather than discarding all results, `apply_findere` returns them unchanged (no filter applied). This avoids penalising legitimate short sequences near read ends. -`find_approx` maps the k-mer through the MPHF, then checks a stored `b`-bit fingerprint of the canonical hash. False-positive rate: **1/2^b per k-mer query**. No `.idx` file is needed; the layer carries only `fingerprint.bin`. +**Exact indexes**: `z` is effectively 1 (every k-mer is its own window). `apply_findere` is a no-op. -For a query window of z consecutive k-mers (Findere scheme), the false-positive rate per window is **1/2^(b·z)**. The `z` parameter is recorded in `layer_meta.json` at build time but is not enforced during querying — the caller is responsible for interpreting window-level results accordingly. +### Effective z at query time + +`effective_z` is resolved at the start of `run()`: + +```rust +let effective_z = args.findere_z.unwrap_or_else(|| match idx.meta().config.evidence { + IndexMode::Approx { z, .. } | IndexMode::Hybrid { z, .. } => z as usize, + IndexMode::Exact => 1, +}); +``` + +The `-z` CLI option overrides the index metadata value. A higher z increases stringency (lower FP, some true positives may be discarded at sequence ends); a lower z increases sensitivity. + +--- + +## Layer lookup: `MphfLayer::find` + +`MphfLayer::open(dir, mode: &IndexMode)` receives the mode from `PartitionMeta` — no per-layer file is read. The caller (`QueryLayer`) never chooses the dispatch path: it is fixed at open time by `LayerEvidence`. See [obilayeredmap](../implementation/obilayeredmap.md) for the full `find` / `find_strict` API. ### `QueryLayer` variant selection @@ -72,18 +91,6 @@ For a query window of z consecutive k-mers (Findere scheme), the false-positive --- -## `open()` vs `open_sequential()` - -`UnitigFileReader::open()` loads the `.idx` block-offset table, enabling random access to individual unitig chunks. It is required whenever `verify_canonical_kmer` is called (exact layers at query time). - -`UnitigFileReader::open_sequential()` skips the `.idx` and supports only forward iteration. It is sufficient for: -- build passes that scan all unitigs sequentially (`build_exact_evidence`, `build_approx_evidence`); -- the `unitig` subcommand, which iterates and prints unitigs without random access. - -`KmerIndex::open()` (called by `query::run`) triggers `MphfLayer::open` for each layer, which calls `UnitigFileReader::open()` for exact layers. Approximate layers do not open a unitig reader at all. - ---- - ## Presence / count mode at query time The `--force-presence` flag and `--presence-threshold` control how per-genome values are accumulated, independently of what the index stores: @@ -96,49 +103,83 @@ genome_totals[g] += if presence { u32::from(v >= threshold) } else { v } --- +## Coverage vectors (`--detail`) + +When `--detail` is requested, a 3-D accumulator `cov[seq_idx][genome][kmer_pos]` is allocated before the partition loop, with dimensions derived from `batch.n_kmers`: + +``` +cov[seq_idx][g][abs_pos] += contribution +where abs_pos = desc.kmer_offset + local_pos (absolute kmer position in the sequence) +``` + +Coverage reflects confirmed k-mers only (post-Findere). The vectors are emitted in the JSON annotation under the key `"coverage"`. + +--- + +## `kmer_missing` semantics + +`kmer_missing` counts k-mers that returned `None` from the index before Findere filtering — i.e. k-mers truly absent from every layer. K-mers that were found in the index but rejected by the z-window filter are not counted as missing. + +--- + ## Output format Output sequences are written in **OBITools4 format**: the original sequence with a JSON annotation map in the title line. ``` ->read_id {"kmer_count":59,"kmer_strict_matches":{"genome_a":42,"genome_b":7,...}} +>read_id {"kmer_count":59,"kmer_strict_matches":{"genome_a":42,"genome_b":7}} ATCGATCG... ``` -Genome keys in `kmer_strict_matches` are genome labels from `index.meta`. Key order follows iteration order of `meta.genomes`. +With `--detail`: + +``` +>read_id {"kmer_count":59,"kmer_strict_matches":{...},"coverage":{"genome_a":[0,1,2,...],...}} +ATCGATCG... +``` + +Genome keys follow the iteration order of `meta.genomes`. --- -## Annotation schema (current implementation) +## Annotation schema | Key | Type | Condition | Semantics | |---|---|---|---| -| `kmer_count` | int | always | k-mers with at least one match | -| `kmer_missing` | int | `--count-missing` | k-mers absent from every layer | +| `kmer_count` | int | always | k-mers confirmed (post-Findere) with at least one genome match | +| `kmer_missing` | int | `--count-missing` | k-mers absent from the index entirely (pre-Findere None) | | `kmer_strict_matches` | object | always | per-genome accumulated value (label → count or 0/1) | +| `coverage` | object | `--detail` | per-genome array of per-position contributions (label → [u32]) | -`kmer_count` counts matched k-mer positions (incremented once per `Some(row)` hit regardless of how many genomes are covered). `kmer_missing` counts `None` hits. - -**Note on doc/impl divergence:** the doc previously used keys `kmer_total`, `kmer_found`, and `kmer_match` (list). The implementation uses `kmer_count` (int, matched only) and `kmer_strict_matches` (object keyed by genome label). `--mismatch` and `--detail` are parsed but not yet implemented and emit a warning. +`kmer_count + kmer_missing` ≤ total k-mers in the sequence. The gap (if any) corresponds to k-mers found in the index but rejected by the Findere z-window filter. --- ## CLI ``` -obikmer query -i [--detail] [--mismatch] [--count-missing] +obikmer query [--detail] [--mismatch] [--count-missing] [--force-presence] [--presence-threshold ] - [-T ] [ ...] + [-z ] [-T ] + [ ...] ``` -`--mismatch` and `--detail` are accepted but currently ignored with a warning on stderr. +| Option | Default | Semantics | +|---|---|---| +| `-z` / `--findere-z` | from index metadata | Override Findere z parameter | +| `--detail` | off | Emit per-position coverage vectors in JSON | +| `--count-missing` | off | Add `kmer_missing` field to JSON | +| `--force-presence` | off | Report 0/1 per genome regardless of index counts | +| `--presence-threshold` | 1 | Minimum count to declare genome present | +| `-T` / `--threads` | all CPUs | Worker threads (parallelism not yet active) | + +`--mismatch` is accepted but currently ignored with a warning on stderr. --- ## Future work - **`--mismatch`**: 1-mismatch approximate matching — generate `3·k` single-substitution variants per k-mer, look each up independently. -- **`--detail`**: per-position coverage vectors (`cov_`) per genome. - **Read classification** (`--classify`): assign each read to the genome with the highest match score. - **Parallelism**: activate per-partition or per-sequence worker threads using the already-parsed `--threads` value. - **Whitelist / blacklist filtering**: threshold-based accept/reject on per-genome match scores. diff --git a/src/obikmer/src/cmd/query.rs b/src/obikmer/src/cmd/query.rs index 51aa7f8..f86ec02 100644 --- a/src/obikmer/src/cmd/query.rs +++ b/src/obikmer/src/cmd/query.rs @@ -4,6 +4,7 @@ use std::path::PathBuf; use clap::Args; use obikindex::KmerIndex; +use obilayeredmap::IndexMode; use obiread::record::{SeqRecord, parse_chunk}; use obiread::chunk::read_sequence_chunks; use obikseq::{RoutableSuperKmer, set_k, set_m}; @@ -21,7 +22,7 @@ pub struct QueryArgs { #[arg(num_args = 1..)] pub inputs: Vec, - /// Also report per-position coverage vectors (cov_ per genome) + /// Report per-position coverage vectors per genome (adds "coverage" to JSON) #[arg(long)] pub detail: bool, @@ -41,6 +42,10 @@ pub struct QueryArgs { #[arg(long, default_value_t = 1)] pub presence_threshold: u32, + /// Override the Findere z parameter from index metadata + #[arg(short = 'z', long)] + pub findere_z: Option, + /// Number of worker threads #[arg( short = 'T', @@ -59,25 +64,18 @@ pub struct SKDesc { /// Index of the source sequence within the batch. pub seq_idx: u32, /// Kmer offset of the first kmer of this superkmer within its sequence. - /// Reserved for `--detail` coverage vectors (not yet consumed). - #[allow(dead_code)] pub kmer_offset: u32, } // ── QueryBatch ──────────────────────────────────────────────────────────────── /// A batch of query sequences with their superkmers deduplicated. -/// -/// Each unique `RoutableSuperKmer` maps to all the (seq_idx, kmer_offset) -/// positions it occupies across the batch. The superkmer is queried once -/// per partition; the result is broadcast to every SKDesc entry. pub struct QueryBatch { /// Sequence ids in batch order. pub ids: Vec, /// Raw sequence bytes (for output), in batch order. pub seqs: Vec>, - /// Per-sequence total kmer count. Reserved for `--detail` (not yet consumed). - #[allow(dead_code)] + /// Total kmer count per sequence (used for `--detail` coverage allocation). pub n_kmers: Vec, /// Deduplicated superkmer map. pub map: HashMap>, @@ -91,8 +89,8 @@ impl QueryBatch { level_max: usize, theta: f64, ) -> Self { - let mut ids = Vec::with_capacity(records.len()); - let mut seqs = Vec::with_capacity(records.len()); + let mut ids = Vec::with_capacity(records.len()); + let mut seqs = Vec::with_capacity(records.len()); let mut n_kmers = Vec::with_capacity(records.len()); let mut map: HashMap> = HashMap::new(); @@ -116,9 +114,6 @@ impl QueryBatch { } /// Split the superkmer map by partition index. - /// - /// Returns a vec of length `n_partitions`; each slot holds the RSK refs - /// whose minimizer routes to that partition. pub fn split_by_partition(&self, n_partitions: usize) -> Vec> { let mask = (n_partitions as u64) - 1; let mut by_part: Vec> = vec![Vec::new(); n_partitions]; @@ -133,22 +128,83 @@ impl QueryBatch { // ── Per-sequence accumulator ────────────────────────────────────────────────── struct SeqAcc { - kmer_count: u32, - kmer_missing: u32, - /// Per-genome accumulated count (count mode) or presence sum (presence mode). + kmer_count: u32, + kmer_missing: u32, genome_totals: Vec, } impl SeqAcc { fn new(n_genomes: usize) -> Self { Self { - kmer_count: 0, - kmer_missing: 0, + kmer_count: 0, + kmer_missing: 0, genome_totals: vec![0u32; n_genomes], } } } +// ── Findere z-window filter ─────────────────────────────────────────────────── + +/// Apply the Findere z-window filter to per-kmer query results for one superkmer. +/// +/// A k-mer at position i for genome g is confirmed only if it belongs to at least +/// one run of z consecutive positions where all k-mers are present for g. +/// Unconfirmed positions are zeroed; positions whose entire row becomes zero are +/// returned as `None`. +/// +/// When z <= 1 or the superkmer is shorter than z k-mers, results are returned +/// unchanged (short superkmers cannot satisfy the z-window constraint). +fn apply_findere( + results: &[Option>], + z: usize, + n_genomes: usize, +) -> Vec>> { + let n = results.len(); + if z <= 1 || n < z { + return results.iter().map(|r| r.as_ref().map(|row| row.clone())).collect(); + } + + let mut confirmed = vec![vec![false; n_genomes]; n]; + + for g in 0..n_genomes { + let present: Vec = results + .iter() + .map(|r| r.as_ref().map_or(false, |row| row[g] > 0)) + .collect(); + + let mut window_count = present[..z].iter().filter(|&&p| p).count(); + if window_count == z { + for c in confirmed[..z].iter_mut() { + c[g] = true; + } + } + + for j in 1..=(n - z) { + if present[j - 1] { window_count -= 1; } + if present[j + z - 1] { window_count += 1; } + if window_count == z { + for c in confirmed[j..j + z].iter_mut() { + c[g] = true; + } + } + } + } + + results.iter().enumerate().map(|(i, opt)| { + let row = opt.as_ref()?; + let mut new_row: Box<[u32]> = row.clone(); + let mut any = false; + for g in 0..n_genomes { + if !confirmed[i][g] { + new_row[g] = 0; + } else { + any = true; + } + } + if any { Some(new_row) } else { None } + }).collect() +} + // ── Entry point ─────────────────────────────────────────────────────────────── pub fn run(args: QueryArgs) { @@ -165,17 +221,22 @@ pub fn run(args: QueryArgs) { let n_partitions = idx.n_partitions(); let with_counts = idx.meta().config.with_counts; + let effective_z: usize = args.findere_z.unwrap_or_else(|| { + match idx.meta().config.evidence { + IndexMode::Approx { z, .. } | IndexMode::Hybrid { z, .. } => z as usize, + IndexMode::Exact => 1, + } + }); + info!( - "query: k={k}, {} genome(s), with_counts={with_counts}, mismatch={}, detail={}", + "query: k={k}, {} genome(s), with_counts={with_counts}, z={effective_z}, \ + mismatch={}, detail={}", n_genomes, args.mismatch, args.detail ); if args.mismatch { eprintln!("warning: --mismatch not yet implemented, ignored"); } - if args.detail { - eprintln!("warning: --detail not yet implemented, ignored"); - } let paths: Vec = args.inputs.iter().map(PathBuf::from).collect(); let mut out = BufWriter::new(io::stdout()); @@ -198,10 +259,20 @@ pub fn run(args: QueryArgs) { continue; } - let batch = QueryBatch::from_records(records, k, 6, 0.7); + let batch = QueryBatch::from_records(records, k, 6, 0.7); let n_seqs = batch.ids.len(); - let mut accs: Vec = (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect(); + let mut accs: Vec = + (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect(); + + // [seq_idx][genome_idx][kmer_position] — allocated only with --detail + let mut cov: Vec>> = if args.detail { + batch.n_kmers.iter() + .map(|&n| vec![vec![0u32; n as usize]; n_genomes]) + .collect() + } else { + Vec::new() + }; let by_part = batch.split_by_partition(n_partitions); @@ -218,24 +289,40 @@ pub fn run(args: QueryArgs) { std::process::exit(1); }); - let presence = args.force_presence || !with_counts; - let threshold = args.presence_threshold; + let presence = args.force_presence || !with_counts; + let threshold = args.presence_threshold; for (rsk, sk_kmer_results) in part_sks.iter().zip(kmer_results.iter()) { + let filtered = apply_findere(sk_kmer_results, effective_z, n_genomes); let descs = batch.map.get(*rsk).expect("rsk must be in map"); + for desc in descs { let acc = &mut accs[desc.seq_idx as usize]; - for hit in sk_kmer_results.iter() { + + for (local_pos, hit) in filtered.iter().enumerate() { match hit { - None => acc.kmer_missing += 1, + None => { + // Only truly missing if the index also had no entry. + if sk_kmer_results[local_pos].is_none() { + acc.kmer_missing += 1; + } + } Some(row) => { acc.kmer_count += 1; for (g, &v) in row.iter().enumerate() { - acc.genome_totals[g] += if presence { + if v == 0 { + continue; + } + let contribution = if presence { u32::from(v >= threshold) } else { v }; + acc.genome_totals[g] += contribution; + if args.detail { + let abs_pos = desc.kmer_offset as usize + local_pos; + cov[desc.seq_idx as usize][g][abs_pos] += contribution; + } } } } @@ -244,7 +331,11 @@ pub fn run(args: QueryArgs) { } } - emit_batch(&batch, &accs, idx.meta(), args.count_missing, &mut out); + emit_batch( + &batch, &accs, idx.meta(), + args.count_missing, args.detail, &cov, + &mut out, + ); } } } @@ -252,11 +343,13 @@ pub fn run(args: QueryArgs) { // ── Output ──────────────────────────────────────────────────────────────────── fn emit_batch( - batch: &QueryBatch, - accs: &[SeqAcc], - meta: &obikindex::meta::IndexMeta, + batch: &QueryBatch, + accs: &[SeqAcc], + meta: &obikindex::meta::IndexMeta, count_missing: bool, - out: &mut impl Write, + detail: bool, + cov: &[Vec>], + out: &mut impl Write, ) { for (seq_idx, (id, seq)) in batch.ids.iter().zip(batch.seqs.iter()).enumerate() { let acc = &accs[seq_idx]; @@ -266,12 +359,23 @@ fn emit_batch( if count_missing { ann.insert("kmer_missing".into(), acc.kmer_missing.into()); } + let mut match_map = serde_json::Map::new(); for (g, genome) in meta.genomes.iter().enumerate() { match_map.insert(genome.label.clone(), acc.genome_totals[g].into()); } ann.insert("kmer_strict_matches".into(), match_map.into()); + if detail && !cov.is_empty() { + let mut cov_map = serde_json::Map::new(); + for (g, genome) in meta.genomes.iter().enumerate() { + let v: Vec = + cov[seq_idx][g].iter().map(|&x| x.into()).collect(); + cov_map.insert(genome.label.clone(), v.into()); + } + ann.insert("coverage".into(), cov_map.into()); + } + let ann_str = serde_json::to_string(&ann).unwrap_or_else(|_| "{}".to_string()); // OBITools4 FASTA format: >id {"key":value,...}