From 708b0abf9bac746dbcd6e2b9c30483cd3d26d7ad Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sat, 30 May 2026 07:16:23 +0200 Subject: [PATCH] refactor: aggregate query results at sequence level Refactor the query pipeline to buffer partition outputs into a per-sequence `seq_results` vector, deferring final accumulation until all partitions complete. This ensures global position ordering before computing k-mer presence, counts, and coverage statistics. Additionally, removes a resolved TODO and documents a known BLAST false-positive issue where chloroplast and bacterial contaminants yield unrealistic high-confidence matches. --- TODO.md | 34 +++++++++++++ src/obikmer/src/cmd/query.rs | 93 +++++++++++++++++++++--------------- 2 files changed, 89 insertions(+), 38 deletions(-) diff --git a/TODO.md b/TODO.md index ceca1fe..9be40e6 100644 --- a/TODO.md +++ b/TODO.md @@ -24,3 +24,37 @@ Sauf qu'avec un index approximatif, les résultats seront approximatifs. --detail et --mismatch à implementer - status : affiche le statut de l'index + +## Problème biologique sur l'identification des contaminants + +Exemple de reads problématiques: +``` +>LH00534:161:22WMGWLT4:4:1101:45301:1420 {"coverage":{"gbbct":[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]},"kmer_count":117,"kmer_strict_matches":{"gbbct":117}} +GCCCCTACCGTACTCCAGCTTGGTAGTTTCCACCGCCTGTCCAGGGTTGAGCCCTGGGATTTGACGGCGGACTTAAAAAGCCACCTACAGACGCTTTACGCCCAATCATTCCGGATAACGCTTGCATCCTCTGTATTACCGCGGCTGCTGG +``` + +Par blast match une quantité invréssemblable de genomes chloroplastique avec un match de 100% (6554 hits pour Streptophyta) + +mais aussi une quantité de sequences importantes à des OTU bactériennes (uncutured bacteria 115 hits) aussi avec 100% de similarité. + +``` + Uncultured bacterium clone Otu01032 16S ribosomal RNA gene, partial sequence +Sequence ID: KX996137.1Length: 440Number of Matches: 1 +Range 1: 153 to 303GenBankGraphics +Next Match +Previous Match +Alignment statistics for match #1 Score Expect Identities Gaps Strand +273 bits(302) 2e-69 151/151(100%) 0/151(0%) Plus/Minus + +Query 1 GCCCCTACCGTACTCCAGCTTGGTAGTTTCCACCGCCTGTCCAGGGTTGAGCCCTGGGAT 60 + |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| +Sbjct 303 GCCCCTACCGTACTCCAGCTTGGTAGTTTCCACCGCCTGTCCAGGGTTGAGCCCTGGGAT 244 + +Query 61 TTGACGGCGGACTTAAAAAGCCACCTACAGACGCTTTACGCCCAATCATTCCGGATAACG 120 + |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| +Sbjct 243 TTGACGGCGGACTTAAAAAGCCACCTACAGACGCTTTACGCCCAATCATTCCGGATAACG 184 + +Query 121 CTTGCATCCTCTGTATTACCGCGGCTGCTGG 151 + ||||||||||||||||||||||||||||||| +Sbjct 183 CTTGCATCCTCTGTATTACCGCGGCTGCTGG 153 +``` diff --git a/src/obikmer/src/cmd/query.rs b/src/obikmer/src/cmd/query.rs index 2af272e..3c0321b 100644 --- a/src/obikmer/src/cmd/query.rs +++ b/src/obikmer/src/cmd/query.rs @@ -230,16 +230,11 @@ fn process_chunk( let batch = QueryBatch::from_records(records, k, 6, 0.7); let n_seqs = batch.ids.len(); - let mut accs: Vec = - (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect(); - - let mut cov: Vec>> = if detail { - batch.n_kmers.iter() - .map(|&n| vec![vec![0u32; n as usize]; n_genomes]) - .collect() - } else { - Vec::new() - }; + // Per-sequence s-mer result vectors in global sequence position order. + // All partitions fill into this structure before Findere is applied. + let mut seq_results: Vec>>> = batch.n_kmers.iter() + .map(|&n| vec![None; n as usize]) + .collect(); let by_part = batch.split_by_partition(n_partitions); @@ -256,38 +251,60 @@ fn process_chunk( std::process::exit(1); }); - let presence = force_presence || !with_counts; - let threshold = presence_threshold; - for (rsk, sk_kmer_results) in part_sks.iter().zip(kmer_results.iter()) { - let filtered = apply_findere(sk_kmer_results, effective_z, n_genomes); let descs = batch.map.get(*rsk).expect("rsk must be in map"); - for desc in descs { - let acc = &mut accs[desc.seq_idx as usize]; + let offset = desc.kmer_offset as usize; + let dst = &mut seq_results[desc.seq_idx as usize]; + for (j, hit) in sk_kmer_results.iter().enumerate() { + dst[offset + j] = hit.as_ref().map(|r| r.clone()); + } + } + } + } - for (local_pos, hit) in filtered.iter().enumerate() { - match hit { - None => { - if sk_kmer_results[local_pos].is_none() { - acc.kmer_missing += 1; - } - } - Some(row) => { - acc.kmer_count += 1; - for (g, &v) in row.iter().enumerate() { - if v == 0 { continue; } - let contribution = if presence { - u32::from(v >= threshold) - } else { - v - }; - acc.genome_totals[g] += contribution; - if detail { - let abs_pos = desc.kmer_offset as usize + local_pos; - cov[desc.seq_idx as usize][g][abs_pos] += contribution; - } - } + // Apply Findere on each complete sequence vector, then accumulate. + let n_kmers_out: Vec = batch.n_kmers.iter() + .map(|&n| { let n = n as usize; if n >= effective_z { n - effective_z + 1 } else { 0 } }) + .collect(); + + let mut accs: Vec = + (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect(); + + let mut cov: Vec>> = if detail { + n_kmers_out.iter() + .map(|&n| vec![vec![0u32; n]; n_genomes]) + .collect() + } else { + Vec::new() + }; + + let presence = force_presence || !with_counts; + let threshold = presence_threshold; + + for seq_idx in 0..n_seqs { + let filtered = apply_findere(&seq_results[seq_idx], effective_z, n_genomes); + let acc = &mut accs[seq_idx]; + + for (pos, hit) in filtered.iter().enumerate() { + match hit { + None => { + if seq_results[seq_idx][pos].is_none() { + acc.kmer_missing += 1; + } + } + Some(row) => { + acc.kmer_count += 1; + for (g, &v) in row.iter().enumerate() { + if v == 0 { continue; } + let contribution = if presence { + u32::from(v >= threshold) + } else { + v + }; + acc.genome_totals[g] += contribution; + if detail { + cov[seq_idx][g][pos] += contribution; } } }