feat: Implement query subcommand for sequence-to-genome mapping

This change introduces the `query` CLI command and its supporting infrastructure for sequence-to-genome mapping and k-mer matching. It adds a `QueryLayer` abstraction backed by MPHF and persistent matrices, exposes the index partition for direct querying, and implements `Hash`/`Eq` for `RoutableSuperKmer`. The command ingests sequence batches, deduplicates superkmers, routes them to index partitions for parallel exact or 1-mismatch matching, and outputs results as FASTA records annotated with JSON metadata. Includes `serde_json` dependency addition, module exports, and documentation updates.
This commit is contained in:
Eric Coissac
2026-05-21 13:23:05 +02:00
parent c8e591fc78
commit 13599dd444
13 changed files with 762 additions and 19 deletions
+1
View File
@@ -12,6 +12,7 @@ mod mimetype;
pub mod normalize;
mod path_iterator;
pub mod peakreader;
pub mod record;
pub mod xopen;
pub use chunk::{SeqChunkIter, fasta_chunks, fastq_chunks,
+222
View File
@@ -0,0 +1,222 @@
//! Per-sequence record parser for FASTA and FASTQ chunks.
//!
//! Same automaton structure as `normalize.rs` — only the actions differ:
//! instead of writing into a single flat rope, we accumulate per-sequence
//! data (id, raw ASCII, normalised ACGT\x00 rope).
use obikrope::{ForwardCursor, Rope, RopeCursor};
/// One sequence record extracted from a FASTA or FASTQ chunk.
pub struct SeqRecord {
/// Sequence identifier (everything before the first space in the header).
pub id: String,
/// Raw sequence bytes, newlines stripped, non-ACGT characters preserved.
/// Reproduced verbatim in query output.
pub sequence: Vec<u8>,
/// Per-sequence normalised rope: uppercase ACGT segments of length ≥ k
/// separated by `\x00`. Ready for `SuperKmerIter`.
pub normalized: Rope,
}
/// Parse all records from a FASTA or FASTQ chunk rope.
/// Returns an empty vec if the rope carries no recognised mime type.
pub fn parse_chunk(rope: &Rope, k: usize) -> Vec<SeqRecord> {
let cursor = rope.fw_cursor();
match rope.mime_type() {
Some("text/fasta") => parse_fasta(cursor, k),
Some("text/fastq") => parse_fastq(cursor, k),
_ => vec![],
}
}
// ── Shared state accumulated while scanning one sequence ──────────────────────
struct RecordBuilder {
id: String,
sequence: Vec<u8>, // raw ASCII, no newlines
norm: Vec<u8>, // ACGT\x00 segments being built
seg_start: usize, // index in norm where current segment started
k: usize,
}
impl RecordBuilder {
fn new(k: usize) -> Self {
Self { id: String::new(), sequence: Vec::new(), norm: Vec::new(), seg_start: 0, k }
}
fn reset(&mut self, id: String) {
self.id = id;
self.sequence.clear();
self.norm.clear();
self.seg_start = 0;
}
/// Push one accepted ACGT byte.
fn push_acgt(&mut self, b: u8) {
self.sequence.push(b);
self.norm.push(b);
}
/// Push one non-ACGT byte to the raw sequence only (not to the norm buffer).
fn push_raw(&mut self, b: u8) {
self.sequence.push(b);
}
/// Close the current ACGT segment (same logic as `end_segment` in normalize.rs).
fn end_segment(&mut self) {
if self.norm.len() - self.seg_start >= self.k {
self.norm.push(0x00);
self.seg_start = self.norm.len();
} else {
self.norm.truncate(self.seg_start);
}
}
/// Consume into a SeqRecord. Closes any open segment first.
fn finish(mut self) -> Option<SeqRecord> {
self.end_segment();
if self.id.is_empty() {
return None;
}
let mut rope = Rope::new(None);
if !self.norm.is_empty() {
rope.push(self.norm);
}
Some(SeqRecord { id: self.id, sequence: self.sequence, normalized: rope })
}
}
// ── FASTA automaton ───────────────────────────────────────────────────────────
fn parse_fasta(cursor: ForwardCursor<'_>, k: usize) -> Vec<SeqRecord> {
let mut records: Vec<SeqRecord> = Vec::new();
let mut builder = RecordBuilder::new(k);
// skip up to (and including) the first '>'
loop {
match cursor.read_next().ok() {
None => return records,
Some(b'>') => break,
Some(_) => {}
}
}
// read first id — read_id already consumes the full header line
builder.id = read_id(&cursor);
loop {
match cursor.read_next().ok() {
None => {
// EOF — close final segment and emit
if let Some(rec) = builder.finish() {
records.push(rec);
}
return records;
}
Some(b'\n') | Some(b'\r') => {
// peek: next non-empty char determines if new record starts
match cursor.read_ahead(1).ok() {
Some(b'>') => {
// end of current record
builder.end_segment();
if let Some(rec) = builder.finish() {
records.push(rec);
}
cursor.read_next().ok(); // consume '>'
let id = read_id(&cursor); // already consumes header line
builder = RecordBuilder::new(k);
builder.reset(id);
}
None => {
builder.end_segment();
if let Some(rec) = builder.finish() {
records.push(rec);
}
return records;
}
Some(_) => {} // continuation line — do nothing
}
}
Some(b) => {
let upper = b & !0x20u8;
if matches!(upper, b'A' | b'C' | b'G' | b'T') {
builder.push_acgt(upper);
} else {
builder.push_raw(b);
builder.end_segment();
}
}
}
}
}
// ── FASTQ automaton ───────────────────────────────────────────────────────────
fn parse_fastq(cursor: ForwardCursor<'_>, k: usize) -> Vec<SeqRecord> {
let mut records: Vec<SeqRecord> = Vec::new();
loop {
// find '@'
loop {
match cursor.read_next().ok() {
None => return records,
Some(b'@') => break,
Some(_) => {}
}
}
let mut builder = RecordBuilder::new(k);
builder.id = read_id(&cursor); // already consumes the full header line
// sequence line — stop at newline, non-ACGT breaks segment
loop {
match cursor.read_next().ok() {
None | Some(b'\n') | Some(b'\r') => {
builder.end_segment();
break;
}
Some(b) => {
let upper = b & !0x20u8;
if matches!(upper, b'A' | b'C' | b'G' | b'T') {
builder.push_acgt(upper);
} else {
builder.push_raw(b);
builder.end_segment();
}
}
}
}
skip_line(&cursor); // '+' line
skip_line(&cursor); // quality line
if let Some(rec) = builder.finish() {
records.push(rec);
}
}
}
// ── Helpers ───────────────────────────────────────────────────────────────────
fn read_id(cursor: &ForwardCursor<'_>) -> String {
let mut id = Vec::new();
loop {
match cursor.read_next().ok() {
None | Some(b'\n') | Some(b'\r') => break,
Some(b' ') | Some(b'\t') => {
skip_line(cursor);
break;
}
Some(b) => id.push(b),
}
}
String::from_utf8_lossy(&id).into_owned()
}
fn skip_line(cursor: &ForwardCursor<'_>) {
while let Some(c) = cursor.read_next().ok() {
if c == b'\n' {
return;
}
}
}