feat: Implement query subcommand for sequence-to-genome mapping
This change introduces the `query` CLI command and its supporting infrastructure for sequence-to-genome mapping and k-mer matching. It adds a `QueryLayer` abstraction backed by MPHF and persistent matrices, exposes the index partition for direct querying, and implements `Hash`/`Eq` for `RoutableSuperKmer`. The command ingests sequence batches, deduplicates superkmers, routes them to index partitions for parallel exact or 1-mismatch matching, and outputs results as FASTA records annotated with JSON metadata. Includes `serde_json` dependency addition, module exports, and documentation updates.
This commit is contained in:
@@ -12,6 +12,7 @@ mod mimetype;
|
||||
pub mod normalize;
|
||||
mod path_iterator;
|
||||
pub mod peakreader;
|
||||
pub mod record;
|
||||
pub mod xopen;
|
||||
|
||||
pub use chunk::{SeqChunkIter, fasta_chunks, fastq_chunks,
|
||||
|
||||
@@ -0,0 +1,222 @@
|
||||
//! Per-sequence record parser for FASTA and FASTQ chunks.
|
||||
//!
|
||||
//! Same automaton structure as `normalize.rs` — only the actions differ:
|
||||
//! instead of writing into a single flat rope, we accumulate per-sequence
|
||||
//! data (id, raw ASCII, normalised ACGT\x00 rope).
|
||||
|
||||
use obikrope::{ForwardCursor, Rope, RopeCursor};
|
||||
|
||||
/// One sequence record extracted from a FASTA or FASTQ chunk.
|
||||
pub struct SeqRecord {
|
||||
/// Sequence identifier (everything before the first space in the header).
|
||||
pub id: String,
|
||||
/// Raw sequence bytes, newlines stripped, non-ACGT characters preserved.
|
||||
/// Reproduced verbatim in query output.
|
||||
pub sequence: Vec<u8>,
|
||||
/// Per-sequence normalised rope: uppercase ACGT segments of length ≥ k
|
||||
/// separated by `\x00`. Ready for `SuperKmerIter`.
|
||||
pub normalized: Rope,
|
||||
}
|
||||
|
||||
/// Parse all records from a FASTA or FASTQ chunk rope.
|
||||
/// Returns an empty vec if the rope carries no recognised mime type.
|
||||
pub fn parse_chunk(rope: &Rope, k: usize) -> Vec<SeqRecord> {
|
||||
let cursor = rope.fw_cursor();
|
||||
match rope.mime_type() {
|
||||
Some("text/fasta") => parse_fasta(cursor, k),
|
||||
Some("text/fastq") => parse_fastq(cursor, k),
|
||||
_ => vec![],
|
||||
}
|
||||
}
|
||||
|
||||
// ── Shared state accumulated while scanning one sequence ──────────────────────
|
||||
|
||||
struct RecordBuilder {
|
||||
id: String,
|
||||
sequence: Vec<u8>, // raw ASCII, no newlines
|
||||
norm: Vec<u8>, // ACGT\x00 segments being built
|
||||
seg_start: usize, // index in norm where current segment started
|
||||
k: usize,
|
||||
}
|
||||
|
||||
impl RecordBuilder {
|
||||
fn new(k: usize) -> Self {
|
||||
Self { id: String::new(), sequence: Vec::new(), norm: Vec::new(), seg_start: 0, k }
|
||||
}
|
||||
|
||||
fn reset(&mut self, id: String) {
|
||||
self.id = id;
|
||||
self.sequence.clear();
|
||||
self.norm.clear();
|
||||
self.seg_start = 0;
|
||||
}
|
||||
|
||||
/// Push one accepted ACGT byte.
|
||||
fn push_acgt(&mut self, b: u8) {
|
||||
self.sequence.push(b);
|
||||
self.norm.push(b);
|
||||
}
|
||||
|
||||
/// Push one non-ACGT byte to the raw sequence only (not to the norm buffer).
|
||||
fn push_raw(&mut self, b: u8) {
|
||||
self.sequence.push(b);
|
||||
}
|
||||
|
||||
/// Close the current ACGT segment (same logic as `end_segment` in normalize.rs).
|
||||
fn end_segment(&mut self) {
|
||||
if self.norm.len() - self.seg_start >= self.k {
|
||||
self.norm.push(0x00);
|
||||
self.seg_start = self.norm.len();
|
||||
} else {
|
||||
self.norm.truncate(self.seg_start);
|
||||
}
|
||||
}
|
||||
|
||||
/// Consume into a SeqRecord. Closes any open segment first.
|
||||
fn finish(mut self) -> Option<SeqRecord> {
|
||||
self.end_segment();
|
||||
if self.id.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let mut rope = Rope::new(None);
|
||||
if !self.norm.is_empty() {
|
||||
rope.push(self.norm);
|
||||
}
|
||||
Some(SeqRecord { id: self.id, sequence: self.sequence, normalized: rope })
|
||||
}
|
||||
}
|
||||
|
||||
// ── FASTA automaton ───────────────────────────────────────────────────────────
|
||||
|
||||
fn parse_fasta(cursor: ForwardCursor<'_>, k: usize) -> Vec<SeqRecord> {
|
||||
let mut records: Vec<SeqRecord> = Vec::new();
|
||||
let mut builder = RecordBuilder::new(k);
|
||||
|
||||
// skip up to (and including) the first '>'
|
||||
loop {
|
||||
match cursor.read_next().ok() {
|
||||
None => return records,
|
||||
Some(b'>') => break,
|
||||
Some(_) => {}
|
||||
}
|
||||
}
|
||||
|
||||
// read first id — read_id already consumes the full header line
|
||||
builder.id = read_id(&cursor);
|
||||
|
||||
loop {
|
||||
match cursor.read_next().ok() {
|
||||
None => {
|
||||
// EOF — close final segment and emit
|
||||
if let Some(rec) = builder.finish() {
|
||||
records.push(rec);
|
||||
}
|
||||
return records;
|
||||
}
|
||||
Some(b'\n') | Some(b'\r') => {
|
||||
// peek: next non-empty char determines if new record starts
|
||||
match cursor.read_ahead(1).ok() {
|
||||
Some(b'>') => {
|
||||
// end of current record
|
||||
builder.end_segment();
|
||||
if let Some(rec) = builder.finish() {
|
||||
records.push(rec);
|
||||
}
|
||||
cursor.read_next().ok(); // consume '>'
|
||||
let id = read_id(&cursor); // already consumes header line
|
||||
builder = RecordBuilder::new(k);
|
||||
builder.reset(id);
|
||||
}
|
||||
None => {
|
||||
builder.end_segment();
|
||||
if let Some(rec) = builder.finish() {
|
||||
records.push(rec);
|
||||
}
|
||||
return records;
|
||||
}
|
||||
Some(_) => {} // continuation line — do nothing
|
||||
}
|
||||
}
|
||||
Some(b) => {
|
||||
let upper = b & !0x20u8;
|
||||
if matches!(upper, b'A' | b'C' | b'G' | b'T') {
|
||||
builder.push_acgt(upper);
|
||||
} else {
|
||||
builder.push_raw(b);
|
||||
builder.end_segment();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── FASTQ automaton ───────────────────────────────────────────────────────────
|
||||
|
||||
fn parse_fastq(cursor: ForwardCursor<'_>, k: usize) -> Vec<SeqRecord> {
|
||||
let mut records: Vec<SeqRecord> = Vec::new();
|
||||
|
||||
loop {
|
||||
// find '@'
|
||||
loop {
|
||||
match cursor.read_next().ok() {
|
||||
None => return records,
|
||||
Some(b'@') => break,
|
||||
Some(_) => {}
|
||||
}
|
||||
}
|
||||
|
||||
let mut builder = RecordBuilder::new(k);
|
||||
builder.id = read_id(&cursor); // already consumes the full header line
|
||||
|
||||
// sequence line — stop at newline, non-ACGT breaks segment
|
||||
loop {
|
||||
match cursor.read_next().ok() {
|
||||
None | Some(b'\n') | Some(b'\r') => {
|
||||
builder.end_segment();
|
||||
break;
|
||||
}
|
||||
Some(b) => {
|
||||
let upper = b & !0x20u8;
|
||||
if matches!(upper, b'A' | b'C' | b'G' | b'T') {
|
||||
builder.push_acgt(upper);
|
||||
} else {
|
||||
builder.push_raw(b);
|
||||
builder.end_segment();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
skip_line(&cursor); // '+' line
|
||||
skip_line(&cursor); // quality line
|
||||
|
||||
if let Some(rec) = builder.finish() {
|
||||
records.push(rec);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn read_id(cursor: &ForwardCursor<'_>) -> String {
|
||||
let mut id = Vec::new();
|
||||
loop {
|
||||
match cursor.read_next().ok() {
|
||||
None | Some(b'\n') | Some(b'\r') => break,
|
||||
Some(b' ') | Some(b'\t') => {
|
||||
skip_line(cursor);
|
||||
break;
|
||||
}
|
||||
Some(b) => id.push(b),
|
||||
}
|
||||
}
|
||||
String::from_utf8_lossy(&id).into_owned()
|
||||
}
|
||||
|
||||
fn skip_line(cursor: &ForwardCursor<'_>) {
|
||||
while let Some(c) = cursor.read_next().ok() {
|
||||
if c == b'\n' {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user