first implementation but far to be optimal
This commit is contained in:
@@ -0,0 +1,8 @@
|
||||
[package]
|
||||
name = "obifastwrite"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
obikseq = { path = "../obikseq" }
|
||||
xxhash-rust = { version = "0.8", features = ["xxh64"] }
|
||||
@@ -0,0 +1,263 @@
|
||||
//! FASTA serialisation of [`SuperKmer`] values.
|
||||
//!
|
||||
//! Two functions cover the two phases of the scatter pipeline:
|
||||
//!
|
||||
//! - [`write_scatter`]: scatter phase (before routing). The header annotation
|
||||
//! contains the minimizer sequence decoded from [`SuperKmer::minimizer_pos`].
|
||||
//!
|
||||
//! - [`write_count`]: count phase (after deduplication). The header annotation
|
||||
//! contains the occurrence count from [`SuperKmer::count`].
|
||||
//!
|
||||
//! Both functions write standard OBITools-compatible FASTA:
|
||||
//!
|
||||
//! ```text
|
||||
//! >ID {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":42,"minimizer":"CGTGCTAGATC"}
|
||||
//! GCTAGCATGCTAGCTGTAGCTGTGAGTGCTG
|
||||
//! ```
|
||||
//!
|
||||
//! The record identifier is the xxHash-64 of the ASCII sequence, formatted as
|
||||
//! a 16-digit uppercase hexadecimal string. xxHash-64 is collision-resistant
|
||||
//! enough for debugging identifiers (collision probability < 1e-9 for billions
|
||||
//! of distinct super-kmers).
|
||||
//!
|
||||
//! # Phase contract
|
||||
//!
|
||||
//! `write_scatter` reads [`SuperKmer::minimizer_pos`], which is only valid
|
||||
//! **before** [`SuperKmer::init_count`] is called. `write_count` reads
|
||||
//! [`SuperKmer::count`], which is only meaningful **after** `init_count`.
|
||||
//! Mixing the two functions in the wrong phase produces silently wrong output;
|
||||
//! this is enforced by pipeline structure, not by the type system.
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
|
||||
// ── public API ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Write one super-kmer in FASTA format — **scatter phase**.
|
||||
///
|
||||
/// The `minimizer` field in the JSON annotation contains the ASCII sequence of
|
||||
/// the minimizer, decoded from [`SuperKmer::minimizer_pos`] (scatter-phase
|
||||
/// value of the payload field).
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `sk`: the super-kmer to serialise (must be in scatter phase)
|
||||
/// - `out`: destination writer
|
||||
/// - `k`: k-mer size used to build `sk`
|
||||
/// - `m`: minimizer size
|
||||
/// - `partition`: partition index computed from the minimizer hash
|
||||
pub fn write_scatter<W: Write>(
|
||||
sk: &SuperKmer,
|
||||
out: &mut W,
|
||||
k: usize,
|
||||
m: usize,
|
||||
partition: u32,
|
||||
min_hash: u64,
|
||||
) -> io::Result<()> {
|
||||
let ascii = sk.to_ascii();
|
||||
let id = seq_id(&ascii);
|
||||
let seq_len = ascii.len();
|
||||
let min_seq = decode_mmer(min_hash, m);
|
||||
|
||||
writeln!(
|
||||
out,
|
||||
">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
|
||||
\"minimizer_size\":{m},\"partition\":{partition},\
|
||||
\"minimizer\":\"{min}\"}}",
|
||||
id = id,
|
||||
seq_len = seq_len,
|
||||
k = k,
|
||||
m = m,
|
||||
partition = partition,
|
||||
min = std::str::from_utf8(&min_seq).unwrap(),
|
||||
)?;
|
||||
out.write_all(&ascii)?;
|
||||
out.write_all(b"\n")
|
||||
}
|
||||
|
||||
/// Decode a right-aligned 2-bit minimizer value into uppercase ASCII (A/C/G/T).
|
||||
fn decode_mmer(val: u64, m: usize) -> Vec<u8> {
|
||||
const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
|
||||
(0..m).map(|i| BASES[((val >> (2 * (m - 1 - i))) & 3) as usize]).collect()
|
||||
}
|
||||
|
||||
/// Write one super-kmer in FASTA format — **count phase**.
|
||||
///
|
||||
/// The `count` field in the JSON annotation contains the occurrence count from
|
||||
/// [`SuperKmer::count`] (count-phase value of the payload field).
|
||||
///
|
||||
/// # Parameters
|
||||
/// - `sk`: the super-kmer to serialise (must be in count phase, i.e. after
|
||||
/// [`SuperKmer::init_count`] has been called)
|
||||
/// - `out`: destination writer
|
||||
/// - `k`: k-mer size
|
||||
/// - `m`: minimizer size
|
||||
/// - `partition`: partition index
|
||||
pub fn write_count<W: Write>(
|
||||
sk: &SuperKmer,
|
||||
out: &mut W,
|
||||
k: usize,
|
||||
m: usize,
|
||||
partition: u32,
|
||||
) -> io::Result<()> {
|
||||
let ascii = sk.to_ascii();
|
||||
let id = seq_id(&ascii);
|
||||
let seq_len = ascii.len();
|
||||
let count = sk.count();
|
||||
|
||||
writeln!(
|
||||
out,
|
||||
">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
|
||||
\"minimizer_size\":{m},\"partition\":{partition},\
|
||||
\"count\":{count}}}",
|
||||
id = id,
|
||||
seq_len = seq_len,
|
||||
k = k,
|
||||
m = m,
|
||||
partition = partition,
|
||||
count = count,
|
||||
)?;
|
||||
out.write_all(&ascii)?;
|
||||
out.write_all(b"\n")
|
||||
}
|
||||
|
||||
// ── internal helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
/// xxHash-64 of the ASCII sequence, formatted as 16 uppercase hex digits.
|
||||
fn seq_id(ascii: &[u8]) -> String {
|
||||
format!("{:016X}", xxh64(ascii, 0))
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
|
||||
fn make(seq: &[u8]) -> SuperKmer {
|
||||
SuperKmer::from_ascii(seq)
|
||||
}
|
||||
|
||||
fn capture<F: Fn(&mut Vec<u8>) -> io::Result<()>>(f: F) -> String {
|
||||
let mut buf = Vec::new();
|
||||
f(&mut buf).unwrap();
|
||||
String::from_utf8(buf).unwrap()
|
||||
}
|
||||
|
||||
// ── write_scatter ─────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn scatter_header_contains_minimizer_field() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.set_minimizer_pos(2);
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, 0));
|
||||
assert!(out.contains("\"minimizer\":\""));
|
||||
assert!(!out.contains("\"count\":"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scatter_minimizer_decoded_from_hash() {
|
||||
// min_hash for "ACG" (A=0,C=1,G=2, m=3): 0*16 + 1*4 + 2 = 6
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, 6));
|
||||
assert!(out.contains("\"minimizer\":\"ACG\""), "got: {out}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scatter_fields_present() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, 0));
|
||||
assert!(out.contains("\"seq_length\":12"));
|
||||
assert!(out.contains("\"kmer_size\":4"));
|
||||
assert!(out.contains("\"minimizer_size\":3"));
|
||||
assert!(out.contains("\"partition\":5"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scatter_sequence_line_correct() {
|
||||
let mut sk = make(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, 0));
|
||||
let lines: Vec<&str> = out.lines().collect();
|
||||
assert_eq!(lines[1], "ACGTACGT");
|
||||
}
|
||||
|
||||
// ── write_count ───────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn count_header_contains_count_field() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.init_count();
|
||||
sk.add(49);
|
||||
let out = capture(|w| write_count(&sk, w, 4, 3, 2));
|
||||
assert!(out.contains("\"count\":50"));
|
||||
assert!(!out.contains("\"minimizer\":"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count_fields_present() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.init_count();
|
||||
let out = capture(|w| write_count(&sk, w, 4, 3, 9));
|
||||
assert!(out.contains("\"seq_length\":12"));
|
||||
assert!(out.contains("\"kmer_size\":4"));
|
||||
assert!(out.contains("\"minimizer_size\":3"));
|
||||
assert!(out.contains("\"partition\":9"));
|
||||
assert!(out.contains("\"count\":1"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count_sequence_line_correct() {
|
||||
let mut sk = make(b"TTTTACGT");
|
||||
sk.init_count();
|
||||
let out = capture(|w| write_count(&sk, w, 4, 2, 0));
|
||||
let lines: Vec<&str> = out.lines().collect();
|
||||
assert_eq!(lines[1], "TTTTACGT");
|
||||
}
|
||||
|
||||
// ── ID stability ──────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn same_sequence_same_id() {
|
||||
let mut sk1 = make(b"ACGTACGT");
|
||||
sk1.set_minimizer_pos(0);
|
||||
let mut sk2 = make(b"ACGTACGT");
|
||||
sk2.set_minimizer_pos(4); // different pos, same sequence
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, 0))
|
||||
.lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
|
||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, 0))
|
||||
.lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
|
||||
assert_eq!(id1, id2, "same sequence must produce same ID");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_sequences_different_id() {
|
||||
let mut sk1 = make(b"ACGTACGT");
|
||||
sk1.set_minimizer_pos(0);
|
||||
let mut sk2 = make(b"TTTTTTTT");
|
||||
sk2.set_minimizer_pos(0);
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, 0))
|
||||
.lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
|
||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, 0))
|
||||
.lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
|
||||
assert_ne!(id1, id2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn id_is_16_hex_digits() {
|
||||
let mut sk = make(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, 0));
|
||||
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
|
||||
assert_eq!(id.len(), 16);
|
||||
assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user