first implementation but far to be optimal

This commit is contained in:
Eric Coissac
2026-04-16 22:38:20 +02:00
commit de3f9b16cf
19336 changed files with 380276 additions and 0 deletions
+8
View File
@@ -0,0 +1,8 @@
[package]
name = "obifastwrite"
version = "0.1.0"
edition = "2024"
[dependencies]
obikseq = { path = "../obikseq" }
xxhash-rust = { version = "0.8", features = ["xxh64"] }
+263
View File
@@ -0,0 +1,263 @@
//! FASTA serialisation of [`SuperKmer`] values.
//!
//! Two functions cover the two phases of the scatter pipeline:
//!
//! - [`write_scatter`]: scatter phase (before routing). The header annotation
//! contains the minimizer sequence decoded from [`SuperKmer::minimizer_pos`].
//!
//! - [`write_count`]: count phase (after deduplication). The header annotation
//! contains the occurrence count from [`SuperKmer::count`].
//!
//! Both functions write standard OBITools-compatible FASTA:
//!
//! ```text
//! >ID {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":42,"minimizer":"CGTGCTAGATC"}
//! GCTAGCATGCTAGCTGTAGCTGTGAGTGCTG
//! ```
//!
//! The record identifier is the xxHash-64 of the ASCII sequence, formatted as
//! a 16-digit uppercase hexadecimal string. xxHash-64 is collision-resistant
//! enough for debugging identifiers (collision probability < 1e-9 for billions
//! of distinct super-kmers).
//!
//! # Phase contract
//!
//! `write_scatter` reads [`SuperKmer::minimizer_pos`], which is only valid
//! **before** [`SuperKmer::init_count`] is called. `write_count` reads
//! [`SuperKmer::count`], which is only meaningful **after** `init_count`.
//! Mixing the two functions in the wrong phase produces silently wrong output;
//! this is enforced by pipeline structure, not by the type system.
#![deny(missing_docs)]
use std::io::{self, Write};
use obikseq::superkmer::SuperKmer;
use xxhash_rust::xxh64::xxh64;
// ── public API ────────────────────────────────────────────────────────────────
/// Write one super-kmer in FASTA format — **scatter phase**.
///
/// The `minimizer` field in the JSON annotation contains the ASCII sequence of
/// the minimizer, decoded from [`SuperKmer::minimizer_pos`] (scatter-phase
/// value of the payload field).
///
/// # Parameters
/// - `sk`: the super-kmer to serialise (must be in scatter phase)
/// - `out`: destination writer
/// - `k`: k-mer size used to build `sk`
/// - `m`: minimizer size
/// - `partition`: partition index computed from the minimizer hash
pub fn write_scatter<W: Write>(
sk: &SuperKmer,
out: &mut W,
k: usize,
m: usize,
partition: u32,
min_hash: u64,
) -> io::Result<()> {
let ascii = sk.to_ascii();
let id = seq_id(&ascii);
let seq_len = ascii.len();
let min_seq = decode_mmer(min_hash, m);
writeln!(
out,
">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
\"minimizer_size\":{m},\"partition\":{partition},\
\"minimizer\":\"{min}\"}}",
id = id,
seq_len = seq_len,
k = k,
m = m,
partition = partition,
min = std::str::from_utf8(&min_seq).unwrap(),
)?;
out.write_all(&ascii)?;
out.write_all(b"\n")
}
/// Decode a right-aligned 2-bit minimizer value into uppercase ASCII (A/C/G/T).
fn decode_mmer(val: u64, m: usize) -> Vec<u8> {
const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
(0..m).map(|i| BASES[((val >> (2 * (m - 1 - i))) & 3) as usize]).collect()
}
/// Write one super-kmer in FASTA format — **count phase**.
///
/// The `count` field in the JSON annotation contains the occurrence count from
/// [`SuperKmer::count`] (count-phase value of the payload field).
///
/// # Parameters
/// - `sk`: the super-kmer to serialise (must be in count phase, i.e. after
/// [`SuperKmer::init_count`] has been called)
/// - `out`: destination writer
/// - `k`: k-mer size
/// - `m`: minimizer size
/// - `partition`: partition index
pub fn write_count<W: Write>(
sk: &SuperKmer,
out: &mut W,
k: usize,
m: usize,
partition: u32,
) -> io::Result<()> {
let ascii = sk.to_ascii();
let id = seq_id(&ascii);
let seq_len = ascii.len();
let count = sk.count();
writeln!(
out,
">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
\"minimizer_size\":{m},\"partition\":{partition},\
\"count\":{count}}}",
id = id,
seq_len = seq_len,
k = k,
m = m,
partition = partition,
count = count,
)?;
out.write_all(&ascii)?;
out.write_all(b"\n")
}
// ── internal helpers ──────────────────────────────────────────────────────────
/// xxHash-64 of the ASCII sequence, formatted as 16 uppercase hex digits.
fn seq_id(ascii: &[u8]) -> String {
format!("{:016X}", xxh64(ascii, 0))
}
// ── tests ─────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
use obikseq::superkmer::SuperKmer;
fn make(seq: &[u8]) -> SuperKmer {
SuperKmer::from_ascii(seq)
}
fn capture<F: Fn(&mut Vec<u8>) -> io::Result<()>>(f: F) -> String {
let mut buf = Vec::new();
f(&mut buf).unwrap();
String::from_utf8(buf).unwrap()
}
// ── write_scatter ─────────────────────────────────────────────────────────
#[test]
fn scatter_header_contains_minimizer_field() {
let mut sk = make(b"ACGTACGTACGT");
sk.set_minimizer_pos(2);
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, 0));
assert!(out.contains("\"minimizer\":\""));
assert!(!out.contains("\"count\":"));
}
#[test]
fn scatter_minimizer_decoded_from_hash() {
// min_hash for "ACG" (A=0,C=1,G=2, m=3): 0*16 + 1*4 + 2 = 6
let mut sk = make(b"ACGTACGTACGT");
sk.set_minimizer_pos(0);
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, 6));
assert!(out.contains("\"minimizer\":\"ACG\""), "got: {out}");
}
#[test]
fn scatter_fields_present() {
let mut sk = make(b"ACGTACGTACGT");
sk.set_minimizer_pos(0);
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, 0));
assert!(out.contains("\"seq_length\":12"));
assert!(out.contains("\"kmer_size\":4"));
assert!(out.contains("\"minimizer_size\":3"));
assert!(out.contains("\"partition\":5"));
}
#[test]
fn scatter_sequence_line_correct() {
let mut sk = make(b"ACGTACGT");
sk.set_minimizer_pos(0);
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, 0));
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines[1], "ACGTACGT");
}
// ── write_count ───────────────────────────────────────────────────────────
#[test]
fn count_header_contains_count_field() {
let mut sk = make(b"ACGTACGTACGT");
sk.init_count();
sk.add(49);
let out = capture(|w| write_count(&sk, w, 4, 3, 2));
assert!(out.contains("\"count\":50"));
assert!(!out.contains("\"minimizer\":"));
}
#[test]
fn count_fields_present() {
let mut sk = make(b"ACGTACGTACGT");
sk.init_count();
let out = capture(|w| write_count(&sk, w, 4, 3, 9));
assert!(out.contains("\"seq_length\":12"));
assert!(out.contains("\"kmer_size\":4"));
assert!(out.contains("\"minimizer_size\":3"));
assert!(out.contains("\"partition\":9"));
assert!(out.contains("\"count\":1"));
}
#[test]
fn count_sequence_line_correct() {
let mut sk = make(b"TTTTACGT");
sk.init_count();
let out = capture(|w| write_count(&sk, w, 4, 2, 0));
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines[1], "TTTTACGT");
}
// ── ID stability ──────────────────────────────────────────────────────────
#[test]
fn same_sequence_same_id() {
let mut sk1 = make(b"ACGTACGT");
sk1.set_minimizer_pos(0);
let mut sk2 = make(b"ACGTACGT");
sk2.set_minimizer_pos(4); // different pos, same sequence
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, 0))
.lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, 0))
.lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
assert_eq!(id1, id2, "same sequence must produce same ID");
}
#[test]
fn different_sequences_different_id() {
let mut sk1 = make(b"ACGTACGT");
sk1.set_minimizer_pos(0);
let mut sk2 = make(b"TTTTTTTT");
sk2.set_minimizer_pos(0);
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, 0))
.lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, 0))
.lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
assert_ne!(id1, id2);
}
#[test]
fn id_is_16_hex_digits() {
let mut sk = make(b"ACGTACGT");
sk.set_minimizer_pos(0);
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, 0));
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
assert_eq!(id.len(), 16);
assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
}
}