first implementation but far to be optimal

2026-04-16 22:38:20 +02:00
commit de3f9b16cf
19336 changed files with 380276 additions and 0 deletions
@@ -0,0 +1,8 @@
+[package]
+name = "obifastwrite"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+obikseq = { path = "../obikseq" }
+xxhash-rust = { version = "0.8", features = ["xxh64"] }
@@ -0,0 +1,263 @@
+//! FASTA serialisation of [`SuperKmer`] values.
+//!
+//! Two functions cover the two phases of the scatter pipeline:
+//!
+//! - [`write_scatter`]: scatter phase (before routing). The header annotation
+//!   contains the minimizer sequence decoded from [`SuperKmer::minimizer_pos`].
+//!
+//! - [`write_count`]: count phase (after deduplication). The header annotation
+//!   contains the occurrence count from [`SuperKmer::count`].
+//!
+//! Both functions write standard OBITools-compatible FASTA:
+//!
+//! ```text
+//! >ID {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":42,"minimizer":"CGTGCTAGATC"}
+//! GCTAGCATGCTAGCTGTAGCTGTGAGTGCTG
+//! ```
+//!
+//! The record identifier is the xxHash-64 of the ASCII sequence, formatted as
+//! a 16-digit uppercase hexadecimal string. xxHash-64 is collision-resistant
+//! enough for debugging identifiers (collision probability < 1e-9 for billions
+//! of distinct super-kmers).
+//!
+//! # Phase contract
+//!
+//! `write_scatter` reads [`SuperKmer::minimizer_pos`], which is only valid
+//! **before** [`SuperKmer::init_count`] is called.  `write_count` reads
+//! [`SuperKmer::count`], which is only meaningful **after** `init_count`.
+//! Mixing the two functions in the wrong phase produces silently wrong output;
+//! this is enforced by pipeline structure, not by the type system.
+
+#![deny(missing_docs)]
+
+use std::io::{self, Write};
+
+use obikseq::superkmer::SuperKmer;
+use xxhash_rust::xxh64::xxh64;
+
+// ── public API ────────────────────────────────────────────────────────────────
+
+/// Write one super-kmer in FASTA format — **scatter phase**.
+///
+/// The `minimizer` field in the JSON annotation contains the ASCII sequence of
+/// the minimizer, decoded from [`SuperKmer::minimizer_pos`] (scatter-phase
+/// value of the payload field).
+///
+/// # Parameters
+/// - `sk`: the super-kmer to serialise (must be in scatter phase)
+/// - `out`: destination writer
+/// - `k`: k-mer size used to build `sk`
+/// - `m`: minimizer size
+/// - `partition`: partition index computed from the minimizer hash
+pub fn write_scatter<W: Write>(
+    sk: &SuperKmer,
+    out: &mut W,
+    k: usize,
+    m: usize,
+    partition: u32,
+    min_hash: u64,
+) -> io::Result<()> {
+    let ascii = sk.to_ascii();
+    let id = seq_id(&ascii);
+    let seq_len = ascii.len();
+    let min_seq = decode_mmer(min_hash, m);
+
+    writeln!(
+        out,
+        ">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
+         \"minimizer_size\":{m},\"partition\":{partition},\
+         \"minimizer\":\"{min}\"}}",
+        id = id,
+        seq_len = seq_len,
+        k = k,
+        m = m,
+        partition = partition,
+        min = std::str::from_utf8(&min_seq).unwrap(),
+    )?;
+    out.write_all(&ascii)?;
+    out.write_all(b"\n")
+}
+
+/// Decode a right-aligned 2-bit minimizer value into uppercase ASCII (A/C/G/T).
+fn decode_mmer(val: u64, m: usize) -> Vec<u8> {
+    const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
+    (0..m).map(|i| BASES[((val >> (2 * (m - 1 - i))) & 3) as usize]).collect()
+}
+
+/// Write one super-kmer in FASTA format — **count phase**.
+///
+/// The `count` field in the JSON annotation contains the occurrence count from
+/// [`SuperKmer::count`] (count-phase value of the payload field).
+///
+/// # Parameters
+/// - `sk`: the super-kmer to serialise (must be in count phase, i.e. after
+///   [`SuperKmer::init_count`] has been called)
+/// - `out`: destination writer
+/// - `k`: k-mer size
+/// - `m`: minimizer size
+/// - `partition`: partition index
+pub fn write_count<W: Write>(
+    sk: &SuperKmer,
+    out: &mut W,
+    k: usize,
+    m: usize,
+    partition: u32,
+) -> io::Result<()> {
+    let ascii = sk.to_ascii();
+    let id = seq_id(&ascii);
+    let seq_len = ascii.len();
+    let count = sk.count();
+
+    writeln!(
+        out,
+        ">{id} {{\"seq_length\":{seq_len},\"kmer_size\":{k},\
+         \"minimizer_size\":{m},\"partition\":{partition},\
+         \"count\":{count}}}",
+        id = id,
+        seq_len = seq_len,
+        k = k,
+        m = m,
+        partition = partition,
+        count = count,
+    )?;
+    out.write_all(&ascii)?;
+    out.write_all(b"\n")
+}
+
+// ── internal helpers ──────────────────────────────────────────────────────────
+
+/// xxHash-64 of the ASCII sequence, formatted as 16 uppercase hex digits.
+fn seq_id(ascii: &[u8]) -> String {
+    format!("{:016X}", xxh64(ascii, 0))
+}
+
+// ── tests ─────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use obikseq::superkmer::SuperKmer;
+
+    fn make(seq: &[u8]) -> SuperKmer {
+        SuperKmer::from_ascii(seq)
+    }
+
+    fn capture<F: Fn(&mut Vec<u8>) -> io::Result<()>>(f: F) -> String {
+        let mut buf = Vec::new();
+        f(&mut buf).unwrap();
+        String::from_utf8(buf).unwrap()
+    }
+
+    // ── write_scatter ─────────────────────────────────────────────────────────
+
+    #[test]
+    fn scatter_header_contains_minimizer_field() {
+        let mut sk = make(b"ACGTACGTACGT");
+        sk.set_minimizer_pos(2);
+        let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, 0));
+        assert!(out.contains("\"minimizer\":\""));
+        assert!(!out.contains("\"count\":"));
+    }
+
+    #[test]
+    fn scatter_minimizer_decoded_from_hash() {
+        // min_hash for "ACG" (A=0,C=1,G=2, m=3): 0*16 + 1*4 + 2 = 6
+        let mut sk = make(b"ACGTACGTACGT");
+        sk.set_minimizer_pos(0);
+        let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, 6));
+        assert!(out.contains("\"minimizer\":\"ACG\""), "got: {out}");
+    }
+
+    #[test]
+    fn scatter_fields_present() {
+        let mut sk = make(b"ACGTACGTACGT");
+        sk.set_minimizer_pos(0);
+        let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, 0));
+        assert!(out.contains("\"seq_length\":12"));
+        assert!(out.contains("\"kmer_size\":4"));
+        assert!(out.contains("\"minimizer_size\":3"));
+        assert!(out.contains("\"partition\":5"));
+    }
+
+    #[test]
+    fn scatter_sequence_line_correct() {
+        let mut sk = make(b"ACGTACGT");
+        sk.set_minimizer_pos(0);
+        let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, 0));
+        let lines: Vec<&str> = out.lines().collect();
+        assert_eq!(lines[1], "ACGTACGT");
+    }
+
+    // ── write_count ───────────────────────────────────────────────────────────
+
+    #[test]
+    fn count_header_contains_count_field() {
+        let mut sk = make(b"ACGTACGTACGT");
+        sk.init_count();
+        sk.add(49);
+        let out = capture(|w| write_count(&sk, w, 4, 3, 2));
+        assert!(out.contains("\"count\":50"));
+        assert!(!out.contains("\"minimizer\":"));
+    }
+
+    #[test]
+    fn count_fields_present() {
+        let mut sk = make(b"ACGTACGTACGT");
+        sk.init_count();
+        let out = capture(|w| write_count(&sk, w, 4, 3, 9));
+        assert!(out.contains("\"seq_length\":12"));
+        assert!(out.contains("\"kmer_size\":4"));
+        assert!(out.contains("\"minimizer_size\":3"));
+        assert!(out.contains("\"partition\":9"));
+        assert!(out.contains("\"count\":1"));
+    }
+
+    #[test]
+    fn count_sequence_line_correct() {
+        let mut sk = make(b"TTTTACGT");
+        sk.init_count();
+        let out = capture(|w| write_count(&sk, w, 4, 2, 0));
+        let lines: Vec<&str> = out.lines().collect();
+        assert_eq!(lines[1], "TTTTACGT");
+    }
+
+    // ── ID stability ──────────────────────────────────────────────────────────
+
+    #[test]
+    fn same_sequence_same_id() {
+        let mut sk1 = make(b"ACGTACGT");
+        sk1.set_minimizer_pos(0);
+        let mut sk2 = make(b"ACGTACGT");
+        sk2.set_minimizer_pos(4);  // different pos, same sequence
+
+        let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, 0))
+            .lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
+        let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, 0))
+            .lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
+        assert_eq!(id1, id2, "same sequence must produce same ID");
+    }
+
+    #[test]
+    fn different_sequences_different_id() {
+        let mut sk1 = make(b"ACGTACGT");
+        sk1.set_minimizer_pos(0);
+        let mut sk2 = make(b"TTTTTTTT");
+        sk2.set_minimizer_pos(0);
+
+        let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, 0))
+            .lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
+        let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, 0))
+            .lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
+        assert_ne!(id1, id2);
+    }
+
+    #[test]
+    fn id_is_16_hex_digits() {
+        let mut sk = make(b"ACGTACGT");
+        sk.set_minimizer_pos(0);
+        let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, 0));
+        let id = &out.lines().next().unwrap()[1..17];  // skip '>'
+        assert_eq!(id.len(), 16);
+        assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
+    }
+}