From eaf893174fcb4623a7c0514d9fcc22784b27c1fe Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sun, 26 Apr 2026 14:58:41 +0200 Subject: [PATCH] :recycle: refactor(obikpartitionner): replace low-level I/O with obiskio::SKFileWriter - Replace `limits` module and raw binary I/O with a new high-level abstraction using obiskio::SKFileWriter - Remove `niffler` dependency and compression logic (Gzip/Zstd/Lz4/Bgzf) - Simplify PartitionManager to manage partitioned file writers based on kmer hashing * Uses `n_partition_bits` for bitmask-based partition selection (2^n partitions) - Add obiskio as a local dependency Note: This is likely part of aligning with unified I/O primitives in the obiskio crate. --- src/Cargo.lock | 4 ++++ src/obikseq/Cargo.toml | 1 + src/obikseq/src/kmer.rs | 17 +++++++++++++++++ src/obikseq/src/superkmer.rs | 22 +++++++++++++++++++++- 4 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/Cargo.lock b/src/Cargo.lock index 8191eed..e6bf410 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -781,6 +781,9 @@ dependencies = [ [[package]] name = "obikpartitionner" version = "0.1.0" +dependencies = [ + "obiskio", +] [[package]] name = "obikrope" @@ -796,6 +799,7 @@ version = "0.1.0" dependencies = [ "bitvec", "criterion2", + "xxhash-rust", ] [[package]] diff --git a/src/obikseq/Cargo.toml b/src/obikseq/Cargo.toml index bce7f97..daf8193 100644 --- a/src/obikseq/Cargo.toml +++ b/src/obikseq/Cargo.toml @@ -5,6 +5,7 @@ edition = "2024" [dependencies] bitvec = "1" +xxhash-rust = { version = "0.8.15", features = ["xxh3", "const_xxh3"] } [dev-dependencies] criterion2 = { version = "3", features = ["cargo_bench_support"] } diff --git a/src/obikseq/src/kmer.rs b/src/obikseq/src/kmer.rs index 32a769e..ad24534 100644 --- a/src/obikseq/src/kmer.rs +++ b/src/obikseq/src/kmer.rs @@ -49,6 +49,15 @@ impl std::error::Error for KmerError {} #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Kmer(u64); +#[inline] +fn mix64(x: u64) -> u64 { + let x = x ^ (x >> 30); + let x = x.wrapping_mul(0xbf58476d1ce4e5b9); + let x = x ^ (x >> 27); + let x = x.wrapping_mul(0x94d049bb133111eb); + x ^ (x >> 31) +} + impl Kmer { /// Wrap a raw left-aligned u64 value as a Kmer. #[inline] @@ -144,6 +153,14 @@ impl Kmer { let rc = self.revcomp(k); if self.0 <= rc.0 { *self } else { rc } } + + /// Return a hash of this kmer. + /// + /// Uses the canonical form of the kmer to compute the hash. + #[inline] + pub fn hash(&self, k: usize) -> u64 { + mix64(self.canonical(k).0) + } } // ── tests ───────────────────────────────────────────────────────────────────── diff --git a/src/obikseq/src/superkmer.rs b/src/obikseq/src/superkmer.rs index b87ca16..ff27941 100644 --- a/src/obikseq/src/superkmer.rs +++ b/src/obikseq/src/superkmer.rs @@ -4,6 +4,7 @@ use crate::encoding::{DEC4, encode_base}; use crate::kmer::{Kmer, KmerError}; use crate::revcomp_lookup::REVCOMP4; use bitvec::prelude::*; +use xxhash_rust::xxh3::xxh3_64; // ── SuperKmerHeader ─────────────────────────────────────────────────────────── @@ -312,6 +313,15 @@ impl SuperKmer { /// /// Returns `true` if already canonical (no change), `false` if revcomp was applied. pub fn canonical(&mut self) -> bool { + if self.is_canonical() { + return true; + } + self.revcomp(); + false + } + + /// Returns `true` if this super-kmer is in canonical form (lexicographic minimum of forward and revcomp). + pub fn is_canonical(&self) -> bool { let seql = self.seql(); for i in 0..seql { let fwd = self.nucleotide(i); @@ -320,12 +330,22 @@ impl SuperKmer { return true; } if fwd > rev { - self.revcomp(); return false; } } true } + + /// Returns the XXH3 hash of the super-kmer sequence. + pub fn hash(&self) -> u64 { + if self.is_canonical() { + return xxh3_64(&self.seq); + } else { + let mut rev = self.clone(); + rev.revcomp(); + return xxh3_64(&rev.seq); + } + } } // ── helpers ───────────────────────────────────────────────────────────────────