📦 Add infer and new pipeline infrastructure

- Update Cargo.lock with dependency additions (bumpalo, byteorder, cfb, fnv, infer, js-sys, uuid wasm-bindgen)
- Refactor obikseq::superkmer: reorder imports and improve formatting
  - Add `obipipeline` crate with scheduler, error handling & macros (WIP)
- Replace obiread::expand_paths logic with PathIter and path_iterator module
  - Add mimetype detection using `infer` crate via PeekReader wrapper
This commit is contained in:
Eric Coissac
2026-04-23 21:03:48 +02:00
parent 664d0216b5
commit 3f8880a7e5
15 changed files with 893 additions and 86 deletions
+45 -27
View File
@@ -1,9 +1,9 @@
//! Compact 2-bit DNA super-kmer with in-place reverse complement and canonical form.
use bitvec::prelude::*;
use crate::encoding::{encode_base, DEC4};
use crate::encoding::{DEC4, encode_base};
use crate::kmer::{Kmer, KmerError};
use crate::revcomp_lookup::REVCOMP4;
use bitvec::prelude::*;
// ── SuperKmerHeader ───────────────────────────────────────────────────────────
@@ -193,7 +193,8 @@ impl SuperKmer {
let bytes = &mut self.seq[..n];
let (mut lo, mut hi) = (0, n - 1);
while lo < hi {
(bytes[lo], bytes[hi]) = (REVCOMP4[bytes[hi] as usize], REVCOMP4[bytes[lo] as usize]);
(bytes[lo], bytes[hi]) =
(REVCOMP4[bytes[hi] as usize], REVCOMP4[bytes[lo] as usize]);
lo += 1;
hi -= 1;
}
@@ -216,16 +217,19 @@ impl SuperKmer {
/// The result is not yet in canonical form; call `.canonical()` if needed.
pub fn from_ascii(ascii: &[u8]) -> Self {
let seql = ascii.len();
debug_assert!(seql >= 1 && seql <= 256, "super-kmer length must be 1..=256");
debug_assert!(
seql >= 1 && seql <= 256,
"super-kmer length must be 1..=256"
);
let n = byte_len(seql);
let mut seq = vec![0u8; n];
let full = seql / 4;
for i in 0..full {
seq[i] = encode_base(ascii[i * 4]) << 6
| encode_base(ascii[i * 4 + 1]) << 4
| encode_base(ascii[i * 4 + 2]) << 2
| encode_base(ascii[i * 4 + 3]);
seq[i] = encode_base(ascii[i * 4]) << 6
| encode_base(ascii[i * 4 + 1]) << 4
| encode_base(ascii[i * 4 + 2]) << 2
| encode_base(ascii[i * 4 + 3]);
}
let rem = seql % 4;
if rem > 0 {
@@ -236,7 +240,7 @@ impl SuperKmer {
seq[full] = last;
}
Self::new(seql as u8, seq.into_boxed_slice()) // 256usize as u8 == 0, intentional
Self::new(seql as u8, seq.into_boxed_slice()) // 256usize as u8 == 0, intentional
}
/// Decode this super-kmer sequence into ASCII nucleotides, appending into `buf`.
@@ -270,7 +274,11 @@ impl SuperKmer {
}
let seql = self.seql();
if i + k > seql {
return Err(KmerError::OutOfBounds { position: i, k, seql });
return Err(KmerError::OutOfBounds {
position: i,
k,
seql,
});
}
let bits = self.seq.view_bits::<Msb0>();
let raw: u64 = bits[i * 2..(i + k) * 2].load_be();
@@ -334,11 +342,16 @@ mod tests {
/// Reference revcomp on ASCII bytes.
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
seq.iter().rev().map(|&b| match b {
b'A' => b'T', b'T' => b'A',
b'C' => b'G', b'G' => b'C',
_ => b'A',
}).collect()
seq.iter()
.rev()
.map(|&b| match b {
b'A' => b'T',
b'T' => b'A',
b'C' => b'G',
b'G' => b'C',
_ => b'A',
})
.collect()
}
fn all_lengths() -> impl Iterator<Item = usize> {
@@ -545,23 +558,24 @@ mod tests {
fn revcomp_known_values() {
let cases = [
// shift=6
("A", "T"),
("ACGTA", "TACGT"),
("A", "T"),
("ACGTA", "TACGT"),
// shift=4
("AC", "GT"),
("ACGTAC", "GTACGT"),
("AC", "GT"),
("ACGTAC", "GTACGT"),
// shift=2
("ACG", "CGT"),
("ACGTACG", "CGTACGT"),
("ACG", "CGT"),
("ACGTACG", "CGTACGT"),
// shift=0
("ACGT", "ACGT"),
("ACGT", "ACGT"),
("ACGTACGT", "ACGTACGT"),
];
for (seq, expected) in cases {
let mut sk = SuperKmer::from_ascii(seq.as_bytes());
sk.revcomp();
assert_eq!(
sk.to_ascii(), expected.as_bytes(),
sk.to_ascii(),
expected.as_bytes(),
"revcomp wrong for \"{seq}\""
);
}
@@ -594,21 +608,24 @@ mod tests {
#[test]
fn canonical_palindrome_unchanged() {
// ACGT is its own revcomp
let sk = SuperKmer::from_ascii(b"ACGT").canonical();
let mut sk = SuperKmer::from_ascii(b"ACGT");
sk.canonical();
assert_eq!(sk.to_ascii(), b"ACGT");
}
#[test]
fn canonical_chooses_forward() {
// "AAAA" < "TTTT" → stays as-is
let sk = SuperKmer::from_ascii(b"AAAA").canonical();
let mut sk = SuperKmer::from_ascii(b"AAAA");
sk.canonical();
assert_eq!(sk.to_ascii(), b"AAAA");
}
#[test]
fn canonical_chooses_revcomp() {
// "TTTT" > "AAAA" → flipped
let sk = SuperKmer::from_ascii(b"TTTT").canonical();
let mut sk = SuperKmer::from_ascii(b"TTTT");
sk.canonical();
assert_eq!(sk.to_ascii(), b"AAAA");
}
@@ -616,7 +633,8 @@ mod tests {
fn canonical_is_minimal_all_lengths() {
for len in all_lengths() {
let ascii = make_seq(len);
let sk = SuperKmer::from_ascii(&ascii).canonical();
let mut sk = SuperKmer::from_ascii(&ascii);
sk.canonical();
let fwd = sk.to_ascii();
let rev = ascii_revcomp(&fwd);
assert!(fwd <= rev, "canonical not minimal for len={len}");