📦 Add infer and new pipeline infrastructure
- Update Cargo.lock with dependency additions (bumpalo, byteorder, cfb, fnv, infer, js-sys, uuid wasm-bindgen) - Refactor obikseq::superkmer: reorder imports and improve formatting - Add `obipipeline` crate with scheduler, error handling & macros (WIP) - Replace obiread::expand_paths logic with PathIter and path_iterator module - Add mimetype detection using `infer` crate via PeekReader wrapper
This commit is contained in:
@@ -1,9 +1,9 @@
|
||||
//! Compact 2-bit DNA super-kmer with in-place reverse complement and canonical form.
|
||||
|
||||
use bitvec::prelude::*;
|
||||
use crate::encoding::{encode_base, DEC4};
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
use crate::kmer::{Kmer, KmerError};
|
||||
use crate::revcomp_lookup::REVCOMP4;
|
||||
use bitvec::prelude::*;
|
||||
|
||||
// ── SuperKmerHeader ───────────────────────────────────────────────────────────
|
||||
|
||||
@@ -193,7 +193,8 @@ impl SuperKmer {
|
||||
let bytes = &mut self.seq[..n];
|
||||
let (mut lo, mut hi) = (0, n - 1);
|
||||
while lo < hi {
|
||||
(bytes[lo], bytes[hi]) = (REVCOMP4[bytes[hi] as usize], REVCOMP4[bytes[lo] as usize]);
|
||||
(bytes[lo], bytes[hi]) =
|
||||
(REVCOMP4[bytes[hi] as usize], REVCOMP4[bytes[lo] as usize]);
|
||||
lo += 1;
|
||||
hi -= 1;
|
||||
}
|
||||
@@ -216,16 +217,19 @@ impl SuperKmer {
|
||||
/// The result is not yet in canonical form; call `.canonical()` if needed.
|
||||
pub fn from_ascii(ascii: &[u8]) -> Self {
|
||||
let seql = ascii.len();
|
||||
debug_assert!(seql >= 1 && seql <= 256, "super-kmer length must be 1..=256");
|
||||
debug_assert!(
|
||||
seql >= 1 && seql <= 256,
|
||||
"super-kmer length must be 1..=256"
|
||||
);
|
||||
let n = byte_len(seql);
|
||||
let mut seq = vec![0u8; n];
|
||||
|
||||
let full = seql / 4;
|
||||
for i in 0..full {
|
||||
seq[i] = encode_base(ascii[i * 4]) << 6
|
||||
| encode_base(ascii[i * 4 + 1]) << 4
|
||||
| encode_base(ascii[i * 4 + 2]) << 2
|
||||
| encode_base(ascii[i * 4 + 3]);
|
||||
seq[i] = encode_base(ascii[i * 4]) << 6
|
||||
| encode_base(ascii[i * 4 + 1]) << 4
|
||||
| encode_base(ascii[i * 4 + 2]) << 2
|
||||
| encode_base(ascii[i * 4 + 3]);
|
||||
}
|
||||
let rem = seql % 4;
|
||||
if rem > 0 {
|
||||
@@ -236,7 +240,7 @@ impl SuperKmer {
|
||||
seq[full] = last;
|
||||
}
|
||||
|
||||
Self::new(seql as u8, seq.into_boxed_slice()) // 256usize as u8 == 0, intentional
|
||||
Self::new(seql as u8, seq.into_boxed_slice()) // 256usize as u8 == 0, intentional
|
||||
}
|
||||
|
||||
/// Decode this super-kmer sequence into ASCII nucleotides, appending into `buf`.
|
||||
@@ -270,7 +274,11 @@ impl SuperKmer {
|
||||
}
|
||||
let seql = self.seql();
|
||||
if i + k > seql {
|
||||
return Err(KmerError::OutOfBounds { position: i, k, seql });
|
||||
return Err(KmerError::OutOfBounds {
|
||||
position: i,
|
||||
k,
|
||||
seql,
|
||||
});
|
||||
}
|
||||
let bits = self.seq.view_bits::<Msb0>();
|
||||
let raw: u64 = bits[i * 2..(i + k) * 2].load_be();
|
||||
@@ -334,11 +342,16 @@ mod tests {
|
||||
|
||||
/// Reference revcomp on ASCII bytes.
|
||||
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
||||
seq.iter().rev().map(|&b| match b {
|
||||
b'A' => b'T', b'T' => b'A',
|
||||
b'C' => b'G', b'G' => b'C',
|
||||
_ => b'A',
|
||||
}).collect()
|
||||
seq.iter()
|
||||
.rev()
|
||||
.map(|&b| match b {
|
||||
b'A' => b'T',
|
||||
b'T' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
_ => b'A',
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn all_lengths() -> impl Iterator<Item = usize> {
|
||||
@@ -545,23 +558,24 @@ mod tests {
|
||||
fn revcomp_known_values() {
|
||||
let cases = [
|
||||
// shift=6
|
||||
("A", "T"),
|
||||
("ACGTA", "TACGT"),
|
||||
("A", "T"),
|
||||
("ACGTA", "TACGT"),
|
||||
// shift=4
|
||||
("AC", "GT"),
|
||||
("ACGTAC", "GTACGT"),
|
||||
("AC", "GT"),
|
||||
("ACGTAC", "GTACGT"),
|
||||
// shift=2
|
||||
("ACG", "CGT"),
|
||||
("ACGTACG", "CGTACGT"),
|
||||
("ACG", "CGT"),
|
||||
("ACGTACG", "CGTACGT"),
|
||||
// shift=0
|
||||
("ACGT", "ACGT"),
|
||||
("ACGT", "ACGT"),
|
||||
("ACGTACGT", "ACGTACGT"),
|
||||
];
|
||||
for (seq, expected) in cases {
|
||||
let mut sk = SuperKmer::from_ascii(seq.as_bytes());
|
||||
sk.revcomp();
|
||||
assert_eq!(
|
||||
sk.to_ascii(), expected.as_bytes(),
|
||||
sk.to_ascii(),
|
||||
expected.as_bytes(),
|
||||
"revcomp wrong for \"{seq}\""
|
||||
);
|
||||
}
|
||||
@@ -594,21 +608,24 @@ mod tests {
|
||||
#[test]
|
||||
fn canonical_palindrome_unchanged() {
|
||||
// ACGT is its own revcomp
|
||||
let sk = SuperKmer::from_ascii(b"ACGT").canonical();
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"ACGT");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_forward() {
|
||||
// "AAAA" < "TTTT" → stays as-is
|
||||
let sk = SuperKmer::from_ascii(b"AAAA").canonical();
|
||||
let mut sk = SuperKmer::from_ascii(b"AAAA");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_revcomp() {
|
||||
// "TTTT" > "AAAA" → flipped
|
||||
let sk = SuperKmer::from_ascii(b"TTTT").canonical();
|
||||
let mut sk = SuperKmer::from_ascii(b"TTTT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
@@ -616,7 +633,8 @@ mod tests {
|
||||
fn canonical_is_minimal_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let sk = SuperKmer::from_ascii(&ascii).canonical();
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.canonical();
|
||||
let fwd = sk.to_ascii();
|
||||
let rev = ascii_revcomp(&fwd);
|
||||
assert!(fwd <= rev, "canonical not minimal for len={len}");
|
||||
|
||||
Reference in New Issue
Block a user