refactor: implement RoutableSuperKmer and update k-mer indexing pipeline
Replace raw SuperkMer routing with a new RoutableSuperKimer type that embeds canonical sequences and precomputed minimizers, enabling direct partition routing via hash. Update the build pipeline to yield RoutableSuperKmers throughout (builder, scatterer), refactor FASTA/unitig export commands to use the new type and compressed outputs (.fasta.gz, .unitigs.fasta.zst), revise SuperKmer header to store n_kmers instead of seql (avoiding 256-byte wrap), and update documentation to reflect minimizer-based theory, two evidence-encoding strategies for unitig-MPHF indexing (global offset vs. ID+rank), and the new obipipeline library architecture with parallel workers, biased scheduling, and error handling.
This commit is contained in:
@@ -1 +0,0 @@
|
||||
Eric Coissac,coissac,mac.lan,20.04.2026 19:13,file:///Users/coissac/Library/Application%20Support/LibreOffice/4;
|
||||
Generated
+8
@@ -1590,6 +1590,9 @@ name = "obikmer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"memmap2",
|
||||
"niffler 3.0.0",
|
||||
"obidebruinj",
|
||||
"obifastwrite",
|
||||
"obikpartitionner",
|
||||
"obikrope",
|
||||
@@ -1597,7 +1600,10 @@ dependencies = [
|
||||
"obipipeline",
|
||||
"obiread",
|
||||
"obiskbuilder",
|
||||
"obiskio",
|
||||
"ph",
|
||||
"pprof",
|
||||
"rayon",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
@@ -1633,6 +1639,8 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"bitvec",
|
||||
"criterion2",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"xxhash-rust",
|
||||
]
|
||||
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
use std::fmt;
|
||||
use std::io::{self, Write};
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
|
||||
pub(crate) enum JsonVal<'a> {
|
||||
Num(u64),
|
||||
Str(&'a str),
|
||||
}
|
||||
|
||||
impl fmt::Display for JsonVal<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
JsonVal::Num(n) => write!(f, "{n}"),
|
||||
JsonVal::Str(s) => write!(f, "\"{s}\""),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn seq_id(ascii: &[u8]) -> String {
|
||||
format!("{:016X}", xxh64(ascii, 0))
|
||||
}
|
||||
|
||||
pub(crate) fn annotation<W: Write>(
|
||||
writer: &mut W,
|
||||
fields: &[(&str, JsonVal<'_>)],
|
||||
) -> io::Result<()> {
|
||||
write!(writer, "{{")?;
|
||||
for (i, (k, v)) in fields.iter().enumerate() {
|
||||
if i > 0 {
|
||||
write!(writer, ",")?;
|
||||
}
|
||||
write!(writer, "\"{k}\":{v}")?;
|
||||
}
|
||||
write!(writer, "}}")
|
||||
}
|
||||
|
||||
pub(crate) fn write_sequence<W: Write>(writer: &mut W, seq: &[u8], width: usize) -> io::Result<()> {
|
||||
for chunk in seq.chunks(width) {
|
||||
// SAFETY: seq is valid UTF-8; any contiguous slice of ASCII bytes is too
|
||||
writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(chunk) })?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
+15
-24
@@ -30,6 +30,8 @@
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
mod fasta;
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use obikseq::{kmer::Kmer, superkmer::SuperKmer, unitig::Unitig};
|
||||
@@ -168,8 +170,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn scatter_header_contains_minimizer_field() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.set_minimizer_pos(2);
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, Kmer::from_raw(0)));
|
||||
assert!(out.contains("\"minimizer\":\""));
|
||||
assert!(!out.contains("\"count\":"));
|
||||
@@ -178,16 +179,14 @@ mod tests {
|
||||
#[test]
|
||||
fn scatter_minimizer_decoded_from_hash() {
|
||||
// min_hash for "ACG" (A=0,C=1,G=2, m=3): 0*16 + 1*4 + 2 = 6
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, Kmer::from_raw_right(6, 3)));
|
||||
assert!(out.contains("\"minimizer\":\"ACG\""), "got: {out}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scatter_fields_present() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, Kmer::from_raw(0)));
|
||||
assert!(out.contains("\"seq_length\":12"));
|
||||
assert!(out.contains("\"kmer_size\":4"));
|
||||
@@ -197,8 +196,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn scatter_sequence_line_correct() {
|
||||
let mut sk = make(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let sk = make(b"ACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
|
||||
let lines: Vec<&str> = out.lines().collect();
|
||||
assert_eq!(lines[1], "ACGTACGT");
|
||||
@@ -209,7 +207,6 @@ mod tests {
|
||||
#[test]
|
||||
fn count_header_contains_count_field() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.init_count();
|
||||
sk.add(49);
|
||||
let out = capture(|w| write_count(&sk, w, 4, 3, 2));
|
||||
assert!(out.contains("\"count\":50"));
|
||||
@@ -218,8 +215,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn count_fields_present() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.init_count();
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_count(&sk, w, 4, 3, 9));
|
||||
assert!(out.contains("\"seq_length\":12"));
|
||||
assert!(out.contains("\"kmer_size\":4"));
|
||||
@@ -230,21 +226,19 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn count_sequence_line_correct() {
|
||||
let mut sk = make(b"TTTTACGT");
|
||||
sk.init_count();
|
||||
// TTTTACGT canonicalises to ACGTAAAA (revcomp is ACGTAAAA < TTTTACGT)
|
||||
let sk = make(b"TTTTACGT");
|
||||
let out = capture(|w| write_count(&sk, w, 4, 2, 0));
|
||||
let lines: Vec<&str> = out.lines().collect();
|
||||
assert_eq!(lines[1], "TTTTACGT");
|
||||
assert_eq!(lines[1], "ACGTAAAA");
|
||||
}
|
||||
|
||||
// ── ID stability ──────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn same_sequence_same_id() {
|
||||
let mut sk1 = make(b"ACGTACGT");
|
||||
sk1.set_minimizer_pos(0);
|
||||
let mut sk2 = make(b"ACGTACGT");
|
||||
sk2.set_minimizer_pos(4); // different pos, same sequence
|
||||
let sk1 = make(b"ACGTACGT");
|
||||
let sk2 = make(b"ACGTACGT");
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||
.lines()
|
||||
@@ -267,10 +261,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn different_sequences_different_id() {
|
||||
let mut sk1 = make(b"ACGTACGT");
|
||||
sk1.set_minimizer_pos(0);
|
||||
let mut sk2 = make(b"TTTTTTTT");
|
||||
sk2.set_minimizer_pos(0);
|
||||
let sk1 = make(b"ACGTACGT");
|
||||
let sk2 = make(b"TTTTTTTT");
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||
.lines()
|
||||
@@ -293,8 +285,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn id_is_16_hex_digits() {
|
||||
let mut sk = make(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let sk = make(b"ACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
|
||||
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
|
||||
assert_eq!(id.len(), 16);
|
||||
|
||||
@@ -13,9 +13,15 @@ obiread = { path = "../obiread" }
|
||||
obiskbuilder = { path = "../obiskbuilder" }
|
||||
obifastwrite = { path = "../obifastwrite" }
|
||||
obipipeline = { path = "../obipipeline" }
|
||||
obidebruinj = { path = "../obidebruinj" }
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
obikrope = { path = "../obikrope" }
|
||||
obikpartitionner = { path = "../obikpartitionner" }
|
||||
obiskio = { path = "../obiskio" }
|
||||
niffler = "3"
|
||||
rayon = "1"
|
||||
ph = "0.11"
|
||||
memmap2 = "0.9"
|
||||
tracing = "0.1.44"
|
||||
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
|
||||
pprof = { version = "0.13", features = ["prost-codec"], optional = true }
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::path::PathBuf;
|
||||
|
||||
use clap::Args;
|
||||
use obikrope::Rope;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
// ── Shared arguments ──────────────────────────────────────────────────────────
|
||||
|
||||
@@ -57,7 +57,7 @@ pub enum PipelineData {
|
||||
Path(PathBuf),
|
||||
RawChunk(Rope),
|
||||
NormChunk(Rope),
|
||||
Batch(Vec<SuperKmer>),
|
||||
Batch(Vec<RoutableSuperKmer>),
|
||||
}
|
||||
|
||||
// SAFETY: Rope contains Cell<u8> which is !Sync, but pipeline ownership transfers
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
use std::fs::File;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use clap::Args;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obifastwrite::write_count;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obiskio::SKFileReader;
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct FastaArgs {
|
||||
/// Root of the k-mer partition directory (produced by the `partition` command)
|
||||
pub partition: PathBuf,
|
||||
|
||||
/// Dump dereplicated super-kmers as FASTA (→ <partition>/dereplicated.skmer.fasta.gz)
|
||||
#[arg(long)]
|
||||
pub super_kmers: bool,
|
||||
}
|
||||
|
||||
pub fn run(args: FastaArgs) {
|
||||
if !args.super_kmers {
|
||||
eprintln!("error: specify at least one output mode (--super-kmers)");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| {
|
||||
eprintln!("error opening partition: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
if args.super_kmers {
|
||||
dump_super_kmers(&kp, &args.partition);
|
||||
}
|
||||
}
|
||||
|
||||
fn dump_super_kmers(kp: &KmerPartition, partition_dir: &PathBuf) {
|
||||
let k = kp.kmer_size();
|
||||
let m = kp.minimizer_size();
|
||||
let n = kp.n_partitions();
|
||||
|
||||
info!("writing {n} partition FASTA files (parallel)");
|
||||
|
||||
let total = AtomicUsize::new(0);
|
||||
|
||||
(0..n).into_par_iter().for_each(|i| {
|
||||
let part_dir = partition_dir.join(format!("part_{i:05}"));
|
||||
let in_path = part_dir.join("dereplicated.skmer.zst");
|
||||
if !in_path.exists() {
|
||||
return;
|
||||
}
|
||||
let out_path = part_dir.join("dereplicated.skmer.fasta.gz");
|
||||
|
||||
let file = File::create(&out_path).unwrap_or_else(|e| {
|
||||
eprintln!("error creating {}: {e}", out_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
let mut writer = niffler::send::get_writer(Box::new(file), Format::Gzip, Level::Six)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("error creating gzip writer: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
let mut reader = SKFileReader::open(&in_path, k).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", in_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
let mut count = 0usize;
|
||||
for sk in reader.iter() {
|
||||
write_count(&sk, &mut writer, k, m, i as u32).unwrap_or_else(|e| {
|
||||
eprintln!("write error: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
count += 1;
|
||||
}
|
||||
info!("partition {i}: {count} super-kmers → {}", out_path.display());
|
||||
total.fetch_add(count, Ordering::Relaxed);
|
||||
});
|
||||
|
||||
info!("wrote {} super-kmers total", total.load(Ordering::Relaxed));
|
||||
}
|
||||
@@ -1,3 +1,5 @@
|
||||
pub mod count;
|
||||
pub mod fasta;
|
||||
pub mod partition;
|
||||
pub mod superkmer;
|
||||
pub mod unitig;
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::path::PathBuf;
|
||||
|
||||
use clap::Args;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
use tracing::info;
|
||||
|
||||
use crate::cli::{CommonArgs, PipelineData, open_chunks};
|
||||
@@ -39,14 +39,14 @@ pub fn run(args: PartitionArgs) {
|
||||
let path_source = args.common.seqfile_paths();
|
||||
|
||||
let pipe = obipipeline::make_pipe! {
|
||||
PipelineData : PathBuf => Vec<SuperKmer>,
|
||||
PipelineData : PathBuf => Vec<RoutableSuperKmer>,
|
||||
||? { |path| open_chunks(path) } : Path => RawChunk,
|
||||
|? { move |rope| obiread::normalize_sequence_chunk(rope, k) } : RawChunk => NormChunk,
|
||||
| { move |rope| obiskbuilder::build_superkmers(rope, k, m, level_max, theta) }: NormChunk => Batch,
|
||||
};
|
||||
|
||||
for mut batch in pipe.apply(path_source, n_workers, 1) {
|
||||
kp.write_batch(&mut batch).unwrap_or_else(|e| {
|
||||
for batch in pipe.apply(path_source, n_workers, 1) {
|
||||
kp.write_batch(batch).unwrap_or_else(|e| {
|
||||
eprintln!("error: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::path::PathBuf;
|
||||
|
||||
use clap::Args;
|
||||
use obifastwrite::write_scatter;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
use crate::cli::{CommonArgs, PipelineData, open_chunks};
|
||||
|
||||
@@ -16,20 +16,17 @@ pub struct SuperkmerArgs {
|
||||
// ── Stage functions ───────────────────────────────────────────────────────────
|
||||
|
||||
fn write_batch(
|
||||
batch: Vec<SuperKmer>,
|
||||
batch: Vec<RoutableSuperKmer>,
|
||||
out: &mut BufWriter<io::Stdout>,
|
||||
partition_bits: usize,
|
||||
k: usize,
|
||||
m: usize,
|
||||
) -> io::Result<()> {
|
||||
let partition_mask = (1u64 << partition_bits) - 1;
|
||||
for sk in batch {
|
||||
let minimizer = sk
|
||||
.kmer(sk.minimizer_pos() as usize, m)
|
||||
.map_err(io::Error::other)?
|
||||
.canonical(m);
|
||||
for rsk in batch {
|
||||
let minimizer = *rsk.minimizer();
|
||||
let partition = (minimizer.hash(m) & partition_mask) as usize;
|
||||
write_scatter(&sk, out, k, m, partition, minimizer)?;
|
||||
write_scatter(rsk.superkmer(), out, k, m, partition, minimizer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -47,7 +44,7 @@ pub fn run(args: SuperkmerArgs) {
|
||||
let path_source = args.common.seqfile_paths();
|
||||
|
||||
let pipe = obipipeline::make_pipe! {
|
||||
PipelineData : PathBuf => Vec<SuperKmer>,
|
||||
PipelineData : PathBuf => Vec<RoutableSuperKmer>,
|
||||
||? { |path| open_chunks(path) } : Path => RawChunk,
|
||||
|? { move |rope| obiread::normalize_sequence_chunk(rope, k) } : RawChunk => NormChunk,
|
||||
| { move |rope| obiskbuilder::build_superkmers(rope, k, m, level_max, theta) }: NormChunk => Batch,
|
||||
|
||||
@@ -0,0 +1,138 @@
|
||||
use std::fs::File;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use clap::Args;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obidebruinj::GraphDeBruijn;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obiskio::SKFileReader;
|
||||
use ph::fmph::GOFunction;
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct UnitigArgs {
|
||||
/// Root of the k-mer partition directory (produced by the `partition` command)
|
||||
pub partition: PathBuf,
|
||||
|
||||
/// Minimum kmer abundance (inclusive); kmers below this threshold are excluded
|
||||
#[arg(long, default_value_t = 1)]
|
||||
pub min_abundance: u32,
|
||||
|
||||
/// Maximum kmer abundance (inclusive); kmers above this threshold are excluded
|
||||
#[arg(long)]
|
||||
pub max_abundance: Option<u32>,
|
||||
}
|
||||
|
||||
pub fn run(args: UnitigArgs) {
|
||||
let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| {
|
||||
eprintln!("error opening partition: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
let k = kp.kmer_size();
|
||||
let n = kp.n_partitions();
|
||||
info!("building unitigs from {n} partitions (k={k}, parallel)");
|
||||
|
||||
let total_kmers = AtomicUsize::new(0);
|
||||
|
||||
(0..n).into_par_iter().for_each(|i| {
|
||||
let part_dir = args.partition.join(format!("part_{i:05}"));
|
||||
let in_path = part_dir.join("dereplicated.skmer.zst");
|
||||
if !in_path.exists() {
|
||||
return;
|
||||
}
|
||||
let out_path = part_dir.join("unitig.fasta.gz");
|
||||
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
|
||||
let mphf_path = part_dir.join("mphf1.bin");
|
||||
let counts_path = part_dir.join("counts1.bin");
|
||||
let filter_active = (args.min_abundance > 1 || args.max_abundance.is_some())
|
||||
&& mphf_path.exists()
|
||||
&& counts_path.exists();
|
||||
|
||||
let mphf_opt: Option<GOFunction> = if filter_active {
|
||||
let mut f = File::open(&mphf_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", mphf_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
Some(GOFunction::read(&mut f).unwrap_or_else(|e| {
|
||||
eprintln!("error reading MPHF {}: {e}", mphf_path.display());
|
||||
std::process::exit(1)
|
||||
}))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let counts_mmap_opt = if filter_active {
|
||||
let cf = File::open(&counts_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", counts_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
Some(unsafe {
|
||||
memmap2::Mmap::map(&cf).unwrap_or_else(|e| {
|
||||
eprintln!("error mmapping {}: {e}", counts_path.display());
|
||||
std::process::exit(1)
|
||||
})
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let counts_slice: Option<&[u32]> = counts_mmap_opt.as_ref().map(|m| unsafe {
|
||||
std::slice::from_raw_parts(m.as_ptr() as *const u32, m.len() / 4)
|
||||
});
|
||||
|
||||
let mut reader = SKFileReader::open(&in_path, k).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", in_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
for sk in reader.iter() {
|
||||
for kmer in sk.iter_canonical_kmers(k) {
|
||||
let accept = match (&mphf_opt, counts_slice) {
|
||||
(Some(mphf), Some(counts)) => {
|
||||
if let Some(slot) = mphf.get(&kmer) {
|
||||
let ab = counts[slot as usize];
|
||||
ab >= args.min_abundance
|
||||
&& args.max_abundance.map_or(true, |max| ab <= max)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
_ => true,
|
||||
};
|
||||
if accept {
|
||||
g.push(kmer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let n_kmers = g.len();
|
||||
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
||||
info!("partition {i}/{n}: {n_kmers} canonical k-mers → {}", out_path.display());
|
||||
|
||||
g.compute_degrees();
|
||||
|
||||
let file = File::create(&out_path).unwrap_or_else(|e| {
|
||||
eprintln!("error creating {}: {e}", out_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
let mut writer = niffler::send::get_writer(Box::new(file), Format::Gzip, Level::Six)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("error creating gzip writer: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
g.write_fasta(&mut writer).unwrap_or_else(|e| {
|
||||
eprintln!("write error on partition {i}: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
});
|
||||
|
||||
info!(
|
||||
"done — {} total canonical k-mers across all partitions",
|
||||
total_kmers.load(Ordering::Relaxed)
|
||||
);
|
||||
}
|
||||
@@ -19,6 +19,10 @@ enum Commands {
|
||||
Partition(cmd::partition::PartitionArgs),
|
||||
/// Count kmers from an existing dereplicated partition directory
|
||||
Count(cmd::count::CountArgs),
|
||||
/// Export partition data to FASTA (--super-kmers: dereplicated super-kmers)
|
||||
Fasta(cmd::fasta::FastaArgs),
|
||||
/// Build de Bruijn unitigs for all partitions and write to unitig.fasta.gz
|
||||
Unitig(cmd::unitig::UnitigArgs),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
@@ -41,6 +45,8 @@ fn main() {
|
||||
Commands::Superkmer(args) => cmd::superkmer::run(args),
|
||||
Commands::Partition(args) => cmd::partition::run(args),
|
||||
Commands::Count(args) => cmd::count::run(args),
|
||||
Commands::Fasta(args) => cmd::fasta::run(args),
|
||||
Commands::Unitig(args) => cmd::unitig::run(args),
|
||||
}
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
|
||||
@@ -15,6 +15,7 @@ use remove_dir_all::remove_dir_all;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
use obiskio::{SKFileMeta, SKFileReader, SKFileWriter, SKResult};
|
||||
use rayon::prelude::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -102,8 +103,8 @@ impl KmerPartition {
|
||||
.into());
|
||||
}
|
||||
let meta_path = root_path.join(META_FILENAME);
|
||||
let meta: PartitionMeta = serde_json::from_reader(fs::File::open(&meta_path)?)
|
||||
.map_err(io::Error::other)?;
|
||||
let meta: PartitionMeta =
|
||||
serde_json::from_reader(fs::File::open(&meta_path)?).map_err(io::Error::other)?;
|
||||
|
||||
let level = level_from_u32(meta.level);
|
||||
let n_partitions = 1usize << meta.n_bits;
|
||||
@@ -120,19 +121,21 @@ impl KmerPartition {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn write(&mut self, sk: &mut SuperKmer) -> SKResult<()> {
|
||||
/// Route and write one super-kmer to its partition file.
|
||||
pub fn write(&mut self, rsk: RoutableSuperKmer) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
let partition = self.partition_of(sk)?;
|
||||
sk.init_count();
|
||||
self.ensure_writer(partition)?.write(sk)
|
||||
let partition = (rsk.minimizer().hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||
let sk = rsk.into_superkmer();
|
||||
self.ensure_writer(partition)?.write(&sk)
|
||||
}
|
||||
|
||||
pub fn write_batch(&mut self, sks: &mut [SuperKmer]) -> SKResult<()> {
|
||||
/// Route and write a batch of super-kmers.
|
||||
pub fn write_batch(&mut self, rsks: Vec<RoutableSuperKmer>) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
for sk in sks {
|
||||
let partition = self.partition_of(sk)?;
|
||||
sk.init_count();
|
||||
self.ensure_writer(partition)?.write(sk)?;
|
||||
for rsk in rsks {
|
||||
let partition = (rsk.minimizer().hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||
let sk = rsk.into_superkmer();
|
||||
self.ensure_writer(partition)?.write(&sk)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -164,6 +167,18 @@ impl KmerPartition {
|
||||
&self.root_path
|
||||
}
|
||||
|
||||
pub fn kmer_size(&self) -> usize {
|
||||
self.kmer_size
|
||||
}
|
||||
|
||||
pub fn minimizer_size(&self) -> usize {
|
||||
self.minimizer_size
|
||||
}
|
||||
|
||||
pub fn n_partitions(&self) -> usize {
|
||||
self.n_partitions
|
||||
}
|
||||
|
||||
/// Deduplicate all `raw.{ext}` files in parallel, replacing each with a
|
||||
/// `dereplicated.{ext}` file where identical canonical sequences are merged
|
||||
/// and their counts summed.
|
||||
@@ -185,6 +200,7 @@ impl KmerPartition {
|
||||
/// more temporary file descriptors — all managed by the global fd pool.
|
||||
pub fn dereplicate(&self) -> SKResult<()> {
|
||||
let level = self.level;
|
||||
let k = self.kmer_size;
|
||||
let root = &self.root_path;
|
||||
let sys = System::new_all();
|
||||
// available_memory() can return 0 on macOS when the compressor page count exceeds
|
||||
@@ -205,7 +221,7 @@ impl KmerPartition {
|
||||
}
|
||||
let raw_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
let n_buckets = optimal_buckets(&raw_path, available_per_thread);
|
||||
dereplicate_partition(&dir, level, n_buckets)
|
||||
dereplicate_partition(&dir, level, n_buckets, k)
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -270,8 +286,10 @@ impl KmerPartition {
|
||||
}
|
||||
}
|
||||
|
||||
let global_spectrum_map: BTreeMap<String, u64> =
|
||||
global_spectrum.iter().map(|(&c, &f)| (format!("{c:010}"), f)).collect();
|
||||
let global_spectrum_map: BTreeMap<String, u64> = global_spectrum
|
||||
.iter()
|
||||
.map(|(&c, &f)| (format!("{c:010}"), f))
|
||||
.collect();
|
||||
serde_json::to_writer_pretty(
|
||||
fs::File::create(root.join("kmer_spectrum_raw.json"))?,
|
||||
&serde_json::json!({ "f0": global_f0, "f1": global_f1, "spectrum": &global_spectrum_map }),
|
||||
@@ -291,14 +309,6 @@ impl KmerPartition {
|
||||
}
|
||||
}
|
||||
|
||||
fn partition_of(&self, sk: &SuperKmer) -> SKResult<usize> {
|
||||
let minimizer = sk
|
||||
.kmer(sk.minimizer_pos() as usize, self.minimizer_size)
|
||||
.map_err(|e| io::Error::other(e))?
|
||||
.canonical(self.minimizer_size);
|
||||
Ok((minimizer.hash(self.minimizer_size) & self.partitions_mask) as usize)
|
||||
}
|
||||
|
||||
fn write_meta(&self, n_bits: usize) -> SKResult<()> {
|
||||
let meta = PartitionMeta {
|
||||
n_bits,
|
||||
@@ -316,7 +326,8 @@ impl KmerPartition {
|
||||
let dir = self.root_path.join(format!("part_{:05}", partition));
|
||||
fs::create_dir_all(&dir)?;
|
||||
let file_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
let writer = SKFileWriter::create_with(file_path, Format::Zstd, self.level)?;
|
||||
let writer =
|
||||
SKFileWriter::create_with(file_path, self.kmer_size, Format::Zstd, self.level)?;
|
||||
self.writers[partition] = Some(writer);
|
||||
}
|
||||
Ok(self.writers[partition].as_mut().unwrap())
|
||||
@@ -373,33 +384,47 @@ fn optimal_buckets(raw_path: &Path, available_bytes: u64) -> usize {
|
||||
|
||||
fn level_from_u32(n: u32) -> Level {
|
||||
match n {
|
||||
0 => Level::Zero, 1 => Level::One, 2 => Level::Two, 3 => Level::Three,
|
||||
4 => Level::Four, 5 => Level::Five, 6 => Level::Six, 7 => Level::Seven,
|
||||
8 => Level::Eight, 9 => Level::Nine, 10 => Level::Ten, 11 => Level::Eleven,
|
||||
12 => Level::Twelve, 13 => Level::Thirteen, 14 => Level::Fourteen,
|
||||
15 => Level::Fifteen, 16 => Level::Sixteen, 17 => Level::Seventeen,
|
||||
18 => Level::Eighteen, 19 => Level::Nineteen, 20 => Level::Twenty,
|
||||
0 => Level::Zero,
|
||||
1 => Level::One,
|
||||
2 => Level::Two,
|
||||
3 => Level::Three,
|
||||
4 => Level::Four,
|
||||
5 => Level::Five,
|
||||
6 => Level::Six,
|
||||
7 => Level::Seven,
|
||||
8 => Level::Eight,
|
||||
9 => Level::Nine,
|
||||
10 => Level::Ten,
|
||||
11 => Level::Eleven,
|
||||
12 => Level::Twelve,
|
||||
13 => Level::Thirteen,
|
||||
14 => Level::Fourteen,
|
||||
15 => Level::Fifteen,
|
||||
16 => Level::Sixteen,
|
||||
17 => Level::Seventeen,
|
||||
18 => Level::Eighteen,
|
||||
19 => Level::Nineteen,
|
||||
20 => Level::Twenty,
|
||||
_ => Level::TwentyOne,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Maximum value that fits in the 24-bit COUNT field of a SuperKmer header.
|
||||
const MAX_SK_COUNT: u64 = (1 << 24) - 1;
|
||||
|
||||
/// Deduplicate one partition directory in place (two-phase split + merge).
|
||||
fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()> {
|
||||
fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize, k: usize) -> SKResult<()> {
|
||||
let raw_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
if !raw_path.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let out_path = dir.join(format!("dereplicated.{SK_EXT}"));
|
||||
let mut writer = SKFileWriter::create_with(&out_path, Format::Zstd, level)?;
|
||||
let mut writer = SKFileWriter::create_with(&out_path, k, Format::Zstd, level)?;
|
||||
|
||||
if n_temp == 1 {
|
||||
// ── Direct path: partition fits in memory, no split needed ────────────
|
||||
let map = load_bucket(&raw_path)?;
|
||||
let map = load_bucket(&raw_path, k)?;
|
||||
remove_skmer_file(&raw_path)?;
|
||||
flush_map(map, &mut writer)?;
|
||||
} else {
|
||||
@@ -412,10 +437,10 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()
|
||||
{
|
||||
let mut writers: Vec<SKFileWriter> = temp_paths
|
||||
.iter()
|
||||
.map(|p| SKFileWriter::create_with(p, Format::Zstd, level))
|
||||
.map(|p| SKFileWriter::create_with(p, k, Format::Zstd, level))
|
||||
.collect::<SKResult<_>>()?;
|
||||
|
||||
let mut reader = SKFileReader::open(&raw_path)?;
|
||||
let mut reader = SKFileReader::open(&raw_path, k)?;
|
||||
while let Some(mut sk) = reader.read()? {
|
||||
sk.canonical();
|
||||
let bucket = (sk.hash() & temp_mask) as usize;
|
||||
@@ -429,7 +454,7 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()
|
||||
|
||||
// ── Phase 2: merge each temp bucket into the output ───────────────────
|
||||
for temp_path in &temp_paths {
|
||||
let map = load_bucket(temp_path)?;
|
||||
let map = load_bucket(temp_path, k)?;
|
||||
remove_skmer_file(temp_path)?;
|
||||
flush_map(map, &mut writer)?;
|
||||
}
|
||||
@@ -440,14 +465,14 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()
|
||||
}
|
||||
|
||||
/// Read a SuperKmer file into a deduplication map (already canonical).
|
||||
fn load_bucket(path: &Path) -> SKResult<HashMap<SuperKmer, u64>> {
|
||||
fn load_bucket(path: &Path, k: usize) -> SKResult<HashMap<SuperKmer, u64>> {
|
||||
let capacity = SKFileMeta::read(path)
|
||||
.ok()
|
||||
.flatten()
|
||||
.map(|m| m.instances as usize)
|
||||
.unwrap_or(0);
|
||||
let mut map: HashMap<SuperKmer, u64> = HashMap::with_capacity(capacity);
|
||||
let mut reader = SKFileReader::open(path)?;
|
||||
let mut reader = SKFileReader::open(path, k)?;
|
||||
while let Some(mut sk) = reader.read()? {
|
||||
sk.canonical();
|
||||
let count = sk.count() as u64;
|
||||
@@ -487,7 +512,7 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
let mut seen: HashSet<Kmer> = HashSet::with_capacity(capacity);
|
||||
let mut pass1_superkmers: u64 = 0;
|
||||
{
|
||||
let mut reader = SKFileReader::open(dedup_path)?;
|
||||
let mut reader = SKFileReader::open(dedup_path, k)?;
|
||||
while let Some(sk) = reader.read()? {
|
||||
pass1_superkmers += 1;
|
||||
for kmer in sk.iter_canonical_kmers(k) {
|
||||
@@ -497,7 +522,10 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
}
|
||||
let kmers: Vec<Kmer> = seen.into_iter().collect();
|
||||
let n_kmers = kmers.len();
|
||||
debug!("{}: pass1 superkmers={pass1_superkmers} unique_kmers={n_kmers}", dir.display());
|
||||
debug!(
|
||||
"{}: pass1 superkmers={pass1_superkmers} unique_kmers={n_kmers}",
|
||||
dir.display()
|
||||
);
|
||||
|
||||
if n_kmers == 0 {
|
||||
return Ok(());
|
||||
@@ -527,13 +555,16 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
{
|
||||
let counts =
|
||||
unsafe { std::slice::from_raw_parts_mut(mmap.as_mut_ptr() as *mut u32, n_kmers) };
|
||||
let mut reader = SKFileReader::open(dedup_path)?;
|
||||
let mut reader = SKFileReader::open(dedup_path, k)?;
|
||||
while let Some(sk) = reader.read()? {
|
||||
pass2_superkmers += 1;
|
||||
let seql = sk.seql();
|
||||
let seql = sk.len();
|
||||
let sk_count = sk.count();
|
||||
if pass2_superkmers <= 3 {
|
||||
debug!("{}: sk#{pass2_superkmers} seql={seql} count={sk_count}", dir.display());
|
||||
debug!(
|
||||
"{}: sk#{pass2_superkmers} seql={seql} count={sk_count}",
|
||||
dir.display()
|
||||
);
|
||||
}
|
||||
if seql < k {
|
||||
continue;
|
||||
@@ -566,8 +597,10 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
let f0 = n_kmers as u64;
|
||||
let f1: u64 = spectrum.iter().map(|(&c, &f)| c as u64 * f).sum();
|
||||
|
||||
let spectrum_map: BTreeMap<String, u64> =
|
||||
spectrum.iter().map(|(&c, &f)| (format!("{c:010}"), f)).collect();
|
||||
let spectrum_map: BTreeMap<String, u64> = spectrum
|
||||
.iter()
|
||||
.map(|(&c, &f)| (format!("{c:010}"), f))
|
||||
.collect();
|
||||
serde_json::to_writer_pretty(
|
||||
fs::File::create(dir.join("kmer_spectrum_raw.json"))?,
|
||||
&serde_json::json!({ "f0": f0, "f1": f1, "spectrum": &spectrum_map }),
|
||||
|
||||
@@ -5,6 +5,8 @@ edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
bitvec = "1"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0.149"
|
||||
xxhash-rust = { version = "0.8.15", features = ["xxh3", "const_xxh3"] }
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -40,7 +40,7 @@ fn bench_write_ascii(c: &mut Criterion) {
|
||||
let mut buf = Vec::with_capacity(len);
|
||||
b.iter(|| {
|
||||
buf.clear();
|
||||
std::hint::black_box(sk).write_ascii(&mut buf);
|
||||
std::hint::black_box(sk).write_ascii(&mut buf).unwrap();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
use serde::Serialize;
|
||||
use serde_json;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// Serialize `self` as a single-line JSON object into a writer.
|
||||
pub trait Annotation: Serialize {
|
||||
/// Write the annotation as compact JSON into `writer`.
|
||||
fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let s = serde_json::to_string(self).map_err(io::Error::other)?;
|
||||
writer.write_all(s.as_bytes())
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,8 @@
|
||||
//! The low 64−2k bits are always zero. k is not stored — it is a parameter of
|
||||
//! every operation that needs it, and will be owned by the collection-level indexer.
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
|
||||
// ── KmerError ─────────────────────────────────────────────────────────────────
|
||||
@@ -115,24 +117,24 @@ impl Kmer {
|
||||
#[inline]
|
||||
pub fn to_ascii(&self, k: usize) -> Vec<u8> {
|
||||
let mut buf = Vec::with_capacity(k);
|
||||
self.write_ascii(k, &mut buf);
|
||||
self.write_ascii(k, &mut buf).unwrap();
|
||||
buf
|
||||
}
|
||||
|
||||
/// Decode this kmer into ASCII nucleotides, appending into `buf`.
|
||||
/// Zero allocation — caller owns the buffer.
|
||||
/// Decode this kmer into ASCII nucleotides, writing into `writer`.
|
||||
#[inline]
|
||||
pub fn write_ascii(&self, k: usize, buf: &mut Vec<u8>) {
|
||||
pub fn write_ascii<W: Write>(&self, k: usize, writer: &mut W) -> io::Result<()> {
|
||||
let bytes = self.0.to_be_bytes();
|
||||
let full = k / 4;
|
||||
let rem = k % 4;
|
||||
for i in 0..full {
|
||||
buf.extend_from_slice(&DEC4[bytes[i] as usize].to_be_bytes());
|
||||
writer.write_all(&DEC4[bytes[i] as usize].to_be_bytes())?;
|
||||
}
|
||||
if rem > 0 {
|
||||
let decoded = DEC4[bytes[full] as usize].to_be_bytes();
|
||||
buf.extend_from_slice(&decoded[..rem]);
|
||||
writer.write_all(&decoded[..rem])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compute the reverse complement of this kmer.
|
||||
|
||||
@@ -5,8 +5,17 @@
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
mod annotations;
|
||||
|
||||
mod encoding;
|
||||
pub mod kmer;
|
||||
mod revcomp_lookup;
|
||||
/// Routable super-kmer: canonical sequence paired with its minimizer for scatter routing.
|
||||
pub mod routable;
|
||||
pub mod superkmer;
|
||||
|
||||
pub mod unitig;
|
||||
|
||||
pub use annotations::Annotation;
|
||||
pub use routable::RoutableSuperKmer;
|
||||
pub use superkmer::SuperKmer;
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
//! Super-kmer with routing metadata: canonical sequence + pre-computed minimizer.
|
||||
|
||||
use super::kmer::Kmer;
|
||||
use super::SuperKmer;
|
||||
|
||||
/// Owned wrapper that pairs a canonical [`SuperKmer`] with its minimizer [`Kmer`].
|
||||
///
|
||||
/// Created at the single point where raw sequence bytes are emitted from the
|
||||
/// scratch buffer. The minimizer position (given in original orientation) is
|
||||
/// adjusted for any flip applied during canonicalisation. After routing, call
|
||||
/// [`into_superkmer`] to discard the metadata and continue with the bare sequence.
|
||||
///
|
||||
/// [`into_superkmer`]: RoutableSuperKmer::into_superkmer
|
||||
pub struct RoutableSuperKmer {
|
||||
superkmer: SuperKmer,
|
||||
minimizer: Kmer,
|
||||
}
|
||||
|
||||
impl RoutableSuperKmer {
|
||||
/// Construct from raw packed bytes.
|
||||
///
|
||||
/// `min_pos` is the 0-based minimizer position in the **original** (pre-flip)
|
||||
/// orientation. `m` is the minimizer length. `seql` and `seq` are the
|
||||
/// raw length byte and 2-bit-packed nucleotides as produced by the scratch
|
||||
/// buffer.
|
||||
pub fn build(min_pos: usize, m: usize, seql: u8, seq: Box<[u8]>) -> Self {
|
||||
let (sk, already_canonical) = SuperKmer::build(seql, seq);
|
||||
let adjusted_pos = if already_canonical {
|
||||
min_pos
|
||||
} else {
|
||||
sk.len() - m - min_pos
|
||||
};
|
||||
let minimizer = sk.kmer(adjusted_pos, m).unwrap().canonical(m);
|
||||
Self {
|
||||
superkmer: sk,
|
||||
minimizer,
|
||||
}
|
||||
}
|
||||
|
||||
/// Borrow the canonical super-kmer sequence.
|
||||
pub fn superkmer(&self) -> &SuperKmer {
|
||||
&self.superkmer
|
||||
}
|
||||
|
||||
/// Borrow the canonical minimizer kmer.
|
||||
pub fn minimizer(&self) -> &Kmer {
|
||||
&self.minimizer
|
||||
}
|
||||
|
||||
/// Consume this wrapper and return the inner [`SuperKmer`].
|
||||
pub fn into_superkmer(self) -> SuperKmer {
|
||||
self.superkmer
|
||||
}
|
||||
|
||||
/// Sequence length in nucleotides.
|
||||
pub fn len(&self) -> usize {
|
||||
self.superkmer.len()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
pub trait Sequence {
|
||||
fn len(&self) -> usize;
|
||||
fn sequence(&self) -> &[u8];
|
||||
fn revcomp(&self) -> Self;
|
||||
}
|
||||
+54
-587
@@ -1,4 +1,7 @@
|
||||
//! Compact 2-bit DNA super-kmer with in-place reverse complement and canonical form.
|
||||
use std::io::{self, Write};
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
use crate::kmer::{Kmer, KmerError};
|
||||
@@ -14,70 +17,24 @@ use xxhash_rust::xxh3::xxh3_64;
|
||||
///
|
||||
/// ```text
|
||||
/// [31 .......... 8] [7 ...... 0]
|
||||
/// payload (24 b) SEQL (8 b)
|
||||
/// count (24 b) SEQL (8 b)
|
||||
/// ```
|
||||
///
|
||||
/// SEQL encodes the sequence length: 1–255 map directly; 0 encodes 256.
|
||||
///
|
||||
/// # Temporal dual-use of the payload field
|
||||
///
|
||||
/// The 24-bit payload field serves two distinct roles that are **never active
|
||||
/// at the same time**, separated by the routing step of the scatter pipeline:
|
||||
///
|
||||
/// | Phase | Bits [15:8] | Bits [31:16] |
|
||||
/// |---|---|---|
|
||||
/// | **Scatter** (before routing) | minimizer start position (0–255) | unused (zero) |
|
||||
/// | **Count** (after routing) | low byte of occurrence count | high bytes of occurrence count |
|
||||
///
|
||||
/// During scatter, [`set_minimizer_pos`] stores the 0-based position of the
|
||||
/// minimizer's first nucleotide within the super-kmer. At routing time,
|
||||
/// [`init_count`] overwrites the entire payload with `1`, marking the
|
||||
/// super-kmer as seen once and enabling the usual [`increment`] / [`add`] /
|
||||
/// [`set_count`] operations during deduplication.
|
||||
///
|
||||
/// [`set_minimizer_pos`]: SuperKmerHeader::set_minimizer_pos
|
||||
/// [`init_count`]: SuperKmerHeader::init_count
|
||||
/// [`increment`]: SuperKmerHeader::increment
|
||||
/// [`add`]: SuperKmerHeader::add
|
||||
/// [`set_count`]: SuperKmerHeader::set_count
|
||||
/// The count field starts at 1 and accumulates occurrence counts during
|
||||
/// deduplication.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) struct SuperKmerHeader(u32);
|
||||
|
||||
impl SuperKmerHeader {
|
||||
pub(crate) fn new(seql: u8) -> Self {
|
||||
Self(seql as u32)
|
||||
Self((1 << 8) | seql as u32)
|
||||
}
|
||||
|
||||
fn seql(&self) -> u8 {
|
||||
self.0 as u8
|
||||
}
|
||||
|
||||
// ── scatter phase ─────────────────────────────────────────────────────────
|
||||
|
||||
/// Store the minimizer start position (bits [15:8]).
|
||||
/// Only meaningful during the scatter phase, before [`init_count`].
|
||||
///
|
||||
/// [`init_count`]: SuperKmerHeader::init_count
|
||||
fn set_minimizer_pos(&mut self, pos: u8) {
|
||||
self.0 = (self.0 & 0xFF) | ((pos as u32) << 8);
|
||||
}
|
||||
|
||||
/// Return the minimizer start position stored during scatter.
|
||||
/// Only meaningful before [`init_count`] is called.
|
||||
///
|
||||
/// [`init_count`]: SuperKmerHeader::init_count
|
||||
fn minimizer_pos(&self) -> u8 {
|
||||
(self.0 >> 8) as u8
|
||||
}
|
||||
|
||||
// ── count phase ───────────────────────────────────────────────────────────
|
||||
|
||||
/// Transition from scatter to count phase: set occurrence count to 1.
|
||||
/// Overwrites the minimizer position stored in the payload.
|
||||
fn init_count(&mut self) {
|
||||
self.0 = (self.0 & 0xFF) | (1 << 8);
|
||||
}
|
||||
|
||||
fn count(&self) -> u32 {
|
||||
self.0 >> 8
|
||||
}
|
||||
@@ -95,6 +52,15 @@ impl SuperKmerHeader {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct CountAnnotation {
|
||||
seq_length: usize,
|
||||
kmer_size: usize,
|
||||
minimizer_size: usize,
|
||||
partition: u32,
|
||||
count: u32,
|
||||
}
|
||||
|
||||
// ── SuperKmer ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Canonical super-kmer: 32-bit header followed by a byte-aligned 2-bit nucleotide sequence.
|
||||
@@ -127,12 +93,18 @@ impl std::hash::Hash for SuperKmer {
|
||||
impl SuperKmer {
|
||||
/// `seql` is the raw stored byte: 1–255 for lengths 1–255, 0 for length 256.
|
||||
pub fn new(seql: u8, seq: Box<[u8]>) -> Self {
|
||||
let len = stored_to_len(seql);
|
||||
debug_assert_eq!(seq.len(), byte_len(len));
|
||||
Self {
|
||||
Self::build(seql, seq).0
|
||||
}
|
||||
|
||||
/// Construct and canonicalise in place, returning `(sk, already_canonical)`.
|
||||
/// `already_canonical` is `true` when the sequence was not flipped.
|
||||
pub fn build(seql: u8, seq: Box<[u8]>) -> (Self, bool) {
|
||||
let mut sk = Self {
|
||||
header: SuperKmerHeader::new(seql),
|
||||
seq,
|
||||
}
|
||||
};
|
||||
let already_canonical = sk.canonical(); // true = pas retourné
|
||||
(sk, already_canonical)
|
||||
}
|
||||
|
||||
/// Deserialise from a raw 32-bit header word and packed sequence bytes.
|
||||
@@ -141,14 +113,19 @@ impl SuperKmer {
|
||||
let seql = (bits & 0xFF) as u8;
|
||||
let len = stored_to_len(seql);
|
||||
debug_assert_eq!(seq.len(), byte_len(len));
|
||||
Self {
|
||||
let sk = Self {
|
||||
header: SuperKmerHeader(bits),
|
||||
seq,
|
||||
}
|
||||
};
|
||||
debug_assert!(
|
||||
sk.is_canonical(),
|
||||
"SuperKmer deserialised from disk is not canonical"
|
||||
);
|
||||
sk
|
||||
}
|
||||
|
||||
/// Returns the sequence length in nucleotides (1–256).
|
||||
pub fn seql(&self) -> usize {
|
||||
pub fn len(&self) -> usize {
|
||||
stored_to_len(self.header.seql())
|
||||
}
|
||||
|
||||
@@ -172,44 +149,6 @@ impl SuperKmer {
|
||||
self.header.set_count(n);
|
||||
}
|
||||
|
||||
// ── scatter / routing interface ───────────────────────────────────────────
|
||||
|
||||
/// Store the 0-based position of the minimizer's first nucleotide within
|
||||
/// this super-kmer.
|
||||
///
|
||||
/// **Scatter phase only.** Must be called before [`init_count`].
|
||||
/// The position is encoded in the payload field that later holds the
|
||||
/// occurrence count; the two uses are mutually exclusive by pipeline phase.
|
||||
///
|
||||
/// [`init_count`]: SuperKmer::init_count
|
||||
pub fn set_minimizer_pos(&mut self, pos: u8) {
|
||||
self.header.set_minimizer_pos(pos);
|
||||
}
|
||||
|
||||
/// Return the stored minimizer start position.
|
||||
///
|
||||
/// **Scatter phase only.** Only meaningful before [`init_count`] is called.
|
||||
///
|
||||
/// [`init_count`]: SuperKmer::init_count
|
||||
pub fn minimizer_pos(&self) -> u8 {
|
||||
self.header.minimizer_pos()
|
||||
}
|
||||
|
||||
/// Transition from scatter phase to count phase: set occurrence count to 1.
|
||||
///
|
||||
/// Call this once at routing time. After this call, [`minimizer_pos`] is
|
||||
/// no longer valid and the count methods ([`count`], [`increment`], [`add`],
|
||||
/// [`set_count`]) become meaningful.
|
||||
///
|
||||
/// [`minimizer_pos`]: SuperKmer::minimizer_pos
|
||||
/// [`count`]: SuperKmer::count
|
||||
/// [`increment`]: SuperKmer::increment
|
||||
/// [`add`]: SuperKmer::add
|
||||
/// [`set_count`]: SuperKmer::set_count
|
||||
pub fn init_count(&mut self) {
|
||||
self.header.init_count();
|
||||
}
|
||||
|
||||
/// Extract nucleotide i (0-based from 5' end) as a 2-bit value.
|
||||
pub fn nucleotide(&self, i: usize) -> u8 {
|
||||
(self.seq[i / 4] >> (6 - 2 * (i % 4))) & 0b11
|
||||
@@ -217,7 +156,7 @@ impl SuperKmer {
|
||||
|
||||
/// Reverse-complement this super-kmer in place.
|
||||
pub fn revcomp(&mut self) {
|
||||
let seql = self.seql();
|
||||
let seql = self.len();
|
||||
let n = byte_len(seql);
|
||||
|
||||
// Step 1: swap bytes outside-in, applying revcomp4 to each.
|
||||
@@ -245,8 +184,7 @@ impl SuperKmer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode an ASCII nucleotide sequence (ACGT, length 1–256) into a new SuperKmer.
|
||||
/// The result is not yet in canonical form; call `.canonical()` if needed.
|
||||
/// Encode an ASCII nucleotide sequence (ACGT, length 1–256) into a canonical SuperKmer.
|
||||
pub fn from_ascii(ascii: &[u8]) -> Self {
|
||||
let seql = ascii.len();
|
||||
debug_assert!(
|
||||
@@ -275,25 +213,26 @@ impl SuperKmer {
|
||||
Self::new(seql as u8, seq.into_boxed_slice()) // 256usize as u8 == 0, intentional
|
||||
}
|
||||
|
||||
/// Decode this super-kmer sequence into ASCII nucleotides, appending into `buf`.
|
||||
pub fn write_ascii(&self, buf: &mut Vec<u8>) {
|
||||
let seql = self.seql();
|
||||
/// Decode this super-kmer sequence into ASCII nucleotides, writing into `writer`.
|
||||
pub fn write_ascii<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let seql = self.len();
|
||||
let full = seql / 4;
|
||||
|
||||
for i in 0..full {
|
||||
buf.extend_from_slice(&DEC4[self.seq[i] as usize].to_be_bytes());
|
||||
writer.write_all(&DEC4[self.seq[i] as usize].to_be_bytes())?;
|
||||
}
|
||||
let rem = seql % 4;
|
||||
if rem > 0 {
|
||||
let bytes = DEC4[self.seq[full] as usize].to_be_bytes();
|
||||
buf.extend_from_slice(&bytes[..rem]);
|
||||
writer.write_all(&bytes[..rem])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Decode this super-kmer sequence into a fresh ASCII `Vec<u8>`.
|
||||
pub fn to_ascii(&self) -> Vec<u8> {
|
||||
let mut buf = Vec::with_capacity(self.seql());
|
||||
self.write_ascii(&mut buf);
|
||||
let mut buf = Vec::with_capacity(self.len());
|
||||
self.write_ascii(&mut buf).unwrap();
|
||||
buf
|
||||
}
|
||||
|
||||
@@ -318,7 +257,7 @@ impl SuperKmer {
|
||||
if k == 0 || k > 32 {
|
||||
return Err(KmerError::InvalidK { k });
|
||||
}
|
||||
let seql = self.seql();
|
||||
let seql = self.len();
|
||||
if i + k > seql {
|
||||
return Err(KmerError::OutOfBounds {
|
||||
position: i,
|
||||
@@ -351,7 +290,7 @@ impl SuperKmer {
|
||||
|
||||
/// Returns `true` if this super-kmer is in canonical form (lexicographic minimum of forward and revcomp).
|
||||
pub fn is_canonical(&self) -> bool {
|
||||
let seql = self.seql();
|
||||
let seql = self.len();
|
||||
for i in 0..seql {
|
||||
let fwd = self.nucleotide(i);
|
||||
let rev = complement(self.nucleotide(seql - 1 - i));
|
||||
@@ -398,14 +337,18 @@ struct SKKmerIter<'a> {
|
||||
|
||||
impl<'a> SKKmerIter<'a> {
|
||||
fn new(skmer: &'a SuperKmer, k: usize) -> Self {
|
||||
let seql = skmer.seql();
|
||||
let seql = skmer.len();
|
||||
let lshift = 64 - k * 2;
|
||||
let mask = ((!0u128) << (lshift + 2)) as u64;
|
||||
Self {
|
||||
skmer,
|
||||
mask,
|
||||
lshift,
|
||||
current: if seql >= k { skmer.kmer(0, k).unwrap().raw() } else { 0 },
|
||||
current: if seql >= k {
|
||||
skmer.kmer(0, k).unwrap().raw()
|
||||
} else {
|
||||
0
|
||||
},
|
||||
pos: k,
|
||||
max_pos: seql,
|
||||
}
|
||||
@@ -449,482 +392,6 @@ fn stored_to_len(s: u8) -> usize {
|
||||
if s == 0 { 256 } else { s as usize }
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Repeating ACGT pattern of the given length.
|
||||
fn make_seq(len: usize) -> Vec<u8> {
|
||||
(0..len).map(|i| b"ACGT"[i % 4]).collect()
|
||||
}
|
||||
|
||||
/// Reference revcomp on ASCII bytes.
|
||||
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
||||
seq.iter()
|
||||
.rev()
|
||||
.map(|&b| match b {
|
||||
b'A' => b'T',
|
||||
b'T' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
_ => b'A',
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn all_lengths() -> impl Iterator<Item = usize> {
|
||||
(1..=9).chain([255, 256])
|
||||
}
|
||||
|
||||
// ── kmer extraction ───────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn kmer_first_matches_from_ascii() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmer = sk.kmer(0, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[..k], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_last_position() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let seql = ascii.len();
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmer = sk.kmer(seql - k, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[seql - k..], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_all_positions() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for i in 0..=ascii.len() - k {
|
||||
let kmer = sk.kmer(i, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[i..i + k], k).unwrap();
|
||||
assert_eq!(kmer, expected, "mismatch at position {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_out_of_bounds() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(2, 4).is_err()); // 2 + 4 > 4
|
||||
assert!(sk.kmer(4, 1).is_err()); // 4 + 1 > 4
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_invalid_k() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(0, 0).is_err());
|
||||
assert!(sk.kmer(0, 33).is_err());
|
||||
}
|
||||
|
||||
// ── canonical_kmer ────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_is_min_of_kmer_and_revcomp() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
let k = 4;
|
||||
for i in 0..=(sk.seql() - k) {
|
||||
let ck = sk.canonical_kmer(i, k).unwrap();
|
||||
let fwd = sk.kmer(i, k).unwrap();
|
||||
assert_eq!(ck, fwd.canonical(k));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_palindrome_unchanged() {
|
||||
// ACGT is its own reverse complement
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let fwd = sk.kmer(0, 4).unwrap();
|
||||
assert_eq!(ck, fwd);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_tttt_becomes_aaaa() {
|
||||
let sk = SuperKmer::from_ascii(b"TTTT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
|
||||
assert_eq!(ck, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_errors_propagate() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.canonical_kmer(2, 4).is_err()); // out of bounds
|
||||
assert!(sk.canonical_kmer(0, 0).is_err()); // invalid k
|
||||
}
|
||||
|
||||
// ── count ─────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn count_starts_at_zero() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert_eq!(sk.count(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_adds_one() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 1);
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_increases_count() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(42);
|
||||
assert_eq!(sk.count(), 42);
|
||||
sk.add(8);
|
||||
assert_eq!(sk.count(), 50);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_overwrites() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(100);
|
||||
sk.set_count(7);
|
||||
assert_eq!(sk.count(), 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.increment();
|
||||
assert_eq!(sk.seql(), len, "increment altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.add(1000);
|
||||
assert_eq!(sk.seql(), len, "add altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.set_count(999);
|
||||
assert_eq!(sk.seql(), len, "set_count altered seql for len={len}");
|
||||
assert_eq!(sk.count(), 999);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count_does_not_affect_sequence() {
|
||||
let ascii = b"ACGTACGT".to_vec();
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.set_count(16_000_000);
|
||||
assert_eq!(sk.to_ascii(), ascii);
|
||||
}
|
||||
|
||||
// ── seql encoding ─────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn seql_roundtrip() {
|
||||
for len in all_lengths() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
assert_eq!(sk.seql(), len, "seql() wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seql_256_stored_as_zero() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(256));
|
||||
assert_eq!(sk.header.seql(), 0u8);
|
||||
assert_eq!(sk.seql(), 256);
|
||||
}
|
||||
|
||||
// ── from_ascii / to_ascii roundtrip ───────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), ascii, "roundtrip failed for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_bases() {
|
||||
for (base, expected) in [(b'A', b'A'), (b'C', b'C'), (b'G', b'G'), (b'T', b'T')] {
|
||||
let ascii = vec![base; 4];
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), vec![expected; 4]);
|
||||
}
|
||||
}
|
||||
|
||||
// ── revcomp correctness ───────────────────────────────────────────────────
|
||||
|
||||
/// Known (seq, expected_revcomp) pairs — one per shift value × two byte counts.
|
||||
#[test]
|
||||
fn revcomp_known_values() {
|
||||
let cases = [
|
||||
// shift=6
|
||||
("A", "T"),
|
||||
("ACGTA", "TACGT"),
|
||||
// shift=4
|
||||
("AC", "GT"),
|
||||
("ACGTAC", "GTACGT"),
|
||||
// shift=2
|
||||
("ACG", "CGT"),
|
||||
("ACGTACG", "CGTACGT"),
|
||||
// shift=0
|
||||
("ACGT", "ACGT"),
|
||||
("ACGTACGT", "ACGTACGT"),
|
||||
];
|
||||
for (seq, expected) in cases {
|
||||
let mut sk = SuperKmer::from_ascii(seq.as_bytes());
|
||||
sk.revcomp();
|
||||
assert_eq!(
|
||||
sk.to_ascii(),
|
||||
expected.as_bytes(),
|
||||
"revcomp wrong for \"{seq}\""
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_vs_reference_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let expected = ascii_revcomp(&ascii);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), expected, "revcomp wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_involution_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), ascii, "revcomp∘revcomp≠id for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── canonical ─────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_palindrome_unchanged() {
|
||||
// ACGT is its own revcomp
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"ACGT");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_forward() {
|
||||
// "AAAA" < "TTTT" → stays as-is
|
||||
let mut sk = SuperKmer::from_ascii(b"AAAA");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_revcomp() {
|
||||
// "TTTT" > "AAAA" → flipped
|
||||
let mut sk = SuperKmer::from_ascii(b"TTTT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_is_minimal_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.canonical();
|
||||
let fwd = sk.to_ascii();
|
||||
let rev = ascii_revcomp(&fwd);
|
||||
assert!(fwd <= rev, "canonical not minimal for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── scatter / routing lifecycle ───────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn minimizer_pos_roundtrip() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(42);
|
||||
assert_eq!(sk.minimizer_pos(), 42);
|
||||
assert_eq!(sk.seql(), 8, "set_minimizer_pos altered seql");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minimizer_pos_boundary_values() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
assert_eq!(sk.minimizer_pos(), 0);
|
||||
sk.set_minimizer_pos(255);
|
||||
assert_eq!(sk.minimizer_pos(), 255);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn init_count_resets_to_one_and_enables_counting() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(7);
|
||||
sk.init_count();
|
||||
assert_eq!(sk.count(), 1);
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 2);
|
||||
sk.add(10);
|
||||
assert_eq!(sk.count(), 12);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn init_count_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.set_minimizer_pos(0);
|
||||
sk.init_count();
|
||||
assert_eq!(sk.seql(), len, "init_count altered seql for len={len}");
|
||||
assert_eq!(sk.count(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minimizer_pos_does_not_affect_sequence() {
|
||||
let ascii = b"ACGTACGT".to_vec();
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.set_minimizer_pos(3);
|
||||
assert_eq!(sk.to_ascii(), ascii);
|
||||
}
|
||||
|
||||
// ── iter_kmers ────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_count() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in [1usize, 3, 4, 5, 8, 12] {
|
||||
let n = sk.iter_kmers(k).count();
|
||||
assert_eq!(n, ascii.len() - k + 1, "count mismatch for k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_first_is_kmer_0() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in 1..=ascii.len() {
|
||||
let first = sk.iter_kmers(k).next().unwrap();
|
||||
assert_eq!(first, sk.kmer(0, k).unwrap(), "k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_matches_kmer_at_each_position() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "mismatch at pos {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_single_when_seql_eq_k() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len();
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 1);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_two_when_seql_eq_k_plus_one() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len() - 1;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 2);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
assert_eq!(kmers[1], sk.kmer(1, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_all_k_values() {
|
||||
// For every valid k, each yielded kmer must match kmer(i, k).
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let seql = ascii.len();
|
||||
for k in 1..=seql {
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), seql - k + 1, "k={k}");
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "k={k}, pos={i}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_crosses_byte_boundary() {
|
||||
// Positions 3→4 and 7→8 cross a 4-nucleotide byte boundary.
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 3;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
for boundary in [3usize, 4, 7, 8] {
|
||||
if boundary + 1 < kmers.len() {
|
||||
assert_eq!(
|
||||
kmers[boundary],
|
||||
sk.kmer(boundary, k).unwrap(),
|
||||
"pos={boundary}"
|
||||
);
|
||||
assert_eq!(
|
||||
kmers[boundary + 1],
|
||||
sk.kmer(boundary + 1, k).unwrap(),
|
||||
"pos={}",
|
||||
boundary + 1
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_k1_yields_all_nucleotides() {
|
||||
let ascii = b"ACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(1).collect();
|
||||
assert_eq!(kmers.len(), 4);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, 1).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_long_sequence() {
|
||||
let ascii = make_seq(20);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
let k = 7;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
}
|
||||
#[path = "tests/superkmer.rs"]
|
||||
mod tests;
|
||||
|
||||
@@ -0,0 +1,425 @@
|
||||
use super::*;
|
||||
|
||||
/// Repeating ACGT pattern of the given length.
|
||||
fn make_seq(len: usize) -> Vec<u8> {
|
||||
(0..len).map(|i| b"ACGT"[i % 4]).collect()
|
||||
}
|
||||
|
||||
/// Reference revcomp on ASCII bytes.
|
||||
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
||||
seq.iter()
|
||||
.rev()
|
||||
.map(|&b| match b {
|
||||
b'A' => b'T',
|
||||
b'T' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
_ => b'A',
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn all_lengths() -> impl Iterator<Item = usize> {
|
||||
(1..=9).chain([255, 256])
|
||||
}
|
||||
|
||||
// ── kmer extraction ───────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn kmer_first_matches_from_ascii() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmer = sk.kmer(0, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[..k], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_last_position() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let seql = ascii.len();
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmer = sk.kmer(seql - k, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[seql - k..], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_all_positions() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for i in 0..=ascii.len() - k {
|
||||
let kmer = sk.kmer(i, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[i..i + k], k).unwrap();
|
||||
assert_eq!(kmer, expected, "mismatch at position {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_out_of_bounds() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(2, 4).is_err()); // 2 + 4 > 4
|
||||
assert!(sk.kmer(4, 1).is_err()); // 4 + 1 > 4
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_invalid_k() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(0, 0).is_err());
|
||||
assert!(sk.kmer(0, 33).is_err());
|
||||
}
|
||||
|
||||
// ── canonical_kmer ────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_is_min_of_kmer_and_revcomp() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
let k = 4;
|
||||
for i in 0..=(sk.len() - k) {
|
||||
let ck = sk.canonical_kmer(i, k).unwrap();
|
||||
let fwd = sk.kmer(i, k).unwrap();
|
||||
assert_eq!(ck, fwd.canonical(k));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_palindrome_unchanged() {
|
||||
// ACGT is its own reverse complement
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let fwd = sk.kmer(0, 4).unwrap();
|
||||
assert_eq!(ck, fwd);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_tttt_becomes_aaaa() {
|
||||
let sk = SuperKmer::from_ascii(b"TTTT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
|
||||
assert_eq!(ck, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_errors_propagate() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.canonical_kmer(2, 4).is_err()); // out of bounds
|
||||
assert!(sk.canonical_kmer(0, 0).is_err()); // invalid k
|
||||
}
|
||||
|
||||
// ── count ─────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn count_starts_at_one() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert_eq!(sk.count(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_adds_one() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 2);
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_increases_count() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(42);
|
||||
assert_eq!(sk.count(), 43);
|
||||
sk.add(8);
|
||||
assert_eq!(sk.count(), 51);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_overwrites() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(100);
|
||||
sk.set_count(7);
|
||||
assert_eq!(sk.count(), 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.increment();
|
||||
assert_eq!(sk.len(), len, "increment altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.add(1000);
|
||||
assert_eq!(sk.len(), len, "add altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.set_count(999);
|
||||
assert_eq!(sk.len(), len, "set_count altered seql for len={len}");
|
||||
assert_eq!(sk.count(), 999);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count_does_not_affect_sequence() {
|
||||
let ascii = b"ACGTACGT".to_vec();
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.set_count(16_000_000);
|
||||
assert_eq!(sk.to_ascii(), ascii);
|
||||
}
|
||||
|
||||
// ── seql encoding ─────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn seql_roundtrip() {
|
||||
for len in all_lengths() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
assert_eq!(sk.len(), len, "seql() wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seql_256_stored_as_zero() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(256));
|
||||
assert_eq!(sk.header.seql(), 0u8);
|
||||
assert_eq!(sk.len(), 256);
|
||||
}
|
||||
|
||||
// ── from_ascii / to_ascii roundtrip ───────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), ascii, "roundtrip failed for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_bases() {
|
||||
// Canonical form: min(seq, revcomp). G×4 flips to C×4, T×4 flips to A×4.
|
||||
for (base, expected) in [(b'A', b'A'), (b'C', b'C'), (b'G', b'C'), (b'T', b'A')] {
|
||||
let ascii = vec![base; 4];
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), vec![expected; 4]);
|
||||
}
|
||||
}
|
||||
|
||||
// ── revcomp correctness ───────────────────────────────────────────────────
|
||||
|
||||
/// Known (seq, expected_revcomp) pairs — one per shift value × two byte counts.
|
||||
#[test]
|
||||
fn revcomp_known_values() {
|
||||
let cases = [
|
||||
// shift=6
|
||||
("A", "T"),
|
||||
("ACGTA", "TACGT"),
|
||||
// shift=4
|
||||
("AC", "GT"),
|
||||
("ACGTAC", "GTACGT"),
|
||||
// shift=2
|
||||
("ACG", "CGT"),
|
||||
("ACGTACG", "CGTACGT"),
|
||||
// shift=0
|
||||
("ACGT", "ACGT"),
|
||||
("ACGTACGT", "ACGTACGT"),
|
||||
];
|
||||
for (seq, expected) in cases {
|
||||
let mut sk = SuperKmer::from_ascii(seq.as_bytes());
|
||||
sk.revcomp();
|
||||
assert_eq!(
|
||||
sk.to_ascii(),
|
||||
expected.as_bytes(),
|
||||
"revcomp wrong for \"{seq}\""
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_vs_reference_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let expected = ascii_revcomp(&ascii);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), expected, "revcomp wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_involution_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), ascii, "revcomp∘revcomp≠id for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── canonical ─────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_palindrome_unchanged() {
|
||||
// ACGT is its own revcomp
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"ACGT");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_forward() {
|
||||
// "AAAA" < "TTTT" → stays as-is
|
||||
let mut sk = SuperKmer::from_ascii(b"AAAA");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_revcomp() {
|
||||
// "TTTT" > "AAAA" → flipped
|
||||
let mut sk = SuperKmer::from_ascii(b"TTTT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_is_minimal_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.canonical();
|
||||
let fwd = sk.to_ascii();
|
||||
let rev = ascii_revcomp(&fwd);
|
||||
assert!(fwd <= rev, "canonical not minimal for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── iter_kmers ────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_count() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in [1usize, 3, 4, 5, 8, 12] {
|
||||
let n = sk.iter_kmers(k).count();
|
||||
assert_eq!(n, ascii.len() - k + 1, "count mismatch for k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_first_is_kmer_0() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in 1..=ascii.len() {
|
||||
let first = sk.iter_kmers(k).next().unwrap();
|
||||
assert_eq!(first, sk.kmer(0, k).unwrap(), "k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_matches_kmer_at_each_position() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "mismatch at pos {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_single_when_seql_eq_k() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len();
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 1);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_two_when_seql_eq_k_plus_one() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len() - 1;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 2);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
assert_eq!(kmers[1], sk.kmer(1, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_all_k_values() {
|
||||
// For every valid k, each yielded kmer must match kmer(i, k).
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let seql = ascii.len();
|
||||
for k in 1..=seql {
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), seql - k + 1, "k={k}");
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "k={k}, pos={i}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_crosses_byte_boundary() {
|
||||
// Positions 3→4 and 7→8 cross a 4-nucleotide byte boundary.
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 3;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
for boundary in [3usize, 4, 7, 8] {
|
||||
if boundary + 1 < kmers.len() {
|
||||
assert_eq!(
|
||||
kmers[boundary],
|
||||
sk.kmer(boundary, k).unwrap(),
|
||||
"pos={boundary}"
|
||||
);
|
||||
assert_eq!(
|
||||
kmers[boundary + 1],
|
||||
sk.kmer(boundary + 1, k).unwrap(),
|
||||
"pos={}",
|
||||
boundary + 1
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_k1_yields_all_nucleotides() {
|
||||
let ascii = b"ACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(1).collect();
|
||||
assert_eq!(kmers.len(), 4);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, 1).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_long_sequence() {
|
||||
let ascii = make_seq(20);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
let k = 7;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,8 @@
|
||||
//! at the MSB of `seq[0]`, 4 bases per byte — but without the 256-nucleotide
|
||||
//! length cap and without the scatter/count header payload.
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
use crate::kmer::{Kmer, KmerError};
|
||||
use crate::revcomp_lookup::REVCOMP4;
|
||||
@@ -101,23 +103,24 @@ impl Unitig {
|
||||
(self.seq[i / 4] >> (6 - 2 * (i % 4))) & 0b11
|
||||
}
|
||||
|
||||
/// Decode into ASCII nucleotides, appending into `buf`.
|
||||
pub fn write_ascii(&self, buf: &mut Vec<u8>) {
|
||||
/// Decode into ASCII nucleotides, writing into `writer`.
|
||||
pub fn write_ascii<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let full = self.seql / 4;
|
||||
for i in 0..full {
|
||||
buf.extend_from_slice(&DEC4[self.seq[i] as usize].to_be_bytes());
|
||||
writer.write_all(&DEC4[self.seq[i] as usize].to_be_bytes())?;
|
||||
}
|
||||
let rem = self.seql % 4;
|
||||
if rem > 0 {
|
||||
let bytes = DEC4[self.seq[full] as usize].to_be_bytes();
|
||||
buf.extend_from_slice(&bytes[..rem]);
|
||||
writer.write_all(&bytes[..rem])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Decode into a fresh ASCII `Vec<u8>`.
|
||||
pub fn to_ascii(&self) -> Vec<u8> {
|
||||
let mut buf = Vec::with_capacity(self.seql);
|
||||
self.write_ascii(&mut buf);
|
||||
self.write_ascii(&mut buf).unwrap();
|
||||
buf
|
||||
}
|
||||
|
||||
|
||||
@@ -16,12 +16,12 @@
|
||||
//! | super-kmer length = 256| k |
|
||||
|
||||
use obikrope::{ForwardCursor, Rope, RopeCursor};
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
use crate::rolling_stat::RollingStat;
|
||||
use crate::scratch::SuperKmerScratch;
|
||||
|
||||
/// Iterator over `(minimizer_hash, SuperKmer)` pairs.
|
||||
/// Iterator over [`RoutableSuperKmer`] values.
|
||||
pub struct SuperKmerIter<'a> {
|
||||
cursor: ForwardCursor<'a>,
|
||||
k: usize,
|
||||
@@ -60,26 +60,19 @@ impl<'a> SuperKmerIter<'a> {
|
||||
self.prev_min_pos = 0;
|
||||
}
|
||||
|
||||
fn try_emit(&mut self) -> Option<SuperKmer> {
|
||||
fn try_emit(&mut self) -> Option<RoutableSuperKmer> {
|
||||
if self.scratch.len() < self.k {
|
||||
return None;
|
||||
}
|
||||
let min = self.prev_min?;
|
||||
let mut sk = self.scratch.emit();
|
||||
let min_pos = if sk.canonical() {
|
||||
self.prev_min_pos
|
||||
} else {
|
||||
sk.seql() - self.m - self.prev_min_pos
|
||||
};
|
||||
sk.set_minimizer_pos(min_pos as u8);
|
||||
Some(sk)
|
||||
self.prev_min?;
|
||||
Some(self.scratch.emit(self.prev_min_pos, self.m))
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for SuperKmerIter<'_> {
|
||||
type Item = SuperKmer;
|
||||
type Item = RoutableSuperKmer;
|
||||
|
||||
fn next(&mut self) -> Option<SuperKmer> {
|
||||
fn next(&mut self) -> Option<RoutableSuperKmer> {
|
||||
loop {
|
||||
let byte = match self.cursor.read_next().ok() {
|
||||
None => {
|
||||
@@ -164,7 +157,7 @@ mod tests {
|
||||
fn run_nofilter(data: &[u8], k: usize, m: usize) -> Vec<Vec<u8>> {
|
||||
let rope = make_rope(data);
|
||||
SuperKmerIter::new(&rope, k, m, 1, 0.0)
|
||||
.map(|sk| sk.to_ascii())
|
||||
.map(|rsk| rsk.superkmer().to_ascii())
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -205,7 +198,7 @@ mod tests {
|
||||
|
||||
let rope = make_rope(b"AAAAAAAAAAAAAAAAAAAA\x00");
|
||||
let out_reject: Vec<Vec<u8>> = SuperKmerIter::new(&rope, K, M, 6, 0.9)
|
||||
.map(|sk| sk.to_ascii())
|
||||
.map(|rsk| rsk.superkmer().to_ascii())
|
||||
.collect();
|
||||
assert!(out_reject.is_empty());
|
||||
}
|
||||
@@ -218,7 +211,7 @@ mod tests {
|
||||
rope.push(data[..mid].to_vec());
|
||||
rope.push(data[mid..].to_vec());
|
||||
let out: Vec<Vec<u8>> = SuperKmerIter::new(&rope, K, M, 1, 0.0)
|
||||
.map(|sk| sk.to_ascii())
|
||||
.map(|rsk| rsk.superkmer().to_ascii())
|
||||
.collect();
|
||||
assert!(!out.is_empty());
|
||||
}
|
||||
@@ -226,7 +219,7 @@ mod tests {
|
||||
#[test]
|
||||
fn yields_minimizer_value() {
|
||||
let rope = make_rope(b"ACGTACGTACGTACGTACGT\x00");
|
||||
let results: Vec<SuperKmer> = SuperKmerIter::new(&rope, K, M, 1, 0.0).collect();
|
||||
let results: Vec<RoutableSuperKmer> = SuperKmerIter::new(&rope, K, M, 1, 0.0).collect();
|
||||
assert!(!results.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,9 +16,9 @@ pub use iter::SuperKmerIter;
|
||||
pub use scratch::SuperKmerScratch;
|
||||
|
||||
use obikrope::Rope;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
/// Collect all super-kmers from a normalised rope chunk.
|
||||
pub fn build_superkmers(rope: Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Vec<SuperKmer> {
|
||||
pub fn build_superkmers(rope: Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Vec<RoutableSuperKmer> {
|
||||
SuperKmerIter::new(&rope, k, m, level_max, theta).collect()
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//! Stack-allocated scratch buffer for building a SuperKmer before heap emission.
|
||||
|
||||
use crate::encoding::{BYTE_LEN_MAX, encode_nuc};
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
/// Maximum nucleotides in a super-kmer (fits one `u64` segment window, kept ≤ 256).
|
||||
pub const MAX_SUPERKMER_LEN: usize = 256;
|
||||
@@ -56,16 +56,15 @@ impl SuperKmerScratch {
|
||||
///
|
||||
/// The heap allocation (`Box<[u8]>`) is exactly sized to the sequence.
|
||||
/// Resets the buffer to empty afterward.
|
||||
pub fn emit(&mut self) -> SuperKmer {
|
||||
pub fn emit(&mut self, min_pos: usize, m: usize) -> RoutableSuperKmer {
|
||||
let seql = self.len;
|
||||
debug_assert!(seql >= 1 && seql <= MAX_SUPERKMER_LEN);
|
||||
let n = (seql + 3) / 4;
|
||||
let seq: Box<[u8]> = self.buf[..n].into();
|
||||
self.buf[..n].fill(0);
|
||||
self.len = 0;
|
||||
SuperKmer::new(seql as u8, seq)
|
||||
RoutableSuperKmer::build(min_pos, m, seql as u8, seq)
|
||||
}
|
||||
|
||||
/// Discard all accumulated nucleotides without producing a [`SuperKmer`].
|
||||
pub fn reset(&mut self) {
|
||||
let n = (self.len + 3) / 4;
|
||||
|
||||
+33
-15
@@ -2,17 +2,25 @@ use obikseq::superkmer::SuperKmer;
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
/// Serialise one SuperKmer into `w` (uncompressed; caller must wrap with a compressor).
|
||||
///
|
||||
/// Bits [7:0] of the header store `n_kmers = seql - k + 1` (kmer units, 1–255),
|
||||
/// not the raw nucleotide length. This removes the 0=256 wrapping convention.
|
||||
#[inline]
|
||||
pub(crate) fn write_superkmer<W: Write>(w: &mut W, sk: &SuperKmer) -> io::Result<()> {
|
||||
w.write_all(&sk.header_bits().to_le_bytes())?;
|
||||
pub(crate) fn write_superkmer<W: Write>(w: &mut W, sk: &SuperKmer, k: usize) -> io::Result<()> {
|
||||
let n_kmers = sk.len() - k + 1;
|
||||
let new_bits = (sk.header_bits() & !0xFF) | (n_kmers as u32);
|
||||
w.write_all(&new_bits.to_le_bytes())?;
|
||||
w.write_all(sk.seq_bytes())
|
||||
}
|
||||
|
||||
/// Deserialise one SuperKmer from `r`. Returns `None` on clean EOF.
|
||||
/// `seq_buf` is a reusable scratch buffer to avoid per-record allocation.
|
||||
/// Bits [7:0] of the on-disk header contain `n_kmers`; nucleotide length is
|
||||
/// reconstructed as `n_kmers + k - 1`.
|
||||
pub(crate) fn read_superkmer<R: Read>(
|
||||
r: &mut R,
|
||||
seq_buf: &mut Vec<u8>,
|
||||
k: usize,
|
||||
) -> io::Result<Option<SuperKmer>> {
|
||||
let mut hdr = [0u8; 4];
|
||||
match r.read_exact(&mut hdr) {
|
||||
@@ -21,12 +29,18 @@ pub(crate) fn read_superkmer<R: Read>(
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
let bits = u32::from_le_bytes(hdr);
|
||||
let seql_byte = (bits & 0xFF) as u8;
|
||||
let nt_len: usize = if seql_byte == 0 { 256 } else { seql_byte as usize };
|
||||
let n_kmers = (bits & 0xFF) as usize;
|
||||
let nt_len = n_kmers + k - 1;
|
||||
let byte_len = (nt_len + 3) / 4;
|
||||
seq_buf.resize(byte_len, 0);
|
||||
r.read_exact(seq_buf)?;
|
||||
Ok(Some(SuperKmer::from_header_bits(bits, seq_buf.as_slice().into())))
|
||||
// Reconstruct the in-memory seql byte (0 encodes 256, 1-255 direct).
|
||||
let seql_byte = if nt_len == 256 { 0u8 } else { nt_len as u8 };
|
||||
let mem_bits = (bits & !0xFF) | (seql_byte as u32);
|
||||
Ok(Some(SuperKmer::from_header_bits(
|
||||
mem_bits,
|
||||
seq_buf.as_slice().into(),
|
||||
)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -40,28 +54,31 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn roundtrip_single() {
|
||||
let k = 4;
|
||||
let sk = make_sk(b"ACGTACGT");
|
||||
let mut buf = Vec::new();
|
||||
write_superkmer(&mut buf, &sk).unwrap();
|
||||
write_superkmer(&mut buf, &sk, k).unwrap();
|
||||
|
||||
let mut cur = Cursor::new(&buf);
|
||||
let mut seq_buf = Vec::new();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf).unwrap().unwrap();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf, k).unwrap().unwrap();
|
||||
assert_eq!(sk.to_ascii(), got.to_ascii());
|
||||
assert_eq!(sk.seql(), got.seql());
|
||||
assert_eq!(sk.len(), got.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn roundtrip_all_lengths() {
|
||||
let bases: Vec<u8> = (0..256).map(|i| b"ACGT"[i % 4]).collect();
|
||||
for len in (1..=9).chain([255, 256]) {
|
||||
// k=11 is the project minimum; test seql from k to 256.
|
||||
let k = 11;
|
||||
for len in (k..=k + 8).chain([255, 256]) {
|
||||
let sk = make_sk(&bases[..len]);
|
||||
let mut buf = Vec::new();
|
||||
write_superkmer(&mut buf, &sk).unwrap();
|
||||
write_superkmer(&mut buf, &sk, k).unwrap();
|
||||
|
||||
let mut cur = Cursor::new(&buf);
|
||||
let mut seq_buf = Vec::new();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf).unwrap().unwrap();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf, k).unwrap().unwrap();
|
||||
assert_eq!(sk.to_ascii(), got.to_ascii(), "len={len}");
|
||||
}
|
||||
}
|
||||
@@ -71,24 +88,25 @@ mod tests {
|
||||
let buf: Vec<u8> = vec![];
|
||||
let mut cur = Cursor::new(&buf);
|
||||
let mut seq_buf = Vec::new();
|
||||
assert!(read_superkmer(&mut cur, &mut seq_buf).unwrap().is_none());
|
||||
assert!(read_superkmer(&mut cur, &mut seq_buf, 4).unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_records() {
|
||||
let k = 4;
|
||||
let seqs: &[&[u8]] = &[b"AAAA", b"CCCC", b"GGGG", b"TTTT"];
|
||||
let mut buf = Vec::new();
|
||||
for s in seqs {
|
||||
write_superkmer(&mut buf, &make_sk(s)).unwrap();
|
||||
write_superkmer(&mut buf, &make_sk(s), k).unwrap();
|
||||
}
|
||||
|
||||
let mut cur = Cursor::new(&buf);
|
||||
let mut seq_buf = Vec::new();
|
||||
for s in seqs {
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf).unwrap().unwrap();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf, k).unwrap().unwrap();
|
||||
let expected = make_sk(s);
|
||||
assert_eq!(expected.to_ascii(), got.to_ascii());
|
||||
}
|
||||
assert!(read_superkmer(&mut cur, &mut seq_buf).unwrap().is_none());
|
||||
assert!(read_superkmer(&mut cur, &mut seq_buf, k).unwrap().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
+63
-35
@@ -3,8 +3,8 @@ use crate::error::SKResult;
|
||||
use crate::limits::max_concurrent_files;
|
||||
use crate::meta::SKFileMeta;
|
||||
use lru::LruCache;
|
||||
use niffler::send::compression::Format;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{BufWriter, Write};
|
||||
@@ -79,7 +79,11 @@ impl SKFilePool {
|
||||
/// Create a pool allowing at most `max_open` simultaneously open fds.
|
||||
pub fn new(max_open: usize) -> Self {
|
||||
let cap = NonZeroUsize::new(max_open.max(1)).unwrap();
|
||||
Self { max_open, entries: Vec::new(), open: LruCache::new(cap) }
|
||||
Self {
|
||||
max_open,
|
||||
entries: Vec::new(),
|
||||
open: LruCache::new(cap),
|
||||
}
|
||||
}
|
||||
|
||||
/// Derive pool size from the OS fd limit (75 %, clamped to `[16, MAX_POOL_SIZE]`).
|
||||
@@ -218,6 +222,7 @@ pub struct SKFileWriter {
|
||||
id: usize,
|
||||
pool: Arc<Mutex<SKFilePool>>,
|
||||
path: PathBuf,
|
||||
k: usize,
|
||||
pending: Vec<u8>,
|
||||
flush_threshold: usize,
|
||||
logically_closed: bool,
|
||||
@@ -225,14 +230,15 @@ pub struct SKFileWriter {
|
||||
}
|
||||
|
||||
/// Create a `SKFileWriter` for a new file (Zstd, level 3).
|
||||
pub fn create_token(pool: &SharedPool, path: PathBuf) -> SKResult<SKFileWriter> {
|
||||
create_token_with(pool, path, Format::Zstd, Level::Three)
|
||||
pub fn create_token(pool: &SharedPool, path: PathBuf, k: usize) -> SKResult<SKFileWriter> {
|
||||
create_token_with(pool, path, k, Format::Zstd, Level::Three)
|
||||
}
|
||||
|
||||
/// Create a `SKFileWriter` for a new file with explicit format and level.
|
||||
pub fn create_token_with(
|
||||
pool: &SharedPool,
|
||||
path: PathBuf,
|
||||
k: usize,
|
||||
format: Format,
|
||||
level: Level,
|
||||
) -> SKResult<SKFileWriter> {
|
||||
@@ -241,6 +247,7 @@ pub fn create_token_with(
|
||||
id,
|
||||
pool: Arc::clone(pool),
|
||||
path,
|
||||
k,
|
||||
pending: Vec::with_capacity(DEFAULT_FLUSH_THRESHOLD + 128),
|
||||
flush_threshold: DEFAULT_FLUSH_THRESHOLD,
|
||||
logically_closed: false,
|
||||
@@ -251,13 +258,18 @@ pub fn create_token_with(
|
||||
impl SKFileWriter {
|
||||
/// Create a standalone file writer (Zstd, level 3).
|
||||
/// The pool is created internally and is not accessible to the caller.
|
||||
pub fn create<P: AsRef<Path>>(path: P) -> SKResult<Self> {
|
||||
Self::create_with(path, Format::Zstd, Level::Three)
|
||||
pub fn create<P: AsRef<Path>>(path: P, k: usize) -> SKResult<Self> {
|
||||
Self::create_with(path, k, Format::Zstd, Level::Three)
|
||||
}
|
||||
|
||||
/// Create a standalone file writer with explicit format and level.
|
||||
pub fn create_with<P: AsRef<Path>>(path: P, format: Format, level: Level) -> SKResult<Self> {
|
||||
create_token_with(global_pool(), path.as_ref().to_owned(), format, level)
|
||||
pub fn create_with<P: AsRef<Path>>(
|
||||
path: P,
|
||||
k: usize,
|
||||
format: Format,
|
||||
level: Level,
|
||||
) -> SKResult<Self> {
|
||||
create_token_with(global_pool(), path.as_ref().to_owned(), k, format, level)
|
||||
}
|
||||
|
||||
/// `true` if the underlying fd is currently open in the pool.
|
||||
@@ -268,10 +280,10 @@ impl SKFileWriter {
|
||||
/// Accumulate one SuperKmer. Drains to fd when `pending ≥ flush_threshold`.
|
||||
pub fn write(&mut self, sk: &SuperKmer) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
write_superkmer(&mut self.pending, sk)?;
|
||||
write_superkmer(&mut self.pending, sk, self.k)?;
|
||||
self.meta.instances += 1;
|
||||
self.meta.count_sum += sk.count() as u64;
|
||||
self.meta.length_sum += sk.seql() as u64;
|
||||
self.meta.length_sum += sk.len() as u64;
|
||||
if self.pending.len() >= self.flush_threshold {
|
||||
self.drain()?;
|
||||
}
|
||||
@@ -282,10 +294,10 @@ impl SKFileWriter {
|
||||
pub fn write_batch(&mut self, sks: &[SuperKmer]) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
for sk in sks {
|
||||
write_superkmer(&mut self.pending, sk)?;
|
||||
write_superkmer(&mut self.pending, sk, self.k)?;
|
||||
self.meta.instances += 1;
|
||||
self.meta.count_sum += sk.count() as u64;
|
||||
self.meta.length_sum += sk.seql() as u64;
|
||||
self.meta.length_sum += sk.len() as u64;
|
||||
if self.pending.len() >= self.flush_threshold {
|
||||
self.drain()?;
|
||||
}
|
||||
@@ -339,7 +351,10 @@ impl SKFileWriter {
|
||||
}
|
||||
|
||||
if !self.pending.is_empty() {
|
||||
fd_guard.as_mut().expect("fd open after ensure_open").write_all(&self.pending)?;
|
||||
fd_guard
|
||||
.as_mut()
|
||||
.expect("fd open after ensure_open")
|
||||
.write_all(&self.pending)?;
|
||||
self.pending.clear();
|
||||
}
|
||||
if let Some(mut w) = fd_guard.take() {
|
||||
@@ -400,7 +415,10 @@ impl SKFileWriter {
|
||||
fd_guard = fd_arc.lock().unwrap(); // acquire fd lock under pool lock
|
||||
// pool drops here → pool lock released, fd lock still held
|
||||
}
|
||||
fd_guard.as_mut().expect("fd open after ensure_open").write_all(&self.pending)?;
|
||||
fd_guard
|
||||
.as_mut()
|
||||
.expect("fd open after ensure_open")
|
||||
.write_all(&self.pending)?;
|
||||
// fd_guard drops → entry fd lock released
|
||||
self.pending.clear();
|
||||
Ok(())
|
||||
@@ -424,6 +442,8 @@ mod tests {
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use tempfile::{NamedTempFile, TempDir};
|
||||
|
||||
const TEST_K: usize = 4;
|
||||
|
||||
fn make_sk(seed: usize) -> SuperKmer {
|
||||
let bases: Vec<u8> = (0..8).map(|j| b"ACGT"[(seed + j) % 4]).collect();
|
||||
SuperKmer::from_ascii(&bases)
|
||||
@@ -443,7 +463,7 @@ mod tests {
|
||||
let dir = TempDir::new().unwrap();
|
||||
let p = pool(3);
|
||||
for i in 0..10 {
|
||||
create_token(&p, dir.path().join(format!("p{i}.zst"))).unwrap();
|
||||
create_token(&p, dir.path().join(format!("p{i}.zst")), TEST_K).unwrap();
|
||||
}
|
||||
assert_eq!(p.lock().unwrap().open_count(), 0);
|
||||
}
|
||||
@@ -455,14 +475,18 @@ mod tests {
|
||||
let sk = make_sk(0);
|
||||
|
||||
let mut tokens: Vec<SKFileWriter> = (0..6)
|
||||
.map(|i| create_token(&p, dir.path().join(format!("p{i}.zst"))).unwrap())
|
||||
.map(|i| create_token(&p, dir.path().join(format!("p{i}.zst")), TEST_K).unwrap())
|
||||
.collect();
|
||||
|
||||
for t in tokens.iter_mut() {
|
||||
open_token(t, &sk);
|
||||
}
|
||||
|
||||
assert!(p.lock().unwrap().open_count() <= 3, "open={}", p.lock().unwrap().open_count());
|
||||
assert!(
|
||||
p.lock().unwrap().open_count() <= 3,
|
||||
"open={}",
|
||||
p.lock().unwrap().open_count()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -471,8 +495,8 @@ mod tests {
|
||||
let p = pool(1);
|
||||
let sk = make_sk(0);
|
||||
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst"), TEST_K).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst"), TEST_K).unwrap();
|
||||
|
||||
open_token(&mut t0, &sk); // t0 fd open, pool full
|
||||
open_token(&mut t1, &sk); // evicts t0, t1 fd open
|
||||
@@ -487,8 +511,8 @@ mod tests {
|
||||
let p = pool(1);
|
||||
let sk = make_sk(0);
|
||||
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst"), TEST_K).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst"), TEST_K).unwrap();
|
||||
|
||||
t0.set_flush_threshold(1);
|
||||
t0.write(&sk).unwrap(); // t0 fd open, pool full
|
||||
@@ -504,7 +528,7 @@ mod tests {
|
||||
p.lock().unwrap().close_all().unwrap();
|
||||
|
||||
for name in &["a.zst", "b.zst"] {
|
||||
let mut r = SKFileReader::open(dir.path().join(name)).unwrap();
|
||||
let mut r = SKFileReader::open(dir.path().join(name), TEST_K).unwrap();
|
||||
let got = r.read_batch(10).unwrap();
|
||||
assert_eq!(got.len(), 1, "{name}: expected 1 record");
|
||||
}
|
||||
@@ -516,9 +540,9 @@ mod tests {
|
||||
let p = pool(2);
|
||||
let sk = make_sk(0);
|
||||
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
|
||||
let mut t2 = create_token(&p, dir.path().join("c.zst")).unwrap();
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst"), TEST_K).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst"), TEST_K).unwrap();
|
||||
let mut t2 = create_token(&p, dir.path().join("c.zst"), TEST_K).unwrap();
|
||||
|
||||
open_token(&mut t0, &sk); // t0 open
|
||||
open_token(&mut t1, &sk); // t1 open, t0 LRU
|
||||
@@ -538,10 +562,14 @@ mod tests {
|
||||
fn close_all_produces_readable_files() {
|
||||
let dir = TempDir::new().unwrap();
|
||||
let p = pool(8);
|
||||
let paths: Vec<_> = (0..4).map(|i| dir.path().join(format!("{i}.zst"))).collect();
|
||||
let paths: Vec<_> = (0..4)
|
||||
.map(|i| dir.path().join(format!("{i}.zst")))
|
||||
.collect();
|
||||
|
||||
let mut tokens: Vec<SKFileWriter> =
|
||||
paths.iter().map(|path| create_token(&p, path.clone()).unwrap()).collect();
|
||||
let mut tokens: Vec<SKFileWriter> = paths
|
||||
.iter()
|
||||
.map(|path| create_token(&p, path.clone(), TEST_K).unwrap())
|
||||
.collect();
|
||||
|
||||
for (i, t) in tokens.iter_mut().enumerate() {
|
||||
t.write(&make_sk(i)).unwrap();
|
||||
@@ -553,7 +581,7 @@ mod tests {
|
||||
p.lock().unwrap().close_all().unwrap();
|
||||
|
||||
for path in &paths {
|
||||
let mut r = SKFileReader::open(path).unwrap();
|
||||
let mut r = SKFileReader::open(path, TEST_K).unwrap();
|
||||
let got = r.read_batch(10).unwrap();
|
||||
assert_eq!(got.len(), 1);
|
||||
}
|
||||
@@ -566,11 +594,11 @@ mod tests {
|
||||
let sks: Vec<_> = (0..50).map(make_sk).collect();
|
||||
let path = dir.path().join("batch.zst");
|
||||
|
||||
let mut t = create_token(&p, path.clone()).unwrap();
|
||||
let mut t = create_token(&p, path.clone(), TEST_K).unwrap();
|
||||
t.write_batch(&sks).unwrap();
|
||||
t.close().unwrap();
|
||||
|
||||
let mut r = SKFileReader::open(&path).unwrap();
|
||||
let mut r = SKFileReader::open(&path, TEST_K).unwrap();
|
||||
let got = r.read_batch(100).unwrap();
|
||||
assert_eq!(got.len(), 50);
|
||||
for (a, b) in sks.iter().zip(got.iter()) {
|
||||
@@ -590,11 +618,11 @@ mod tests {
|
||||
let tmp = NamedTempFile::new().unwrap();
|
||||
let sks: Vec<_> = (0..100).map(make_sk).collect();
|
||||
{
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
w.write_batch(&sks).unwrap();
|
||||
w.close().unwrap();
|
||||
}
|
||||
let mut r = SKFileReader::open(tmp.path()).unwrap();
|
||||
let mut r = SKFileReader::open(tmp.path(), TEST_K).unwrap();
|
||||
let got = r.read_batch(200).unwrap();
|
||||
assert_eq!(got.len(), 100);
|
||||
for (a, b) in sks.iter().zip(got.iter()) {
|
||||
@@ -605,7 +633,7 @@ mod tests {
|
||||
#[test]
|
||||
fn standalone_close_prevents_write() {
|
||||
let tmp = NamedTempFile::new().unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
w.close().unwrap();
|
||||
assert!(!w.is_open());
|
||||
assert!(w.write(&make_sk(0)).is_err());
|
||||
@@ -614,7 +642,7 @@ mod tests {
|
||||
#[test]
|
||||
fn standalone_is_physically_open() {
|
||||
let tmp = NamedTempFile::new().unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
assert!(!w.is_physically_open()); // fd deferred until first drain
|
||||
w.set_flush_threshold(1);
|
||||
w.write(&make_sk(0)).unwrap(); // triggers drain → fd opened
|
||||
|
||||
@@ -15,6 +15,7 @@ use std::path::{Path, PathBuf};
|
||||
/// that it can fast-forward on next open.
|
||||
pub struct SKFileReader {
|
||||
path: PathBuf,
|
||||
k: usize,
|
||||
reader: Option<Box<dyn std::io::Read + Send>>,
|
||||
/// Reusable scratch buffer for the `seq` bytes of each record.
|
||||
seq_buf: Vec<u8>,
|
||||
@@ -24,11 +25,13 @@ pub struct SKFileReader {
|
||||
|
||||
impl SKFileReader {
|
||||
/// Open a file for reading. Format is auto-detected from magic bytes.
|
||||
pub fn open<P: AsRef<Path>>(path: P) -> SKResult<Self> {
|
||||
/// `k` is the kmer size of the partition; required to decode the on-disk n_kmers field.
|
||||
pub fn open<P: AsRef<Path>>(path: P, k: usize) -> SKResult<Self> {
|
||||
let path = path.as_ref().to_owned();
|
||||
let (reader, _fmt) = niffler::send::get_reader(Box::new(BufReader::new(File::open(&path)?)))?;
|
||||
Ok(Self {
|
||||
path,
|
||||
k,
|
||||
reader: Some(reader),
|
||||
seq_buf: Vec::with_capacity(64),
|
||||
consumed: 0,
|
||||
@@ -43,7 +46,7 @@ impl SKFileReader {
|
||||
"read from physically closed SKFileReader",
|
||||
)
|
||||
})?;
|
||||
let result = read_superkmer(r, &mut self.seq_buf)?;
|
||||
let result = read_superkmer(r, &mut self.seq_buf, self.k)?;
|
||||
if result.is_some() {
|
||||
self.consumed += 1;
|
||||
}
|
||||
@@ -100,7 +103,7 @@ impl SKFileReader {
|
||||
let target = self.consumed;
|
||||
self.consumed = 0;
|
||||
for _ in 0..target {
|
||||
match read_superkmer(self.reader.as_mut().unwrap(), &mut self.seq_buf)? {
|
||||
match read_superkmer(self.reader.as_mut().unwrap(), &mut self.seq_buf, self.k)? {
|
||||
Some(_) => self.consumed += 1,
|
||||
None => break,
|
||||
}
|
||||
@@ -147,6 +150,8 @@ mod tests {
|
||||
use crate::pool::SKFileWriter;
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
const TEST_K: usize = 4; // test sequences are 8 bases; k=4 gives n_kmers=5
|
||||
|
||||
fn make_sks(n: usize) -> Vec<SuperKmer> {
|
||||
(0..n)
|
||||
.map(|i| {
|
||||
@@ -162,11 +167,11 @@ mod tests {
|
||||
let sks = make_sks(50);
|
||||
|
||||
{
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
w.write_batch(&sks).unwrap();
|
||||
}
|
||||
|
||||
let mut r = SKFileReader::open(tmp.path()).unwrap();
|
||||
let mut r = SKFileReader::open(tmp.path(), TEST_K).unwrap();
|
||||
let got: Vec<_> = r.iter().collect();
|
||||
assert_eq!(got.len(), 50);
|
||||
for (a, b) in sks.iter().zip(got.iter()) {
|
||||
@@ -180,11 +185,11 @@ mod tests {
|
||||
let sks = make_sks(20);
|
||||
|
||||
{
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
w.write_batch(&sks).unwrap();
|
||||
}
|
||||
|
||||
let mut r = SKFileReader::open(tmp.path()).unwrap();
|
||||
let mut r = SKFileReader::open(tmp.path(), TEST_K).unwrap();
|
||||
// Read 10, then simulate pool eviction + re-access
|
||||
let first = r.read_batch(10).unwrap();
|
||||
r.close();
|
||||
|
||||
Reference in New Issue
Block a user