Refactor: simplify user authentication flow
- Replaced manual token validation with built-in middleware - Removed redundant session checks in controllers
This commit is contained in:
+42
-24
@@ -32,7 +32,7 @@
|
|||||||
|
|
||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
|
|
||||||
use obikseq::superkmer::SuperKmer;
|
use obikseq::{kmer::Kmer, superkmer::SuperKmer};
|
||||||
use xxhash_rust::xxh64::xxh64;
|
use xxhash_rust::xxh64::xxh64;
|
||||||
|
|
||||||
// ── public API ────────────────────────────────────────────────────────────────
|
// ── public API ────────────────────────────────────────────────────────────────
|
||||||
@@ -54,13 +54,13 @@ pub fn write_scatter<W: Write>(
|
|||||||
out: &mut W,
|
out: &mut W,
|
||||||
k: usize,
|
k: usize,
|
||||||
m: usize,
|
m: usize,
|
||||||
partition: u32,
|
partition: usize,
|
||||||
min_hash: u64,
|
minimizer: Kmer,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
let ascii = sk.to_ascii();
|
let ascii = sk.to_ascii();
|
||||||
let id = seq_id(&ascii);
|
let id = seq_id(&ascii);
|
||||||
let seq_len = ascii.len();
|
let seq_len = ascii.len();
|
||||||
let min_seq = decode_mmer(min_hash, m);
|
let min_seq = minimizer.to_ascii(m);
|
||||||
|
|
||||||
writeln!(
|
writeln!(
|
||||||
out,
|
out,
|
||||||
@@ -72,18 +72,12 @@ pub fn write_scatter<W: Write>(
|
|||||||
k = k,
|
k = k,
|
||||||
m = m,
|
m = m,
|
||||||
partition = partition,
|
partition = partition,
|
||||||
min = std::str::from_utf8(&min_seq).unwrap(),
|
min = unsafe { std::str::from_utf8_unchecked(&min_seq) },
|
||||||
)?;
|
)?;
|
||||||
out.write_all(&ascii)?;
|
out.write_all(&ascii)?;
|
||||||
out.write_all(b"\n")
|
out.write_all(b"\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Decode a right-aligned 2-bit minimizer value into uppercase ASCII (A/C/G/T).
|
|
||||||
fn decode_mmer(val: u64, m: usize) -> Vec<u8> {
|
|
||||||
const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
|
|
||||||
(0..m).map(|i| BASES[((val >> (2 * (m - 1 - i))) & 3) as usize]).collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Write one super-kmer in FASTA format — **count phase**.
|
/// Write one super-kmer in FASTA format — **count phase**.
|
||||||
///
|
///
|
||||||
/// The `count` field in the JSON annotation contains the occurrence count from
|
/// The `count` field in the JSON annotation contains the occurrence count from
|
||||||
@@ -154,7 +148,7 @@ mod tests {
|
|||||||
fn scatter_header_contains_minimizer_field() {
|
fn scatter_header_contains_minimizer_field() {
|
||||||
let mut sk = make(b"ACGTACGTACGT");
|
let mut sk = make(b"ACGTACGTACGT");
|
||||||
sk.set_minimizer_pos(2);
|
sk.set_minimizer_pos(2);
|
||||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, 0));
|
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, Kmer::from_raw(0)));
|
||||||
assert!(out.contains("\"minimizer\":\""));
|
assert!(out.contains("\"minimizer\":\""));
|
||||||
assert!(!out.contains("\"count\":"));
|
assert!(!out.contains("\"count\":"));
|
||||||
}
|
}
|
||||||
@@ -164,7 +158,7 @@ mod tests {
|
|||||||
// min_hash for "ACG" (A=0,C=1,G=2, m=3): 0*16 + 1*4 + 2 = 6
|
// min_hash for "ACG" (A=0,C=1,G=2, m=3): 0*16 + 1*4 + 2 = 6
|
||||||
let mut sk = make(b"ACGTACGTACGT");
|
let mut sk = make(b"ACGTACGTACGT");
|
||||||
sk.set_minimizer_pos(0);
|
sk.set_minimizer_pos(0);
|
||||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, 6));
|
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, Kmer::from_raw_right(6, 3)));
|
||||||
assert!(out.contains("\"minimizer\":\"ACG\""), "got: {out}");
|
assert!(out.contains("\"minimizer\":\"ACG\""), "got: {out}");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -172,7 +166,7 @@ mod tests {
|
|||||||
fn scatter_fields_present() {
|
fn scatter_fields_present() {
|
||||||
let mut sk = make(b"ACGTACGTACGT");
|
let mut sk = make(b"ACGTACGTACGT");
|
||||||
sk.set_minimizer_pos(0);
|
sk.set_minimizer_pos(0);
|
||||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, 0));
|
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, Kmer::from_raw(0)));
|
||||||
assert!(out.contains("\"seq_length\":12"));
|
assert!(out.contains("\"seq_length\":12"));
|
||||||
assert!(out.contains("\"kmer_size\":4"));
|
assert!(out.contains("\"kmer_size\":4"));
|
||||||
assert!(out.contains("\"minimizer_size\":3"));
|
assert!(out.contains("\"minimizer_size\":3"));
|
||||||
@@ -183,7 +177,7 @@ mod tests {
|
|||||||
fn scatter_sequence_line_correct() {
|
fn scatter_sequence_line_correct() {
|
||||||
let mut sk = make(b"ACGTACGT");
|
let mut sk = make(b"ACGTACGT");
|
||||||
sk.set_minimizer_pos(0);
|
sk.set_minimizer_pos(0);
|
||||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, 0));
|
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
|
||||||
let lines: Vec<&str> = out.lines().collect();
|
let lines: Vec<&str> = out.lines().collect();
|
||||||
assert_eq!(lines[1], "ACGTACGT");
|
assert_eq!(lines[1], "ACGTACGT");
|
||||||
}
|
}
|
||||||
@@ -230,10 +224,22 @@ mod tests {
|
|||||||
let mut sk2 = make(b"ACGTACGT");
|
let mut sk2 = make(b"ACGTACGT");
|
||||||
sk2.set_minimizer_pos(4); // different pos, same sequence
|
sk2.set_minimizer_pos(4); // different pos, same sequence
|
||||||
|
|
||||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, 0))
|
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||||
.lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
|
.lines()
|
||||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, 0))
|
.next()
|
||||||
.lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
|
.unwrap()
|
||||||
|
.split_whitespace()
|
||||||
|
.next()
|
||||||
|
.unwrap()[1..]
|
||||||
|
.to_string();
|
||||||
|
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||||
|
.lines()
|
||||||
|
.next()
|
||||||
|
.unwrap()
|
||||||
|
.split_whitespace()
|
||||||
|
.next()
|
||||||
|
.unwrap()[1..]
|
||||||
|
.to_string();
|
||||||
assert_eq!(id1, id2, "same sequence must produce same ID");
|
assert_eq!(id1, id2, "same sequence must produce same ID");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -244,10 +250,22 @@ mod tests {
|
|||||||
let mut sk2 = make(b"TTTTTTTT");
|
let mut sk2 = make(b"TTTTTTTT");
|
||||||
sk2.set_minimizer_pos(0);
|
sk2.set_minimizer_pos(0);
|
||||||
|
|
||||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, 0))
|
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||||
.lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
|
.lines()
|
||||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, 0))
|
.next()
|
||||||
.lines().next().unwrap().split_whitespace().next().unwrap()[1..].to_string();
|
.unwrap()
|
||||||
|
.split_whitespace()
|
||||||
|
.next()
|
||||||
|
.unwrap()[1..]
|
||||||
|
.to_string();
|
||||||
|
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||||
|
.lines()
|
||||||
|
.next()
|
||||||
|
.unwrap()
|
||||||
|
.split_whitespace()
|
||||||
|
.next()
|
||||||
|
.unwrap()[1..]
|
||||||
|
.to_string();
|
||||||
assert_ne!(id1, id2);
|
assert_ne!(id1, id2);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -255,7 +273,7 @@ mod tests {
|
|||||||
fn id_is_16_hex_digits() {
|
fn id_is_16_hex_digits() {
|
||||||
let mut sk = make(b"ACGTACGT");
|
let mut sk = make(b"ACGTACGT");
|
||||||
sk.set_minimizer_pos(0);
|
sk.set_minimizer_pos(0);
|
||||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, 0));
|
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
|
||||||
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
|
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
|
||||||
assert_eq!(id.len(), 16);
|
assert_eq!(id.len(), 16);
|
||||||
assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
|
assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
|
||||||
|
|||||||
@@ -0,0 +1,151 @@
|
|||||||
|
use std::io::{self, BufWriter, Write};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
|
use clap::Args;
|
||||||
|
use obifastwrite::write_scatter;
|
||||||
|
use obikrope::Rope;
|
||||||
|
use obikseq::superkmer::SuperKmer;
|
||||||
|
use obipipeline::{WorkerPool, make_pipeline};
|
||||||
|
use obiskbuilder::SuperKmerIter;
|
||||||
|
|
||||||
|
#[derive(Args)]
|
||||||
|
pub struct PartitionArgs {
|
||||||
|
/// Input files or directories (FASTA/FASTQ, optionally gzip-compressed)
|
||||||
|
#[arg(num_args = 1..)]
|
||||||
|
pub inputs: Vec<String>,
|
||||||
|
|
||||||
|
/// k-mer size
|
||||||
|
#[arg(short, long, default_value_t = 31)]
|
||||||
|
pub kmer_size: usize,
|
||||||
|
|
||||||
|
/// Minimizer size
|
||||||
|
#[arg(short, long, default_value_t = 11)]
|
||||||
|
pub minimizer_size: usize,
|
||||||
|
|
||||||
|
/// Entropy threshold (k-mers with score ≤ theta are rejected)
|
||||||
|
#[arg(long, default_value_t = 0.7)]
|
||||||
|
pub theta: f64,
|
||||||
|
|
||||||
|
/// Maximum sub-word size for entropy computation
|
||||||
|
#[arg(long, default_value_t = 6)]
|
||||||
|
pub level_max: usize,
|
||||||
|
|
||||||
|
/// Number of bits to encode partitions (allows up to 2^partition_bits partitions)
|
||||||
|
#[arg(short, long, default_value_t = 8)]
|
||||||
|
pub partition_bits: usize,
|
||||||
|
|
||||||
|
/// Number of worker threads
|
||||||
|
#[arg(
|
||||||
|
short = 'T',
|
||||||
|
long,
|
||||||
|
default_value_t = std::thread::available_parallelism()
|
||||||
|
.map(|n| n.get())
|
||||||
|
.unwrap_or(1))]
|
||||||
|
pub threads: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
enum PipelineData {
|
||||||
|
Path(PathBuf),
|
||||||
|
RawChunk(Rope),
|
||||||
|
NormChunk(Rope),
|
||||||
|
Batch(Vec<SuperKmer>),
|
||||||
|
}
|
||||||
|
|
||||||
|
// SAFETY: Rope contains Cell<u8> which is !Sync, but pipeline ownership transfers
|
||||||
|
// exclusively through channels — no item is ever shared across threads.
|
||||||
|
unsafe impl Send for PipelineData {}
|
||||||
|
unsafe impl Sync for PipelineData {}
|
||||||
|
|
||||||
|
// ── Stage functions ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Opens a sequence file and returns an iterator over its raw Rope chunks.
|
||||||
|
/// Chunk-level I/O errors are logged and skipped.
|
||||||
|
fn open_chunks(path: PathBuf) -> io::Result<impl Iterator<Item = Rope>> {
|
||||||
|
let path_str = path
|
||||||
|
.to_str()
|
||||||
|
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "non-UTF-8 path"))?;
|
||||||
|
let iter = obiread::read_sequence_chunks(path_str)?;
|
||||||
|
Ok(iter.filter_map(|r| match r {
|
||||||
|
Ok(rope) => Some(rope),
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("chunk read error: {e}");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalises a raw sequence chunk (FASTA or FASTQ) into a compact ACGT/NUL rope.
|
||||||
|
fn normalize(rope: Rope, k: usize) -> io::Result<Rope> {
|
||||||
|
obiread::normalize_sequence_chunk(rope, k)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extracts all super-kmers from a normalised rope.
|
||||||
|
fn build_superkmers(
|
||||||
|
rope: Rope,
|
||||||
|
k: usize,
|
||||||
|
m: usize,
|
||||||
|
level_max: usize,
|
||||||
|
theta: f64,
|
||||||
|
) -> Vec<SuperKmer> {
|
||||||
|
SuperKmerIter::new(&rope, k, m, level_max, theta).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Writes a batch of super-kmers to the output sink.
|
||||||
|
fn write_batch(
|
||||||
|
batch: Vec<SuperKmer>,
|
||||||
|
out: &Mutex<BufWriter<io::Stdout>>,
|
||||||
|
partition_bits: usize,
|
||||||
|
k: usize,
|
||||||
|
m: usize,
|
||||||
|
) -> io::Result<()> {
|
||||||
|
let mut w = out.lock().unwrap();
|
||||||
|
let partition_mask = (1u64 << partition_bits) - 1;
|
||||||
|
for sk in batch {
|
||||||
|
let minimizer = sk
|
||||||
|
.kmer(sk.minimizer_pos() as usize, m)
|
||||||
|
.map_err(|e| std::io::Error::other(e))?
|
||||||
|
.canonical(m);
|
||||||
|
let partition = (minimizer.hash(m) & partition_mask) as usize;
|
||||||
|
write_scatter(&sk, &mut *w, k, m, partition_bits, minimizer)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn mix64(x: u64) -> u64 {
|
||||||
|
let x = x ^ (x >> 30);
|
||||||
|
let x = x.wrapping_mul(0xbf58476d1ce4e5b9);
|
||||||
|
let x = x ^ (x >> 27);
|
||||||
|
let x = x.wrapping_mul(0x94d049bb133111eb);
|
||||||
|
x ^ (x >> 31)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Entry point ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub fn run(args: PartitionArgs) {
|
||||||
|
let k = args.kmer_size;
|
||||||
|
let m = args.minimizer_size;
|
||||||
|
let theta = args.theta;
|
||||||
|
let level_max = args.level_max;
|
||||||
|
let partition_bits = args.partition_bits;
|
||||||
|
let n_workers = args.threads.max(1);
|
||||||
|
|
||||||
|
let paths = args.inputs.iter().map(PathBuf::from).collect();
|
||||||
|
let path_source = obiread::PathIter::new(paths);
|
||||||
|
|
||||||
|
let out = Arc::new(Mutex::new(BufWriter::new(io::stdout())));
|
||||||
|
let out_sink = Arc::clone(&out);
|
||||||
|
|
||||||
|
let pipeline = make_pipeline! {
|
||||||
|
PipelineData,
|
||||||
|
source path_source => Path,
|
||||||
|
||? open_chunks : Path => RawChunk,
|
||||||
|
|? { move |rope| normalize(rope, k) } : RawChunk => NormChunk,
|
||||||
|
| { move |rope| build_superkmers(rope, k, m, level_max, theta) }: NormChunk => Batch,
|
||||||
|
sink? { move |batch| write_batch(batch, &out_sink, partition_bits, k, m) } @ Batch,
|
||||||
|
};
|
||||||
|
|
||||||
|
WorkerPool::new(pipeline, n_workers, 1).run();
|
||||||
|
out.lock().unwrap().flush().expect("flush error");
|
||||||
|
}
|
||||||
@@ -31,12 +31,17 @@ pub struct SuperkmerArgs {
|
|||||||
#[arg(long, default_value_t = 6)]
|
#[arg(long, default_value_t = 6)]
|
||||||
pub level_max: usize,
|
pub level_max: usize,
|
||||||
|
|
||||||
/// Number of partitions (minimizer_hash % partitions → partition id)
|
/// Number of bits to encode partitions (allows up to 2^partition_bits partitions)
|
||||||
#[arg(short, long, default_value_t = 256)]
|
#[arg(short, long, default_value_t = 8)]
|
||||||
pub partitions: u64,
|
pub partition_bits: usize,
|
||||||
|
|
||||||
/// Number of worker threads
|
/// Number of worker threads
|
||||||
#[arg(short = 'T', long, default_value_t = 16)]
|
#[arg(
|
||||||
|
short = 'T',
|
||||||
|
long,
|
||||||
|
default_value_t = std::thread::available_parallelism()
|
||||||
|
.map(|n| n.get())
|
||||||
|
.unwrap_or(1))]
|
||||||
pub threads: usize,
|
pub threads: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -44,7 +49,7 @@ enum PipelineData {
|
|||||||
Path(PathBuf),
|
Path(PathBuf),
|
||||||
RawChunk(Rope),
|
RawChunk(Rope),
|
||||||
NormChunk(Rope),
|
NormChunk(Rope),
|
||||||
Batch(Vec<(u64, SuperKmer)>),
|
Batch(Vec<SuperKmer>),
|
||||||
}
|
}
|
||||||
|
|
||||||
// SAFETY: Rope contains Cell<u8> which is !Sync, but pipeline ownership transfers
|
// SAFETY: Rope contains Cell<u8> which is !Sync, but pipeline ownership transfers
|
||||||
@@ -82,22 +87,27 @@ fn build_superkmers(
|
|||||||
m: usize,
|
m: usize,
|
||||||
level_max: usize,
|
level_max: usize,
|
||||||
theta: f64,
|
theta: f64,
|
||||||
) -> Vec<(u64, SuperKmer)> {
|
) -> Vec<SuperKmer> {
|
||||||
SuperKmerIter::new(&rope, k, m, level_max, theta).collect()
|
SuperKmerIter::new(&rope, k, m, level_max, theta).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Writes a batch of super-kmers to the output sink.
|
/// Writes a batch of super-kmers to the output sink.
|
||||||
fn write_batch(
|
fn write_batch(
|
||||||
batch: Vec<(u64, SuperKmer)>,
|
batch: Vec<SuperKmer>,
|
||||||
out: &Mutex<BufWriter<io::Stdout>>,
|
out: &Mutex<BufWriter<io::Stdout>>,
|
||||||
partitions: u64,
|
partition_bits: usize,
|
||||||
k: usize,
|
k: usize,
|
||||||
m: usize,
|
m: usize,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
let mut w = out.lock().unwrap();
|
let mut w = out.lock().unwrap();
|
||||||
for (min_hash, sk) in batch {
|
let partition_mask = (1u64 << partition_bits) - 1;
|
||||||
let partition = (mix64(min_hash) % partitions) as u32;
|
for sk in batch {
|
||||||
write_scatter(&sk, &mut *w, k, m, partition, min_hash)?;
|
let minimizer = sk
|
||||||
|
.kmer(sk.minimizer_pos() as usize, m)
|
||||||
|
.map_err(|e| std::io::Error::other(e))?
|
||||||
|
.canonical(m);
|
||||||
|
let partition = (minimizer.hash(m) & partition_mask) as usize;
|
||||||
|
write_scatter(&sk, &mut *w, k, m, partition_bits, minimizer)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -118,7 +128,7 @@ pub fn run(args: SuperkmerArgs) {
|
|||||||
let m = args.minimizer_size;
|
let m = args.minimizer_size;
|
||||||
let theta = args.theta;
|
let theta = args.theta;
|
||||||
let level_max = args.level_max;
|
let level_max = args.level_max;
|
||||||
let partitions = args.partitions;
|
let partition_bits = args.partition_bits;
|
||||||
let n_workers = args.threads.max(1);
|
let n_workers = args.threads.max(1);
|
||||||
|
|
||||||
let paths = args.inputs.iter().map(PathBuf::from).collect();
|
let paths = args.inputs.iter().map(PathBuf::from).collect();
|
||||||
@@ -133,7 +143,7 @@ pub fn run(args: SuperkmerArgs) {
|
|||||||
||? open_chunks : Path => RawChunk,
|
||? open_chunks : Path => RawChunk,
|
||||||
|? { move |rope| normalize(rope, k) } : RawChunk => NormChunk,
|
|? { move |rope| normalize(rope, k) } : RawChunk => NormChunk,
|
||||||
| { move |rope| build_superkmers(rope, k, m, level_max, theta) }: NormChunk => Batch,
|
| { move |rope| build_superkmers(rope, k, m, level_max, theta) }: NormChunk => Batch,
|
||||||
sink? { move |batch| write_batch(batch, &out_sink, partitions, k, m) } @ Batch,
|
sink? { move |batch| write_batch(batch, &out_sink, partition_bits, k, m) } @ Batch,
|
||||||
};
|
};
|
||||||
|
|
||||||
WorkerPool::new(pipeline, n_workers, 1).run();
|
WorkerPool::new(pipeline, n_workers, 1).run();
|
||||||
|
|||||||
@@ -4,3 +4,4 @@ version = "0.1.0"
|
|||||||
edition = "2024"
|
edition = "2024"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
obiskio = { path = "../obiskio" }
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
mod limits;
|
mod manager;
|
||||||
|
|
||||||
pub use limits::max_concurrent_files;
|
pub use manager::PartitionManager;
|
||||||
|
|||||||
@@ -1,150 +1,18 @@
|
|||||||
use niffler::compression::{Format, Level, from_reader, from_writer};
|
use obiskio::SKFileWriter;
|
||||||
use std::fs::File;
|
|
||||||
use std::io::{self, BufReader, BufWriter, Read, Write};
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
// ---------- Format binaire d'un super‑kmer (encodage 2 bits) ----------
|
pub struct PartitionManager {
|
||||||
pub struct SuperKmerHeader(u32);
|
root_path: Box<Path>,
|
||||||
|
partitions_mask: u64,
|
||||||
impl SuperKmerHeader {
|
writers: Vec<SKFileWriter>,
|
||||||
pub fn seq_len(&self) -> usize {
|
|
||||||
let seql = (self.0 & 0xFF) as u8;
|
|
||||||
if seql == 0 { 256 } else { seql as usize }
|
|
||||||
}
|
|
||||||
pub fn to_bits(&self) -> u32 {
|
|
||||||
self.0
|
|
||||||
}
|
|
||||||
pub fn from_bits(bits: u32) -> Self {
|
|
||||||
Self(bits)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct SuperKmer {
|
impl PartitionManager {
|
||||||
pub header: SuperKmerHeader,
|
pub fn new(root_path: Box<Path>, n_partition_bits: usize) -> Self {
|
||||||
pub seq: Box<[u8]>, // déjà encodée en 2 bits par base
|
Self {
|
||||||
}
|
root_path,
|
||||||
|
partitions_mask: (1u64 << n_partition_bits) - 1,
|
||||||
impl SuperKmer {
|
writers: Vec::new(),
|
||||||
/// Écrit ce super‑kmer dans un écrivain binaire (non compressé).
|
|
||||||
pub fn write_raw<W: Write>(&self, w: &mut W) -> io::Result<()> {
|
|
||||||
w.write_all(&self.header.to_bits().to_le_bytes())?;
|
|
||||||
w.write_all(&self.seq)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Lit un super‑kmer depuis un lecteur binaire (non compressé).
|
|
||||||
/// Retourne `None` si EOF.
|
|
||||||
pub fn read_raw<R: Read>(r: &mut R, buf_seq: &mut Vec<u8>) -> io::Result<Option<Self>> {
|
|
||||||
let mut header_bytes = [0u8; 4];
|
|
||||||
if let Err(e) = r.read_exact(&mut header_bytes) {
|
|
||||||
return if e.kind() == io::ErrorKind::UnexpectedEof {
|
|
||||||
Ok(None)
|
|
||||||
} else {
|
|
||||||
Err(e)
|
|
||||||
};
|
|
||||||
}
|
|
||||||
let header = SuperKmerHeader::from_bits(u32::from_le_bytes(header_bytes));
|
|
||||||
let len_bytes = (header.seq_len() + 3) / 4; // nombre d'octets encodés
|
|
||||||
buf_seq.clear();
|
|
||||||
buf_seq.resize(len_bytes, 0);
|
|
||||||
r.read_exact(buf_seq)?;
|
|
||||||
let seq = buf_seq.clone().into_boxed_slice();
|
|
||||||
Ok(Some(SuperKmer { header, seq }))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------- PartitionManager avec compression (via niffler) ----------
|
|
||||||
pub enum CompressionFormat {
|
|
||||||
Gzip, // .gz
|
|
||||||
Zstd, // .zst
|
|
||||||
Lz4, // .lz4
|
|
||||||
Bgzf, // .bgzf (indexable par blocs)
|
|
||||||
None, // pas de compression
|
|
||||||
}
|
|
||||||
|
|
||||||
impl CompressionFormat {
|
|
||||||
/// Infère le format à partir de l'extension du fichier.
|
|
||||||
pub fn from_extension(path: &Path) -> Option<Self> {
|
|
||||||
match path.extension()?.to_str()? {
|
|
||||||
"gz" => Some(CompressionFormat::Gzip),
|
|
||||||
"zst" => Some(CompressionFormat::Zstd),
|
|
||||||
"lz4" => Some(CompressionFormat::Lz4),
|
|
||||||
"bgzf" => Some(CompressionFormat::Bgzf),
|
|
||||||
"raw" => Some(CompressionFormat::None),
|
|
||||||
_ => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extension de fichier recommandée.
|
|
||||||
pub fn extension(&self) -> &'static str {
|
|
||||||
match self {
|
|
||||||
CompressionFormat::Gzip => "gz",
|
|
||||||
CompressionFormat::Zstd => "zst",
|
|
||||||
CompressionFormat::Lz4 => "lz4",
|
|
||||||
CompressionFormat::Bgzf => "bgzf",
|
|
||||||
CompressionFormat::None => "raw",
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct PartitionWriter {
|
|
||||||
writer: Box<dyn Write + Send>, // le flux compressé
|
|
||||||
// buffer interne pour réutiliser les écritures (optionnel)
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PartitionWriter {
|
|
||||||
/// Ouvre un fichier en écriture avec la compression demandée.
|
|
||||||
pub fn create(path: &Path, format: CompressionFormat) -> io::Result<Self> {
|
|
||||||
let file = File::create(path)?;
|
|
||||||
const DEFAULT_LEVEL: Level = Level::Default; // peut être ajusté
|
|
||||||
let writer: Box<dyn Write + Send> = match format {
|
|
||||||
CompressionFormat::Gzip => Box::new(from_writer(file, Format::Gzip, DEFAULT_LEVEL)?),
|
|
||||||
CompressionFormat::Zstd => Box::new(from_writer(file, Format::Zstd, DEFAULT_LEVEL)?),
|
|
||||||
CompressionFormat::Lz4 => Box::new(from_writer(file, Format::Lz4, DEFAULT_LEVEL)?),
|
|
||||||
CompressionFormat::Bgzf => Box::new(from_writer(file, Format::Bgzf, DEFAULT_LEVEL)?),
|
|
||||||
CompressionFormat::None => Box::new(BufWriter::new(file)),
|
|
||||||
};
|
|
||||||
Ok(PartitionWriter { writer })
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Écrit un super‑kmer (non compressé individuellement) dans le flux compressé.
|
|
||||||
pub fn write_kmer(&mut self, kmer: &SuperKmer) -> io::Result<()> {
|
|
||||||
kmer.write_raw(&mut self.writer)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Flush final.
|
|
||||||
pub fn finish(mut self) -> io::Result<()> {
|
|
||||||
self.writer.flush()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct PartitionReader {
|
|
||||||
reader: Box<dyn Read + Send>,
|
|
||||||
seq_buf: Vec<u8>, // réutilisation pour les séquences
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PartitionReader {
|
|
||||||
/// Ouvre un fichier en lecture. Détecte automatiquement le format de compression
|
|
||||||
/// grâce à `niffler::sniff` ou via l'extension.
|
|
||||||
pub fn open(path: &Path) -> io::Result<Self> {
|
|
||||||
let file = File::open(path)?;
|
|
||||||
// `niffler::sniff` examine les premiers octets pour choisir le décompresseur
|
|
||||||
let reader = match niffler::sniff(Box::new(file)) {
|
|
||||||
Ok(r) => r,
|
|
||||||
Err(e) => {
|
|
||||||
// Si aucune signature connue, on suppose raw
|
|
||||||
eprintln!("Aucune signature de compression trouvée, lecture brute.");
|
|
||||||
Box::new(BufReader::new(file)) as Box<dyn Read + Send>
|
|
||||||
}
|
|
||||||
};
|
|
||||||
Ok(PartitionReader {
|
|
||||||
reader,
|
|
||||||
seq_buf: Vec::with_capacity(256),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Lit le prochain super‑kmer. Retourne `None` à la fin du fichier.
|
|
||||||
pub fn read_next(&mut self) -> io::Result<Option<SuperKmer>> {
|
|
||||||
SuperKmer::read_raw(&mut self.reader, &mut self.seq_buf)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ impl<'a> SuperKmerIter<'a> {
|
|||||||
self.prev_min_pos = 0;
|
self.prev_min_pos = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn try_emit(&mut self) -> Option<(u64, SuperKmer)> {
|
fn try_emit(&mut self) -> Option<SuperKmer> {
|
||||||
if self.scratch.len() < self.k {
|
if self.scratch.len() < self.k {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
@@ -72,14 +72,14 @@ impl<'a> SuperKmerIter<'a> {
|
|||||||
sk.seql() - self.m - self.prev_min_pos
|
sk.seql() - self.m - self.prev_min_pos
|
||||||
};
|
};
|
||||||
sk.set_minimizer_pos(min_pos as u8);
|
sk.set_minimizer_pos(min_pos as u8);
|
||||||
Some((min, sk))
|
Some(sk)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Iterator for SuperKmerIter<'_> {
|
impl Iterator for SuperKmerIter<'_> {
|
||||||
type Item = (u64, SuperKmer);
|
type Item = SuperKmer;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<(u64, SuperKmer)> {
|
fn next(&mut self) -> Option<SuperKmer> {
|
||||||
loop {
|
loop {
|
||||||
let byte = match self.cursor.read_next().ok() {
|
let byte = match self.cursor.read_next().ok() {
|
||||||
None => {
|
None => {
|
||||||
@@ -164,7 +164,7 @@ mod tests {
|
|||||||
fn run_nofilter(data: &[u8], k: usize, m: usize) -> Vec<Vec<u8>> {
|
fn run_nofilter(data: &[u8], k: usize, m: usize) -> Vec<Vec<u8>> {
|
||||||
let rope = make_rope(data);
|
let rope = make_rope(data);
|
||||||
SuperKmerIter::new(&rope, k, m, 1, 0.0)
|
SuperKmerIter::new(&rope, k, m, 1, 0.0)
|
||||||
.map(|(_, sk)| sk.to_ascii())
|
.map(|sk| sk.to_ascii())
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -201,7 +201,7 @@ mod tests {
|
|||||||
|
|
||||||
let rope = make_rope(b"AAAAAAAA\x00");
|
let rope = make_rope(b"AAAAAAAA\x00");
|
||||||
let out_reject: Vec<Vec<u8>> = SuperKmerIter::new(&rope, 4, 2, 6, 0.9)
|
let out_reject: Vec<Vec<u8>> = SuperKmerIter::new(&rope, 4, 2, 6, 0.9)
|
||||||
.map(|(_, sk)| sk.to_ascii())
|
.map(|sk| sk.to_ascii())
|
||||||
.collect();
|
.collect();
|
||||||
assert!(out_reject.is_empty());
|
assert!(out_reject.is_empty());
|
||||||
}
|
}
|
||||||
@@ -214,7 +214,7 @@ mod tests {
|
|||||||
rope.push(data[..mid].to_vec());
|
rope.push(data[..mid].to_vec());
|
||||||
rope.push(data[mid..].to_vec());
|
rope.push(data[mid..].to_vec());
|
||||||
let out: Vec<Vec<u8>> = SuperKmerIter::new(&rope, 4, 2, 1, 0.0)
|
let out: Vec<Vec<u8>> = SuperKmerIter::new(&rope, 4, 2, 1, 0.0)
|
||||||
.map(|(_, sk)| sk.to_ascii())
|
.map(|sk| sk.to_ascii())
|
||||||
.collect();
|
.collect();
|
||||||
assert!(!out.is_empty());
|
assert!(!out.is_empty());
|
||||||
}
|
}
|
||||||
@@ -222,9 +222,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn yields_minimizer_value() {
|
fn yields_minimizer_value() {
|
||||||
let rope = make_rope(b"ACGTACGT\x00");
|
let rope = make_rope(b"ACGTACGT\x00");
|
||||||
let results: Vec<(u64, Vec<u8>)> = SuperKmerIter::new(&rope, 4, 2, 1, 0.0)
|
let results: Vec<SuperKmer> = SuperKmerIter::new(&rope, 4, 2, 1, 0.0).collect();
|
||||||
.map(|(min, sk)| (min, sk.to_ascii()))
|
|
||||||
.collect();
|
|
||||||
assert!(!results.is_empty());
|
assert!(!results.is_empty());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user