+ obiskio: add binary I/O with LRU pool and compression
- Add new obiskio crate for high-performance SuperKmer serialization/deserialization - Implement binary codec with 2-bit packed sequence encoding and raw header format (32 bits) - Add transparent compression support via niffler: Zstd, Gzip/Bgzf/Lz4 - Implement SKFilePool with LRU-based fd management, max-concurrent-fd limiting (75% of ulimit) - Add SKFileWriter with batched writes, configurable flush threshold (8 KiB default), and two-phase locking - Add SKFileReader with sequential access, LRU recovery via reopen_and_seek() + New obikpartitionner crate: basic header/seq handling for binary super-kmer format - Bump niffler from 2.7 to v3, add dependencies: allocator-api2, bitflags(>=1), errno/fastrand/rustix/tempfile/lru/hashbrown/bzip2/thiserror - Update workspace members to include obikpartitionner andobiskio
This commit is contained in:
@@ -0,0 +1,6 @@
|
||||
[package]
|
||||
name = "obikpartitionner"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
@@ -0,0 +1,3 @@
|
||||
mod limits;
|
||||
|
||||
pub use limits::max_concurrent_files;
|
||||
@@ -0,0 +1,150 @@
|
||||
use niffler::compression::{Format, Level, from_reader, from_writer};
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter, Read, Write};
|
||||
use std::path::Path;
|
||||
|
||||
// ---------- Format binaire d'un super‑kmer (encodage 2 bits) ----------
|
||||
pub struct SuperKmerHeader(u32);
|
||||
|
||||
impl SuperKmerHeader {
|
||||
pub fn seq_len(&self) -> usize {
|
||||
let seql = (self.0 & 0xFF) as u8;
|
||||
if seql == 0 { 256 } else { seql as usize }
|
||||
}
|
||||
pub fn to_bits(&self) -> u32 {
|
||||
self.0
|
||||
}
|
||||
pub fn from_bits(bits: u32) -> Self {
|
||||
Self(bits)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SuperKmer {
|
||||
pub header: SuperKmerHeader,
|
||||
pub seq: Box<[u8]>, // déjà encodée en 2 bits par base
|
||||
}
|
||||
|
||||
impl SuperKmer {
|
||||
/// Écrit ce super‑kmer dans un écrivain binaire (non compressé).
|
||||
pub fn write_raw<W: Write>(&self, w: &mut W) -> io::Result<()> {
|
||||
w.write_all(&self.header.to_bits().to_le_bytes())?;
|
||||
w.write_all(&self.seq)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Lit un super‑kmer depuis un lecteur binaire (non compressé).
|
||||
/// Retourne `None` si EOF.
|
||||
pub fn read_raw<R: Read>(r: &mut R, buf_seq: &mut Vec<u8>) -> io::Result<Option<Self>> {
|
||||
let mut header_bytes = [0u8; 4];
|
||||
if let Err(e) = r.read_exact(&mut header_bytes) {
|
||||
return if e.kind() == io::ErrorKind::UnexpectedEof {
|
||||
Ok(None)
|
||||
} else {
|
||||
Err(e)
|
||||
};
|
||||
}
|
||||
let header = SuperKmerHeader::from_bits(u32::from_le_bytes(header_bytes));
|
||||
let len_bytes = (header.seq_len() + 3) / 4; // nombre d'octets encodés
|
||||
buf_seq.clear();
|
||||
buf_seq.resize(len_bytes, 0);
|
||||
r.read_exact(buf_seq)?;
|
||||
let seq = buf_seq.clone().into_boxed_slice();
|
||||
Ok(Some(SuperKmer { header, seq }))
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- PartitionManager avec compression (via niffler) ----------
|
||||
pub enum CompressionFormat {
|
||||
Gzip, // .gz
|
||||
Zstd, // .zst
|
||||
Lz4, // .lz4
|
||||
Bgzf, // .bgzf (indexable par blocs)
|
||||
None, // pas de compression
|
||||
}
|
||||
|
||||
impl CompressionFormat {
|
||||
/// Infère le format à partir de l'extension du fichier.
|
||||
pub fn from_extension(path: &Path) -> Option<Self> {
|
||||
match path.extension()?.to_str()? {
|
||||
"gz" => Some(CompressionFormat::Gzip),
|
||||
"zst" => Some(CompressionFormat::Zstd),
|
||||
"lz4" => Some(CompressionFormat::Lz4),
|
||||
"bgzf" => Some(CompressionFormat::Bgzf),
|
||||
"raw" => Some(CompressionFormat::None),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Extension de fichier recommandée.
|
||||
pub fn extension(&self) -> &'static str {
|
||||
match self {
|
||||
CompressionFormat::Gzip => "gz",
|
||||
CompressionFormat::Zstd => "zst",
|
||||
CompressionFormat::Lz4 => "lz4",
|
||||
CompressionFormat::Bgzf => "bgzf",
|
||||
CompressionFormat::None => "raw",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PartitionWriter {
|
||||
writer: Box<dyn Write + Send>, // le flux compressé
|
||||
// buffer interne pour réutiliser les écritures (optionnel)
|
||||
}
|
||||
|
||||
impl PartitionWriter {
|
||||
/// Ouvre un fichier en écriture avec la compression demandée.
|
||||
pub fn create(path: &Path, format: CompressionFormat) -> io::Result<Self> {
|
||||
let file = File::create(path)?;
|
||||
const DEFAULT_LEVEL: Level = Level::Default; // peut être ajusté
|
||||
let writer: Box<dyn Write + Send> = match format {
|
||||
CompressionFormat::Gzip => Box::new(from_writer(file, Format::Gzip, DEFAULT_LEVEL)?),
|
||||
CompressionFormat::Zstd => Box::new(from_writer(file, Format::Zstd, DEFAULT_LEVEL)?),
|
||||
CompressionFormat::Lz4 => Box::new(from_writer(file, Format::Lz4, DEFAULT_LEVEL)?),
|
||||
CompressionFormat::Bgzf => Box::new(from_writer(file, Format::Bgzf, DEFAULT_LEVEL)?),
|
||||
CompressionFormat::None => Box::new(BufWriter::new(file)),
|
||||
};
|
||||
Ok(PartitionWriter { writer })
|
||||
}
|
||||
|
||||
/// Écrit un super‑kmer (non compressé individuellement) dans le flux compressé.
|
||||
pub fn write_kmer(&mut self, kmer: &SuperKmer) -> io::Result<()> {
|
||||
kmer.write_raw(&mut self.writer)
|
||||
}
|
||||
|
||||
/// Flush final.
|
||||
pub fn finish(mut self) -> io::Result<()> {
|
||||
self.writer.flush()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PartitionReader {
|
||||
reader: Box<dyn Read + Send>,
|
||||
seq_buf: Vec<u8>, // réutilisation pour les séquences
|
||||
}
|
||||
|
||||
impl PartitionReader {
|
||||
/// Ouvre un fichier en lecture. Détecte automatiquement le format de compression
|
||||
/// grâce à `niffler::sniff` ou via l'extension.
|
||||
pub fn open(path: &Path) -> io::Result<Self> {
|
||||
let file = File::open(path)?;
|
||||
// `niffler::sniff` examine les premiers octets pour choisir le décompresseur
|
||||
let reader = match niffler::sniff(Box::new(file)) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
// Si aucune signature connue, on suppose raw
|
||||
eprintln!("Aucune signature de compression trouvée, lecture brute.");
|
||||
Box::new(BufReader::new(file)) as Box<dyn Read + Send>
|
||||
}
|
||||
};
|
||||
Ok(PartitionReader {
|
||||
reader,
|
||||
seq_buf: Vec::with_capacity(256),
|
||||
})
|
||||
}
|
||||
|
||||
/// Lit le prochain super‑kmer. Retourne `None` à la fin du fichier.
|
||||
pub fn read_next(&mut self) -> io::Result<Option<SuperKmer>> {
|
||||
SuperKmer::read_raw(&mut self.reader, &mut self.seq_buf)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user