+ obiskio: add binary I/O with LRU pool and compression

- Add new obiskio crate for high-performance SuperKmer serialization/deserialization
- Implement binary codec with 2-bit packed sequence encoding and raw header format (32 bits)
- Add transparent compression support via niffler: Zstd, Gzip/Bgzf/Lz4
- Implement SKFilePool with LRU-based fd management, max-concurrent-fd limiting (75% of ulimit)
- Add SKFileWriter with batched writes, configurable flush threshold (8 KiB default), and two-phase locking
- Add SKFileReader with sequential access, LRU recovery via reopen_and_seek()
+ New obikpartitionner crate: basic header/seq handling for binary super-kmer format
- Bump niffler from 2.7 to v3, add dependencies: allocator-api2, bitflags(>=1), errno/fastrand/rustix/tempfile/lru/hashbrown/bzip2/thiserror
- Update workspace members to include obikpartitionner andobiskio
This commit is contained in:
Eric Coissac
2026-04-24 21:07:58 +02:00
parent d4e4289aff
commit c09d17401d
13 changed files with 1324 additions and 5 deletions
+6
View File
@@ -0,0 +1,6 @@
[package]
name = "obikpartitionner"
version = "0.1.0"
edition = "2024"
[dependencies]
+3
View File
@@ -0,0 +1,3 @@
mod limits;
pub use limits::max_concurrent_files;
+150
View File
@@ -0,0 +1,150 @@
use niffler::compression::{Format, Level, from_reader, from_writer};
use std::fs::File;
use std::io::{self, BufReader, BufWriter, Read, Write};
use std::path::Path;
// ---------- Format binaire d'un superkmer (encodage 2 bits) ----------
pub struct SuperKmerHeader(u32);
impl SuperKmerHeader {
pub fn seq_len(&self) -> usize {
let seql = (self.0 & 0xFF) as u8;
if seql == 0 { 256 } else { seql as usize }
}
pub fn to_bits(&self) -> u32 {
self.0
}
pub fn from_bits(bits: u32) -> Self {
Self(bits)
}
}
pub struct SuperKmer {
pub header: SuperKmerHeader,
pub seq: Box<[u8]>, // déjà encodée en 2 bits par base
}
impl SuperKmer {
/// Écrit ce superkmer dans un écrivain binaire (non compressé).
pub fn write_raw<W: Write>(&self, w: &mut W) -> io::Result<()> {
w.write_all(&self.header.to_bits().to_le_bytes())?;
w.write_all(&self.seq)?;
Ok(())
}
/// Lit un superkmer depuis un lecteur binaire (non compressé).
/// Retourne `None` si EOF.
pub fn read_raw<R: Read>(r: &mut R, buf_seq: &mut Vec<u8>) -> io::Result<Option<Self>> {
let mut header_bytes = [0u8; 4];
if let Err(e) = r.read_exact(&mut header_bytes) {
return if e.kind() == io::ErrorKind::UnexpectedEof {
Ok(None)
} else {
Err(e)
};
}
let header = SuperKmerHeader::from_bits(u32::from_le_bytes(header_bytes));
let len_bytes = (header.seq_len() + 3) / 4; // nombre d'octets encodés
buf_seq.clear();
buf_seq.resize(len_bytes, 0);
r.read_exact(buf_seq)?;
let seq = buf_seq.clone().into_boxed_slice();
Ok(Some(SuperKmer { header, seq }))
}
}
// ---------- PartitionManager avec compression (via niffler) ----------
pub enum CompressionFormat {
Gzip, // .gz
Zstd, // .zst
Lz4, // .lz4
Bgzf, // .bgzf (indexable par blocs)
None, // pas de compression
}
impl CompressionFormat {
/// Infère le format à partir de l'extension du fichier.
pub fn from_extension(path: &Path) -> Option<Self> {
match path.extension()?.to_str()? {
"gz" => Some(CompressionFormat::Gzip),
"zst" => Some(CompressionFormat::Zstd),
"lz4" => Some(CompressionFormat::Lz4),
"bgzf" => Some(CompressionFormat::Bgzf),
"raw" => Some(CompressionFormat::None),
_ => None,
}
}
/// Extension de fichier recommandée.
pub fn extension(&self) -> &'static str {
match self {
CompressionFormat::Gzip => "gz",
CompressionFormat::Zstd => "zst",
CompressionFormat::Lz4 => "lz4",
CompressionFormat::Bgzf => "bgzf",
CompressionFormat::None => "raw",
}
}
}
pub struct PartitionWriter {
writer: Box<dyn Write + Send>, // le flux compressé
// buffer interne pour réutiliser les écritures (optionnel)
}
impl PartitionWriter {
/// Ouvre un fichier en écriture avec la compression demandée.
pub fn create(path: &Path, format: CompressionFormat) -> io::Result<Self> {
let file = File::create(path)?;
const DEFAULT_LEVEL: Level = Level::Default; // peut être ajusté
let writer: Box<dyn Write + Send> = match format {
CompressionFormat::Gzip => Box::new(from_writer(file, Format::Gzip, DEFAULT_LEVEL)?),
CompressionFormat::Zstd => Box::new(from_writer(file, Format::Zstd, DEFAULT_LEVEL)?),
CompressionFormat::Lz4 => Box::new(from_writer(file, Format::Lz4, DEFAULT_LEVEL)?),
CompressionFormat::Bgzf => Box::new(from_writer(file, Format::Bgzf, DEFAULT_LEVEL)?),
CompressionFormat::None => Box::new(BufWriter::new(file)),
};
Ok(PartitionWriter { writer })
}
/// Écrit un superkmer (non compressé individuellement) dans le flux compressé.
pub fn write_kmer(&mut self, kmer: &SuperKmer) -> io::Result<()> {
kmer.write_raw(&mut self.writer)
}
/// Flush final.
pub fn finish(mut self) -> io::Result<()> {
self.writer.flush()
}
}
pub struct PartitionReader {
reader: Box<dyn Read + Send>,
seq_buf: Vec<u8>, // réutilisation pour les séquences
}
impl PartitionReader {
/// Ouvre un fichier en lecture. Détecte automatiquement le format de compression
/// grâce à `niffler::sniff` ou via l'extension.
pub fn open(path: &Path) -> io::Result<Self> {
let file = File::open(path)?;
// `niffler::sniff` examine les premiers octets pour choisir le décompresseur
let reader = match niffler::sniff(Box::new(file)) {
Ok(r) => r,
Err(e) => {
// Si aucune signature connue, on suppose raw
eprintln!("Aucune signature de compression trouvée, lecture brute.");
Box::new(BufReader::new(file)) as Box<dyn Read + Send>
}
};
Ok(PartitionReader {
reader,
seq_buf: Vec::with_capacity(256),
})
}
/// Lit le prochain superkmer. Retourne `None` à la fin du fichier.
pub fn read_next(&mut self) -> io::Result<Option<SuperKmer>> {
SuperKmer::read_raw(&mut self.reader, &mut self.seq_buf)
}
}