.gitignore: ignore zstandard-compressed files
- Add *.zst pattern to .gitignore - Prevents tracking of zstandard-compressed archives
This commit is contained in:
@@ -4,4 +4,8 @@ version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
niffler = "3.0.0"
|
||||
obikseq = { path = "../obikseq" }
|
||||
obiskio = { path = "../obiskio" }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
mod manager;
|
||||
mod partition;
|
||||
|
||||
pub use manager::PartitionManager;
|
||||
pub use partition::KmerPartition;
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
use obiskio::SKFileWriter;
|
||||
use std::path::Path;
|
||||
|
||||
pub struct PartitionManager {
|
||||
root_path: Box<Path>,
|
||||
partitions_mask: u64,
|
||||
writers: Vec<SKFileWriter>,
|
||||
}
|
||||
|
||||
impl PartitionManager {
|
||||
pub fn new(root_path: Box<Path>, n_partition_bits: usize) -> Self {
|
||||
Self {
|
||||
root_path,
|
||||
partitions_mask: (1u64 << n_partition_bits) - 1,
|
||||
writers: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,196 @@
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use obiskio::{SKFilePool, SKFileWriter, SKResult, SharedPool, create_token_with};
|
||||
|
||||
const META_FILENAME: &str = "partition.meta";
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PartitionMeta {
|
||||
n_bits: usize,
|
||||
kmer_size: usize,
|
||||
minimizer_size: usize,
|
||||
format: String,
|
||||
level: u32,
|
||||
}
|
||||
|
||||
pub struct KmerPartition {
|
||||
root_path: PathBuf,
|
||||
n_partitions: usize,
|
||||
partitions_mask: u64,
|
||||
kmer_size: usize,
|
||||
minimizer_size: usize,
|
||||
pool: SharedPool,
|
||||
writers: Vec<Option<SKFileWriter>>,
|
||||
format: Format,
|
||||
level: Level,
|
||||
closed: bool,
|
||||
}
|
||||
|
||||
impl KmerPartition {
|
||||
pub fn create<P: AsRef<Path>>(
|
||||
path: P,
|
||||
n_bits: usize,
|
||||
kmer_size: usize,
|
||||
minimizer_size: usize,
|
||||
force: bool,
|
||||
) -> SKResult<Self> {
|
||||
Self::create_with(path, n_bits, kmer_size, minimizer_size, Format::Zstd, Level::Three, force)
|
||||
}
|
||||
|
||||
pub fn create_with<P: AsRef<Path>>(
|
||||
path: P,
|
||||
n_bits: usize,
|
||||
kmer_size: usize,
|
||||
minimizer_size: usize,
|
||||
format: Format,
|
||||
level: Level,
|
||||
force: bool,
|
||||
) -> SKResult<Self> {
|
||||
let root_path = path.as_ref().to_owned();
|
||||
if root_path.exists() {
|
||||
if force {
|
||||
fs::remove_dir_all(&root_path)?;
|
||||
} else {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::AlreadyExists,
|
||||
format!("{}: partition directory already exists", root_path.display()),
|
||||
)
|
||||
.into());
|
||||
}
|
||||
}
|
||||
fs::create_dir_all(&root_path)?;
|
||||
let n_partitions = 1usize << n_bits;
|
||||
let pool = Arc::new(Mutex::new(SKFilePool::from_system_limits()));
|
||||
let writers = (0..n_partitions).map(|_| None).collect();
|
||||
let partition = Self {
|
||||
root_path,
|
||||
n_partitions,
|
||||
partitions_mask: (1u64 << n_bits) - 1,
|
||||
kmer_size,
|
||||
minimizer_size,
|
||||
pool,
|
||||
writers,
|
||||
format,
|
||||
level,
|
||||
closed: false,
|
||||
};
|
||||
partition.write_meta(n_bits)?;
|
||||
Ok(partition)
|
||||
}
|
||||
|
||||
pub fn write(&mut self, sk: &SuperKmer) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
let partition = self.partition_of(sk)?;
|
||||
self.ensure_writer(partition)?.write(sk)
|
||||
}
|
||||
|
||||
pub fn write_batch(&mut self, sks: &[SuperKmer]) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
for sk in sks {
|
||||
let partition = self.partition_of(sk)?;
|
||||
self.ensure_writer(partition)?.write(sk)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
for writer in self.writers.iter_mut().flatten() {
|
||||
writer.flush()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close(&mut self) -> SKResult<()> {
|
||||
if self.closed {
|
||||
return Ok(());
|
||||
}
|
||||
self.closed = true;
|
||||
for writer in self.writers.iter_mut().flatten() {
|
||||
writer.close()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn is_open(&self) -> bool {
|
||||
!self.closed
|
||||
}
|
||||
|
||||
pub fn path(&self) -> &Path {
|
||||
&self.root_path
|
||||
}
|
||||
|
||||
// ── private ───────────────────────────────────────────────────────────────
|
||||
|
||||
fn check_not_closed(&self) -> SKResult<()> {
|
||||
if self.closed {
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::BrokenPipe,
|
||||
"write to closed KmerPartition",
|
||||
)
|
||||
.into())
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn partition_of(&self, sk: &SuperKmer) -> SKResult<usize> {
|
||||
let minimizer = sk
|
||||
.kmer(sk.minimizer_pos() as usize, self.minimizer_size)
|
||||
.map_err(|e| io::Error::other(e))?
|
||||
.canonical(self.minimizer_size);
|
||||
Ok((minimizer.hash(self.minimizer_size) & self.partitions_mask) as usize)
|
||||
}
|
||||
|
||||
fn write_meta(&self, n_bits: usize) -> SKResult<()> {
|
||||
let meta = PartitionMeta {
|
||||
n_bits,
|
||||
kmer_size: self.kmer_size,
|
||||
minimizer_size: self.minimizer_size,
|
||||
format: match self.format {
|
||||
Format::Gzip => "gzip",
|
||||
Format::Bzip => "bzip2",
|
||||
Format::Lzma => "lzma",
|
||||
Format::Zstd => "zstd",
|
||||
Format::No => "none",
|
||||
}
|
||||
.to_owned(),
|
||||
level: u32::from(self.level),
|
||||
};
|
||||
let f = fs::File::create(self.root_path.join(META_FILENAME))?;
|
||||
serde_json::to_writer_pretty(f, &meta)
|
||||
.map_err(|e| io::Error::other(e))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ensure_writer(&mut self, partition: usize) -> SKResult<&mut SKFileWriter> {
|
||||
if self.writers[partition].is_none() {
|
||||
let dir = self.root_path.join(format!("part_{:05}", partition));
|
||||
fs::create_dir_all(&dir)?;
|
||||
let ext = match self.format {
|
||||
Format::Gzip => "skmer.gz",
|
||||
Format::Bzip => "skmer.bz2",
|
||||
Format::Lzma => "skmer.xz",
|
||||
Format::Zstd => "skmer.zst",
|
||||
Format::No => "skmer",
|
||||
};
|
||||
let file_path = dir.join(format!("raw.{ext}"));
|
||||
let writer = create_token_with(&self.pool, file_path, self.format, self.level)?;
|
||||
self.writers[partition] = Some(writer);
|
||||
}
|
||||
Ok(self.writers[partition].as_mut().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for KmerPartition {
|
||||
fn drop(&mut self) {
|
||||
let _ = self.close();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user