refactor: extract pipeline stages and centralize partition directory paths

Extracts the scatter, dereplicate/count, and index pipeline stages into a new `steps` module to improve modularity. Centralizes partition directory path construction by introducing a `part_dir()` helper, replacing manual string formatting across multiple command files. Adds `--with-counts` and `--keep-intermediate` CLI flags to the index command and fixes a typo in the `partition_dir` parameter name.
This commit is contained in:
Eric Coissac
2026-05-20 15:45:50 +02:00
parent cc2ed4bd31
commit f8cfb493b8
12 changed files with 319 additions and 277 deletions
+1 -1
View File
@@ -1,4 +1,4 @@
mod kmer_sort;
mod partition;
pub use partition::KmerPartition;
pub use partition::{KmerPartition, PARTITIONS_SUBDIR};
+13 -12
View File
@@ -30,6 +30,7 @@ type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u
const META_FILENAME: &str = "partition.meta";
const SK_EXT: &str = "skmer.zst";
pub const PARTITIONS_SUBDIR: &str = "partitions";
#[derive(Serialize, Deserialize)]
struct PartitionMeta {
@@ -84,7 +85,7 @@ impl KmerPartition {
.into());
}
}
fs::create_dir_all(&root_path)?;
fs::create_dir_all(root_path.join(PARTITIONS_SUBDIR))?;
let n_partitions = 1usize << n_bits;
let writers = (0..n_partitions).map(|_| None).collect();
let partition = Self {
@@ -175,6 +176,11 @@ impl KmerPartition {
&self.root_path
}
/// Path of partition `i` directory.
pub fn part_dir(&self, i: usize) -> PathBuf {
self.root_path.join(PARTITIONS_SUBDIR).join(format!("part_{i:05}"))
}
pub fn kmer_size(&self) -> usize {
self.kmer_size
}
@@ -208,7 +214,6 @@ impl KmerPartition {
/// more temporary file descriptors — all managed by the global fd pool.
pub fn dereplicate(&self) -> SKResult<()> {
let level = self.level;
let root = &self.root_path;
let sys = System::new_all();
// available_memory() can return 0 on macOS when the compressor page count exceeds
// free+inactive+purgeable pages (sysinfo saturating_sub). Fall back to half of total.
@@ -222,7 +227,7 @@ impl KmerPartition {
let results: Vec<SKResult<()>> = (0..self.n_partitions)
.into_par_iter()
.map(|i| {
let dir = root.join(format!("part_{:05}", i));
let dir = self.part_dir(i);
if !dir.exists() {
return Ok(());
}
@@ -249,8 +254,6 @@ impl KmerPartition {
/// Partitions are processed in parallel via Rayon (one task per thread).
/// Peak memory per partition is ~80 MB, so n_threads partitions run simultaneously.
pub fn count_kmer(&self) -> SKResult<()> {
let root = &self.root_path;
let sys = System::new_all();
let available = match sys.available_memory() {
0 => sys.total_memory() / 2,
@@ -271,7 +274,7 @@ impl KmerPartition {
let results: Vec<SKResult<()>> = (0..self.n_partitions)
.into_par_iter()
.map(|i| {
let dir = root.join(format!("part_{:05}", i));
let dir = self.part_dir(i);
let dedup_path = dir.join(format!("dereplicated.{SK_EXT}"));
if !dedup_path.exists() {
pb.inc(1);
@@ -296,9 +299,7 @@ impl KmerPartition {
let mut global_f1: u64 = 0;
for i in 0..self.n_partitions {
let path = root
.join(format!("part_{:05}", i))
.join("kmer_spectrum_raw.json");
let path = self.part_dir(i).join("kmer_spectrum_raw.json");
if !path.exists() {
continue;
}
@@ -320,7 +321,7 @@ impl KmerPartition {
.map(|(&c, &f)| (format!("{c:010}"), f))
.collect();
serde_json::to_writer_pretty(
fs::File::create(root.join("kmer_spectrum_raw.json"))?,
fs::File::create(self.root_path.join("kmer_spectrum_raw.json"))?,
&serde_json::json!({ "f0": global_f0, "f1": global_f1, "spectrum": &global_spectrum_map }),
)
.map_err(io::Error::other)?;
@@ -352,7 +353,7 @@ impl KmerPartition {
fn ensure_writer(&mut self, partition: usize) -> SKResult<&mut SKFileWriter> {
if self.writers[partition].is_none() {
let dir = self.root_path.join(format!("part_{:05}", partition));
let dir = self.root_path.join(PARTITIONS_SUBDIR).join(format!("part_{:05}", partition));
fs::create_dir_all(&dir)?;
let file_path = dir.join(format!("raw.{SK_EXT}"));
let writer = SKFileWriter::create_with(file_path, Format::Zstd, self.level)?;
@@ -645,7 +646,7 @@ mod tests {
kp.close().unwrap();
kp.dereplicate().unwrap();
let part_dir = dir.path().join("part_00000");
let part_dir = dir.path().join(PARTITIONS_SUBDIR).join("part_00000");
let dedup_path = part_dir.join("dereplicated.skmer.zst");
if !dedup_path.exists() {
return (0, 0);