refactor: extract pipeline stages and centralize partition directory paths
Extracts the scatter, dereplicate/count, and index pipeline stages into a new `steps` module to improve modularity. Centralizes partition directory path construction by introducing a `part_dir()` helper, replacing manual string formatting across multiple command files. Adds `--with-counts` and `--keep-intermediate` CLI flags to the index command and fixes a typo in the `partition_dir` parameter name.
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
mod kmer_sort;
|
||||
mod partition;
|
||||
|
||||
pub use partition::KmerPartition;
|
||||
pub use partition::{KmerPartition, PARTITIONS_SUBDIR};
|
||||
|
||||
@@ -30,6 +30,7 @@ type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u
|
||||
|
||||
const META_FILENAME: &str = "partition.meta";
|
||||
const SK_EXT: &str = "skmer.zst";
|
||||
pub const PARTITIONS_SUBDIR: &str = "partitions";
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PartitionMeta {
|
||||
@@ -84,7 +85,7 @@ impl KmerPartition {
|
||||
.into());
|
||||
}
|
||||
}
|
||||
fs::create_dir_all(&root_path)?;
|
||||
fs::create_dir_all(root_path.join(PARTITIONS_SUBDIR))?;
|
||||
let n_partitions = 1usize << n_bits;
|
||||
let writers = (0..n_partitions).map(|_| None).collect();
|
||||
let partition = Self {
|
||||
@@ -175,6 +176,11 @@ impl KmerPartition {
|
||||
&self.root_path
|
||||
}
|
||||
|
||||
/// Path of partition `i` directory.
|
||||
pub fn part_dir(&self, i: usize) -> PathBuf {
|
||||
self.root_path.join(PARTITIONS_SUBDIR).join(format!("part_{i:05}"))
|
||||
}
|
||||
|
||||
pub fn kmer_size(&self) -> usize {
|
||||
self.kmer_size
|
||||
}
|
||||
@@ -208,7 +214,6 @@ impl KmerPartition {
|
||||
/// more temporary file descriptors — all managed by the global fd pool.
|
||||
pub fn dereplicate(&self) -> SKResult<()> {
|
||||
let level = self.level;
|
||||
let root = &self.root_path;
|
||||
let sys = System::new_all();
|
||||
// available_memory() can return 0 on macOS when the compressor page count exceeds
|
||||
// free+inactive+purgeable pages (sysinfo saturating_sub). Fall back to half of total.
|
||||
@@ -222,7 +227,7 @@ impl KmerPartition {
|
||||
let results: Vec<SKResult<()>> = (0..self.n_partitions)
|
||||
.into_par_iter()
|
||||
.map(|i| {
|
||||
let dir = root.join(format!("part_{:05}", i));
|
||||
let dir = self.part_dir(i);
|
||||
if !dir.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -249,8 +254,6 @@ impl KmerPartition {
|
||||
/// Partitions are processed in parallel via Rayon (one task per thread).
|
||||
/// Peak memory per partition is ~80 MB, so n_threads partitions run simultaneously.
|
||||
pub fn count_kmer(&self) -> SKResult<()> {
|
||||
let root = &self.root_path;
|
||||
|
||||
let sys = System::new_all();
|
||||
let available = match sys.available_memory() {
|
||||
0 => sys.total_memory() / 2,
|
||||
@@ -271,7 +274,7 @@ impl KmerPartition {
|
||||
let results: Vec<SKResult<()>> = (0..self.n_partitions)
|
||||
.into_par_iter()
|
||||
.map(|i| {
|
||||
let dir = root.join(format!("part_{:05}", i));
|
||||
let dir = self.part_dir(i);
|
||||
let dedup_path = dir.join(format!("dereplicated.{SK_EXT}"));
|
||||
if !dedup_path.exists() {
|
||||
pb.inc(1);
|
||||
@@ -296,9 +299,7 @@ impl KmerPartition {
|
||||
let mut global_f1: u64 = 0;
|
||||
|
||||
for i in 0..self.n_partitions {
|
||||
let path = root
|
||||
.join(format!("part_{:05}", i))
|
||||
.join("kmer_spectrum_raw.json");
|
||||
let path = self.part_dir(i).join("kmer_spectrum_raw.json");
|
||||
if !path.exists() {
|
||||
continue;
|
||||
}
|
||||
@@ -320,7 +321,7 @@ impl KmerPartition {
|
||||
.map(|(&c, &f)| (format!("{c:010}"), f))
|
||||
.collect();
|
||||
serde_json::to_writer_pretty(
|
||||
fs::File::create(root.join("kmer_spectrum_raw.json"))?,
|
||||
fs::File::create(self.root_path.join("kmer_spectrum_raw.json"))?,
|
||||
&serde_json::json!({ "f0": global_f0, "f1": global_f1, "spectrum": &global_spectrum_map }),
|
||||
)
|
||||
.map_err(io::Error::other)?;
|
||||
@@ -352,7 +353,7 @@ impl KmerPartition {
|
||||
|
||||
fn ensure_writer(&mut self, partition: usize) -> SKResult<&mut SKFileWriter> {
|
||||
if self.writers[partition].is_none() {
|
||||
let dir = self.root_path.join(format!("part_{:05}", partition));
|
||||
let dir = self.root_path.join(PARTITIONS_SUBDIR).join(format!("part_{:05}", partition));
|
||||
fs::create_dir_all(&dir)?;
|
||||
let file_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
let writer = SKFileWriter::create_with(file_path, Format::Zstd, self.level)?;
|
||||
@@ -645,7 +646,7 @@ mod tests {
|
||||
kp.close().unwrap();
|
||||
kp.dereplicate().unwrap();
|
||||
|
||||
let part_dir = dir.path().join("part_00000");
|
||||
let part_dir = dir.path().join(PARTITIONS_SUBDIR).join("part_00000");
|
||||
let dedup_path = part_dir.join("dereplicated.skmer.zst");
|
||||
if !dedup_path.exists() {
|
||||
return (0, 0);
|
||||
|
||||
Reference in New Issue
Block a user