refactor: replace single spectrum file with per-partition outputs
Replace the single `kmer_spectrum_raw.json` output with per-partition JSON files in a `spectrums/` directory. Add a `keep_intermediate` parameter to control intermediate file cleanup, and introduce a `write_spectrum` helper for serialization. Update the completion sentinel to `count.done` and align state documentation accordingly.
This commit is contained in:
@@ -2,4 +2,4 @@ mod index_layer;
|
||||
mod kmer_sort;
|
||||
mod partition;
|
||||
|
||||
pub use partition::{KmerPartition, PARTITIONS_SUBDIR};
|
||||
pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};
|
||||
|
||||
@@ -27,6 +27,12 @@ use crate::kmer_sort::{chunk_size_from_ram, sort_unique_kmers};
|
||||
|
||||
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
||||
|
||||
pub struct KmerSpectrum {
|
||||
pub f0: u64,
|
||||
pub f1: u64,
|
||||
pub counts: BTreeMap<u32, u64>,
|
||||
}
|
||||
|
||||
const SK_EXT: &str = "skmer.zst";
|
||||
pub const PARTITIONS_SUBDIR: &str = "partitions";
|
||||
|
||||
@@ -238,11 +244,13 @@ impl KmerPartition {
|
||||
/// 3. Writes a flat binary count file (`counts1.bin`, one `u32` per slot,
|
||||
/// memory-mapped) accumulating kmer abundances from the superkmer counts.
|
||||
/// 4. Persists the MPHF to `mphf1.bin` for downstream use.
|
||||
/// 5. Writes a global `kmer_spectrum_raw.json` at the partition root.
|
||||
///
|
||||
/// Returns the aggregated `KmerSpectrum`. Per-partition spectrum files are
|
||||
/// deleted after aggregation unless `keep_partial` is true.
|
||||
///
|
||||
/// Partitions are processed in parallel via Rayon (one task per thread).
|
||||
/// Peak memory per partition is ~80 MB, so n_threads partitions run simultaneously.
|
||||
pub fn count_kmer(&self) -> SKResult<()> {
|
||||
pub fn count_kmer(&self, keep_partial: bool) -> SKResult<KmerSpectrum> {
|
||||
let sys = System::new_all();
|
||||
let available = match sys.available_memory() {
|
||||
0 => sys.total_memory() / 2,
|
||||
@@ -282,10 +290,10 @@ impl KmerPartition {
|
||||
r?;
|
||||
}
|
||||
|
||||
// Aggregate per-partition spectra into a global one at the root.
|
||||
let mut global_spectrum: BTreeMap<u32, u64> = BTreeMap::new();
|
||||
let mut global_f0: u64 = 0;
|
||||
let mut global_f1: u64 = 0;
|
||||
// Aggregate per-partition spectra.
|
||||
let mut counts: BTreeMap<u32, u64> = BTreeMap::new();
|
||||
let mut f0: u64 = 0;
|
||||
let mut f1: u64 = 0;
|
||||
|
||||
for i in 0..self.n_partitions {
|
||||
let path = self.part_dir(i).join("kmer_spectrum_raw.json");
|
||||
@@ -294,28 +302,21 @@ impl KmerPartition {
|
||||
}
|
||||
let v: serde_json::Value =
|
||||
serde_json::from_str(&fs::read_to_string(&path)?).map_err(io::Error::other)?;
|
||||
global_f0 += v["f0"].as_u64().unwrap_or(0);
|
||||
global_f1 += v["f1"].as_u64().unwrap_or(0);
|
||||
f0 += v["f0"].as_u64().unwrap_or(0);
|
||||
f1 += v["f1"].as_u64().unwrap_or(0);
|
||||
if let Some(obj) = v["spectrum"].as_object() {
|
||||
for (c_str, freq) in obj {
|
||||
if let (Ok(c), Some(f)) = (c_str.parse::<u32>(), freq.as_u64()) {
|
||||
*global_spectrum.entry(c).or_insert(0) += f;
|
||||
*counts.entry(c).or_insert(0) += f;
|
||||
}
|
||||
}
|
||||
}
|
||||
if !keep_partial {
|
||||
let _ = fs::remove_file(&path);
|
||||
}
|
||||
}
|
||||
|
||||
let global_spectrum_map: BTreeMap<String, u64> = global_spectrum
|
||||
.iter()
|
||||
.map(|(&c, &f)| (format!("{c:010}"), f))
|
||||
.collect();
|
||||
serde_json::to_writer_pretty(
|
||||
fs::File::create(self.root_path.join("kmer_spectrum_raw.json"))?,
|
||||
&serde_json::json!({ "f0": global_f0, "f1": global_f1, "spectrum": &global_spectrum_map }),
|
||||
)
|
||||
.map_err(io::Error::other)?;
|
||||
|
||||
Ok(())
|
||||
Ok(KmerSpectrum { f0, f1, counts })
|
||||
}
|
||||
|
||||
// ── private ───────────────────────────────────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user