refactor: replace single spectrum file with per-partition outputs

Replace the single `kmer_spectrum_raw.json` output with per-partition JSON files in a `spectrums/` directory. Add a `keep_intermediate` parameter to control intermediate file cleanup, and introduce a `write_spectrum` helper for serialization. Update the completion sentinel to `count.done` and align state documentation accordingly.
This commit is contained in:
Eric Coissac
2026-05-20 21:06:27 +02:00
parent c5bcb7b8fa
commit 7d1b62ddf3
5 changed files with 53 additions and 30 deletions
+28 -6
View File
@@ -1,17 +1,18 @@
use std::collections::BTreeMap;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Mutex};
use indicatif::{ProgressBar, ProgressStyle};
use obikpartitionner::KmerPartition;
use obikpartitionner::{KmerPartition, KmerSpectrum};
use obisys::{Reporter, Stage};
use rayon::prelude::*;
use tracing::info;
use crate::error::{OKIError, OKIResult};
use crate::meta::{IndexConfig, IndexMeta};
use crate::state::{IndexState, SENTINEL_INDEXED, SENTINEL_SCATTERED};
use crate::state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
pub struct KmerIndex {
root_path: PathBuf,
@@ -96,16 +97,37 @@ impl KmerIndex {
/// Dereplicate all partitions then compute kmer counts.
///
/// Writes `kmer_spectrum_raw.json` at the index root upon completion
/// (this file doubles as the `Counted` sentinel).
pub fn dereplicate_and_count(&self, rep: &mut Reporter) -> OKIResult<()> {
/// Writes `spectrums/{label}.json` and touches `count.done` upon completion.
/// Per-partition spectrum files are removed unless `keep_intermediate` is true.
pub fn dereplicate_and_count(&self, keep_intermediate: bool, rep: &mut Reporter) -> OKIResult<()> {
let t = Stage::start("dereplicate");
self.partition.dereplicate()?;
rep.push(t.stop());
let t = Stage::start("count_kmer");
self.partition.count_kmer()?;
let spectrum = self.partition.count_kmer(keep_intermediate)?;
rep.push(t.stop());
self.write_spectrum(&spectrum)?;
touch(&self.root_path.join(SENTINEL_COUNTED))?;
Ok(())
}
fn write_spectrum(&self, sp: &KmerSpectrum) -> OKIResult<()> {
let label = self.meta.genomes.first().map(String::as_str).unwrap_or("unknown");
let spectrums_dir = self.root_path.join("spectrums");
fs::create_dir_all(&spectrums_dir)?;
let path = spectrums_dir.join(format!("{label}.json"));
let spectrum_map: BTreeMap<String, u64> = sp.counts
.iter()
.map(|(&c, &f)| (format!("{c:010}"), f))
.collect();
let f = fs::File::create(&path)?;
serde_json::to_writer_pretty(
f,
&serde_json::json!({ "f0": sp.f0, "f1": sp.f1, "spectrum": spectrum_map }),
)
.map_err(OKIError::Json)?;
Ok(())
}
+2 -2
View File
@@ -3,7 +3,7 @@ use std::path::Path;
use crate::meta::META_FILENAME;
pub const SENTINEL_SCATTERED: &str = "scatter.done";
pub const SENTINEL_COUNTED: &str = "kmer_spectrum_raw.json";
pub const SENTINEL_COUNTED: &str = "count.done";
pub const SENTINEL_INDEXED: &str = "index.done";
/// Progression state of a `KmerIndex`.
@@ -17,7 +17,7 @@ pub enum IndexState {
Empty,
/// `scatter.done` sentinel present — all super-kmers have been routed.
Scattered,
/// `kmer_spectrum_raw.json` present — dereplicate + count complete.
/// `count.done` sentinel present — dereplicate + count complete.
Counted,
/// `index.done` sentinel present — layered MPHF index fully built.
Indexed,