refactor: rename compute_degrees and mark start nodes

Renames `compute_degrees` to `compute_degrees_and_mark_starts` across the De Bruijn graph and partitioner layers to consolidate degree calculation and start-node flagging. Introduces safe neighbor iteration methods and a debug validation block to verify graph consistency. Refactors unitig extraction to use sequential execution with a `Mutex` for safe error propagation. Fixes malformed and duplicated method calls, adds auto-generation of missing `meta.json` files, and ensures persistent matrix builders are explicitly closed to finalize metadata.
This commit is contained in:
Eric Coissac
2026-06-05 19:32:30 +02:00
parent 27088ab810
commit 5c2f48535f
6 changed files with 252 additions and 384 deletions
+62 -30
View File
@@ -2,15 +2,15 @@ use std::fs;
use std::io;
use std::path::{Path, PathBuf};
use obicompactvec::{PersistentBitMatrixBuilder,
PersistentBitVecBuilder,
PersistentCompactIntMatrixBuilder,
PersistentCompactIntVecBuilder};
use obicompactvec::{
PersistentBitMatrixBuilder, PersistentBitVecBuilder, PersistentCompactIntMatrixBuilder,
PersistentCompactIntVecBuilder,
};
use obidebruinj::GraphDeBruijn;
use obikseq::CanonicalKmer;
use obiskio::{SKError, SKResult, UnitigFileReader};
use obilayeredmap::{IndexMode, Layer, MphfLayer, OLMError};
use obilayeredmap::meta::PartitionMeta;
use obilayeredmap::{IndexMode, Layer, MphfLayer, OLMError};
use obiskio::{SKError, SKResult, UnitigFileReader};
use crate::filter::{KmerFilter, passes_all};
use crate::merge_layer::{MergeMode, SrcLayerData};
@@ -21,7 +21,10 @@ const INDEX_SUBDIR: &str = "index";
fn olm_to_sk(e: OLMError) -> SKError {
match e {
OLMError::Io(e) => SKError::Io(e),
other => SKError::InvalidData { context: "rebuild", detail: other.to_string() },
other => SKError::InvalidData {
context: "rebuild",
detail: other.to_string(),
},
}
}
@@ -34,7 +37,10 @@ fn col_path_int(dir: &Path, col: usize) -> PathBuf {
}
fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
fs::write(dir.join("meta.json"), format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"))
fs::write(
dir.join("meta.json"),
format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"),
)
}
// ── ColBuilder ────────────────────────────────────────────────────────────────
@@ -54,8 +60,8 @@ impl ColBuilder {
fn close(self) -> SKResult<()> {
match self {
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
ColBuilder::Int(b) => b.close().map_err(SKError::Io),
}
}
}
@@ -65,10 +71,16 @@ impl ColBuilder {
fn load_meta(dir: &Path) -> SKResult<PartitionMeta> {
match PartitionMeta::load(dir) {
Ok(m) => Ok(m),
Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => {
Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) =>
{
let mut n = 0usize;
while dir.join(format!("layer_{n}")).exists() { n += 1; }
let m = PartitionMeta { n_layers: n, mode: IndexMode::default() };
while dir.join(format!("layer_{n}")).exists() {
n += 1;
}
let m = PartitionMeta {
n_layers: n,
mode: IndexMode::default(),
};
m.save(dir).map_err(olm_to_sk)?;
Ok(m)
}
@@ -90,10 +102,12 @@ fn iter_src_layers(
let src_meta = load_meta(src_index_dir)?;
for l in 0..src_meta.n_layers {
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
let unitigs_path = src_layer_dir.join("unitigs.bin");
if !unitigs_path.exists() { continue; }
let unitigs_path = src_layer_dir.join("unitigs.bin");
if !unitigs_path.exists() {
continue;
}
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
@@ -146,7 +160,7 @@ impl KmerPartition {
}
let n_new = g.len();
g.compute_degrees();
g.compute_degrees_and_mark_starts();
// ── Build MPHF in dst layer_0 ─────────────────────────────────────────
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
@@ -167,26 +181,37 @@ impl KmerPartition {
// ── Prepare matrix builders (one column per genome) ───────────────────
let data_dir = match mode {
MergeMode::Presence => dst_layer_dir.join("presence"),
MergeMode::Count => dst_layer_dir.join("counts"),
MergeMode::Count => dst_layer_dir.join("counts"),
};
fs::create_dir_all(&data_dir)?;
let mut builders: Vec<ColBuilder> = match mode {
MergeMode::Presence => {
PersistentBitMatrixBuilder::new(n_new, &data_dir)
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
(0..n_genomes).map(|g| -> SKResult<ColBuilder> {
let b = PersistentBitVecBuilder::new(n_new, &col_path_bit(&data_dir, g))?;
Ok(ColBuilder::Bit(b))
}).collect::<SKResult<_>>()?
.map_err(SKError::Io)?
.close()
.map_err(SKError::Io)?;
(0..n_genomes)
.map(|g| -> SKResult<ColBuilder> {
let b = PersistentBitVecBuilder::new(n_new, &col_path_bit(&data_dir, g))?;
Ok(ColBuilder::Bit(b))
})
.collect::<SKResult<_>>()?
}
MergeMode::Count => {
PersistentCompactIntMatrixBuilder::new(n_new, &data_dir)
.map_err(SKError::Io)?.close().map_err(SKError::Io)?;
(0..n_genomes).map(|g| -> SKResult<ColBuilder> {
let b = PersistentCompactIntVecBuilder::new(n_new, &col_path_int(&data_dir, g))?;
Ok(ColBuilder::Int(b))
}).collect::<SKResult<_>>()?
.map_err(SKError::Io)?
.close()
.map_err(SKError::Io)?;
(0..n_genomes)
.map(|g| -> SKResult<ColBuilder> {
let b = PersistentCompactIntVecBuilder::new(
n_new,
&col_path_int(&data_dir, g),
)?;
Ok(ColBuilder::Int(b))
})
.collect::<SKResult<_>>()?
}
};
@@ -200,10 +225,17 @@ impl KmerPartition {
})?;
// ── Close builders, write metadata ────────────────────────────────────
for b in builders { b.close()?; }
for b in builders {
b.close()?;
}
write_matrix_meta(&data_dir, n_new, n_genomes).map_err(SKError::Io)?;
PartitionMeta { n_layers: 1, mode: IndexMode::Exact }.save(&dst_index_dir).map_err(olm_to_sk)?;
PartitionMeta {
n_layers: 1,
mode: IndexMode::Exact,
}
.save(&dst_index_dir)
.map_err(olm_to_sk)?;
Ok(())
}