Push pnxswqpxlyso #13

Merged
coissac merged 5 commits from push-pnxswqpxlyso into main 2026-06-01 12:45:46 +00:00
5 changed files with 51 additions and 31 deletions
Showing only changes of commit 0350ca855b - Show all commits
+2 -2
View File
@@ -96,7 +96,7 @@ impl KmerIndex {
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
meta.genomes = all_genomes;
meta.config.with_counts = mode == MergeMode::Count;
meta.config.evidence = evidence;
meta.config.evidence = evidence.clone();
meta.write(output)?;
// In presence/absence mode, purge counts/ directories inherited from
@@ -147,7 +147,7 @@ impl KmerIndex {
.filter_map(|i| {
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits).err();
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits, &evidence).err();
pb.inc(1);
result
})
+21 -25
View File
@@ -9,7 +9,7 @@ use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder,
PersistentCompactIntVecBuilder};
use obikseq::CanonicalKmer;
use obiskio::{SKError, SKResult, UnitigFileReader};
use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfLayer, OLMError};
use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfOnly, OLMError};
use obilayeredmap::meta::PartitionMeta;
use crate::partition::KmerPartition;
@@ -47,22 +47,22 @@ impl ColBuilder {
pub(crate) enum SrcLayerData {
/// Pure set-membership layer (no data matrix): every kmer is present in all genomes.
SetMembership,
Presence(MphfLayer, PersistentBitMatrix),
Count(MphfLayer, PersistentCompactIntMatrix),
Presence(MphfOnly, PersistentBitMatrix),
Count(MphfOnly, PersistentCompactIntMatrix),
}
impl SrcLayerData {
pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode, index_mode: &IndexMode) -> SKResult<Self> {
pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode) -> SKResult<Self> {
let presence_dir = layer_dir.join("presence");
let counts_dir = layer_dir.join("counts");
match merge_mode {
MergeMode::Presence => {
if presence_dir.exists() {
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Presence(mphf, mat))
} else if counts_dir.exists() {
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Count(mphf, mat))
} else {
@@ -71,7 +71,7 @@ impl SrcLayerData {
}
MergeMode::Count => {
if counts_dir.exists() {
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Count(mphf, mat))
} else {
@@ -82,22 +82,16 @@ impl SrcLayerData {
}
/// Return one value per source genome for `kmer`.
/// The caller guarantees `kmer` is in the source MPHF domain.
#[inline]
pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
match self {
SrcLayerData::SetMembership => vec![1u32; n_genomes],
SrcLayerData::Presence(mphf, mat) => {
if let Some(slot) = mphf.find(kmer) {
mat.row(slot).iter().map(|&b| b as u32).collect()
} else {
vec![0u32; n_genomes]
}
mat.row(mphf.index(kmer)).iter().map(|&b| b as u32).collect()
}
SrcLayerData::Count(mphf, mat) => {
if let Some(slot) = mphf.find(kmer) {
mat.row(slot).iter().copied().collect()
} else {
vec![0u32; n_genomes]
}
mat.row(mphf.index(kmer)).iter().copied().collect()
}
}
}
@@ -161,6 +155,7 @@ impl KmerPartition {
mode: MergeMode,
n_dst_genomes: usize,
block_bits: u8,
evidence: &IndexMode,
) -> SKResult<()> {
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
if !dst_index_dir.exists() {
@@ -208,7 +203,7 @@ impl KmerPartition {
let new_layer_idx = n_dst_layers;
let new_layer_dir = dst_index_dir.join(format!("layer_{new_layer_idx}"));
if any_new {
let n_new = if any_new {
g.compute_degrees();
fs::create_dir_all(&new_layer_dir)?;
let mut uw = Layer::<()>::unitig_writer(&new_layer_dir).map_err(olm_to_sk)?;
@@ -216,16 +211,18 @@ impl KmerPartition {
uw.write(&unitig)?;
}
uw.close()?;
Layer::<()>::build(&new_layer_dir, block_bits, &IndexMode::Exact).map_err(olm_to_sk)?;
}
Layer::<()>::build(&new_layer_dir, block_bits, evidence).map_err(olm_to_sk)?;
g.len()
} else {
0
};
drop(g);
let new_mphf = if any_new {
Some(MphfLayer::open(&new_layer_dir, &IndexMode::Exact).map_err(olm_to_sk)?)
Some(MphfOnly::open(&new_layer_dir).map_err(olm_to_sk)?)
} else {
None
};
let n_new = new_mphf.as_ref().map_or(0, |m| m.n());
// ── Prepare matrix directories for the new layer ──────────────────────
// Absent columns (dst genomes) are written via append_column (all-zero/false).
@@ -303,7 +300,7 @@ impl KmerPartition {
for l in 0..src_meta.n_layers {
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
let reader = UnitigFileReader::open_sequential(&src_layer_dir.join("unitigs.bin"))?;
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?;
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let values = src_data.lookup(kmer, *src_n);
@@ -312,13 +309,12 @@ impl KmerPartition {
if let Some((dst_layer, hit)) = dst_map.query(kmer) {
exist_builders[dst_layer][builder_idx].set_val(hit.slot, value);
} else if let Some(ref mphf) = new_mphf {
if let Some(slot) = mphf.find(kmer) {
let slot = mphf.index(kmer);
new_src_builders[builder_idx].set_val(slot, value);
}
}
}
}
}
col_offset += src_n;
}
+2 -2
View File
@@ -117,7 +117,7 @@ impl KmerPartition {
if !unitigs_path.exists() { continue; }
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?;
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let row = src_data.lookup(kmer, n_genomes);
@@ -182,7 +182,7 @@ impl KmerPartition {
if !unitigs_path.exists() { continue; }
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?;
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let row = src_data.lookup(kmer, n_genomes);
+1 -1
View File
@@ -12,4 +12,4 @@ pub use layer::{Hit, Layer, LayerData};
pub use layered_store::LayeredStore;
pub use map::LayeredMap;
pub use meta::{IndexMode, PartitionMeta};
pub use mphf_layer::MphfLayer;
pub use mphf_layer::{MphfLayer, MphfOnly};
+24
View File
@@ -129,7 +129,31 @@ impl MphfLayer {
}
pub fn n(&self) -> usize { self.n }
}
// ── MphfOnly ──────────────────────────────────────────────────────────────────
/// Lightweight wrapper that loads only the MPHF file, without evidence or unitigs.
///
/// Use this when the caller guarantees that all queried kmers are in the MPHF
/// domain (e.g. when iterating the source's own unitigs during merge).
pub struct MphfOnly(Mphf);
impl MphfOnly {
pub fn open(dir: &Path) -> OLMResult<Self> {
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
Ok(Self(mphf))
}
/// Return the slot for `kmer`. Only valid when `kmer` is in the MPHF domain.
#[inline]
pub fn index(&self, kmer: CanonicalKmer) -> usize {
self.0.index(&kmer.raw())
}
}
impl MphfLayer {
// ── Build helpers ─────────────────────────────────────────────────────────
pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> {