diff --git a/README.md b/README.md index 1dc4d21..34d557a 100644 --- a/README.md +++ b/README.md @@ -79,9 +79,12 @@ Non-ACGT characters act as hard breaks between k-mer segments in all formats. ## Quick start ```sh -# Build an exact index for two genomes -obikmer index --kmer-size 31 --label genome_a genome_a.fa --output index/ -obikmer index --kmer-size 31 --label genome_b genome_b.fa --output index/ +# Build an exact index for each genome independently +obikmer index --kmer-size 31 --label genome_a genome_a.fa --output index_a/ +obikmer index --kmer-size 31 --label genome_b genome_b.fa --output index_b/ + +# Merge into a single multi-genome index +obikmer merge --output index/ index_a/ index_b/ # Convert to approximate index (z=5, 8-bit fingerprints) obikmer reindex --approx -z 5 --evidence-bits 8 index/ diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs index 4f9cd04..68d8db7 100644 --- a/src/obikindex/src/merge.rs +++ b/src/obikindex/src/merge.rs @@ -63,8 +63,15 @@ impl KmerIndex { } } - // ── Validate evidence compatibility ─────────────────────────────────── - let evidence = validate_evidence_compat(sources)?; + // ── Choose base: largest source in the output evidence mode ─────────── + let base_idx = choose_base(sources, mode); + let mut ordered: Vec<&KmerIndex> = Vec::with_capacity(sources.len()); + ordered.push(sources[base_idx]); + for (i, &src) in sources.iter().enumerate() { + if i != base_idx { ordered.push(src); } + } + let sources: &[&KmerIndex] = &ordered; + let evidence = sources[0].meta.config.evidence.clone(); // ── Compute final genome labels (rename duplicates if requested) ─────── let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?; @@ -96,7 +103,7 @@ impl KmerIndex { let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?; meta.genomes = all_genomes; meta.config.with_counts = mode == MergeMode::Count; - meta.config.evidence = evidence; + meta.config.evidence = evidence.clone(); meta.write(output)?; // In presence/absence mode, purge counts/ directories inherited from @@ -147,7 +154,7 @@ impl KmerIndex { .filter_map(|i| { let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> = remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect(); - let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits).err(); + let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits, &evidence).err(); pb.inc(1); result }) @@ -265,34 +272,51 @@ fn partition_bar(n: u64) -> ProgressBar { pb } -/// Check that all sources share the same evidence kind. -/// -/// Rules: -/// - all `Exact` → OK, returns `Exact` -/// - all `Approx { b, z }` same params → OK, returns `Approx { b, z }` -/// - mixed exact/approx or different approx params → `IncompatibleEvidence` -fn validate_evidence_compat(sources: &[&KmerIndex]) -> OKIResult { - let ref_ev = &sources[0].meta.config.evidence; - for src in &sources[1..] { - let ev = &src.meta.config.evidence; - let compat = match (ref_ev, ev) { - (IndexMode::Exact, IndexMode::Exact) => true, - (IndexMode::Approx { b: b1, z: z1 }, - IndexMode::Approx { b: b2, z: z2 }) => b1 == b2 && z1 == z2, - (IndexMode::Hybrid { b: b1, z: z1 }, - IndexMode::Hybrid { b: b2, z: z2 }) => b1 == b2 && z1 == z2, - _ => false, - }; - if !compat { - return Err(OKIError::IncompatibleEvidence(format!( - "source {:?} has evidence {:?}, expected {:?} — \ - convert all sources to the same evidence kind first \ - (use the `reindex` command)", - src.root_path.display(), ev, ref_ev, - ))); +/// A source is "trivial" if its presence/count values carry no approximation: +/// single-genome presence index (SetMembership — all values are 1 by construction). +fn is_trivial(src: &KmerIndex, mode: MergeMode) -> bool { + src.meta.genomes.len() == 1 && mode == MergeMode::Presence +} + +/// Sum of all `unitigs.bin` sizes across every partition and layer. +/// Used as a proxy for the number of indexed smers. +fn index_unitig_size(src: &KmerIndex) -> u64 { + let n = src.partition.n_partitions(); + let mut total = 0u64; + for i in 0..n { + let index_dir = src.partition.part_dir(i).join("index"); + let mut l = 0usize; + loop { + let p = index_dir.join(format!("layer_{l}")).join("unitigs.bin"); + if !p.exists() { break; } + if let Ok(m) = std::fs::metadata(&p) { total += m.len(); } + l += 1; } } - Ok(ref_ev.clone()) + total +} + +/// Choose the index to use as bootstrap base. +/// +/// Rule — mieux-disant: if any non-trivial source uses approximate evidence +/// (Approx or Hybrid), the output must also be approximate; the base must +/// therefore come from an approximate source so its layers carry the right +/// evidence files. Among qualifying candidates, the largest (by unitig size) +/// is chosen to minimise the number of new smers in the merge layer. +fn choose_base(sources: &[&KmerIndex], mode: MergeMode) -> usize { + let needs_approx = sources.iter().any(|src| { + !is_trivial(src, mode) + && matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. }) + }); + + sources.iter().enumerate() + .filter(|(_, src)| { + !needs_approx + || matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. }) + }) + .max_by_key(|(_, src)| index_unitig_size(src)) + .map(|(i, _)| i) + .unwrap() } fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> { diff --git a/src/obikpartitionner/src/merge_layer.rs b/src/obikpartitionner/src/merge_layer.rs index 6107f0b..b8673a2 100644 --- a/src/obikpartitionner/src/merge_layer.rs +++ b/src/obikpartitionner/src/merge_layer.rs @@ -9,7 +9,7 @@ use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentCompactIntVecBuilder}; use obikseq::CanonicalKmer; use obiskio::{SKError, SKResult, UnitigFileReader}; -use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfLayer, OLMError}; +use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfOnly, OLMError}; use obilayeredmap::meta::PartitionMeta; use crate::partition::KmerPartition; @@ -47,22 +47,22 @@ impl ColBuilder { pub(crate) enum SrcLayerData { /// Pure set-membership layer (no data matrix): every kmer is present in all genomes. SetMembership, - Presence(MphfLayer, PersistentBitMatrix), - Count(MphfLayer, PersistentCompactIntMatrix), + Presence(MphfOnly, PersistentBitMatrix), + Count(MphfOnly, PersistentCompactIntMatrix), } impl SrcLayerData { - pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode, index_mode: &IndexMode) -> SKResult { + pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode) -> SKResult { let presence_dir = layer_dir.join("presence"); let counts_dir = layer_dir.join("counts"); match merge_mode { MergeMode::Presence => { if presence_dir.exists() { - let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?; + let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?; let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?; Ok(SrcLayerData::Presence(mphf, mat)) } else if counts_dir.exists() { - let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?; + let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?; let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; Ok(SrcLayerData::Count(mphf, mat)) } else { @@ -71,7 +71,7 @@ impl SrcLayerData { } MergeMode::Count => { if counts_dir.exists() { - let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?; + let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?; let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; Ok(SrcLayerData::Count(mphf, mat)) } else { @@ -82,22 +82,16 @@ impl SrcLayerData { } /// Return one value per source genome for `kmer`. + /// The caller guarantees `kmer` is in the source MPHF domain. + #[inline] pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec { match self { SrcLayerData::SetMembership => vec![1u32; n_genomes], SrcLayerData::Presence(mphf, mat) => { - if let Some(slot) = mphf.find(kmer) { - mat.row(slot).iter().map(|&b| b as u32).collect() - } else { - vec![0u32; n_genomes] - } + mat.row(mphf.index(kmer)).iter().map(|&b| b as u32).collect() } SrcLayerData::Count(mphf, mat) => { - if let Some(slot) = mphf.find(kmer) { - mat.row(slot).iter().copied().collect() - } else { - vec![0u32; n_genomes] - } + mat.row(mphf.index(kmer)).iter().copied().collect() } } } @@ -161,6 +155,7 @@ impl KmerPartition { mode: MergeMode, n_dst_genomes: usize, block_bits: u8, + evidence: &IndexMode, ) -> SKResult<()> { let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR); if !dst_index_dir.exists() { @@ -208,7 +203,7 @@ impl KmerPartition { let new_layer_idx = n_dst_layers; let new_layer_dir = dst_index_dir.join(format!("layer_{new_layer_idx}")); - if any_new { + let n_new = if any_new { g.compute_degrees(); fs::create_dir_all(&new_layer_dir)?; let mut uw = Layer::<()>::unitig_writer(&new_layer_dir).map_err(olm_to_sk)?; @@ -216,16 +211,18 @@ impl KmerPartition { uw.write(&unitig)?; } uw.close()?; - Layer::<()>::build(&new_layer_dir, block_bits, &IndexMode::Exact).map_err(olm_to_sk)?; - } + Layer::<()>::build(&new_layer_dir, block_bits, evidence).map_err(olm_to_sk)?; + g.len() + } else { + 0 + }; drop(g); let new_mphf = if any_new { - Some(MphfLayer::open(&new_layer_dir, &IndexMode::Exact).map_err(olm_to_sk)?) + Some(MphfOnly::open(&new_layer_dir).map_err(olm_to_sk)?) } else { None }; - let n_new = new_mphf.as_ref().map_or(0, |m| m.n()); // ── Prepare matrix directories for the new layer ────────────────────── // Absent columns (dst genomes) are written via append_column (all-zero/false). @@ -303,7 +300,7 @@ impl KmerPartition { for l in 0..src_meta.n_layers { let src_layer_dir = src_index_dir.join(format!("layer_{l}")); let reader = UnitigFileReader::open_sequential(&src_layer_dir.join("unitigs.bin"))?; - let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?; + let src_data = SrcLayerData::open(&src_layer_dir, mode)?; for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { let values = src_data.lookup(kmer, *src_n); @@ -312,9 +309,8 @@ impl KmerPartition { if let Some((dst_layer, hit)) = dst_map.query(kmer) { exist_builders[dst_layer][builder_idx].set_val(hit.slot, value); } else if let Some(ref mphf) = new_mphf { - if let Some(slot) = mphf.find(kmer) { - new_src_builders[builder_idx].set_val(slot, value); - } + let slot = mphf.index(kmer); + new_src_builders[builder_idx].set_val(slot, value); } } } diff --git a/src/obikpartitionner/src/rebuild_layer.rs b/src/obikpartitionner/src/rebuild_layer.rs index 907a580..ed6732a 100644 --- a/src/obikpartitionner/src/rebuild_layer.rs +++ b/src/obikpartitionner/src/rebuild_layer.rs @@ -117,7 +117,7 @@ impl KmerPartition { if !unitigs_path.exists() { continue; } let reader = UnitigFileReader::open_sequential(&unitigs_path)?; - let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?; + let src_data = SrcLayerData::open(&src_layer_dir, mode)?; for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { let row = src_data.lookup(kmer, n_genomes); @@ -182,7 +182,7 @@ impl KmerPartition { if !unitigs_path.exists() { continue; } let reader = UnitigFileReader::open_sequential(&unitigs_path)?; - let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?; + let src_data = SrcLayerData::open(&src_layer_dir, mode)?; for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { let row = src_data.lookup(kmer, n_genomes); diff --git a/src/obikseq/src/kmer.rs b/src/obikseq/src/kmer.rs index 9091661..7dba41c 100644 --- a/src/obikseq/src/kmer.rs +++ b/src/obikseq/src/kmer.rs @@ -1,7 +1,8 @@ -//! Compact 2-bit kmer stored as a left-aligned u64. +//! Compact 2-bit kmer stored as a left-aligned [`RawKmer`]. //! -//! Nucleotide 0 occupies bits 63–62, nucleotide i occupies bits 63−2i and 62−2i. -//! The low 64−2·len bits are always zero. +//! Nucleotide 0 occupies bits KMER_BITS−1 and KMER_BITS−2, nucleotide i occupies +//! bits KMER_BITS−1−2i and KMER_BITS−2−2i. +//! The low `KMER_BITS − 2·len` bits are always zero. //! //! The length is not stored in the struct — it is supplied by the [`KmerLength`] //! type parameter. Two public marker types cover the common cases: @@ -15,6 +16,17 @@ //! Tests that need a fixed length independent of the global singletons can use //! [`ConstLen`]. +/// Underlying storage type for a raw (2-bit-per-base, left-aligned) kmer. +/// +/// All kmer bit-width constants (`KMER_BITS`) and shift calculations are derived +/// from this type. To extend the maximum supported k beyond 32, change this alias +/// to `u128` and update `KMER_BITS` — callers that use `RawKmer` directly will +/// then pick up the wider type automatically. +pub type RawKmer = u64; + +/// Bit width of [`RawKmer`]: the number of bits available for nucleotide encoding. +pub const KMER_BITS: usize = 64; + use serde::Serialize; use std::io::{self, Write}; use std::marker::PhantomData; @@ -109,35 +121,35 @@ impl Annotation for KmerAnnotation {} // ── KmerOf ──────────────────────────────────────────────────────────────────── -/// A DNA kmer of length `L::len()` encoded as a left-aligned u64 (2 bits/nucleotide, MSB-first). -/// The low `64 − 2·L::len()` bits are always zero. +/// A DNA kmer of length `L::len()` encoded as a left-aligned [`RawKmer`] (2 bits/nucleotide, MSB-first). +/// The low `KMER_BITS − 2·L::len()` bits are always zero. #[repr(transparent)] #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct KmerOf(u64, PhantomData); +pub struct KmerOf(RawKmer, PhantomData); impl KmerOf { - /// Wrap a raw left-aligned u64 value as a kmer. + /// Wrap a raw left-aligned [`RawKmer`] value as a kmer. #[inline] - pub const fn from_raw(raw: u64) -> Self { + pub const fn from_raw(raw: RawKmer) -> Self { KmerOf(raw, PhantomData) } - /// Wrap a raw right-aligned u64 value, shifting it into left-aligned position. + /// Wrap a raw right-aligned [`RawKmer`] value, shifting it into left-aligned position. #[inline] - pub fn from_raw_right(raw: u64) -> Self { - KmerOf(raw << (64 - 2 * L::len()), PhantomData) + pub fn from_raw_right(raw: RawKmer) -> Self { + KmerOf(raw << (KMER_BITS - 2 * L::len()), PhantomData) } - /// Return the raw left-aligned u64 value. + /// Return the raw left-aligned [`RawKmer`] value. #[inline] - pub fn raw(&self) -> u64 { + pub fn raw(&self) -> RawKmer { self.0 } - /// Return the raw right-aligned u64 value. + /// Return the raw right-aligned [`RawKmer`] value. #[inline] - pub fn raw_right(&self) -> u64 { - self.0 >> (64 - 2 * L::len()) + pub fn raw_right(&self) -> RawKmer { + self.0 >> (KMER_BITS - 2 * L::len()) } /// Encode the first `L::len()` nucleotides of an ASCII slice into a kmer. @@ -153,11 +165,11 @@ impl KmerOf { seql: ascii.len(), }); } - let mut val = 0u64; + let mut val: RawKmer = 0; for i in 0..k { - val = (val << 2) | encode_base(ascii[i]) as u64; + val = (val << 2) | encode_base(ascii[i]) as RawKmer; } - Ok(KmerOf(val << (64 - 2 * k), PhantomData)) + Ok(KmerOf(val << (KMER_BITS - 2 * k), PhantomData)) } /// Decode into a freshly allocated ASCII `Vec`. @@ -193,27 +205,27 @@ impl KmerOf { let x = x.swap_bytes(); let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4); let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2); - KmerOf(x << (64 - 2 * k), PhantomData) + KmerOf(x << (KMER_BITS - 2 * k), PhantomData) } /// Slide the window one base to the right: drop nucleotide 0, append `nuc` at position `L::len()-1`. pub fn push_right(self, nuc: u8) -> Self { let k = L::len(); - let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1))); - KmerOf(shifted | ((nuc as u64 & 3) << (64 - 2 * k)), PhantomData) + let shifted = self.0 << 2 & (RawKmer::MAX << (KMER_BITS - 2 * (k - 1))); + KmerOf(shifted | ((nuc as RawKmer & 3) << (KMER_BITS - 2 * k)), PhantomData) } /// Slide the window one base to the left: drop nucleotide `L::len()-1`, prepend `nuc` at position 0. pub fn push_left(self, nuc: u8) -> Self { let k = L::len(); - let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k)); - KmerOf(shifted | ((nuc as u64 & 3) << 62), PhantomData) + let shifted = (self.0 >> 2) & (RawKmer::MAX << (KMER_BITS - 2 * k)); + KmerOf(shifted | ((nuc as RawKmer & 3) << (KMER_BITS - 2)), PhantomData) } /// Returns `true` if the last `L::len()-1` nucleotides of `self` equal the first `L::len()-1` of `other`. pub fn is_overlapping(self, other: Self) -> bool { let k = L::len(); - let mask = !0u64 << (64 - 2 * (k - 1)); + let mask = RawKmer::MAX << (KMER_BITS - 2 * (k - 1)); (self.0 << 2 & mask) == (other.0 & mask) } } @@ -231,7 +243,7 @@ impl Sequence for KmerOf { #[inline] fn nucleotide(&self, i: usize) -> u8 { - ((self.0 >> (62 - 2 * i)) & 0b11) as u8 + ((self.0 >> (KMER_BITS - 2 - 2 * i)) & 0b11) as u8 } #[inline] @@ -255,21 +267,21 @@ impl Sequence for KmerOf { /// [`CanonicalKmerOf::from_raw_unchecked`] (trusted paths such as deserialisation). #[repr(transparent)] #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct CanonicalKmerOf(u64, PhantomData); +pub struct CanonicalKmerOf(RawKmer, PhantomData); impl CanonicalKmerOf { - /// Wrap a raw left-aligned u64 without verifying the canonical invariant. + /// Wrap a raw left-aligned [`RawKmer`] without verifying the canonical invariant. /// /// # Safety (logical) /// The caller must guarantee `raw == min(raw, revcomp(raw))`. #[inline] - pub fn from_raw_unchecked(raw: u64) -> Self { + pub fn from_raw_unchecked(raw: RawKmer) -> Self { CanonicalKmerOf(raw, PhantomData) } - /// Return the raw left-aligned u64 value. + /// Return the raw left-aligned [`RawKmer`] value. #[inline] - pub fn raw(&self) -> u64 { + pub fn raw(&self) -> RawKmer { self.0 } @@ -307,25 +319,25 @@ impl CanonicalKmerOf { /// Return the four left canonical neighbours (each already canonical). pub fn left_canonical_neighbors(&self) -> [CanonicalKmerOf; 4] { let k = L::len(); - let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k)); + let shifted = (self.0 >> 2) & (RawKmer::MAX << (KMER_BITS - 2 * k)); [ KmerOf::(shifted, PhantomData).canonical(), - KmerOf::(shifted | (1u64 << 62), PhantomData).canonical(), - KmerOf::(shifted | (2u64 << 62), PhantomData).canonical(), - KmerOf::(shifted | (3u64 << 62), PhantomData).canonical(), + KmerOf::(shifted | ((1 as RawKmer) << (KMER_BITS - 2)), PhantomData).canonical(), + KmerOf::(shifted | ((2 as RawKmer) << (KMER_BITS - 2)), PhantomData).canonical(), + KmerOf::(shifted | ((3 as RawKmer) << (KMER_BITS - 2)), PhantomData).canonical(), ] } /// Return the four right canonical neighbours (each already canonical). pub fn right_canonical_neighbors(&self) -> [CanonicalKmerOf; 4] { let k = L::len(); - let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1))); - let shift = 64 - 2 * k; + let shifted = self.0 << 2 & (RawKmer::MAX << (KMER_BITS - 2 * (k - 1))); + let shift = KMER_BITS - 2 * k; [ KmerOf::(shifted, PhantomData).canonical(), - KmerOf::(shifted | (1u64 << shift), PhantomData).canonical(), - KmerOf::(shifted | (2u64 << shift), PhantomData).canonical(), - KmerOf::(shifted | (3u64 << shift), PhantomData).canonical(), + KmerOf::(shifted | ((1 as RawKmer) << shift), PhantomData).canonical(), + KmerOf::(shifted | ((2 as RawKmer) << shift), PhantomData).canonical(), + KmerOf::(shifted | ((3 as RawKmer) << shift), PhantomData).canonical(), ] } @@ -349,7 +361,7 @@ impl Sequence for CanonicalKmerOf { #[inline] fn nucleotide(&self, i: usize) -> u8 { - ((self.0 >> (62 - 2 * i)) & 0b11) as u8 + ((self.0 >> (KMER_BITS - 2 - 2 * i)) & 0b11) as u8 } fn canonical(&self) -> Self::Canonical { @@ -386,7 +398,7 @@ pub type Minimizer = CanonicalKmerOf; /// 64-bit representation directly, which is useful when the canonical /// invariant is not required or has already been handled. #[inline] -pub fn hash_kmer(raw: u64) -> u64 { +pub fn hash_kmer(raw: RawKmer) -> u64 { mix64(raw ^ 0x9e3779b97f4a7c15) } diff --git a/src/obikseq/src/lib.rs b/src/obikseq/src/lib.rs index 79625d1..626f90a 100644 --- a/src/obikseq/src/lib.rs +++ b/src/obikseq/src/lib.rs @@ -20,7 +20,7 @@ pub mod superkmer; pub mod unitig; pub use annotations::Annotation; -pub use kmer::{CanonicalKmer, Kmer, Minimizer, hash_kmer}; +pub use kmer::{CanonicalKmer, Kmer, Minimizer, RawKmer, KMER_BITS, hash_kmer}; pub use packed_seq::MAX_KMERS_PER_CHUNK; pub use params::{k, m, set_k, set_m}; pub use routable::RoutableSuperKmer; diff --git a/src/obikseq/src/packed_seq.rs b/src/obikseq/src/packed_seq.rs index 601e667..f0888d4 100644 --- a/src/obikseq/src/packed_seq.rs +++ b/src/obikseq/src/packed_seq.rs @@ -18,7 +18,7 @@ use bitvec::prelude::*; use crate::Sequence; use crate::encoding::{DEC4, encode_base}; -use crate::kmer::{CanonicalKmer, Kmer, KmerError, KLen, KmerLength, KmerOf, MLen, Minimizer}; +use crate::kmer::{CanonicalKmer, Kmer, KmerError, KLen, KmerLength, KmerOf, MLen, Minimizer, RawKmer, KMER_BITS}; use crate::params::k; use crate::revcomp_lookup::REVCOMP4; @@ -198,8 +198,8 @@ impl PackedSeq { return Err(KmerError::OutOfBounds { position: i, k: len, seql }); } let bits = self.seq.view_bits::(); - let raw: u64 = bits[i * 2..(i + len) * 2].load_be(); - Ok(KmerOf::from_raw(raw << (64 - 2 * len))) + let raw: RawKmer = bits[i * 2..(i + len) * 2].load_be(); + Ok(KmerOf::from_raw(raw << (KMER_BITS - 2 * len))) } /// Extract the kmer of length `params::k()` at nucleotide position `i`. Zero allocation. @@ -322,9 +322,9 @@ impl AsRef for PackedSeq { /// sequence into the iterator, so no reference escapes the closure. pub struct PackedSeqKmerIter { seq: S, - mask: u64, + mask: RawKmer, lshift: usize, - current: u64, + current: RawKmer, pos: usize, max_pos: usize, } @@ -341,8 +341,8 @@ impl> PackedSeqKmerIter { let ps = seq.as_ref(); let seql = ps.seql(); let klen = k(); - let lshift = 64 - klen * 2; - let mask = ((!0u128) << (lshift + 2)) as u64; + let lshift = KMER_BITS - klen * 2; + let mask = ((!0u128) << (lshift + 2)) as RawKmer; let current = if seql >= klen { ps.extract::(0).map(|km| km.raw()).unwrap_or(0) } else { @@ -362,7 +362,7 @@ impl> Iterator for PackedSeqKmerIter { let result = Kmer::from_raw(self.current); if self.pos < self.max_pos { let inner_shift = 6 - 2 * (self.pos & 3); - let nuc = ((self.seq.as_ref().seq[self.pos / 4] >> inner_shift) & 3) as u64; + let nuc = ((self.seq.as_ref().seq[self.pos / 4] >> inner_shift) & 3) as RawKmer; self.current = ((self.current << 2) & self.mask) | (nuc << self.lshift); } self.pos += 1; diff --git a/src/obilayeredmap/src/lib.rs b/src/obilayeredmap/src/lib.rs index 9b275a0..65ae2e2 100644 --- a/src/obilayeredmap/src/lib.rs +++ b/src/obilayeredmap/src/lib.rs @@ -12,4 +12,4 @@ pub use layer::{Hit, Layer, LayerData}; pub use layered_store::LayeredStore; pub use map::LayeredMap; pub use meta::{IndexMode, PartitionMeta}; -pub use mphf_layer::MphfLayer; +pub use mphf_layer::{MphfLayer, MphfOnly}; diff --git a/src/obilayeredmap/src/mphf_layer.rs b/src/obilayeredmap/src/mphf_layer.rs index 877e564..ac6f33d 100644 --- a/src/obilayeredmap/src/mphf_layer.rs +++ b/src/obilayeredmap/src/mphf_layer.rs @@ -118,7 +118,7 @@ impl MphfLayer { } LayerEvidence::Approx { unitigs_path, .. } => { let reader = UnitigFileReader::open_sequential(unitigs_path).ok()?; - for stored in reader.iter_canonical_kmers() { + for (stored, _, _) in reader.iter_indexed_canonical_kmers() { if self.mphf.index(&stored.raw()) == slot { return if stored == kmer { Some(slot) } else { None }; } @@ -129,7 +129,31 @@ impl MphfLayer { } pub fn n(&self) -> usize { self.n } +} +// ── MphfOnly ────────────────────────────────────────────────────────────────── + +/// Lightweight wrapper that loads only the MPHF file, without evidence or unitigs. +/// +/// Use this when the caller guarantees that all queried kmers are in the MPHF +/// domain (e.g. when iterating the source's own unitigs during merge). +pub struct MphfOnly(Mphf); + +impl MphfOnly { + pub fn open(dir: &Path) -> OLMResult { + let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE)) + .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; + Ok(Self(mphf)) + } + + /// Return the slot for `kmer`. Only valid when `kmer` is in the MPHF domain. + #[inline] + pub fn index(&self, kmer: CanonicalKmer) -> usize { + self.0.index(&kmer.raw()) + } +} + +impl MphfLayer { // ── Build helpers ───────────────────────────────────────────────────────── pub fn unitig_writer(dir: &Path) -> OLMResult { @@ -196,7 +220,7 @@ impl MphfLayer { let mut fw = FingerprintVecWriter::new(n, b); - for kmer in unitigs.iter_canonical_kmers() { + for (kmer, _, _) in unitigs.iter_indexed_canonical_kmers() { let slot = mphf.index(&kmer.raw()); if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); @@ -281,7 +305,7 @@ impl MphfLayer { IndexMode::Approx { b, .. } => { let mut fw = FingerprintVecWriter::new(n, *b); - for kmer in unitigs2.iter_canonical_kmers() { + for (kmer, _, _) in unitigs2.iter_indexed_canonical_kmers() { let slot = mphf.index(&kmer.raw()); if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); } let byte = slot / 8; let bit = 1u8 << (slot % 8); diff --git a/src/obilayeredmap/src/tests/layer.rs b/src/obilayeredmap/src/tests/layer.rs index 308fb0d..8a7233e 100644 --- a/src/obilayeredmap/src/tests/layer.rs +++ b/src/obilayeredmap/src/tests/layer.rs @@ -2,7 +2,7 @@ use super::*; use obicompactvec::PersistentCompactIntMatrix; use obikseq::{set_k, Kmer, Sequence as _, Unitig}; use obiskio::DEFAULT_BLOCK_BITS; -use crate::meta::EvidenceKind; +use crate::meta::IndexMode; use tempfile::tempdir; fn write_unitigs(dir: &Path, seqs: &[&[u8]]) { @@ -15,7 +15,8 @@ fn write_unitigs(dir: &Path, seqs: &[&[u8]]) { fn all_canonical_kmers(dir: &Path) -> Vec { UnitigFileReader::open_sequential(&dir.join(UNITIGS_FILE)).unwrap() - .iter_canonical_kmers() + .iter_indexed_canonical_kmers() + .map(|(kmer, _, _)| kmer) .collect() } @@ -25,8 +26,8 @@ fn build_and_query_all_kmers_found() { let dir = tempdir().unwrap(); write_unitigs(dir.path(), &[b"AAAACGT"]); let kmers = all_canonical_kmers(dir.path()); - Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS, &EvidenceKind::Exact).unwrap(); - let layer = Layer::<()>::open(dir.path()).unwrap(); + Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS, &IndexMode::Exact).unwrap(); + let layer = Layer::<()>::open(dir.path(), &IndexMode::Exact).unwrap(); for kmer in kmers { assert!(layer.query(kmer).is_some(), "kmer should be present"); } @@ -43,10 +44,10 @@ fn counts_are_stored_and_retrieved() { Layer::::build( dir.path(), DEFAULT_BLOCK_BITS, - &EvidenceKind::Exact, + &IndexMode::Exact, |kmer| count_map.get(&kmer).copied().unwrap_or(0), ).unwrap(); - let layer = Layer::::open(dir.path()).unwrap(); + let layer = Layer::::open(dir.path(), &IndexMode::Exact).unwrap(); for kmer in &kmers { let hit = layer.query(*kmer).expect("kmer must be present"); assert_eq!(hit.data[0], count_map[kmer]); @@ -58,8 +59,8 @@ fn query_absent_returns_none() { set_k(4); let dir = tempdir().unwrap(); write_unitigs(dir.path(), &[b"AAAACGT"]); - Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS, &EvidenceKind::Exact).unwrap(); - let layer = Layer::<()>::open(dir.path()).unwrap(); + Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS, &IndexMode::Exact).unwrap(); + let layer = Layer::<()>::open(dir.path(), &IndexMode::Exact).unwrap(); let absent = Kmer::from_ascii(b"CCCC").unwrap().canonical(); assert!(layer.query(absent).is_none()); } @@ -69,9 +70,9 @@ fn open_after_build_is_consistent() { set_k(4); let dir = tempdir().unwrap(); write_unitigs(dir.path(), &[b"AAAACGT"]); - let n = Layer::::build(dir.path(), DEFAULT_BLOCK_BITS, &EvidenceKind::Exact, |_| 7).unwrap(); + let n = Layer::::build(dir.path(), DEFAULT_BLOCK_BITS, &IndexMode::Exact, |_| 7).unwrap(); assert_eq!(n, 4); - let layer = Layer::::open(dir.path()).unwrap(); + let layer = Layer::::open(dir.path(), &IndexMode::Exact).unwrap(); let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical(); let hit = layer.query(kmer).expect("AAAA must be present"); assert_eq!(hit.data[0], 7); diff --git a/src/obilayeredmap/src/tests/map.rs b/src/obilayeredmap/src/tests/map.rs index 30f9530..d8e2ce8 100644 --- a/src/obilayeredmap/src/tests/map.rs +++ b/src/obilayeredmap/src/tests/map.rs @@ -1,6 +1,7 @@ use super::*; use obicompactvec::PersistentCompactIntMatrix; use obikseq::{set_k, Sequence as _, Unitig}; +use crate::meta::IndexMode; use tempfile::tempdir; fn push_unitigs_and_layer( @@ -24,7 +25,7 @@ fn canonical(ascii: &[u8]) -> CanonicalKmer { fn create_empty_map() { set_k(4); let dir = tempdir().unwrap(); - let map = LayeredMap::<()>::create(dir.path()).unwrap(); + let map = LayeredMap::<()>::create(dir.path(), IndexMode::Exact).unwrap(); assert_eq!(map.n_layers(), 0); } @@ -33,7 +34,7 @@ fn open_reloads_layer_count() { set_k(4); let dir = tempdir().unwrap(); { - let mut map = LayeredMap::::create(dir.path()).unwrap(); + let mut map = LayeredMap::::create(dir.path(), IndexMode::Exact).unwrap(); push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1); } let map = LayeredMap::::open(dir.path()).unwrap(); @@ -44,7 +45,7 @@ fn open_reloads_layer_count() { fn query_finds_kmer_in_layer_zero() { set_k(4); let dir = tempdir().unwrap(); - let mut map = LayeredMap::::create(dir.path()).unwrap(); + let mut map = LayeredMap::::create(dir.path(), IndexMode::Exact).unwrap(); push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 3); let kmer = canonical(b"AAAC"); let (layer_idx, hit) = map.query(kmer).expect("kmer must be found"); @@ -56,7 +57,7 @@ fn query_finds_kmer_in_layer_zero() { fn query_finds_kmer_in_correct_layer() { set_k(4); let dir = tempdir().unwrap(); - let mut map = LayeredMap::::create(dir.path()).unwrap(); + let mut map = LayeredMap::::create(dir.path(), IndexMode::Exact).unwrap(); push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1); push_unitigs_and_layer(&mut map, &[b"GGGACGT"], 2); assert_eq!(map.n_layers(), 2); @@ -74,7 +75,7 @@ fn query_finds_kmer_in_correct_layer() { fn query_absent_returns_none() { set_k(4); let dir = tempdir().unwrap(); - let mut map = LayeredMap::::create(dir.path()).unwrap(); + let mut map = LayeredMap::::create(dir.path(), IndexMode::Exact).unwrap(); push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1); let absent = canonical(b"CCCC"); assert!(map.query(absent).is_none()); @@ -84,7 +85,7 @@ fn query_absent_returns_none() { fn push_layer_from_map_convenience() { set_k(4); let dir = tempdir().unwrap(); - let mut map = LayeredMap::::create(dir.path()).unwrap(); + let mut map = LayeredMap::::create(dir.path(), IndexMode::Exact).unwrap(); let mut w = map.next_layer_writer().unwrap(); w.write(&Unitig::from_ascii(b"AAAACGT")).unwrap(); w.close().unwrap(); diff --git a/src/obiskio/src/tests/unitig_index.rs b/src/obiskio/src/tests/unitig_index.rs index 5553357..f4f1ffb 100644 --- a/src/obiskio/src/tests/unitig_index.rs +++ b/src/obiskio/src/tests/unitig_index.rs @@ -181,30 +181,6 @@ fn iter_kmers_two_chunks_order() { } } -// ── iter_canonical_kmers ────────────────────────────────────────────────────── - -#[test] -fn iter_canonical_kmers_all_canonical() { - set_k(4); - let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]); - for kmer in r.iter_canonical_kmers() { - // canonical of a canonical kmer is itself - assert_eq!(kmer.raw(), kmer.canonical().raw()); - } -} - -#[test] -fn iter_canonical_kmers_matches_iter_kmers() { - set_k(4); - let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]); - let canonical: Vec = r.iter_canonical_kmers().collect(); - let raw: Vec = r.iter_kmers().collect(); - assert_eq!(canonical.len(), raw.len()); - for (ck, rk) in canonical.iter().zip(raw.iter()) { - assert_eq!(ck.raw(), rk.canonical().raw()); - } -} - // ── iter_indexed_canonical_kmers ────────────────────────────────────────────── #[test] diff --git a/src/obiskio/src/unitig_index.rs b/src/obiskio/src/unitig_index.rs index fd46892..3320500 100644 --- a/src/obiskio/src/unitig_index.rs +++ b/src/obiskio/src/unitig_index.rs @@ -370,11 +370,6 @@ impl UnitigFileReader { .flat_map(|(_, u)| u.into_kmers()) } - pub fn iter_canonical_kmers(&self) -> impl Iterator + '_ { - self.iter_chunks_sequential() - .flat_map(|(_, u)| u.into_canonical_kmers()) - } - pub fn iter_indexed_canonical_kmers( &self, ) -> impl Iterator + '_ {