refactor: switch indexing to IndexMode and update metadata

Replace EvidenceKind with IndexMode (Exact, Approx, Hybrid) across layer construction and query dispatch. Update PartitionMeta and LayerMeta serialization to centralize index-wide configuration. Add flexible push_layer overloads to LayeredMap for dynamic index expansion without full rebuilds. Improve UnitigFileReader to gracefully fallback to sequential scanning when indexes are missing, eliminating panics.
This commit is contained in:
Eric Coissac
2026-05-26 10:04:25 +02:00
parent 1d880fdc5f
commit 7501b6e854
9 changed files with 284 additions and 315 deletions
+81 -50
View File
@@ -198,11 +198,16 @@ pub fn build_unitig_idx(unitigs_path: &Path, block_bits: u8) -> SKResult<()> {
// ── Reader ────────────────────────────────────────────────────────────────────
/// Read-only random-access view of a unitig file.
/// Memory-mapped view of a unitig file, with optional direct-access index.
///
/// The sequence file is memory-mapped; the block offset table is loaded into RAM
/// on open. Random access to chunk `i`: O(1 << block_bits) sequential mmap
/// reads. Sequential iteration: O(n) via a running-offset cursor.
/// Three constructors select the operating mode:
/// - [`open`](Self::open) — smart default: direct access if `.idx` exists, sequential otherwise.
/// - [`open_sequential`](Self::open_sequential) — always sequential, ignores `.idx`.
/// - [`open_direct_access`](Self::open_direct_access) — requires `.idx`, errors if absent.
///
/// All positional methods (`chunk_start`, `verify_canonical_kmer`, …) work in
/// both modes. Without `.idx` they fall back to an O(i) sequential scan —
/// correct but slower.
pub struct UnitigFileReader {
mmap: Mmap,
block_offsets: Vec<u32>,
@@ -214,8 +219,52 @@ pub struct UnitigFileReader {
}
impl UnitigFileReader {
/// Open with `.idx` — enables both sequential iteration and random access.
/// Smart default: opens with direct access if `.idx` is present, sequential otherwise.
pub fn open(path: &Path) -> SKResult<Self> {
if idx_path(path).exists() {
Self::open_direct_access(path)
} else {
Self::open_sequential(path)
}
}
/// Always sequential — never reads `.idx` even if present.
///
/// Scans the binary file once to count chunks and k-mers.
/// Positional access (`chunk_start`, `verify_canonical_kmer`) falls back to
/// O(i) sequential scan.
pub fn open_sequential(path: &Path) -> SKResult<Self> {
let file = File::open(path).map_err(SKError::Io)?;
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
let k = obikseq::params::k();
let mut offset = 0usize;
let mut n_unitigs = 0usize;
let mut n_kmers = 0usize;
while offset < mmap.len() {
let seql_minus_k = mmap[offset] as usize;
n_kmers += seql_minus_k + 1;
offset += 1 + (seql_minus_k + k + 3) / 4;
n_unitigs += 1;
}
Ok(Self {
mmap,
block_offsets: Vec::new(),
n_unitigs,
n_kmers,
k,
block_bits: DEFAULT_BLOCK_BITS,
mask: (1usize << DEFAULT_BLOCK_BITS) - 1,
})
}
/// Requires `.idx` — errors if the companion index file is absent.
///
/// Enables O(1 << block_bits) positional access to any chunk.
/// Use only when direct access is architecturally required (query-time
/// verification on an exact-evidence layer).
pub fn open_direct_access(path: &Path) -> SKResult<Self> {
let file = File::open(path).map_err(SKError::Io)?;
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
let (n_unitigs, n_kmers, block_bits, block_offsets) = read_idx(&idx_path(path))?;
@@ -231,58 +280,38 @@ impl UnitigFileReader {
})
}
/// Open without `.idx` — sequential iteration only, no random access.
///
/// Scans the binary file once to count chunks and k-mers. Use when only
/// [`Self::iter_kmers`], [`Self::iter_canonical_kmers`], or
/// [`Self::iter_indexed_canonical_kmers`] are needed.
pub fn open_sequential(path: &Path) -> SKResult<Self> {
let file = File::open(path).map_err(SKError::Io)?;
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
let k = obikseq::params::k();
let mut offset = 0usize;
let mut n_unitigs = 0usize;
let mut n_kmers = 0usize;
while offset < mmap.len() {
let seql_minus_k = mmap[offset] as usize;
n_kmers += seql_minus_k + 1;
offset += 1 + (seql_minus_k + k + 3) / 4;
n_unitigs += 1;
}
Ok(Self {
mmap,
block_offsets: Vec::new(), // empty → random access disabled
n_unitigs,
n_kmers,
k,
block_bits: DEFAULT_BLOCK_BITS,
mask: (1usize << DEFAULT_BLOCK_BITS) - 1,
})
}
pub fn len(&self) -> usize { self.n_unitigs }
pub fn is_empty(&self) -> bool { self.n_unitigs == 0 }
pub fn n_kmers(&self) -> usize { self.n_kmers }
pub fn block_bits(&self) -> u8 { self.block_bits }
pub fn has_direct_access(&self) -> bool { !self.block_offsets.is_empty() }
/// Byte offset of the START of record `i` (the seql byte) in the mmap.
/// Byte offset of record `i` in the mmap.
///
/// Fast path (O(1 << block_bits)) when `.idx` is loaded; degraded O(i)
/// sequential scan otherwise.
#[inline]
fn chunk_start(&self, i: usize) -> usize {
assert!(!self.block_offsets.is_empty(),
"random access requires UnitigFileReader::open(); use open_sequential() for iteration only");
if self.block_bits == 0 {
return self.block_offsets[i] as usize;
if !self.block_offsets.is_empty() {
if self.block_bits == 0 {
return self.block_offsets[i] as usize;
}
let block = i >> self.block_bits;
let rem = i & self.mask;
let mut offset = self.block_offsets[block] as usize;
for _ in 0..rem {
let seql_minus_k = self.mmap[offset] as usize;
offset += 1 + (seql_minus_k + self.k + 3) / 4;
}
offset
} else {
let mut offset = 0usize;
for _ in 0..i {
let seql_minus_k = self.mmap[offset] as usize;
offset += 1 + (seql_minus_k + self.k + 3) / 4;
}
offset
}
let block = i >> self.block_bits;
let rem = i & self.mask;
let mut offset = self.block_offsets[block] as usize;
for _ in 0..rem {
let seql_minus_k = self.mmap[offset] as usize;
offset += 1 + (seql_minus_k + self.k + 3) / 4;
}
offset
}
/// Nucleotide length of chunk `i`.
@@ -307,7 +336,9 @@ impl UnitigFileReader {
extract_kmer_raw(&self.mmap[offset + 1..], j, self.k)
}
/// `true` iff the k-mer at position `j` of chunk `i` equals `query` (canonical).
/// `true` iff the k-mer at position `j` of chunk `i` matches `query`.
///
/// Works in both modes; O(i) scan when `.idx` is absent.
#[inline]
pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool {
canonical_raw(self.raw_kmer(i, j), self.k) == query.raw()