refactor: switch indexing to IndexMode and update metadata
Replace EvidenceKind with IndexMode (Exact, Approx, Hybrid) across layer construction and query dispatch. Update PartitionMeta and LayerMeta serialization to centralize index-wide configuration. Add flexible push_layer overloads to LayeredMap for dynamic index expansion without full rebuilds. Improve UnitigFileReader to gracefully fallback to sequential scanning when indexes are missing, eliminating panics.
This commit is contained in:
@@ -198,11 +198,16 @@ pub fn build_unitig_idx(unitigs_path: &Path, block_bits: u8) -> SKResult<()> {
|
||||
|
||||
// ── Reader ────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Read-only random-access view of a unitig file.
|
||||
/// Memory-mapped view of a unitig file, with optional direct-access index.
|
||||
///
|
||||
/// The sequence file is memory-mapped; the block offset table is loaded into RAM
|
||||
/// on open. Random access to chunk `i`: O(1 << block_bits) sequential mmap
|
||||
/// reads. Sequential iteration: O(n) via a running-offset cursor.
|
||||
/// Three constructors select the operating mode:
|
||||
/// - [`open`](Self::open) — smart default: direct access if `.idx` exists, sequential otherwise.
|
||||
/// - [`open_sequential`](Self::open_sequential) — always sequential, ignores `.idx`.
|
||||
/// - [`open_direct_access`](Self::open_direct_access) — requires `.idx`, errors if absent.
|
||||
///
|
||||
/// All positional methods (`chunk_start`, `verify_canonical_kmer`, …) work in
|
||||
/// both modes. Without `.idx` they fall back to an O(i) sequential scan —
|
||||
/// correct but slower.
|
||||
pub struct UnitigFileReader {
|
||||
mmap: Mmap,
|
||||
block_offsets: Vec<u32>,
|
||||
@@ -214,8 +219,52 @@ pub struct UnitigFileReader {
|
||||
}
|
||||
|
||||
impl UnitigFileReader {
|
||||
/// Open with `.idx` — enables both sequential iteration and random access.
|
||||
/// Smart default: opens with direct access if `.idx` is present, sequential otherwise.
|
||||
pub fn open(path: &Path) -> SKResult<Self> {
|
||||
if idx_path(path).exists() {
|
||||
Self::open_direct_access(path)
|
||||
} else {
|
||||
Self::open_sequential(path)
|
||||
}
|
||||
}
|
||||
|
||||
/// Always sequential — never reads `.idx` even if present.
|
||||
///
|
||||
/// Scans the binary file once to count chunks and k-mers.
|
||||
/// Positional access (`chunk_start`, `verify_canonical_kmer`) falls back to
|
||||
/// O(i) sequential scan.
|
||||
pub fn open_sequential(path: &Path) -> SKResult<Self> {
|
||||
let file = File::open(path).map_err(SKError::Io)?;
|
||||
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
|
||||
let k = obikseq::params::k();
|
||||
|
||||
let mut offset = 0usize;
|
||||
let mut n_unitigs = 0usize;
|
||||
let mut n_kmers = 0usize;
|
||||
while offset < mmap.len() {
|
||||
let seql_minus_k = mmap[offset] as usize;
|
||||
n_kmers += seql_minus_k + 1;
|
||||
offset += 1 + (seql_minus_k + k + 3) / 4;
|
||||
n_unitigs += 1;
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
mmap,
|
||||
block_offsets: Vec::new(),
|
||||
n_unitigs,
|
||||
n_kmers,
|
||||
k,
|
||||
block_bits: DEFAULT_BLOCK_BITS,
|
||||
mask: (1usize << DEFAULT_BLOCK_BITS) - 1,
|
||||
})
|
||||
}
|
||||
|
||||
/// Requires `.idx` — errors if the companion index file is absent.
|
||||
///
|
||||
/// Enables O(1 << block_bits) positional access to any chunk.
|
||||
/// Use only when direct access is architecturally required (query-time
|
||||
/// verification on an exact-evidence layer).
|
||||
pub fn open_direct_access(path: &Path) -> SKResult<Self> {
|
||||
let file = File::open(path).map_err(SKError::Io)?;
|
||||
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
|
||||
let (n_unitigs, n_kmers, block_bits, block_offsets) = read_idx(&idx_path(path))?;
|
||||
@@ -231,58 +280,38 @@ impl UnitigFileReader {
|
||||
})
|
||||
}
|
||||
|
||||
/// Open without `.idx` — sequential iteration only, no random access.
|
||||
///
|
||||
/// Scans the binary file once to count chunks and k-mers. Use when only
|
||||
/// [`Self::iter_kmers`], [`Self::iter_canonical_kmers`], or
|
||||
/// [`Self::iter_indexed_canonical_kmers`] are needed.
|
||||
pub fn open_sequential(path: &Path) -> SKResult<Self> {
|
||||
let file = File::open(path).map_err(SKError::Io)?;
|
||||
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
|
||||
let k = obikseq::params::k();
|
||||
|
||||
let mut offset = 0usize;
|
||||
let mut n_unitigs = 0usize;
|
||||
let mut n_kmers = 0usize;
|
||||
while offset < mmap.len() {
|
||||
let seql_minus_k = mmap[offset] as usize;
|
||||
n_kmers += seql_minus_k + 1;
|
||||
offset += 1 + (seql_minus_k + k + 3) / 4;
|
||||
n_unitigs += 1;
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
mmap,
|
||||
block_offsets: Vec::new(), // empty → random access disabled
|
||||
n_unitigs,
|
||||
n_kmers,
|
||||
k,
|
||||
block_bits: DEFAULT_BLOCK_BITS,
|
||||
mask: (1usize << DEFAULT_BLOCK_BITS) - 1,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.n_unitigs }
|
||||
pub fn is_empty(&self) -> bool { self.n_unitigs == 0 }
|
||||
pub fn n_kmers(&self) -> usize { self.n_kmers }
|
||||
pub fn block_bits(&self) -> u8 { self.block_bits }
|
||||
pub fn has_direct_access(&self) -> bool { !self.block_offsets.is_empty() }
|
||||
|
||||
/// Byte offset of the START of record `i` (the seql byte) in the mmap.
|
||||
/// Byte offset of record `i` in the mmap.
|
||||
///
|
||||
/// Fast path (O(1 << block_bits)) when `.idx` is loaded; degraded O(i)
|
||||
/// sequential scan otherwise.
|
||||
#[inline]
|
||||
fn chunk_start(&self, i: usize) -> usize {
|
||||
assert!(!self.block_offsets.is_empty(),
|
||||
"random access requires UnitigFileReader::open(); use open_sequential() for iteration only");
|
||||
if self.block_bits == 0 {
|
||||
return self.block_offsets[i] as usize;
|
||||
if !self.block_offsets.is_empty() {
|
||||
if self.block_bits == 0 {
|
||||
return self.block_offsets[i] as usize;
|
||||
}
|
||||
let block = i >> self.block_bits;
|
||||
let rem = i & self.mask;
|
||||
let mut offset = self.block_offsets[block] as usize;
|
||||
for _ in 0..rem {
|
||||
let seql_minus_k = self.mmap[offset] as usize;
|
||||
offset += 1 + (seql_minus_k + self.k + 3) / 4;
|
||||
}
|
||||
offset
|
||||
} else {
|
||||
let mut offset = 0usize;
|
||||
for _ in 0..i {
|
||||
let seql_minus_k = self.mmap[offset] as usize;
|
||||
offset += 1 + (seql_minus_k + self.k + 3) / 4;
|
||||
}
|
||||
offset
|
||||
}
|
||||
let block = i >> self.block_bits;
|
||||
let rem = i & self.mask;
|
||||
let mut offset = self.block_offsets[block] as usize;
|
||||
for _ in 0..rem {
|
||||
let seql_minus_k = self.mmap[offset] as usize;
|
||||
offset += 1 + (seql_minus_k + self.k + 3) / 4;
|
||||
}
|
||||
offset
|
||||
}
|
||||
|
||||
/// Nucleotide length of chunk `i`.
|
||||
@@ -307,7 +336,9 @@ impl UnitigFileReader {
|
||||
extract_kmer_raw(&self.mmap[offset + 1..], j, self.k)
|
||||
}
|
||||
|
||||
/// `true` iff the k-mer at position `j` of chunk `i` equals `query` (canonical).
|
||||
/// `true` iff the k-mer at position `j` of chunk `i` matches `query`.
|
||||
///
|
||||
/// Works in both modes; O(i) scan when `.idx` is absent.
|
||||
#[inline]
|
||||
pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool {
|
||||
canonical_raw(self.raw_kmer(i, j), self.k) == query.raw()
|
||||
|
||||
Reference in New Issue
Block a user