refactor: decouple unitig index generation and add exact evidence
Decouple index generation by introducing `build_unitig_idx()` for retroactive `.idx` creation and optional immediate writing on close. Add `open_sequential()` for index-less iteration while enforcing index requirements for random access. Refactor the MPHF layer to pre-generate the unitig index for parallel random access, integrate `rayon` for k-mer processing, and enforce mapping integrity via duplicate slot validation. Additionally, implement `build_exact_evidence()` to reconstruct evidence from existing artifacts, and update tests to leverage the new index generation and simplified k-mer iteration helpers.
This commit is contained in:
@@ -39,7 +39,6 @@ fn idx_path(path: &Path) -> PathBuf {
|
||||
/// Unitigs with more than [`MAX_KMERS_PER_CHUNK`] k-mers are transparently split
|
||||
/// into overlapping chunks (k−1 nucleotide overlap) so no k-mer is lost.
|
||||
pub struct UnitigFileWriter {
|
||||
path: PathBuf,
|
||||
file: BufWriter<File>,
|
||||
block_offsets: Vec<u32>,
|
||||
chunk_count: usize,
|
||||
@@ -64,7 +63,6 @@ impl UnitigFileWriter {
|
||||
assert!(block_bits <= 31, "block_bits must be ≤ 31");
|
||||
let file = File::create(path).map_err(SKError::Io)?;
|
||||
Ok(Self {
|
||||
path: path.to_owned(),
|
||||
file: BufWriter::new(file),
|
||||
block_offsets: Vec::new(),
|
||||
chunk_count: 0,
|
||||
@@ -122,19 +120,14 @@ impl UnitigFileWriter {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Flush and close the binary sequence file.
|
||||
///
|
||||
/// The companion `.idx` file is **not** written here; call
|
||||
/// [`build_unitig_idx`] separately when exact evidence is needed.
|
||||
pub fn close(mut self) -> SKResult<()> {
|
||||
self.file.flush().map_err(SKError::Io)?;
|
||||
drop(self.file);
|
||||
|
||||
self.block_offsets.push(self.next_offset);
|
||||
|
||||
write_idx(
|
||||
&idx_path(&self.path),
|
||||
self.chunk_count as u32,
|
||||
self.n_kmers as u64,
|
||||
self.block_bits,
|
||||
&self.block_offsets,
|
||||
)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.chunk_count }
|
||||
@@ -160,6 +153,48 @@ fn write_idx(
|
||||
w.flush().map_err(SKError::Io)
|
||||
}
|
||||
|
||||
/// Scan an existing `unitigs.bin` file and write its companion `.idx`.
|
||||
///
|
||||
/// Called by the exact-evidence construction route after the sequence file is
|
||||
/// closed. `block_bits` controls index granularity (1 << block_bits chunks per
|
||||
/// offset entry); use [`DEFAULT_BLOCK_BITS`] for the default.
|
||||
pub fn build_unitig_idx(unitigs_path: &Path, block_bits: u8) -> SKResult<()> {
|
||||
assert!(block_bits <= 31, "block_bits must be ≤ 31");
|
||||
|
||||
let file = File::open(unitigs_path).map_err(SKError::Io)?;
|
||||
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
|
||||
|
||||
let k = obikseq::params::k();
|
||||
let block_size = 1usize << block_bits;
|
||||
let mask = block_size - 1;
|
||||
|
||||
let mut block_offsets: Vec<u32> = Vec::new();
|
||||
let mut offset = 0usize;
|
||||
let mut chunk_count = 0usize;
|
||||
let mut n_kmers = 0usize;
|
||||
|
||||
while offset < mmap.len() {
|
||||
if chunk_count & mask == 0 {
|
||||
block_offsets.push(offset as u32);
|
||||
}
|
||||
let seql_minus_k = mmap[offset] as usize;
|
||||
let byte_len = (seql_minus_k + k + 3) / 4;
|
||||
n_kmers += seql_minus_k + 1;
|
||||
offset += 1 + byte_len;
|
||||
chunk_count += 1;
|
||||
}
|
||||
|
||||
block_offsets.push(offset as u32); // sentinel
|
||||
|
||||
write_idx(
|
||||
&idx_path(unitigs_path),
|
||||
chunk_count as u32,
|
||||
n_kmers as u64,
|
||||
block_bits,
|
||||
&block_offsets,
|
||||
)
|
||||
}
|
||||
|
||||
// ── Reader ────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Read-only random-access view of a unitig file.
|
||||
@@ -178,6 +213,7 @@ pub struct UnitigFileReader {
|
||||
}
|
||||
|
||||
impl UnitigFileReader {
|
||||
/// Open with `.idx` — enables both sequential iteration and random access.
|
||||
pub fn open(path: &Path) -> SKResult<Self> {
|
||||
let file = File::open(path).map_err(SKError::Io)?;
|
||||
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
|
||||
@@ -194,6 +230,37 @@ impl UnitigFileReader {
|
||||
})
|
||||
}
|
||||
|
||||
/// Open without `.idx` — sequential iteration only, no random access.
|
||||
///
|
||||
/// Scans the binary file once to count chunks and k-mers. Use when only
|
||||
/// [`Self::iter_kmers`], [`Self::iter_canonical_kmers`], or
|
||||
/// [`Self::iter_indexed_canonical_kmers`] are needed.
|
||||
pub fn open_sequential(path: &Path) -> SKResult<Self> {
|
||||
let file = File::open(path).map_err(SKError::Io)?;
|
||||
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
|
||||
let k = obikseq::params::k();
|
||||
|
||||
let mut offset = 0usize;
|
||||
let mut n_unitigs = 0usize;
|
||||
let mut n_kmers = 0usize;
|
||||
while offset < mmap.len() {
|
||||
let seql_minus_k = mmap[offset] as usize;
|
||||
n_kmers += seql_minus_k + 1;
|
||||
offset += 1 + (seql_minus_k + k + 3) / 4;
|
||||
n_unitigs += 1;
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
mmap,
|
||||
block_offsets: Vec::new(), // empty → random access disabled
|
||||
n_unitigs,
|
||||
n_kmers,
|
||||
k,
|
||||
block_bits: DEFAULT_BLOCK_BITS,
|
||||
mask: (1usize << DEFAULT_BLOCK_BITS) - 1,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.n_unitigs }
|
||||
pub fn is_empty(&self) -> bool { self.n_unitigs == 0 }
|
||||
pub fn n_kmers(&self) -> usize { self.n_kmers }
|
||||
@@ -202,6 +269,8 @@ impl UnitigFileReader {
|
||||
/// Byte offset of the START of record `i` (the seql byte) in the mmap.
|
||||
#[inline]
|
||||
fn chunk_start(&self, i: usize) -> usize {
|
||||
assert!(!self.block_offsets.is_empty(),
|
||||
"random access requires UnitigFileReader::open(); use open_sequential() for iteration only");
|
||||
let block = i >> self.block_bits;
|
||||
let rem = i & self.mask;
|
||||
let mut offset = self.block_offsets[block] as usize;
|
||||
|
||||
Reference in New Issue
Block a user