feat: add configurable block sizes and in-place reindex command
Propagate configurable block size (`block_bits`) through index and layer construction to control unitig chunking and optimize memory/performance trade-offs. Introduce an in-place `reindex` command and library method to convert indices between exact and approximate evidence formats. Add validation to reject merging indexes with mismatched evidence types, and update parallel kmer counting to use `AtomicUsize` for thread-safe aggregation. Includes CLI argument parsing, metadata persistence, and updated tests.
This commit is contained in:
@@ -18,6 +18,8 @@ pub enum OKIError {
|
|||||||
DuplicateGenomeLabel(String),
|
DuplicateGenomeLabel(String),
|
||||||
/// Operation not valid for this index configuration.
|
/// Operation not valid for this index configuration.
|
||||||
InvalidInput(String),
|
InvalidInput(String),
|
||||||
|
/// Sources mix exact and approximate evidence, or use incompatible approx parameters.
|
||||||
|
IncompatibleEvidence(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type OKIResult<T> = Result<T, OKIError>;
|
pub type OKIResult<T> = Result<T, OKIError>;
|
||||||
@@ -33,6 +35,7 @@ impl fmt::Display for OKIError {
|
|||||||
OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"),
|
OKIError::MismatchedMode => write!(f, "count mode requires all sources to have with_counts=true"),
|
||||||
OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"),
|
OKIError::DuplicateGenomeLabel(l) => write!(f, "duplicate genome label across sources: {l}"),
|
||||||
OKIError::InvalidInput(m) => write!(f, "invalid input: {m}"),
|
OKIError::InvalidInput(m) => write!(f, "invalid input: {m}"),
|
||||||
|
OKIError::IncompatibleEvidence(m) => write!(f, "incompatible evidence: {m}"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -145,6 +145,7 @@ impl KmerIndex {
|
|||||||
let t = Stage::start("index");
|
let t = Stage::start("index");
|
||||||
let with_counts = self.meta.config.with_counts;
|
let with_counts = self.meta.config.with_counts;
|
||||||
let evidence = self.meta.config.evidence.clone();
|
let evidence = self.meta.config.evidence.clone();
|
||||||
|
let block_bits = self.meta.config.block_bits;
|
||||||
let total_kmers = AtomicUsize::new(0);
|
let total_kmers = AtomicUsize::new(0);
|
||||||
|
|
||||||
let pb = Arc::new(Mutex::new(
|
let pb = Arc::new(Mutex::new(
|
||||||
@@ -154,7 +155,7 @@ impl KmerIndex {
|
|||||||
));
|
));
|
||||||
|
|
||||||
(0..n).into_par_iter().for_each(|i| {
|
(0..n).into_par_iter().for_each(|i| {
|
||||||
match self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence) {
|
match self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence, block_bits) {
|
||||||
Ok(0) => {}
|
Ok(0) => {}
|
||||||
Ok(n_kmers) => {
|
Ok(n_kmers) => {
|
||||||
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ mod dump;
|
|||||||
mod index;
|
mod index;
|
||||||
mod merge;
|
mod merge;
|
||||||
mod rebuild;
|
mod rebuild;
|
||||||
|
mod reindex;
|
||||||
|
|
||||||
pub use error::{OKIError, OKIResult};
|
pub use error::{OKIError, OKIResult};
|
||||||
pub use distance::{DistanceMetric, DistanceOutput};
|
pub use distance::{DistanceMetric, DistanceOutput};
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ use obisys::{Reporter, Stage};
|
|||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
|
use obilayeredmap::EvidenceKind;
|
||||||
|
|
||||||
use crate::error::{OKIError, OKIResult};
|
use crate::error::{OKIError, OKIResult};
|
||||||
use crate::index::KmerIndex;
|
use crate::index::KmerIndex;
|
||||||
use crate::meta::{GenomeInfo, IndexMeta};
|
use crate::meta::{GenomeInfo, IndexMeta};
|
||||||
@@ -61,6 +63,9 @@ impl KmerIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Validate evidence compatibility ───────────────────────────────────
|
||||||
|
let evidence = validate_evidence_compat(sources)?;
|
||||||
|
|
||||||
// ── Compute final genome labels (rename duplicates if requested) ───────
|
// ── Compute final genome labels (rename duplicates if requested) ───────
|
||||||
let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?;
|
let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?;
|
||||||
|
|
||||||
@@ -91,6 +96,7 @@ impl KmerIndex {
|
|||||||
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
|
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
|
||||||
meta.genomes = all_genomes;
|
meta.genomes = all_genomes;
|
||||||
meta.config.with_counts = mode == MergeMode::Count;
|
meta.config.with_counts = mode == MergeMode::Count;
|
||||||
|
meta.config.evidence = evidence;
|
||||||
meta.write(output)?;
|
meta.write(output)?;
|
||||||
|
|
||||||
// In presence/absence mode, purge counts/ directories inherited from
|
// In presence/absence mode, purge counts/ directories inherited from
|
||||||
@@ -134,13 +140,14 @@ impl KmerIndex {
|
|||||||
let pb = partition_bar(n_partitions as u64);
|
let pb = partition_bar(n_partitions as u64);
|
||||||
|
|
||||||
let dst_partition = &dst.partition;
|
let dst_partition = &dst.partition;
|
||||||
|
let block_bits = dst.meta.config.block_bits;
|
||||||
|
|
||||||
let errors: Vec<obiskio::SKError> = (0..n_partitions)
|
let errors: Vec<obiskio::SKError> = (0..n_partitions)
|
||||||
.into_par_iter()
|
.into_par_iter()
|
||||||
.filter_map(|i| {
|
.filter_map(|i| {
|
||||||
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
|
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
|
||||||
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
|
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
|
||||||
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err();
|
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits).err();
|
||||||
pb.inc(1);
|
pb.inc(1);
|
||||||
result
|
result
|
||||||
})
|
})
|
||||||
@@ -258,6 +265,34 @@ fn partition_bar(n: u64) -> ProgressBar {
|
|||||||
pb
|
pb
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check that all sources share the same evidence kind.
|
||||||
|
///
|
||||||
|
/// Rules:
|
||||||
|
/// - all `Exact` → OK, returns `Exact`
|
||||||
|
/// - all `Approx { b, z }` same params → OK, returns `Approx { b, z }`
|
||||||
|
/// - mixed exact/approx or different approx params → `IncompatibleEvidence`
|
||||||
|
fn validate_evidence_compat(sources: &[&KmerIndex]) -> OKIResult<EvidenceKind> {
|
||||||
|
let ref_ev = &sources[0].meta.config.evidence;
|
||||||
|
for src in &sources[1..] {
|
||||||
|
let ev = &src.meta.config.evidence;
|
||||||
|
let compat = match (ref_ev, ev) {
|
||||||
|
(EvidenceKind::Exact, EvidenceKind::Exact) => true,
|
||||||
|
(EvidenceKind::Approx { b: b1, z: z1 },
|
||||||
|
EvidenceKind::Approx { b: b2, z: z2 }) => b1 == b2 && z1 == z2,
|
||||||
|
_ => false,
|
||||||
|
};
|
||||||
|
if !compat {
|
||||||
|
return Err(OKIError::IncompatibleEvidence(format!(
|
||||||
|
"source {:?} has evidence {:?}, expected {:?} — \
|
||||||
|
convert all sources to the same evidence kind first \
|
||||||
|
(use the `reindex` command)",
|
||||||
|
src.root_path.display(), ev, ref_ev,
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(ref_ev.clone())
|
||||||
|
}
|
||||||
|
|
||||||
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
||||||
fs::create_dir_all(dst)?;
|
fs::create_dir_all(dst)?;
|
||||||
for entry in fs::read_dir(src)? {
|
for entry in fs::read_dir(src)? {
|
||||||
|
|||||||
@@ -31,6 +31,11 @@ pub struct IndexConfig {
|
|||||||
pub with_counts: bool,
|
pub with_counts: bool,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub evidence: EvidenceKind,
|
pub evidence: EvidenceKind,
|
||||||
|
/// Block size for the unitig index as a power-of-two exponent.
|
||||||
|
/// The `.idx` block covers 2^block_bits consecutive unitigs.
|
||||||
|
/// 0 = one entry per unitig (O(1) access, largest `.idx`).
|
||||||
|
#[serde(default)]
|
||||||
|
pub block_bits: u8,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
|||||||
@@ -88,12 +88,13 @@ impl KmerIndex {
|
|||||||
pb.enable_steady_tick(Duration::from_millis(100));
|
pb.enable_steady_tick(Duration::from_millis(100));
|
||||||
|
|
||||||
let src_partition = &src.partition;
|
let src_partition = &src.partition;
|
||||||
|
let block_bits = meta.config.block_bits;
|
||||||
|
|
||||||
let errors: Vec<obiskio::SKError> = (0..n_partitions)
|
let errors: Vec<obiskio::SKError> = (0..n_partitions)
|
||||||
.into_par_iter()
|
.into_par_iter()
|
||||||
.filter_map(|i| {
|
.filter_map(|i| {
|
||||||
let result = dst_partition
|
let result = dst_partition
|
||||||
.rebuild_partition(src_partition, i, filters, mode, n_genomes)
|
.rebuild_partition(src_partition, i, filters, mode, n_genomes, block_bits)
|
||||||
.err();
|
.err();
|
||||||
pb.inc(1);
|
pb.inc(1);
|
||||||
result
|
result
|
||||||
|
|||||||
@@ -0,0 +1,126 @@
|
|||||||
|
use std::fs;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use indicatif::{ProgressBar, ProgressStyle};
|
||||||
|
use obilayeredmap::{EvidenceKind, layer::Layer};
|
||||||
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
|
use obisys::{Reporter, Stage};
|
||||||
|
use rayon::prelude::*;
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
|
use crate::error::{OKIError, OKIResult};
|
||||||
|
use crate::index::KmerIndex;
|
||||||
|
use crate::state::IndexState;
|
||||||
|
|
||||||
|
const EVIDENCE_FILE: &str = "evidence.bin";
|
||||||
|
const FINGERPRINT_FILE: &str = "fingerprint.bin";
|
||||||
|
const UNITIG_IDX_FILE: &str = "unitigs.bin.idx";
|
||||||
|
|
||||||
|
fn olm_to_oki(e: obilayeredmap::OLMError) -> OKIError {
|
||||||
|
OKIError::InvalidInput(e.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KmerIndex {
|
||||||
|
/// Convert every layer's evidence bundle to `target` in-place.
|
||||||
|
///
|
||||||
|
/// - `Exact` → builds `evidence.bin` + `unitigs.bin.idx`, removes `fingerprint.bin`
|
||||||
|
/// - `Approx` → builds `fingerprint.bin`, removes `evidence.bin` + `unitigs.bin.idx`
|
||||||
|
///
|
||||||
|
/// The MPHF (`mphf.bin`) and unitigs (`unitigs.bin`) are never touched.
|
||||||
|
/// `index.meta` is updated with the new evidence kind on success.
|
||||||
|
pub fn reindex(
|
||||||
|
&mut self,
|
||||||
|
target: EvidenceKind,
|
||||||
|
block_bits: u8,
|
||||||
|
rep: &mut Reporter,
|
||||||
|
) -> OKIResult<()> {
|
||||||
|
if self.state() != IndexState::Indexed {
|
||||||
|
return Err(OKIError::NotIndexed(self.root_path.clone()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let n = self.partition.n_partitions();
|
||||||
|
info!(
|
||||||
|
"reindex {} partition(s): {:?} → {:?}",
|
||||||
|
n, self.meta.config.evidence, target,
|
||||||
|
);
|
||||||
|
|
||||||
|
let t = Stage::start("reindex");
|
||||||
|
let pb = ProgressBar::new(n as u64).with_style(
|
||||||
|
ProgressStyle::with_template(
|
||||||
|
"reindex — [{bar:20}] {pos}/{len} | {msg}",
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
.tick_strings(&["⠋","⠙","⠹","⠸","⠼","⠴","⠦","⠧","⠇","⠏"]),
|
||||||
|
);
|
||||||
|
pb.enable_steady_tick(Duration::from_millis(80));
|
||||||
|
|
||||||
|
let errors: Vec<String> = (0..n)
|
||||||
|
.into_par_iter()
|
||||||
|
.filter_map(|i| {
|
||||||
|
let res = reindex_partition(
|
||||||
|
&self.partition.part_dir(i).join("index"),
|
||||||
|
&target,
|
||||||
|
block_bits,
|
||||||
|
);
|
||||||
|
pb.inc(1);
|
||||||
|
res.err().map(|e| format!("partition {i}: {e}"))
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
pb.finish_and_clear();
|
||||||
|
|
||||||
|
if let Some(e) = errors.into_iter().next() {
|
||||||
|
return Err(OKIError::InvalidInput(e));
|
||||||
|
}
|
||||||
|
|
||||||
|
self.meta.config.evidence = target;
|
||||||
|
if matches!(self.meta.config.evidence, EvidenceKind::Exact) {
|
||||||
|
self.meta.config.block_bits = block_bits;
|
||||||
|
}
|
||||||
|
self.meta.write(&self.root_path)?;
|
||||||
|
rep.push(t.stop());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Process all layers of one partition's index directory.
|
||||||
|
fn reindex_partition(index_dir: &Path, target: &EvidenceKind, block_bits: u8) -> OKIResult<()> {
|
||||||
|
if !index_dir.exists() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
let pm = PartitionMeta::load(index_dir).map_err(olm_to_oki)?;
|
||||||
|
for layer_idx in 0..pm.n_layers {
|
||||||
|
let layer_dir = index_dir.join(format!("layer_{layer_idx}"));
|
||||||
|
reindex_layer(&layer_dir, target, block_bits)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn reindex_layer(layer_dir: &Path, target: &EvidenceKind, block_bits: u8) -> OKIResult<()> {
|
||||||
|
Layer::<()>::build_evidence(layer_dir, target, block_bits).map_err(olm_to_oki)?;
|
||||||
|
remove_stale_evidence(layer_dir, target)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn remove_stale_evidence(layer_dir: &Path, target: &EvidenceKind) -> OKIResult<()> {
|
||||||
|
match target {
|
||||||
|
EvidenceKind::Exact => {
|
||||||
|
// fingerprint.bin is no longer valid
|
||||||
|
remove_if_exists(&layer_dir.join(FINGERPRINT_FILE));
|
||||||
|
}
|
||||||
|
EvidenceKind::Approx { .. } => {
|
||||||
|
// exact bundle is no longer valid
|
||||||
|
remove_if_exists(&layer_dir.join(EVIDENCE_FILE));
|
||||||
|
remove_if_exists(&layer_dir.join(UNITIG_IDX_FILE));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn remove_if_exists(path: &Path) {
|
||||||
|
if let Err(e) = fs::remove_file(path) {
|
||||||
|
if e.kind() != std::io::ErrorKind::NotFound {
|
||||||
|
eprintln!("warning: could not remove {}: {e}", path.display());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -55,6 +55,12 @@ pub fn partitions_to_bits(n: usize) -> usize {
|
|||||||
n.max(1).next_power_of_two().trailing_zeros() as usize
|
n.max(1).next_power_of_two().trailing_zeros() as usize
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert a block size (number of unitigs per block) to its `block_bits` exponent.
|
||||||
|
/// `block_size=1` → `block_bits=0` (one entry per unitig, O(1) random access).
|
||||||
|
pub fn block_size_to_bits(n: usize) -> u8 {
|
||||||
|
n.max(1).next_power_of_two().trailing_zeros() as u8
|
||||||
|
}
|
||||||
|
|
||||||
impl CommonArgs {
|
impl CommonArgs {
|
||||||
pub fn seqfile_paths(&self) -> obiread::PathIter {
|
pub fn seqfile_paths(&self) -> obiread::PathIter {
|
||||||
let paths = self.inputs.iter().map(PathBuf::from).collect();
|
let paths = self.inputs.iter().map(PathBuf::from).collect();
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ use obikseq::{set_k, set_m};
|
|||||||
use obisys::Reporter;
|
use obisys::Reporter;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use crate::cli::{CommonArgs, partitions_to_bits};
|
use crate::cli::{CommonArgs, block_size_to_bits, partitions_to_bits};
|
||||||
use crate::steps::scatter;
|
use crate::steps::scatter;
|
||||||
|
|
||||||
#[derive(Args)]
|
#[derive(Args)]
|
||||||
@@ -68,6 +68,11 @@ pub struct IndexArgs {
|
|||||||
#[arg(long, default_value = None)]
|
#[arg(long, default_value = None)]
|
||||||
pub fp: Option<f64>,
|
pub fp: Option<f64>,
|
||||||
|
|
||||||
|
/// Block size for exact evidence `.idx` (number of unitigs per block).
|
||||||
|
/// Must be a power of two; rounded up if not. Default 1 = O(1) random access.
|
||||||
|
#[arg(long, default_value_t = 1)]
|
||||||
|
pub block_size: usize,
|
||||||
|
|
||||||
#[command(flatten)]
|
#[command(flatten)]
|
||||||
pub common: CommonArgs,
|
pub common: CommonArgs,
|
||||||
}
|
}
|
||||||
@@ -179,12 +184,14 @@ pub fn run(args: IndexArgs) {
|
|||||||
if effective != args.common.partitions {
|
if effective != args.common.partitions {
|
||||||
info!("partitions: {} → {} (next power of 2)", args.common.partitions, effective);
|
info!("partitions: {} → {} (next power of 2)", args.common.partitions, effective);
|
||||||
}
|
}
|
||||||
|
let block_bits = block_size_to_bits(args.block_size);
|
||||||
let config = IndexConfig {
|
let config = IndexConfig {
|
||||||
kmer_size: args.common.kmer_size,
|
kmer_size: args.common.kmer_size,
|
||||||
minimizer_size: args.common.minimizer_size,
|
minimizer_size: args.common.minimizer_size,
|
||||||
n_bits,
|
n_bits,
|
||||||
with_counts: args.with_counts,
|
with_counts: args.with_counts,
|
||||||
evidence: evidence.clone(),
|
evidence: evidence.clone(),
|
||||||
|
block_bits,
|
||||||
};
|
};
|
||||||
let genome_info = args.label.as_ref().map(|label| {
|
let genome_info = args.label.as_ref().map(|label| {
|
||||||
let mut info = GenomeInfo::new(label.clone());
|
let mut info = GenomeInfo::new(label.clone());
|
||||||
|
|||||||
@@ -6,5 +6,6 @@ pub mod index;
|
|||||||
pub mod merge;
|
pub mod merge;
|
||||||
pub mod query;
|
pub mod query;
|
||||||
pub mod rebuild;
|
pub mod rebuild;
|
||||||
|
pub mod reindex;
|
||||||
pub mod superkmer;
|
pub mod superkmer;
|
||||||
pub mod unitig;
|
pub mod unitig;
|
||||||
|
|||||||
@@ -0,0 +1,68 @@
|
|||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use clap::Args;
|
||||||
|
use obikindex::KmerIndex;
|
||||||
|
use obilayeredmap::EvidenceKind;
|
||||||
|
use obisys::Reporter;
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
|
use crate::cli::block_size_to_bits;
|
||||||
|
use super::index::resolve_approx_params;
|
||||||
|
|
||||||
|
#[derive(Args)]
|
||||||
|
pub struct ReindexArgs {
|
||||||
|
/// Index directory to convert (modified in-place)
|
||||||
|
pub index: PathBuf,
|
||||||
|
|
||||||
|
/// Convert to approximate evidence (default: convert to exact).
|
||||||
|
/// Requires --evidence-bits and/or -z and/or --fp.
|
||||||
|
#[arg(long, default_value_t = false)]
|
||||||
|
pub approx: bool,
|
||||||
|
|
||||||
|
/// Findere z parameter (≥1).
|
||||||
|
#[arg(short = 'z', long, default_value = None)]
|
||||||
|
pub findere_z: Option<u8>,
|
||||||
|
|
||||||
|
/// Fingerprint bits per slot (b).
|
||||||
|
#[arg(long, default_value = None)]
|
||||||
|
pub evidence_bits: Option<u8>,
|
||||||
|
|
||||||
|
/// Target false-positive rate per z-window.
|
||||||
|
#[arg(long, default_value = None)]
|
||||||
|
pub fp: Option<f64>,
|
||||||
|
|
||||||
|
/// Block size for exact evidence `.idx` (number of unitigs per block).
|
||||||
|
/// Ignored when converting to approximate evidence.
|
||||||
|
#[arg(long, default_value_t = 1)]
|
||||||
|
pub block_size: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run(args: ReindexArgs) {
|
||||||
|
let target = if args.approx {
|
||||||
|
let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp);
|
||||||
|
info!("target: approximate evidence — b={b}, z={z}, fp={fp:.2e}");
|
||||||
|
EvidenceKind::Approx { b, z }
|
||||||
|
} else {
|
||||||
|
info!("target: exact evidence");
|
||||||
|
EvidenceKind::Exact
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut idx = KmerIndex::open(&args.index).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error opening index: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"current evidence: {:?}",
|
||||||
|
idx.meta().config.evidence,
|
||||||
|
);
|
||||||
|
|
||||||
|
let block_bits = block_size_to_bits(args.block_size);
|
||||||
|
let mut rep = Reporter::new();
|
||||||
|
idx.reindex(target, block_bits, &mut rep).unwrap_or_else(|e| {
|
||||||
|
eprintln!("reindex error: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
rep.print();
|
||||||
|
}
|
||||||
@@ -35,13 +35,13 @@ pub fn run(args: UnitigArgs) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
let reader = UnitigFileReader::open(&path).unwrap_or_else(|e| {
|
// open_sequential: works with and without .idx (approx or exact index)
|
||||||
|
let reader = UnitigFileReader::open_sequential(&path).unwrap_or_else(|e| {
|
||||||
eprintln!("error opening unitigs (partition {i}): {e}");
|
eprintln!("error opening unitigs (partition {i}): {e}");
|
||||||
std::process::exit(1)
|
std::process::exit(1)
|
||||||
});
|
});
|
||||||
|
|
||||||
for j in 0..reader.len() {
|
for (j, unitig) in reader.iter_unitigs() {
|
||||||
let unitig = reader.unitig(j);
|
|
||||||
let mut out = stdout.lock().unwrap();
|
let mut out = stdout.lock().unwrap();
|
||||||
write_unitig(&unitig, k, i, j, &mut *out).unwrap_or_else(|e| {
|
write_unitig(&unitig, k, i, j, &mut *out).unwrap_or_else(|e| {
|
||||||
eprintln!("write error: {e}");
|
eprintln!("write error: {e}");
|
||||||
|
|||||||
@@ -34,6 +34,8 @@ enum Commands {
|
|||||||
Unitig(cmd::unitig::UnitigArgs),
|
Unitig(cmd::unitig::UnitigArgs),
|
||||||
/// Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing
|
/// Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing
|
||||||
Estimate(cmd::estimate::EstimateArgs),
|
Estimate(cmd::estimate::EstimateArgs),
|
||||||
|
/// Convert an index's evidence in-place: exact ↔ approx
|
||||||
|
Reindex(cmd::reindex::ReindexArgs),
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
@@ -65,6 +67,7 @@ fn main() {
|
|||||||
Commands::Distance(args) => cmd::distance::run(args),
|
Commands::Distance(args) => cmd::distance::run(args),
|
||||||
Commands::Unitig(args) => cmd::unitig::run(args),
|
Commands::Unitig(args) => cmd::unitig::run(args),
|
||||||
Commands::Estimate(args) => cmd::estimate::run(args),
|
Commands::Estimate(args) => cmd::estimate::run(args),
|
||||||
|
Commands::Reindex(args) => cmd::reindex::run(args),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "profiling")]
|
#[cfg(feature = "profiling")]
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ impl KmerPartition {
|
|||||||
max_ab: Option<u32>,
|
max_ab: Option<u32>,
|
||||||
with_counts: bool,
|
with_counts: bool,
|
||||||
evidence: &EvidenceKind,
|
evidence: &EvidenceKind,
|
||||||
|
block_bits: u8,
|
||||||
) -> Result<usize, SKError> {
|
) -> Result<usize, SKError> {
|
||||||
let part_dir = self.part_dir(i);
|
let part_dir = self.part_dir(i);
|
||||||
let dedup_path = part_dir.join("dereplicated.skmer.zst");
|
let dedup_path = part_dir.join("dereplicated.skmer.zst");
|
||||||
@@ -109,7 +110,7 @@ impl KmerPartition {
|
|||||||
uw.close()?;
|
uw.close()?;
|
||||||
|
|
||||||
if with_counts {
|
if with_counts {
|
||||||
Layer::<PersistentCompactIntMatrix>::build(&layer_dir, |kmer| {
|
Layer::<PersistentCompactIntMatrix>::build(&layer_dir, block_bits, |kmer| {
|
||||||
match (&mphf1_opt, &counts1_opt) {
|
match (&mphf1_opt, &counts1_opt) {
|
||||||
(Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())),
|
(Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())),
|
||||||
_ => 1,
|
_ => 1,
|
||||||
@@ -117,7 +118,7 @@ impl KmerPartition {
|
|||||||
})
|
})
|
||||||
.map_err(olm_to_sk)?;
|
.map_err(olm_to_sk)?;
|
||||||
} else {
|
} else {
|
||||||
Layer::<()>::build(&layer_dir).map_err(olm_to_sk)?;
|
Layer::<()>::build(&layer_dir, block_bits).map_err(olm_to_sk)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// For approximate evidence: replace the exact evidence bundle with a
|
// For approximate evidence: replace the exact evidence bundle with a
|
||||||
|
|||||||
@@ -161,6 +161,7 @@ impl KmerPartition {
|
|||||||
sources: &[(&KmerPartition, usize)],
|
sources: &[(&KmerPartition, usize)],
|
||||||
mode: MergeMode,
|
mode: MergeMode,
|
||||||
n_dst_genomes: usize,
|
n_dst_genomes: usize,
|
||||||
|
block_bits: u8,
|
||||||
) -> SKResult<()> {
|
) -> SKResult<()> {
|
||||||
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
|
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
|
||||||
if !dst_index_dir.exists() {
|
if !dst_index_dir.exists() {
|
||||||
@@ -216,7 +217,7 @@ impl KmerPartition {
|
|||||||
uw.write(&unitig)?;
|
uw.write(&unitig)?;
|
||||||
}
|
}
|
||||||
uw.close()?;
|
uw.close()?;
|
||||||
Layer::<()>::build(&new_layer_dir).map_err(olm_to_sk)?;
|
Layer::<()>::build(&new_layer_dir, block_bits).map_err(olm_to_sk)?;
|
||||||
}
|
}
|
||||||
drop(g);
|
drop(g);
|
||||||
|
|
||||||
|
|||||||
@@ -96,6 +96,7 @@ impl KmerPartition {
|
|||||||
filters: &[Box<dyn KmerFilter>],
|
filters: &[Box<dyn KmerFilter>],
|
||||||
mode: MergeMode,
|
mode: MergeMode,
|
||||||
n_genomes: usize,
|
n_genomes: usize,
|
||||||
|
block_bits: u8,
|
||||||
) -> SKResult<()> {
|
) -> SKResult<()> {
|
||||||
let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
|
let src_index_dir = src.part_dir(i).join(INDEX_SUBDIR);
|
||||||
if !src_index_dir.exists() {
|
if !src_index_dir.exists() {
|
||||||
@@ -145,7 +146,7 @@ impl KmerPartition {
|
|||||||
uw.close()?;
|
uw.close()?;
|
||||||
drop(g);
|
drop(g);
|
||||||
|
|
||||||
Layer::<()>::build(&dst_layer_dir).map_err(olm_to_sk)?;
|
Layer::<()>::build(&dst_layer_dir, block_bits).map_err(olm_to_sk)?;
|
||||||
let dst_mphf = MphfLayer::open(&dst_layer_dir).map_err(olm_to_sk)?;
|
let dst_mphf = MphfLayer::open(&dst_layer_dir).map_err(olm_to_sk)?;
|
||||||
|
|
||||||
// ── Prepare matrix builders (one column per genome) ───────────────────
|
// ── Prepare matrix builders (one column per genome) ───────────────────
|
||||||
|
|||||||
@@ -80,39 +80,33 @@ impl<D: LayerData> Layer<D> {
|
|||||||
|
|
||||||
/// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and
|
/// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and
|
||||||
/// `mphf.bin` already present in `layer_dir`.
|
/// `mphf.bin` already present in `layer_dir`.
|
||||||
///
|
/// `block_bits` controls the `.idx` block size (2^block_bits chunks/block).
|
||||||
/// See [`MphfLayer::build_exact_evidence`] for the full contract.
|
pub fn build_exact_evidence(layer_dir: &Path, block_bits: u8) -> OLMResult<usize> {
|
||||||
pub fn build_exact_evidence(layer_dir: &Path) -> OLMResult<usize> {
|
MphfLayer::build_exact_evidence(layer_dir, block_bits)
|
||||||
MphfLayer::build_exact_evidence(layer_dir)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already
|
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already
|
||||||
/// present in `layer_dir`. `b` — fingerprint bits (1..=64); `z` — Findere
|
/// present in `layer_dir`. `b` — fingerprint bits (1..=64); `z` — Findere
|
||||||
/// consecutive k-mer parameter (≥1).
|
/// consecutive k-mer parameter (≥1).
|
||||||
///
|
|
||||||
/// See [`MphfLayer::build_approx_evidence`] for the full contract.
|
|
||||||
pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
|
pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
|
||||||
MphfLayer::build_approx_evidence(layer_dir, b, z)
|
MphfLayer::build_approx_evidence(layer_dir, b, z)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
|
/// Dispatch to `build_exact_evidence` or `build_approx_evidence`.
|
||||||
/// `kind`.
|
/// `block_bits` is forwarded to exact evidence only.
|
||||||
pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind) -> OLMResult<usize> {
|
pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult<usize> {
|
||||||
MphfLayer::build_evidence(layer_dir, kind)
|
MphfLayer::build_evidence(layer_dir, kind, block_bits)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Mode 1 — set membership ───────────────────────────────────────────────────
|
// ── Mode 1 — set membership ───────────────────────────────────────────────────
|
||||||
|
|
||||||
impl Layer<()> {
|
impl Layer<()> {
|
||||||
pub fn build(out_dir: &Path) -> OLMResult<usize> {
|
pub fn build(out_dir: &Path, block_bits: u8) -> OLMResult<usize> {
|
||||||
MphfLayer::build(out_dir, &mut |_, _| Ok(()))
|
MphfLayer::build(out_dir, block_bits, &mut |_, _| Ok(()))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a presence matrix for a set-membership layer (first merge).
|
/// Create a presence matrix for a set-membership layer (first merge).
|
||||||
///
|
|
||||||
/// All `n_kmers` slots are set to `true`: every kmer in this layer belongs
|
|
||||||
/// to genome_0, so genome_0 is present at every slot.
|
|
||||||
pub fn init_presence_matrix(layer_dir: &Path, n_kmers: usize) -> OLMResult<()> {
|
pub fn init_presence_matrix(layer_dir: &Path, n_kmers: usize) -> OLMResult<()> {
|
||||||
let presence_dir = layer_dir.join(PRESENCE_DIR);
|
let presence_dir = layer_dir.join(PRESENCE_DIR);
|
||||||
fs::create_dir_all(&presence_dir).map_err(OLMError::Io)?;
|
fs::create_dir_all(&presence_dir).map_err(OLMError::Io)?;
|
||||||
@@ -126,16 +120,20 @@ impl Layer<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Mode 2 — count matrix (1 column per layer) ────────────────────────────────
|
// ── Mode 2 — count matrix ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
impl Layer<PersistentCompactIntMatrix> {
|
impl Layer<PersistentCompactIntMatrix> {
|
||||||
pub fn build(out_dir: &Path, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
|
pub fn build(
|
||||||
|
out_dir: &Path,
|
||||||
|
block_bits: u8,
|
||||||
|
count_of: impl Fn(CanonicalKmer) -> u32,
|
||||||
|
) -> OLMResult<usize> {
|
||||||
let n = UnitigFileReader::open_sequential(&out_dir.join(UNITIGS_FILE))?.n_kmers();
|
let n = UnitigFileReader::open_sequential(&out_dir.join(UNITIGS_FILE))?.n_kmers();
|
||||||
let counts_dir = out_dir.join(COUNTS_DIR);
|
let counts_dir = out_dir.join(COUNTS_DIR);
|
||||||
let mut mb = PersistentCompactIntMatrixBuilder::new(n, &counts_dir)
|
let mut mb = PersistentCompactIntMatrixBuilder::new(n, &counts_dir)
|
||||||
.map_err(OLMError::Io)?;
|
.map_err(OLMError::Io)?;
|
||||||
let mut col = mb.add_col().map_err(OLMError::Io)?;
|
let mut col = mb.add_col().map_err(OLMError::Io)?;
|
||||||
let n_built = MphfLayer::build(out_dir, &mut |slot, kmer| {
|
let n_built = MphfLayer::build(out_dir, block_bits, &mut |slot, kmer| {
|
||||||
col.set(slot, count_of(kmer));
|
col.set(slot, count_of(kmer));
|
||||||
Ok(())
|
Ok(())
|
||||||
})?;
|
})?;
|
||||||
@@ -146,16 +144,16 @@ impl Layer<PersistentCompactIntMatrix> {
|
|||||||
|
|
||||||
pub fn build_from_map(
|
pub fn build_from_map(
|
||||||
out_dir: &Path,
|
out_dir: &Path,
|
||||||
|
block_bits: u8,
|
||||||
counts: &HashMap<CanonicalKmer, u32>,
|
counts: &HashMap<CanonicalKmer, u32>,
|
||||||
) -> OLMResult<usize> {
|
) -> OLMResult<usize> {
|
||||||
Self::build(out_dir, |kmer| counts.get(&kmer).copied().unwrap_or(0))
|
Self::build(out_dir, block_bits, |kmer| counts.get(&kmer).copied().unwrap_or(0))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Mode 2 — count matrix column append ──────────────────────────────────────
|
// ── Mode 2 — count matrix column append ──────────────────────────────────────
|
||||||
|
|
||||||
impl Layer<PersistentCompactIntMatrix> {
|
impl Layer<PersistentCompactIntMatrix> {
|
||||||
/// Append a genome column to an existing count matrix.
|
|
||||||
pub fn append_genome_column(
|
pub fn append_genome_column(
|
||||||
layer_dir: &Path,
|
layer_dir: &Path,
|
||||||
value_of: impl Fn(usize) -> u32,
|
value_of: impl Fn(usize) -> u32,
|
||||||
@@ -165,10 +163,9 @@ impl Layer<PersistentCompactIntMatrix> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Mode 3 — presence/absence matrix (1 column per genome) ───────────────────
|
// ── Mode 3 — presence/absence matrix ─────────────────────────────────────────
|
||||||
|
|
||||||
impl Layer<PersistentBitMatrix> {
|
impl Layer<PersistentBitMatrix> {
|
||||||
/// Append a genome column to an existing presence matrix.
|
|
||||||
pub fn append_genome_column(
|
pub fn append_genome_column(
|
||||||
layer_dir: &Path,
|
layer_dir: &Path,
|
||||||
value_of: impl Fn(usize) -> bool,
|
value_of: impl Fn(usize) -> bool,
|
||||||
@@ -179,6 +176,7 @@ impl Layer<PersistentBitMatrix> {
|
|||||||
|
|
||||||
pub fn build_presence(
|
pub fn build_presence(
|
||||||
out_dir: &Path,
|
out_dir: &Path,
|
||||||
|
block_bits: u8,
|
||||||
n_genomes: usize,
|
n_genomes: usize,
|
||||||
present_in: impl Fn(CanonicalKmer, usize) -> bool,
|
present_in: impl Fn(CanonicalKmer, usize) -> bool,
|
||||||
) -> OLMResult<usize> {
|
) -> OLMResult<usize> {
|
||||||
@@ -188,7 +186,7 @@ impl Layer<PersistentBitMatrix> {
|
|||||||
let mut cols: Vec<_> = (0..n_genomes)
|
let mut cols: Vec<_> = (0..n_genomes)
|
||||||
.map(|_| mb.add_col().map_err(OLMError::Io))
|
.map(|_| mb.add_col().map_err(OLMError::Io))
|
||||||
.collect::<OLMResult<_>>()?;
|
.collect::<OLMResult<_>>()?;
|
||||||
let n_built = MphfLayer::build(out_dir, &mut |slot, kmer| {
|
let n_built = MphfLayer::build(out_dir, block_bits, &mut |slot, kmer| {
|
||||||
for (g, col) in cols.iter_mut().enumerate() {
|
for (g, col) in cols.iter_mut().enumerate() {
|
||||||
col.set(slot, present_in(kmer, g));
|
col.set(slot, present_in(kmer, g));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use std::path::{Path, PathBuf};
|
|||||||
|
|
||||||
use obicompactvec::PersistentCompactIntMatrix;
|
use obicompactvec::PersistentCompactIntMatrix;
|
||||||
use obikseq::CanonicalKmer;
|
use obikseq::CanonicalKmer;
|
||||||
use obiskio::UnitigFileWriter;
|
use obiskio::{UnitigFileWriter, DEFAULT_BLOCK_BITS};
|
||||||
|
|
||||||
use crate::error::OLMResult;
|
use crate::error::OLMResult;
|
||||||
use crate::layer::{Hit, Layer, LayerData};
|
use crate::layer::{Hit, Layer, LayerData};
|
||||||
@@ -90,7 +90,7 @@ impl LayeredMap<()> {
|
|||||||
pub fn push_layer(&mut self) -> OLMResult<usize> {
|
pub fn push_layer(&mut self) -> OLMResult<usize> {
|
||||||
let i = self.layers.len();
|
let i = self.layers.len();
|
||||||
let dir = layer_dir(&self.root, i);
|
let dir = layer_dir(&self.root, i);
|
||||||
Layer::<()>::build(&dir)?;
|
Layer::<()>::build(&dir, DEFAULT_BLOCK_BITS)?;
|
||||||
self.append_layer()?;
|
self.append_layer()?;
|
||||||
Ok(i)
|
Ok(i)
|
||||||
}
|
}
|
||||||
@@ -102,7 +102,7 @@ impl LayeredMap<PersistentCompactIntMatrix> {
|
|||||||
pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
|
pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
|
||||||
let i = self.layers.len();
|
let i = self.layers.len();
|
||||||
let dir = layer_dir(&self.root, i);
|
let dir = layer_dir(&self.root, i);
|
||||||
Layer::<PersistentCompactIntMatrix>::build(&dir, count_of)?;
|
Layer::<PersistentCompactIntMatrix>::build(&dir, DEFAULT_BLOCK_BITS, count_of)?;
|
||||||
self.append_layer()?;
|
self.append_layer()?;
|
||||||
Ok(i)
|
Ok(i)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use std::path::Path;
|
|||||||
use cacheline_ef::{CachelineEf, CachelineEfVec};
|
use cacheline_ef::{CachelineEf, CachelineEfVec};
|
||||||
use epserde::prelude::*;
|
use epserde::prelude::*;
|
||||||
use obikseq::CanonicalKmer;
|
use obikseq::CanonicalKmer;
|
||||||
use obiskio::{UnitigFileReader, UnitigFileWriter, build_unitig_idx, DEFAULT_BLOCK_BITS};
|
use obiskio::{UnitigFileReader, UnitigFileWriter, build_unitig_idx};
|
||||||
use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
|
use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
|
||||||
|
|
||||||
use crate::error::{OLMError, OLMResult};
|
use crate::error::{OLMError, OLMResult};
|
||||||
@@ -110,25 +110,26 @@ impl MphfLayer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
|
/// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
|
||||||
/// `kind`.
|
/// `kind`. `block_bits` is forwarded to exact evidence only.
|
||||||
pub fn build_evidence(dir: &Path, kind: &EvidenceKind) -> OLMResult<usize> {
|
pub fn build_evidence(dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult<usize> {
|
||||||
match kind {
|
match kind {
|
||||||
EvidenceKind::Exact => Self::build_exact_evidence(dir),
|
EvidenceKind::Exact => Self::build_exact_evidence(dir, block_bits),
|
||||||
EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z),
|
EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`.
|
/// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`.
|
||||||
///
|
///
|
||||||
|
/// `block_bits` controls the `.idx` block size (2^block_bits chunks per block).
|
||||||
/// Uses sequential iteration — no `.idx` required on entry.
|
/// Uses sequential iteration — no `.idx` required on entry.
|
||||||
pub fn build_exact_evidence(dir: &Path) -> OLMResult<usize> {
|
pub fn build_exact_evidence(dir: &Path, block_bits: u8) -> OLMResult<usize> {
|
||||||
let unitig_path = dir.join(UNITIGS_FILE);
|
let unitig_path = dir.join(UNITIGS_FILE);
|
||||||
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
|
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
|
||||||
let n = unitigs.n_kmers();
|
let n = unitigs.n_kmers();
|
||||||
|
|
||||||
if n == 0 {
|
if n == 0 {
|
||||||
fs::File::create(dir.join(EVIDENCE_FILE))?;
|
fs::File::create(dir.join(EVIDENCE_FILE))?;
|
||||||
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
build_unitig_idx(&unitig_path, block_bits)?;
|
||||||
LayerMeta::exact().save(dir)?;
|
LayerMeta::exact().save(dir)?;
|
||||||
return Ok(0);
|
return Ok(0);
|
||||||
}
|
}
|
||||||
@@ -154,7 +155,7 @@ impl MphfLayer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ev.write(&dir.join(EVIDENCE_FILE))?;
|
ev.write(&dir.join(EVIDENCE_FILE))?;
|
||||||
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
build_unitig_idx(&unitig_path, block_bits)?;
|
||||||
LayerMeta::exact().save(dir)?;
|
LayerMeta::exact().save(dir)?;
|
||||||
Ok(n)
|
Ok(n)
|
||||||
}
|
}
|
||||||
@@ -202,13 +203,14 @@ impl MphfLayer {
|
|||||||
/// population. Returns the number of kmers indexed.
|
/// population. Returns the number of kmers indexed.
|
||||||
pub(crate) fn build(
|
pub(crate) fn build(
|
||||||
dir: &Path,
|
dir: &Path,
|
||||||
|
block_bits: u8,
|
||||||
fill_slot: &mut impl FnMut(usize, CanonicalKmer) -> OLMResult<()>,
|
fill_slot: &mut impl FnMut(usize, CanonicalKmer) -> OLMResult<()>,
|
||||||
) -> OLMResult<usize> {
|
) -> OLMResult<usize> {
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
|
|
||||||
let unitig_path = dir.join(UNITIGS_FILE);
|
let unitig_path = dir.join(UNITIGS_FILE);
|
||||||
|
|
||||||
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
build_unitig_idx(&unitig_path, block_bits)?;
|
||||||
|
|
||||||
let unitigs = UnitigFileReader::open(&unitig_path)?;
|
let unitigs = UnitigFileReader::open(&unitig_path)?;
|
||||||
let n = unitigs.n_kmers();
|
let n = unitigs.n_kmers();
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use super::*;
|
use super::*;
|
||||||
use obicompactvec::PersistentCompactIntMatrix;
|
use obicompactvec::PersistentCompactIntMatrix;
|
||||||
use obikseq::{set_k, Kmer, Sequence as _, Unitig};
|
use obikseq::{set_k, Kmer, Sequence as _, Unitig};
|
||||||
|
use obiskio::DEFAULT_BLOCK_BITS;
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
fn write_unitigs(dir: &Path, seqs: &[&[u8]]) {
|
fn write_unitigs(dir: &Path, seqs: &[&[u8]]) {
|
||||||
@@ -23,7 +24,7 @@ fn build_and_query_all_kmers_found() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||||
let kmers = all_canonical_kmers(dir.path());
|
let kmers = all_canonical_kmers(dir.path());
|
||||||
Layer::<()>::build(dir.path()).unwrap();
|
Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS).unwrap();
|
||||||
let layer = Layer::<()>::open(dir.path()).unwrap();
|
let layer = Layer::<()>::open(dir.path()).unwrap();
|
||||||
for kmer in kmers {
|
for kmer in kmers {
|
||||||
assert!(layer.query(kmer).is_some(), "kmer should be present");
|
assert!(layer.query(kmer).is_some(), "kmer should be present");
|
||||||
@@ -40,6 +41,7 @@ fn counts_are_stored_and_retrieved() {
|
|||||||
kmers.iter().enumerate().map(|(i, &k)| (k, i as u32 + 1)).collect();
|
kmers.iter().enumerate().map(|(i, &k)| (k, i as u32 + 1)).collect();
|
||||||
Layer::<PersistentCompactIntMatrix>::build(
|
Layer::<PersistentCompactIntMatrix>::build(
|
||||||
dir.path(),
|
dir.path(),
|
||||||
|
DEFAULT_BLOCK_BITS,
|
||||||
|kmer| count_map.get(&kmer).copied().unwrap_or(0),
|
|kmer| count_map.get(&kmer).copied().unwrap_or(0),
|
||||||
).unwrap();
|
).unwrap();
|
||||||
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
|
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
|
||||||
@@ -54,7 +56,7 @@ fn query_absent_returns_none() {
|
|||||||
set_k(4);
|
set_k(4);
|
||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||||
Layer::<()>::build(dir.path()).unwrap();
|
Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS).unwrap();
|
||||||
let layer = Layer::<()>::open(dir.path()).unwrap();
|
let layer = Layer::<()>::open(dir.path()).unwrap();
|
||||||
let absent = Kmer::from_ascii(b"CCCC").unwrap().canonical();
|
let absent = Kmer::from_ascii(b"CCCC").unwrap().canonical();
|
||||||
assert!(layer.query(absent).is_none());
|
assert!(layer.query(absent).is_none());
|
||||||
@@ -65,7 +67,7 @@ fn open_after_build_is_consistent() {
|
|||||||
set_k(4);
|
set_k(4);
|
||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||||
let n = Layer::<PersistentCompactIntMatrix>::build(dir.path(), |_| 7).unwrap();
|
let n = Layer::<PersistentCompactIntMatrix>::build(dir.path(), DEFAULT_BLOCK_BITS, |_| 7).unwrap();
|
||||||
assert_eq!(n, 4);
|
assert_eq!(n, 4);
|
||||||
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
|
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
|
||||||
let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical();
|
let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical();
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ use crate::error::{SKError, SKResult};
|
|||||||
const MAGIC: [u8; 4] = *b"UIX3";
|
const MAGIC: [u8; 4] = *b"UIX3";
|
||||||
|
|
||||||
/// Default block granularity used by [`UnitigFileWriter::create`].
|
/// Default block granularity used by [`UnitigFileWriter::create`].
|
||||||
pub const DEFAULT_BLOCK_BITS: u8 = 6;
|
pub const DEFAULT_BLOCK_BITS: u8 = 0;
|
||||||
|
|
||||||
fn idx_path(path: &Path) -> PathBuf {
|
fn idx_path(path: &Path) -> PathBuf {
|
||||||
crate::append_path_suffix(path, ".idx")
|
crate::append_path_suffix(path, ".idx")
|
||||||
@@ -325,6 +325,11 @@ impl UnitigFileReader {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Iterate all unitigs sequentially. Works without `.idx` (sequential open).
|
||||||
|
pub fn iter_unitigs(&self) -> impl Iterator<Item = (usize, Unitig)> + '_ {
|
||||||
|
self.iter_chunks_sequential()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn iter_kmers(&self) -> impl Iterator<Item = Kmer> + '_ {
|
pub fn iter_kmers(&self) -> impl Iterator<Item = Kmer> + '_ {
|
||||||
self.iter_chunks_sequential()
|
self.iter_chunks_sequential()
|
||||||
.flat_map(|(_, u)| u.into_kmers())
|
.flat_map(|(_, u)| u.into_kmers())
|
||||||
|
|||||||
Reference in New Issue
Block a user