Push pnxswqpxlyso #13
@@ -79,9 +79,12 @@ Non-ACGT characters act as hard breaks between k-mer segments in all formats.
|
|||||||
## Quick start
|
## Quick start
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Build an exact index for two genomes
|
# Build an exact index for each genome independently
|
||||||
obikmer index --kmer-size 31 --label genome_a genome_a.fa --output index/
|
obikmer index --kmer-size 31 --label genome_a genome_a.fa --output index_a/
|
||||||
obikmer index --kmer-size 31 --label genome_b genome_b.fa --output index/
|
obikmer index --kmer-size 31 --label genome_b genome_b.fa --output index_b/
|
||||||
|
|
||||||
|
# Merge into a single multi-genome index
|
||||||
|
obikmer merge --output index/ index_a/ index_b/
|
||||||
|
|
||||||
# Convert to approximate index (z=5, 8-bit fingerprints)
|
# Convert to approximate index (z=5, 8-bit fingerprints)
|
||||||
obikmer reindex --approx -z 5 --evidence-bits 8 index/
|
obikmer reindex --approx -z 5 --evidence-bits 8 index/
|
||||||
|
|||||||
+55
-31
@@ -63,8 +63,15 @@ impl KmerIndex {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Validate evidence compatibility ───────────────────────────────────
|
// ── Choose base: largest source in the output evidence mode ───────────
|
||||||
let evidence = validate_evidence_compat(sources)?;
|
let base_idx = choose_base(sources, mode);
|
||||||
|
let mut ordered: Vec<&KmerIndex> = Vec::with_capacity(sources.len());
|
||||||
|
ordered.push(sources[base_idx]);
|
||||||
|
for (i, &src) in sources.iter().enumerate() {
|
||||||
|
if i != base_idx { ordered.push(src); }
|
||||||
|
}
|
||||||
|
let sources: &[&KmerIndex] = &ordered;
|
||||||
|
let evidence = sources[0].meta.config.evidence.clone();
|
||||||
|
|
||||||
// ── Compute final genome labels (rename duplicates if requested) ───────
|
// ── Compute final genome labels (rename duplicates if requested) ───────
|
||||||
let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?;
|
let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?;
|
||||||
@@ -96,7 +103,7 @@ impl KmerIndex {
|
|||||||
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
|
let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
|
||||||
meta.genomes = all_genomes;
|
meta.genomes = all_genomes;
|
||||||
meta.config.with_counts = mode == MergeMode::Count;
|
meta.config.with_counts = mode == MergeMode::Count;
|
||||||
meta.config.evidence = evidence;
|
meta.config.evidence = evidence.clone();
|
||||||
meta.write(output)?;
|
meta.write(output)?;
|
||||||
|
|
||||||
// In presence/absence mode, purge counts/ directories inherited from
|
// In presence/absence mode, purge counts/ directories inherited from
|
||||||
@@ -147,7 +154,7 @@ impl KmerIndex {
|
|||||||
.filter_map(|i| {
|
.filter_map(|i| {
|
||||||
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
|
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
|
||||||
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
|
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
|
||||||
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits).err();
|
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits, &evidence).err();
|
||||||
pb.inc(1);
|
pb.inc(1);
|
||||||
result
|
result
|
||||||
})
|
})
|
||||||
@@ -265,34 +272,51 @@ fn partition_bar(n: u64) -> ProgressBar {
|
|||||||
pb
|
pb
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check that all sources share the same evidence kind.
|
/// A source is "trivial" if its presence/count values carry no approximation:
|
||||||
|
/// single-genome presence index (SetMembership — all values are 1 by construction).
|
||||||
|
fn is_trivial(src: &KmerIndex, mode: MergeMode) -> bool {
|
||||||
|
src.meta.genomes.len() == 1 && mode == MergeMode::Presence
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sum of all `unitigs.bin` sizes across every partition and layer.
|
||||||
|
/// Used as a proxy for the number of indexed smers.
|
||||||
|
fn index_unitig_size(src: &KmerIndex) -> u64 {
|
||||||
|
let n = src.partition.n_partitions();
|
||||||
|
let mut total = 0u64;
|
||||||
|
for i in 0..n {
|
||||||
|
let index_dir = src.partition.part_dir(i).join("index");
|
||||||
|
let mut l = 0usize;
|
||||||
|
loop {
|
||||||
|
let p = index_dir.join(format!("layer_{l}")).join("unitigs.bin");
|
||||||
|
if !p.exists() { break; }
|
||||||
|
if let Ok(m) = std::fs::metadata(&p) { total += m.len(); }
|
||||||
|
l += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
total
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Choose the index to use as bootstrap base.
|
||||||
///
|
///
|
||||||
/// Rules:
|
/// Rule — mieux-disant: if any non-trivial source uses approximate evidence
|
||||||
/// - all `Exact` → OK, returns `Exact`
|
/// (Approx or Hybrid), the output must also be approximate; the base must
|
||||||
/// - all `Approx { b, z }` same params → OK, returns `Approx { b, z }`
|
/// therefore come from an approximate source so its layers carry the right
|
||||||
/// - mixed exact/approx or different approx params → `IncompatibleEvidence`
|
/// evidence files. Among qualifying candidates, the largest (by unitig size)
|
||||||
fn validate_evidence_compat(sources: &[&KmerIndex]) -> OKIResult<IndexMode> {
|
/// is chosen to minimise the number of new smers in the merge layer.
|
||||||
let ref_ev = &sources[0].meta.config.evidence;
|
fn choose_base(sources: &[&KmerIndex], mode: MergeMode) -> usize {
|
||||||
for src in &sources[1..] {
|
let needs_approx = sources.iter().any(|src| {
|
||||||
let ev = &src.meta.config.evidence;
|
!is_trivial(src, mode)
|
||||||
let compat = match (ref_ev, ev) {
|
&& matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. })
|
||||||
(IndexMode::Exact, IndexMode::Exact) => true,
|
});
|
||||||
(IndexMode::Approx { b: b1, z: z1 },
|
|
||||||
IndexMode::Approx { b: b2, z: z2 }) => b1 == b2 && z1 == z2,
|
sources.iter().enumerate()
|
||||||
(IndexMode::Hybrid { b: b1, z: z1 },
|
.filter(|(_, src)| {
|
||||||
IndexMode::Hybrid { b: b2, z: z2 }) => b1 == b2 && z1 == z2,
|
!needs_approx
|
||||||
_ => false,
|
|| matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. })
|
||||||
};
|
})
|
||||||
if !compat {
|
.max_by_key(|(_, src)| index_unitig_size(src))
|
||||||
return Err(OKIError::IncompatibleEvidence(format!(
|
.map(|(i, _)| i)
|
||||||
"source {:?} has evidence {:?}, expected {:?} — \
|
.unwrap()
|
||||||
convert all sources to the same evidence kind first \
|
|
||||||
(use the `reindex` command)",
|
|
||||||
src.root_path.display(), ev, ref_ev,
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(ref_ev.clone())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder,
|
|||||||
PersistentCompactIntVecBuilder};
|
PersistentCompactIntVecBuilder};
|
||||||
use obikseq::CanonicalKmer;
|
use obikseq::CanonicalKmer;
|
||||||
use obiskio::{SKError, SKResult, UnitigFileReader};
|
use obiskio::{SKError, SKResult, UnitigFileReader};
|
||||||
use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfLayer, OLMError};
|
use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfOnly, OLMError};
|
||||||
use obilayeredmap::meta::PartitionMeta;
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
|
|
||||||
use crate::partition::KmerPartition;
|
use crate::partition::KmerPartition;
|
||||||
@@ -47,22 +47,22 @@ impl ColBuilder {
|
|||||||
pub(crate) enum SrcLayerData {
|
pub(crate) enum SrcLayerData {
|
||||||
/// Pure set-membership layer (no data matrix): every kmer is present in all genomes.
|
/// Pure set-membership layer (no data matrix): every kmer is present in all genomes.
|
||||||
SetMembership,
|
SetMembership,
|
||||||
Presence(MphfLayer, PersistentBitMatrix),
|
Presence(MphfOnly, PersistentBitMatrix),
|
||||||
Count(MphfLayer, PersistentCompactIntMatrix),
|
Count(MphfOnly, PersistentCompactIntMatrix),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SrcLayerData {
|
impl SrcLayerData {
|
||||||
pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode, index_mode: &IndexMode) -> SKResult<Self> {
|
pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode) -> SKResult<Self> {
|
||||||
let presence_dir = layer_dir.join("presence");
|
let presence_dir = layer_dir.join("presence");
|
||||||
let counts_dir = layer_dir.join("counts");
|
let counts_dir = layer_dir.join("counts");
|
||||||
match merge_mode {
|
match merge_mode {
|
||||||
MergeMode::Presence => {
|
MergeMode::Presence => {
|
||||||
if presence_dir.exists() {
|
if presence_dir.exists() {
|
||||||
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
|
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
||||||
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
||||||
Ok(SrcLayerData::Presence(mphf, mat))
|
Ok(SrcLayerData::Presence(mphf, mat))
|
||||||
} else if counts_dir.exists() {
|
} else if counts_dir.exists() {
|
||||||
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
|
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
||||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||||
Ok(SrcLayerData::Count(mphf, mat))
|
Ok(SrcLayerData::Count(mphf, mat))
|
||||||
} else {
|
} else {
|
||||||
@@ -71,7 +71,7 @@ impl SrcLayerData {
|
|||||||
}
|
}
|
||||||
MergeMode::Count => {
|
MergeMode::Count => {
|
||||||
if counts_dir.exists() {
|
if counts_dir.exists() {
|
||||||
let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
|
let mphf = MphfOnly::open(layer_dir).map_err(olm_to_sk)?;
|
||||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||||
Ok(SrcLayerData::Count(mphf, mat))
|
Ok(SrcLayerData::Count(mphf, mat))
|
||||||
} else {
|
} else {
|
||||||
@@ -82,22 +82,16 @@ impl SrcLayerData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Return one value per source genome for `kmer`.
|
/// Return one value per source genome for `kmer`.
|
||||||
|
/// The caller guarantees `kmer` is in the source MPHF domain.
|
||||||
|
#[inline]
|
||||||
pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
|
pub(crate) fn lookup(&self, kmer: CanonicalKmer, n_genomes: usize) -> Vec<u32> {
|
||||||
match self {
|
match self {
|
||||||
SrcLayerData::SetMembership => vec![1u32; n_genomes],
|
SrcLayerData::SetMembership => vec![1u32; n_genomes],
|
||||||
SrcLayerData::Presence(mphf, mat) => {
|
SrcLayerData::Presence(mphf, mat) => {
|
||||||
if let Some(slot) = mphf.find(kmer) {
|
mat.row(mphf.index(kmer)).iter().map(|&b| b as u32).collect()
|
||||||
mat.row(slot).iter().map(|&b| b as u32).collect()
|
|
||||||
} else {
|
|
||||||
vec![0u32; n_genomes]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
SrcLayerData::Count(mphf, mat) => {
|
SrcLayerData::Count(mphf, mat) => {
|
||||||
if let Some(slot) = mphf.find(kmer) {
|
mat.row(mphf.index(kmer)).iter().copied().collect()
|
||||||
mat.row(slot).iter().copied().collect()
|
|
||||||
} else {
|
|
||||||
vec![0u32; n_genomes]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -161,6 +155,7 @@ impl KmerPartition {
|
|||||||
mode: MergeMode,
|
mode: MergeMode,
|
||||||
n_dst_genomes: usize,
|
n_dst_genomes: usize,
|
||||||
block_bits: u8,
|
block_bits: u8,
|
||||||
|
evidence: &IndexMode,
|
||||||
) -> SKResult<()> {
|
) -> SKResult<()> {
|
||||||
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
|
let dst_index_dir = self.part_dir(i).join(INDEX_SUBDIR);
|
||||||
if !dst_index_dir.exists() {
|
if !dst_index_dir.exists() {
|
||||||
@@ -208,7 +203,7 @@ impl KmerPartition {
|
|||||||
let new_layer_idx = n_dst_layers;
|
let new_layer_idx = n_dst_layers;
|
||||||
let new_layer_dir = dst_index_dir.join(format!("layer_{new_layer_idx}"));
|
let new_layer_dir = dst_index_dir.join(format!("layer_{new_layer_idx}"));
|
||||||
|
|
||||||
if any_new {
|
let n_new = if any_new {
|
||||||
g.compute_degrees();
|
g.compute_degrees();
|
||||||
fs::create_dir_all(&new_layer_dir)?;
|
fs::create_dir_all(&new_layer_dir)?;
|
||||||
let mut uw = Layer::<()>::unitig_writer(&new_layer_dir).map_err(olm_to_sk)?;
|
let mut uw = Layer::<()>::unitig_writer(&new_layer_dir).map_err(olm_to_sk)?;
|
||||||
@@ -216,16 +211,18 @@ impl KmerPartition {
|
|||||||
uw.write(&unitig)?;
|
uw.write(&unitig)?;
|
||||||
}
|
}
|
||||||
uw.close()?;
|
uw.close()?;
|
||||||
Layer::<()>::build(&new_layer_dir, block_bits, &IndexMode::Exact).map_err(olm_to_sk)?;
|
Layer::<()>::build(&new_layer_dir, block_bits, evidence).map_err(olm_to_sk)?;
|
||||||
}
|
g.len()
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
drop(g);
|
drop(g);
|
||||||
|
|
||||||
let new_mphf = if any_new {
|
let new_mphf = if any_new {
|
||||||
Some(MphfLayer::open(&new_layer_dir, &IndexMode::Exact).map_err(olm_to_sk)?)
|
Some(MphfOnly::open(&new_layer_dir).map_err(olm_to_sk)?)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
let n_new = new_mphf.as_ref().map_or(0, |m| m.n());
|
|
||||||
|
|
||||||
// ── Prepare matrix directories for the new layer ──────────────────────
|
// ── Prepare matrix directories for the new layer ──────────────────────
|
||||||
// Absent columns (dst genomes) are written via append_column (all-zero/false).
|
// Absent columns (dst genomes) are written via append_column (all-zero/false).
|
||||||
@@ -303,7 +300,7 @@ impl KmerPartition {
|
|||||||
for l in 0..src_meta.n_layers {
|
for l in 0..src_meta.n_layers {
|
||||||
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
|
||||||
let reader = UnitigFileReader::open_sequential(&src_layer_dir.join("unitigs.bin"))?;
|
let reader = UnitigFileReader::open_sequential(&src_layer_dir.join("unitigs.bin"))?;
|
||||||
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?;
|
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||||
|
|
||||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
let values = src_data.lookup(kmer, *src_n);
|
let values = src_data.lookup(kmer, *src_n);
|
||||||
@@ -312,13 +309,12 @@ impl KmerPartition {
|
|||||||
if let Some((dst_layer, hit)) = dst_map.query(kmer) {
|
if let Some((dst_layer, hit)) = dst_map.query(kmer) {
|
||||||
exist_builders[dst_layer][builder_idx].set_val(hit.slot, value);
|
exist_builders[dst_layer][builder_idx].set_val(hit.slot, value);
|
||||||
} else if let Some(ref mphf) = new_mphf {
|
} else if let Some(ref mphf) = new_mphf {
|
||||||
if let Some(slot) = mphf.find(kmer) {
|
let slot = mphf.index(kmer);
|
||||||
new_src_builders[builder_idx].set_val(slot, value);
|
new_src_builders[builder_idx].set_val(slot, value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
col_offset += src_n;
|
col_offset += src_n;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -117,7 +117,7 @@ impl KmerPartition {
|
|||||||
if !unitigs_path.exists() { continue; }
|
if !unitigs_path.exists() { continue; }
|
||||||
|
|
||||||
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||||
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?;
|
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||||
|
|
||||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
let row = src_data.lookup(kmer, n_genomes);
|
let row = src_data.lookup(kmer, n_genomes);
|
||||||
@@ -182,7 +182,7 @@ impl KmerPartition {
|
|||||||
if !unitigs_path.exists() { continue; }
|
if !unitigs_path.exists() { continue; }
|
||||||
|
|
||||||
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
let reader = UnitigFileReader::open_sequential(&unitigs_path)?;
|
||||||
let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?;
|
let src_data = SrcLayerData::open(&src_layer_dir, mode)?;
|
||||||
|
|
||||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
let row = src_data.lookup(kmer, n_genomes);
|
let row = src_data.lookup(kmer, n_genomes);
|
||||||
|
|||||||
+54
-42
@@ -1,7 +1,8 @@
|
|||||||
//! Compact 2-bit kmer stored as a left-aligned u64.
|
//! Compact 2-bit kmer stored as a left-aligned [`RawKmer`].
|
||||||
//!
|
//!
|
||||||
//! Nucleotide 0 occupies bits 63–62, nucleotide i occupies bits 63−2i and 62−2i.
|
//! Nucleotide 0 occupies bits KMER_BITS−1 and KMER_BITS−2, nucleotide i occupies
|
||||||
//! The low 64−2·len bits are always zero.
|
//! bits KMER_BITS−1−2i and KMER_BITS−2−2i.
|
||||||
|
//! The low `KMER_BITS − 2·len` bits are always zero.
|
||||||
//!
|
//!
|
||||||
//! The length is not stored in the struct — it is supplied by the [`KmerLength`]
|
//! The length is not stored in the struct — it is supplied by the [`KmerLength`]
|
||||||
//! type parameter. Two public marker types cover the common cases:
|
//! type parameter. Two public marker types cover the common cases:
|
||||||
@@ -15,6 +16,17 @@
|
|||||||
//! Tests that need a fixed length independent of the global singletons can use
|
//! Tests that need a fixed length independent of the global singletons can use
|
||||||
//! [`ConstLen<N>`].
|
//! [`ConstLen<N>`].
|
||||||
|
|
||||||
|
/// Underlying storage type for a raw (2-bit-per-base, left-aligned) kmer.
|
||||||
|
///
|
||||||
|
/// All kmer bit-width constants (`KMER_BITS`) and shift calculations are derived
|
||||||
|
/// from this type. To extend the maximum supported k beyond 32, change this alias
|
||||||
|
/// to `u128` and update `KMER_BITS` — callers that use `RawKmer` directly will
|
||||||
|
/// then pick up the wider type automatically.
|
||||||
|
pub type RawKmer = u64;
|
||||||
|
|
||||||
|
/// Bit width of [`RawKmer`]: the number of bits available for nucleotide encoding.
|
||||||
|
pub const KMER_BITS: usize = 64;
|
||||||
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
@@ -109,35 +121,35 @@ impl Annotation for KmerAnnotation {}
|
|||||||
|
|
||||||
// ── KmerOf ────────────────────────────────────────────────────────────────────
|
// ── KmerOf ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
/// A DNA kmer of length `L::len()` encoded as a left-aligned u64 (2 bits/nucleotide, MSB-first).
|
/// A DNA kmer of length `L::len()` encoded as a left-aligned [`RawKmer`] (2 bits/nucleotide, MSB-first).
|
||||||
/// The low `64 − 2·L::len()` bits are always zero.
|
/// The low `KMER_BITS − 2·L::len()` bits are always zero.
|
||||||
#[repr(transparent)]
|
#[repr(transparent)]
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
pub struct KmerOf<L: KmerLength>(u64, PhantomData<L>);
|
pub struct KmerOf<L: KmerLength>(RawKmer, PhantomData<L>);
|
||||||
|
|
||||||
impl<L: KmerLength> KmerOf<L> {
|
impl<L: KmerLength> KmerOf<L> {
|
||||||
/// Wrap a raw left-aligned u64 value as a kmer.
|
/// Wrap a raw left-aligned [`RawKmer`] value as a kmer.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub const fn from_raw(raw: u64) -> Self {
|
pub const fn from_raw(raw: RawKmer) -> Self {
|
||||||
KmerOf(raw, PhantomData)
|
KmerOf(raw, PhantomData)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wrap a raw right-aligned u64 value, shifting it into left-aligned position.
|
/// Wrap a raw right-aligned [`RawKmer`] value, shifting it into left-aligned position.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn from_raw_right(raw: u64) -> Self {
|
pub fn from_raw_right(raw: RawKmer) -> Self {
|
||||||
KmerOf(raw << (64 - 2 * L::len()), PhantomData)
|
KmerOf(raw << (KMER_BITS - 2 * L::len()), PhantomData)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the raw left-aligned u64 value.
|
/// Return the raw left-aligned [`RawKmer`] value.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn raw(&self) -> u64 {
|
pub fn raw(&self) -> RawKmer {
|
||||||
self.0
|
self.0
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the raw right-aligned u64 value.
|
/// Return the raw right-aligned [`RawKmer`] value.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn raw_right(&self) -> u64 {
|
pub fn raw_right(&self) -> RawKmer {
|
||||||
self.0 >> (64 - 2 * L::len())
|
self.0 >> (KMER_BITS - 2 * L::len())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Encode the first `L::len()` nucleotides of an ASCII slice into a kmer.
|
/// Encode the first `L::len()` nucleotides of an ASCII slice into a kmer.
|
||||||
@@ -153,11 +165,11 @@ impl<L: KmerLength> KmerOf<L> {
|
|||||||
seql: ascii.len(),
|
seql: ascii.len(),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
let mut val = 0u64;
|
let mut val: RawKmer = 0;
|
||||||
for i in 0..k {
|
for i in 0..k {
|
||||||
val = (val << 2) | encode_base(ascii[i]) as u64;
|
val = (val << 2) | encode_base(ascii[i]) as RawKmer;
|
||||||
}
|
}
|
||||||
Ok(KmerOf(val << (64 - 2 * k), PhantomData))
|
Ok(KmerOf(val << (KMER_BITS - 2 * k), PhantomData))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Decode into a freshly allocated ASCII `Vec<u8>`.
|
/// Decode into a freshly allocated ASCII `Vec<u8>`.
|
||||||
@@ -193,27 +205,27 @@ impl<L: KmerLength> KmerOf<L> {
|
|||||||
let x = x.swap_bytes();
|
let x = x.swap_bytes();
|
||||||
let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4);
|
let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4);
|
||||||
let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2);
|
let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2);
|
||||||
KmerOf(x << (64 - 2 * k), PhantomData)
|
KmerOf(x << (KMER_BITS - 2 * k), PhantomData)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Slide the window one base to the right: drop nucleotide 0, append `nuc` at position `L::len()-1`.
|
/// Slide the window one base to the right: drop nucleotide 0, append `nuc` at position `L::len()-1`.
|
||||||
pub fn push_right(self, nuc: u8) -> Self {
|
pub fn push_right(self, nuc: u8) -> Self {
|
||||||
let k = L::len();
|
let k = L::len();
|
||||||
let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
|
let shifted = self.0 << 2 & (RawKmer::MAX << (KMER_BITS - 2 * (k - 1)));
|
||||||
KmerOf(shifted | ((nuc as u64 & 3) << (64 - 2 * k)), PhantomData)
|
KmerOf(shifted | ((nuc as RawKmer & 3) << (KMER_BITS - 2 * k)), PhantomData)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Slide the window one base to the left: drop nucleotide `L::len()-1`, prepend `nuc` at position 0.
|
/// Slide the window one base to the left: drop nucleotide `L::len()-1`, prepend `nuc` at position 0.
|
||||||
pub fn push_left(self, nuc: u8) -> Self {
|
pub fn push_left(self, nuc: u8) -> Self {
|
||||||
let k = L::len();
|
let k = L::len();
|
||||||
let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k));
|
let shifted = (self.0 >> 2) & (RawKmer::MAX << (KMER_BITS - 2 * k));
|
||||||
KmerOf(shifted | ((nuc as u64 & 3) << 62), PhantomData)
|
KmerOf(shifted | ((nuc as RawKmer & 3) << (KMER_BITS - 2)), PhantomData)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `true` if the last `L::len()-1` nucleotides of `self` equal the first `L::len()-1` of `other`.
|
/// Returns `true` if the last `L::len()-1` nucleotides of `self` equal the first `L::len()-1` of `other`.
|
||||||
pub fn is_overlapping(self, other: Self) -> bool {
|
pub fn is_overlapping(self, other: Self) -> bool {
|
||||||
let k = L::len();
|
let k = L::len();
|
||||||
let mask = !0u64 << (64 - 2 * (k - 1));
|
let mask = RawKmer::MAX << (KMER_BITS - 2 * (k - 1));
|
||||||
(self.0 << 2 & mask) == (other.0 & mask)
|
(self.0 << 2 & mask) == (other.0 & mask)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -231,7 +243,7 @@ impl<L: KmerLength> Sequence for KmerOf<L> {
|
|||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn nucleotide(&self, i: usize) -> u8 {
|
fn nucleotide(&self, i: usize) -> u8 {
|
||||||
((self.0 >> (62 - 2 * i)) & 0b11) as u8
|
((self.0 >> (KMER_BITS - 2 - 2 * i)) & 0b11) as u8
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
@@ -255,21 +267,21 @@ impl<L: KmerLength> Sequence for KmerOf<L> {
|
|||||||
/// [`CanonicalKmerOf::from_raw_unchecked`] (trusted paths such as deserialisation).
|
/// [`CanonicalKmerOf::from_raw_unchecked`] (trusted paths such as deserialisation).
|
||||||
#[repr(transparent)]
|
#[repr(transparent)]
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
pub struct CanonicalKmerOf<L: KmerLength>(u64, PhantomData<L>);
|
pub struct CanonicalKmerOf<L: KmerLength>(RawKmer, PhantomData<L>);
|
||||||
|
|
||||||
impl<L: KmerLength> CanonicalKmerOf<L> {
|
impl<L: KmerLength> CanonicalKmerOf<L> {
|
||||||
/// Wrap a raw left-aligned u64 without verifying the canonical invariant.
|
/// Wrap a raw left-aligned [`RawKmer`] without verifying the canonical invariant.
|
||||||
///
|
///
|
||||||
/// # Safety (logical)
|
/// # Safety (logical)
|
||||||
/// The caller must guarantee `raw == min(raw, revcomp(raw))`.
|
/// The caller must guarantee `raw == min(raw, revcomp(raw))`.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn from_raw_unchecked(raw: u64) -> Self {
|
pub fn from_raw_unchecked(raw: RawKmer) -> Self {
|
||||||
CanonicalKmerOf(raw, PhantomData)
|
CanonicalKmerOf(raw, PhantomData)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the raw left-aligned u64 value.
|
/// Return the raw left-aligned [`RawKmer`] value.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn raw(&self) -> u64 {
|
pub fn raw(&self) -> RawKmer {
|
||||||
self.0
|
self.0
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -307,25 +319,25 @@ impl<L: KmerLength> CanonicalKmerOf<L> {
|
|||||||
/// Return the four left canonical neighbours (each already canonical).
|
/// Return the four left canonical neighbours (each already canonical).
|
||||||
pub fn left_canonical_neighbors(&self) -> [CanonicalKmerOf<L>; 4] {
|
pub fn left_canonical_neighbors(&self) -> [CanonicalKmerOf<L>; 4] {
|
||||||
let k = L::len();
|
let k = L::len();
|
||||||
let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k));
|
let shifted = (self.0 >> 2) & (RawKmer::MAX << (KMER_BITS - 2 * k));
|
||||||
[
|
[
|
||||||
KmerOf::<L>(shifted, PhantomData).canonical(),
|
KmerOf::<L>(shifted, PhantomData).canonical(),
|
||||||
KmerOf::<L>(shifted | (1u64 << 62), PhantomData).canonical(),
|
KmerOf::<L>(shifted | ((1 as RawKmer) << (KMER_BITS - 2)), PhantomData).canonical(),
|
||||||
KmerOf::<L>(shifted | (2u64 << 62), PhantomData).canonical(),
|
KmerOf::<L>(shifted | ((2 as RawKmer) << (KMER_BITS - 2)), PhantomData).canonical(),
|
||||||
KmerOf::<L>(shifted | (3u64 << 62), PhantomData).canonical(),
|
KmerOf::<L>(shifted | ((3 as RawKmer) << (KMER_BITS - 2)), PhantomData).canonical(),
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the four right canonical neighbours (each already canonical).
|
/// Return the four right canonical neighbours (each already canonical).
|
||||||
pub fn right_canonical_neighbors(&self) -> [CanonicalKmerOf<L>; 4] {
|
pub fn right_canonical_neighbors(&self) -> [CanonicalKmerOf<L>; 4] {
|
||||||
let k = L::len();
|
let k = L::len();
|
||||||
let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
|
let shifted = self.0 << 2 & (RawKmer::MAX << (KMER_BITS - 2 * (k - 1)));
|
||||||
let shift = 64 - 2 * k;
|
let shift = KMER_BITS - 2 * k;
|
||||||
[
|
[
|
||||||
KmerOf::<L>(shifted, PhantomData).canonical(),
|
KmerOf::<L>(shifted, PhantomData).canonical(),
|
||||||
KmerOf::<L>(shifted | (1u64 << shift), PhantomData).canonical(),
|
KmerOf::<L>(shifted | ((1 as RawKmer) << shift), PhantomData).canonical(),
|
||||||
KmerOf::<L>(shifted | (2u64 << shift), PhantomData).canonical(),
|
KmerOf::<L>(shifted | ((2 as RawKmer) << shift), PhantomData).canonical(),
|
||||||
KmerOf::<L>(shifted | (3u64 << shift), PhantomData).canonical(),
|
KmerOf::<L>(shifted | ((3 as RawKmer) << shift), PhantomData).canonical(),
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -349,7 +361,7 @@ impl<L: KmerLength> Sequence for CanonicalKmerOf<L> {
|
|||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn nucleotide(&self, i: usize) -> u8 {
|
fn nucleotide(&self, i: usize) -> u8 {
|
||||||
((self.0 >> (62 - 2 * i)) & 0b11) as u8
|
((self.0 >> (KMER_BITS - 2 - 2 * i)) & 0b11) as u8
|
||||||
}
|
}
|
||||||
|
|
||||||
fn canonical(&self) -> Self::Canonical {
|
fn canonical(&self) -> Self::Canonical {
|
||||||
@@ -386,7 +398,7 @@ pub type Minimizer = CanonicalKmerOf<MLen>;
|
|||||||
/// 64-bit representation directly, which is useful when the canonical
|
/// 64-bit representation directly, which is useful when the canonical
|
||||||
/// invariant is not required or has already been handled.
|
/// invariant is not required or has already been handled.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn hash_kmer(raw: u64) -> u64 {
|
pub fn hash_kmer(raw: RawKmer) -> u64 {
|
||||||
mix64(raw ^ 0x9e3779b97f4a7c15)
|
mix64(raw ^ 0x9e3779b97f4a7c15)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ pub mod superkmer;
|
|||||||
pub mod unitig;
|
pub mod unitig;
|
||||||
|
|
||||||
pub use annotations::Annotation;
|
pub use annotations::Annotation;
|
||||||
pub use kmer::{CanonicalKmer, Kmer, Minimizer, hash_kmer};
|
pub use kmer::{CanonicalKmer, Kmer, Minimizer, RawKmer, KMER_BITS, hash_kmer};
|
||||||
pub use packed_seq::MAX_KMERS_PER_CHUNK;
|
pub use packed_seq::MAX_KMERS_PER_CHUNK;
|
||||||
pub use params::{k, m, set_k, set_m};
|
pub use params::{k, m, set_k, set_m};
|
||||||
pub use routable::RoutableSuperKmer;
|
pub use routable::RoutableSuperKmer;
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ use bitvec::prelude::*;
|
|||||||
|
|
||||||
use crate::Sequence;
|
use crate::Sequence;
|
||||||
use crate::encoding::{DEC4, encode_base};
|
use crate::encoding::{DEC4, encode_base};
|
||||||
use crate::kmer::{CanonicalKmer, Kmer, KmerError, KLen, KmerLength, KmerOf, MLen, Minimizer};
|
use crate::kmer::{CanonicalKmer, Kmer, KmerError, KLen, KmerLength, KmerOf, MLen, Minimizer, RawKmer, KMER_BITS};
|
||||||
use crate::params::k;
|
use crate::params::k;
|
||||||
use crate::revcomp_lookup::REVCOMP4;
|
use crate::revcomp_lookup::REVCOMP4;
|
||||||
|
|
||||||
@@ -198,8 +198,8 @@ impl PackedSeq {
|
|||||||
return Err(KmerError::OutOfBounds { position: i, k: len, seql });
|
return Err(KmerError::OutOfBounds { position: i, k: len, seql });
|
||||||
}
|
}
|
||||||
let bits = self.seq.view_bits::<Msb0>();
|
let bits = self.seq.view_bits::<Msb0>();
|
||||||
let raw: u64 = bits[i * 2..(i + len) * 2].load_be();
|
let raw: RawKmer = bits[i * 2..(i + len) * 2].load_be();
|
||||||
Ok(KmerOf::from_raw(raw << (64 - 2 * len)))
|
Ok(KmerOf::from_raw(raw << (KMER_BITS - 2 * len)))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract the kmer of length `params::k()` at nucleotide position `i`. Zero allocation.
|
/// Extract the kmer of length `params::k()` at nucleotide position `i`. Zero allocation.
|
||||||
@@ -322,9 +322,9 @@ impl AsRef<PackedSeq> for PackedSeq {
|
|||||||
/// sequence into the iterator, so no reference escapes the closure.
|
/// sequence into the iterator, so no reference escapes the closure.
|
||||||
pub struct PackedSeqKmerIter<S> {
|
pub struct PackedSeqKmerIter<S> {
|
||||||
seq: S,
|
seq: S,
|
||||||
mask: u64,
|
mask: RawKmer,
|
||||||
lshift: usize,
|
lshift: usize,
|
||||||
current: u64,
|
current: RawKmer,
|
||||||
pos: usize,
|
pos: usize,
|
||||||
max_pos: usize,
|
max_pos: usize,
|
||||||
}
|
}
|
||||||
@@ -341,8 +341,8 @@ impl<S: AsRef<PackedSeq>> PackedSeqKmerIter<S> {
|
|||||||
let ps = seq.as_ref();
|
let ps = seq.as_ref();
|
||||||
let seql = ps.seql();
|
let seql = ps.seql();
|
||||||
let klen = k();
|
let klen = k();
|
||||||
let lshift = 64 - klen * 2;
|
let lshift = KMER_BITS - klen * 2;
|
||||||
let mask = ((!0u128) << (lshift + 2)) as u64;
|
let mask = ((!0u128) << (lshift + 2)) as RawKmer;
|
||||||
let current = if seql >= klen {
|
let current = if seql >= klen {
|
||||||
ps.extract::<KLen>(0).map(|km| km.raw()).unwrap_or(0)
|
ps.extract::<KLen>(0).map(|km| km.raw()).unwrap_or(0)
|
||||||
} else {
|
} else {
|
||||||
@@ -362,7 +362,7 @@ impl<S: AsRef<PackedSeq>> Iterator for PackedSeqKmerIter<S> {
|
|||||||
let result = Kmer::from_raw(self.current);
|
let result = Kmer::from_raw(self.current);
|
||||||
if self.pos < self.max_pos {
|
if self.pos < self.max_pos {
|
||||||
let inner_shift = 6 - 2 * (self.pos & 3);
|
let inner_shift = 6 - 2 * (self.pos & 3);
|
||||||
let nuc = ((self.seq.as_ref().seq[self.pos / 4] >> inner_shift) & 3) as u64;
|
let nuc = ((self.seq.as_ref().seq[self.pos / 4] >> inner_shift) & 3) as RawKmer;
|
||||||
self.current = ((self.current << 2) & self.mask) | (nuc << self.lshift);
|
self.current = ((self.current << 2) & self.mask) | (nuc << self.lshift);
|
||||||
}
|
}
|
||||||
self.pos += 1;
|
self.pos += 1;
|
||||||
|
|||||||
@@ -12,4 +12,4 @@ pub use layer::{Hit, Layer, LayerData};
|
|||||||
pub use layered_store::LayeredStore;
|
pub use layered_store::LayeredStore;
|
||||||
pub use map::LayeredMap;
|
pub use map::LayeredMap;
|
||||||
pub use meta::{IndexMode, PartitionMeta};
|
pub use meta::{IndexMode, PartitionMeta};
|
||||||
pub use mphf_layer::MphfLayer;
|
pub use mphf_layer::{MphfLayer, MphfOnly};
|
||||||
|
|||||||
@@ -118,7 +118,7 @@ impl MphfLayer {
|
|||||||
}
|
}
|
||||||
LayerEvidence::Approx { unitigs_path, .. } => {
|
LayerEvidence::Approx { unitigs_path, .. } => {
|
||||||
let reader = UnitigFileReader::open_sequential(unitigs_path).ok()?;
|
let reader = UnitigFileReader::open_sequential(unitigs_path).ok()?;
|
||||||
for stored in reader.iter_canonical_kmers() {
|
for (stored, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
if self.mphf.index(&stored.raw()) == slot {
|
if self.mphf.index(&stored.raw()) == slot {
|
||||||
return if stored == kmer { Some(slot) } else { None };
|
return if stored == kmer { Some(slot) } else { None };
|
||||||
}
|
}
|
||||||
@@ -129,7 +129,31 @@ impl MphfLayer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn n(&self) -> usize { self.n }
|
pub fn n(&self) -> usize { self.n }
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── MphfOnly ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Lightweight wrapper that loads only the MPHF file, without evidence or unitigs.
|
||||||
|
///
|
||||||
|
/// Use this when the caller guarantees that all queried kmers are in the MPHF
|
||||||
|
/// domain (e.g. when iterating the source's own unitigs during merge).
|
||||||
|
pub struct MphfOnly(Mphf);
|
||||||
|
|
||||||
|
impl MphfOnly {
|
||||||
|
pub fn open(dir: &Path) -> OLMResult<Self> {
|
||||||
|
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
|
||||||
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||||
|
Ok(Self(mphf))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the slot for `kmer`. Only valid when `kmer` is in the MPHF domain.
|
||||||
|
#[inline]
|
||||||
|
pub fn index(&self, kmer: CanonicalKmer) -> usize {
|
||||||
|
self.0.index(&kmer.raw())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MphfLayer {
|
||||||
// ── Build helpers ─────────────────────────────────────────────────────────
|
// ── Build helpers ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> {
|
pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> {
|
||||||
@@ -196,7 +220,7 @@ impl MphfLayer {
|
|||||||
|
|
||||||
let mut fw = FingerprintVecWriter::new(n, b);
|
let mut fw = FingerprintVecWriter::new(n, b);
|
||||||
|
|
||||||
for kmer in unitigs.iter_canonical_kmers() {
|
for (kmer, _, _) in unitigs.iter_indexed_canonical_kmers() {
|
||||||
let slot = mphf.index(&kmer.raw());
|
let slot = mphf.index(&kmer.raw());
|
||||||
if slot >= n {
|
if slot >= n {
|
||||||
return Err(OLMError::Mphf("slot out of bounds".into()));
|
return Err(OLMError::Mphf("slot out of bounds".into()));
|
||||||
@@ -281,7 +305,7 @@ impl MphfLayer {
|
|||||||
|
|
||||||
IndexMode::Approx { b, .. } => {
|
IndexMode::Approx { b, .. } => {
|
||||||
let mut fw = FingerprintVecWriter::new(n, *b);
|
let mut fw = FingerprintVecWriter::new(n, *b);
|
||||||
for kmer in unitigs2.iter_canonical_kmers() {
|
for (kmer, _, _) in unitigs2.iter_indexed_canonical_kmers() {
|
||||||
let slot = mphf.index(&kmer.raw());
|
let slot = mphf.index(&kmer.raw());
|
||||||
if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); }
|
if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); }
|
||||||
let byte = slot / 8; let bit = 1u8 << (slot % 8);
|
let byte = slot / 8; let bit = 1u8 << (slot % 8);
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use super::*;
|
|||||||
use obicompactvec::PersistentCompactIntMatrix;
|
use obicompactvec::PersistentCompactIntMatrix;
|
||||||
use obikseq::{set_k, Kmer, Sequence as _, Unitig};
|
use obikseq::{set_k, Kmer, Sequence as _, Unitig};
|
||||||
use obiskio::DEFAULT_BLOCK_BITS;
|
use obiskio::DEFAULT_BLOCK_BITS;
|
||||||
use crate::meta::EvidenceKind;
|
use crate::meta::IndexMode;
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
fn write_unitigs(dir: &Path, seqs: &[&[u8]]) {
|
fn write_unitigs(dir: &Path, seqs: &[&[u8]]) {
|
||||||
@@ -15,7 +15,8 @@ fn write_unitigs(dir: &Path, seqs: &[&[u8]]) {
|
|||||||
|
|
||||||
fn all_canonical_kmers(dir: &Path) -> Vec<CanonicalKmer> {
|
fn all_canonical_kmers(dir: &Path) -> Vec<CanonicalKmer> {
|
||||||
UnitigFileReader::open_sequential(&dir.join(UNITIGS_FILE)).unwrap()
|
UnitigFileReader::open_sequential(&dir.join(UNITIGS_FILE)).unwrap()
|
||||||
.iter_canonical_kmers()
|
.iter_indexed_canonical_kmers()
|
||||||
|
.map(|(kmer, _, _)| kmer)
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -25,8 +26,8 @@ fn build_and_query_all_kmers_found() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||||
let kmers = all_canonical_kmers(dir.path());
|
let kmers = all_canonical_kmers(dir.path());
|
||||||
Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS, &EvidenceKind::Exact).unwrap();
|
Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS, &IndexMode::Exact).unwrap();
|
||||||
let layer = Layer::<()>::open(dir.path()).unwrap();
|
let layer = Layer::<()>::open(dir.path(), &IndexMode::Exact).unwrap();
|
||||||
for kmer in kmers {
|
for kmer in kmers {
|
||||||
assert!(layer.query(kmer).is_some(), "kmer should be present");
|
assert!(layer.query(kmer).is_some(), "kmer should be present");
|
||||||
}
|
}
|
||||||
@@ -43,10 +44,10 @@ fn counts_are_stored_and_retrieved() {
|
|||||||
Layer::<PersistentCompactIntMatrix>::build(
|
Layer::<PersistentCompactIntMatrix>::build(
|
||||||
dir.path(),
|
dir.path(),
|
||||||
DEFAULT_BLOCK_BITS,
|
DEFAULT_BLOCK_BITS,
|
||||||
&EvidenceKind::Exact,
|
&IndexMode::Exact,
|
||||||
|kmer| count_map.get(&kmer).copied().unwrap_or(0),
|
|kmer| count_map.get(&kmer).copied().unwrap_or(0),
|
||||||
).unwrap();
|
).unwrap();
|
||||||
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
|
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path(), &IndexMode::Exact).unwrap();
|
||||||
for kmer in &kmers {
|
for kmer in &kmers {
|
||||||
let hit = layer.query(*kmer).expect("kmer must be present");
|
let hit = layer.query(*kmer).expect("kmer must be present");
|
||||||
assert_eq!(hit.data[0], count_map[kmer]);
|
assert_eq!(hit.data[0], count_map[kmer]);
|
||||||
@@ -58,8 +59,8 @@ fn query_absent_returns_none() {
|
|||||||
set_k(4);
|
set_k(4);
|
||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||||
Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS, &EvidenceKind::Exact).unwrap();
|
Layer::<()>::build(dir.path(), DEFAULT_BLOCK_BITS, &IndexMode::Exact).unwrap();
|
||||||
let layer = Layer::<()>::open(dir.path()).unwrap();
|
let layer = Layer::<()>::open(dir.path(), &IndexMode::Exact).unwrap();
|
||||||
let absent = Kmer::from_ascii(b"CCCC").unwrap().canonical();
|
let absent = Kmer::from_ascii(b"CCCC").unwrap().canonical();
|
||||||
assert!(layer.query(absent).is_none());
|
assert!(layer.query(absent).is_none());
|
||||||
}
|
}
|
||||||
@@ -69,9 +70,9 @@ fn open_after_build_is_consistent() {
|
|||||||
set_k(4);
|
set_k(4);
|
||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
write_unitigs(dir.path(), &[b"AAAACGT"]);
|
||||||
let n = Layer::<PersistentCompactIntMatrix>::build(dir.path(), DEFAULT_BLOCK_BITS, &EvidenceKind::Exact, |_| 7).unwrap();
|
let n = Layer::<PersistentCompactIntMatrix>::build(dir.path(), DEFAULT_BLOCK_BITS, &IndexMode::Exact, |_| 7).unwrap();
|
||||||
assert_eq!(n, 4);
|
assert_eq!(n, 4);
|
||||||
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
|
let layer = Layer::<PersistentCompactIntMatrix>::open(dir.path(), &IndexMode::Exact).unwrap();
|
||||||
let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical();
|
let kmer = Kmer::from_ascii(b"AAAA").unwrap().canonical();
|
||||||
let hit = layer.query(kmer).expect("AAAA must be present");
|
let hit = layer.query(kmer).expect("AAAA must be present");
|
||||||
assert_eq!(hit.data[0], 7);
|
assert_eq!(hit.data[0], 7);
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use super::*;
|
use super::*;
|
||||||
use obicompactvec::PersistentCompactIntMatrix;
|
use obicompactvec::PersistentCompactIntMatrix;
|
||||||
use obikseq::{set_k, Sequence as _, Unitig};
|
use obikseq::{set_k, Sequence as _, Unitig};
|
||||||
|
use crate::meta::IndexMode;
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
fn push_unitigs_and_layer(
|
fn push_unitigs_and_layer(
|
||||||
@@ -24,7 +25,7 @@ fn canonical(ascii: &[u8]) -> CanonicalKmer {
|
|||||||
fn create_empty_map() {
|
fn create_empty_map() {
|
||||||
set_k(4);
|
set_k(4);
|
||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let map = LayeredMap::<()>::create(dir.path()).unwrap();
|
let map = LayeredMap::<()>::create(dir.path(), IndexMode::Exact).unwrap();
|
||||||
assert_eq!(map.n_layers(), 0);
|
assert_eq!(map.n_layers(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -33,7 +34,7 @@ fn open_reloads_layer_count() {
|
|||||||
set_k(4);
|
set_k(4);
|
||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
{
|
{
|
||||||
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
|
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path(), IndexMode::Exact).unwrap();
|
||||||
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
||||||
}
|
}
|
||||||
let map = LayeredMap::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
|
let map = LayeredMap::<PersistentCompactIntMatrix>::open(dir.path()).unwrap();
|
||||||
@@ -44,7 +45,7 @@ fn open_reloads_layer_count() {
|
|||||||
fn query_finds_kmer_in_layer_zero() {
|
fn query_finds_kmer_in_layer_zero() {
|
||||||
set_k(4);
|
set_k(4);
|
||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
|
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path(), IndexMode::Exact).unwrap();
|
||||||
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 3);
|
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 3);
|
||||||
let kmer = canonical(b"AAAC");
|
let kmer = canonical(b"AAAC");
|
||||||
let (layer_idx, hit) = map.query(kmer).expect("kmer must be found");
|
let (layer_idx, hit) = map.query(kmer).expect("kmer must be found");
|
||||||
@@ -56,7 +57,7 @@ fn query_finds_kmer_in_layer_zero() {
|
|||||||
fn query_finds_kmer_in_correct_layer() {
|
fn query_finds_kmer_in_correct_layer() {
|
||||||
set_k(4);
|
set_k(4);
|
||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
|
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path(), IndexMode::Exact).unwrap();
|
||||||
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
||||||
push_unitigs_and_layer(&mut map, &[b"GGGACGT"], 2);
|
push_unitigs_and_layer(&mut map, &[b"GGGACGT"], 2);
|
||||||
assert_eq!(map.n_layers(), 2);
|
assert_eq!(map.n_layers(), 2);
|
||||||
@@ -74,7 +75,7 @@ fn query_finds_kmer_in_correct_layer() {
|
|||||||
fn query_absent_returns_none() {
|
fn query_absent_returns_none() {
|
||||||
set_k(4);
|
set_k(4);
|
||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
|
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path(), IndexMode::Exact).unwrap();
|
||||||
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
push_unitigs_and_layer(&mut map, &[b"AAAACGT"], 1);
|
||||||
let absent = canonical(b"CCCC");
|
let absent = canonical(b"CCCC");
|
||||||
assert!(map.query(absent).is_none());
|
assert!(map.query(absent).is_none());
|
||||||
@@ -84,7 +85,7 @@ fn query_absent_returns_none() {
|
|||||||
fn push_layer_from_map_convenience() {
|
fn push_layer_from_map_convenience() {
|
||||||
set_k(4);
|
set_k(4);
|
||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path()).unwrap();
|
let mut map = LayeredMap::<PersistentCompactIntMatrix>::create(dir.path(), IndexMode::Exact).unwrap();
|
||||||
let mut w = map.next_layer_writer().unwrap();
|
let mut w = map.next_layer_writer().unwrap();
|
||||||
w.write(&Unitig::from_ascii(b"AAAACGT")).unwrap();
|
w.write(&Unitig::from_ascii(b"AAAACGT")).unwrap();
|
||||||
w.close().unwrap();
|
w.close().unwrap();
|
||||||
|
|||||||
@@ -181,30 +181,6 @@ fn iter_kmers_two_chunks_order() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── iter_canonical_kmers ──────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn iter_canonical_kmers_all_canonical() {
|
|
||||||
set_k(4);
|
|
||||||
let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
|
|
||||||
for kmer in r.iter_canonical_kmers() {
|
|
||||||
// canonical of a canonical kmer is itself
|
|
||||||
assert_eq!(kmer.raw(), kmer.canonical().raw());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn iter_canonical_kmers_matches_iter_kmers() {
|
|
||||||
set_k(4);
|
|
||||||
let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
|
|
||||||
let canonical: Vec<CanonicalKmer> = r.iter_canonical_kmers().collect();
|
|
||||||
let raw: Vec<Kmer> = r.iter_kmers().collect();
|
|
||||||
assert_eq!(canonical.len(), raw.len());
|
|
||||||
for (ck, rk) in canonical.iter().zip(raw.iter()) {
|
|
||||||
assert_eq!(ck.raw(), rk.canonical().raw());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── iter_indexed_canonical_kmers ──────────────────────────────────────────────
|
// ── iter_indexed_canonical_kmers ──────────────────────────────────────────────
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -370,11 +370,6 @@ impl UnitigFileReader {
|
|||||||
.flat_map(|(_, u)| u.into_kmers())
|
.flat_map(|(_, u)| u.into_kmers())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn iter_canonical_kmers(&self) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
|
||||||
self.iter_chunks_sequential()
|
|
||||||
.flat_map(|(_, u)| u.into_canonical_kmers())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn iter_indexed_canonical_kmers(
|
pub fn iter_indexed_canonical_kmers(
|
||||||
&self,
|
&self,
|
||||||
) -> impl Iterator<Item = (CanonicalKmer, usize, usize)> + '_ {
|
) -> impl Iterator<Item = (CanonicalKmer, usize, usize)> + '_ {
|
||||||
|
|||||||
Reference in New Issue
Block a user