feat: add kmer iterators and optimize layered map performance
Replace `ph` with `ptr_hash` and introduce `epserde` and `rayon` dependencies. Refactor MPHF construction to leverage parallel iteration, eliminating intermediate `Vec<u64>` allocations and reducing memory footprint. Add a `n_kmers` field to track and serialize total kmer counts, alongside three zero-allocation iterators for efficient chunk traversal. Include comprehensive unit tests for the new iterators and update CLAUDE.md to enforce explicit dependency validation policies.
This commit is contained in:
@@ -5,6 +5,9 @@ Tu es ma base de connaissance et mon bloc-notes intelligent sur le projet **obik
|
|||||||
**Règle absolue : une question appelle une réponse, pas une action.**
|
**Règle absolue : une question appelle une réponse, pas une action.**
|
||||||
Ne modifier aucun fichier à moins d'une demande explicite de modification. En particulier : observer un bug ou une incohérence dans le code montré ne constitue pas un mandat pour le corriger. Le code montré peut refléter une intention en cours — modifier sans mandat risque d'introduire un vrai bug là où tu croyais corriger.
|
Ne modifier aucun fichier à moins d'une demande explicite de modification. En particulier : observer un bug ou une incohérence dans le code montré ne constitue pas un mandat pour le corriger. Le code montré peut refléter une intention en cours — modifier sans mandat risque d'introduire un vrai bug là où tu croyais corriger.
|
||||||
|
|
||||||
|
**Règle absolue : ne jamais substituer une dépendance ou une bibliothèque sans validation explicite.**
|
||||||
|
Si une dépendance demandée pose problème (erreur de compilation, bug, API manquante), exposer le problème et proposer des alternatives — ne jamais switcher silencieusement vers une autre bibliothèque. Le choix des dépendances est une décision d'architecture qui appartient au développeur.
|
||||||
|
|
||||||
Tu maintiens en **anglais**, dense et sans remplissage, les documents suivants :
|
Tu maintiens en **anglais**, dense et sans remplissage, les documents suivants :
|
||||||
- `docmd/index.md` — document de discussion de base, enrichi progressivement au fil de nos échanges ; il reflète l'état courant de la réflexion sur le projet
|
- `docmd/index.md` — document de discussion de base, enrichi progressivement au fil de nos échanges ; il reflète l'état courant de la réflexion sur le projet
|
||||||
- les autres fichiers Markdown dans `docmd/` selon leur thème respectif
|
- les autres fichiers Markdown dans `docmd/` selon leur thème respectif
|
||||||
|
|||||||
Generated
+145
-8
@@ -314,6 +314,17 @@ dependencies = [
|
|||||||
"pkg-config",
|
"pkg-config",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cacheline-ef"
|
||||||
|
version = "1.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "af737c6c59cb018ecbe6472cbdf86d39c59d78252febfe311953a991b6e4ed85"
|
||||||
|
dependencies = [
|
||||||
|
"common_traits 0.11.4",
|
||||||
|
"epserde 0.8.0",
|
||||||
|
"mem_dbg",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cast"
|
name = "cast"
|
||||||
version = "0.3.0"
|
version = "0.3.0"
|
||||||
@@ -437,6 +448,15 @@ version = "1.0.5"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
|
checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "colored"
|
||||||
|
version = "3.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34"
|
||||||
|
dependencies = [
|
||||||
|
"windows-sys 0.61.2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "combine"
|
name = "combine"
|
||||||
version = "4.6.7"
|
version = "4.6.7"
|
||||||
@@ -447,6 +467,17 @@ dependencies = [
|
|||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "common_traits"
|
||||||
|
version = "0.11.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fda9ae1f26adcae83adb2e92f69cf59421f2a277a942f49f8e59f2fcbd7cf062"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"half",
|
||||||
|
"impl-tools 0.10.3",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "common_traits"
|
name = "common_traits"
|
||||||
version = "0.12.1"
|
version = "0.12.1"
|
||||||
@@ -455,7 +486,7 @@ checksum = "65d0a1296e8d359cb197a8f8289f3d3f77cdb67f1a83d0aeb0820a5b7aea4058"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"half",
|
"half",
|
||||||
"impl-tools",
|
"impl-tools 0.11.4",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -751,6 +782,24 @@ dependencies = [
|
|||||||
"log",
|
"log",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "epserde"
|
||||||
|
version = "0.8.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c40d342ff20a2ce62d9a85ce406e672dfa137f902ac9670034533184f1533976"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"bitflags 2.11.1",
|
||||||
|
"common_traits 0.11.4",
|
||||||
|
"epserde-derive 0.8.0",
|
||||||
|
"maligned",
|
||||||
|
"mem_dbg",
|
||||||
|
"mmap-rs",
|
||||||
|
"sealed",
|
||||||
|
"thiserror 2.0.18",
|
||||||
|
"xxhash-rust",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "epserde"
|
name = "epserde"
|
||||||
version = "0.11.5"
|
version = "0.11.5"
|
||||||
@@ -759,8 +808,8 @@ checksum = "d8dffc01a379703ad5178f47a22aa532f5811b3ef45979ccd66b79da9856770b"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bitflags 2.11.1",
|
"bitflags 2.11.1",
|
||||||
"common_traits",
|
"common_traits 0.12.1",
|
||||||
"epserde-derive",
|
"epserde-derive 0.11.0",
|
||||||
"mem_dbg",
|
"mem_dbg",
|
||||||
"mmap-rs",
|
"mmap-rs",
|
||||||
"sealed",
|
"sealed",
|
||||||
@@ -768,6 +817,17 @@ dependencies = [
|
|||||||
"xxhash-rust",
|
"xxhash-rust",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "epserde-derive"
|
||||||
|
version = "0.8.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ac80cc78b69765703f48ad93f33b8919cf5d907cda7459ad6ba2919cbbe605dd"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.117",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "epserde-derive"
|
name = "epserde-derive"
|
||||||
version = "0.11.0"
|
version = "0.11.0"
|
||||||
@@ -903,6 +963,15 @@ version = "2.0.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
|
checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fxhash"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "generic-array"
|
name = "generic-array"
|
||||||
version = "0.14.7"
|
version = "0.14.7"
|
||||||
@@ -1101,6 +1170,18 @@ dependencies = [
|
|||||||
"icu_properties",
|
"icu_properties",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "impl-tools"
|
||||||
|
version = "0.10.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0ae95c9095c2f1126d7db785955c73cdc5fc33e7c3fa911bd4a42931672029a7"
|
||||||
|
dependencies = [
|
||||||
|
"autocfg",
|
||||||
|
"impl-tools-lib",
|
||||||
|
"proc-macro-error2",
|
||||||
|
"syn 2.0.117",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "impl-tools"
|
name = "impl-tools"
|
||||||
version = "0.11.4"
|
version = "0.11.4"
|
||||||
@@ -1364,6 +1445,12 @@ dependencies = [
|
|||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "maligned"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7e88c3cbe8288f77f293e48a28b3232e3defd203a6d839fa7f68ea4329e83464"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "matchers"
|
name = "matchers"
|
||||||
version = "0.2.0"
|
version = "0.2.0"
|
||||||
@@ -1380,6 +1467,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "728cc9dc97593cd22f7bc81fbef70a2d391d7a9a855e7d658b653318124a6cf0"
|
checksum = "728cc9dc97593cd22f7bc81fbef70a2d391d7a9a855e7d658b653318124a6cf0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.11.1",
|
"bitflags 2.11.1",
|
||||||
|
"maligned",
|
||||||
"mem_dbg-derive",
|
"mem_dbg-derive",
|
||||||
"mmap-rs",
|
"mmap-rs",
|
||||||
]
|
]
|
||||||
@@ -1653,10 +1741,12 @@ dependencies = [
|
|||||||
name = "obilayeredmap"
|
name = "obilayeredmap"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"epserde 0.8.0",
|
||||||
"memmap2",
|
"memmap2",
|
||||||
"obikseq",
|
"obikseq",
|
||||||
"obiskio",
|
"obiskio",
|
||||||
"ph",
|
"ptr_hash",
|
||||||
|
"rayon",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
@@ -1807,7 +1897,7 @@ dependencies = [
|
|||||||
"binout",
|
"binout",
|
||||||
"bitm",
|
"bitm",
|
||||||
"dyn_size_of",
|
"dyn_size_of",
|
||||||
"epserde",
|
"epserde 0.11.5",
|
||||||
"mem_dbg",
|
"mem_dbg",
|
||||||
"rayon",
|
"rayon",
|
||||||
"seedable_hash",
|
"seedable_hash",
|
||||||
@@ -2045,6 +2135,37 @@ dependencies = [
|
|||||||
"prost",
|
"prost",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ptr_hash"
|
||||||
|
version = "1.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4b4e4fb9c4c2ba3e5b060f53ef46afd3de37345b08e3ec0f2c65e0ca1d57ccbd"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"bitvec",
|
||||||
|
"cacheline-ef",
|
||||||
|
"clap",
|
||||||
|
"colored",
|
||||||
|
"common_traits 0.11.4",
|
||||||
|
"epserde 0.8.0",
|
||||||
|
"epserde-derive 0.8.0",
|
||||||
|
"fastrand",
|
||||||
|
"fxhash",
|
||||||
|
"itertools 0.14.0",
|
||||||
|
"lazy_static",
|
||||||
|
"log",
|
||||||
|
"mem_dbg",
|
||||||
|
"rand",
|
||||||
|
"rand_chacha",
|
||||||
|
"rayon",
|
||||||
|
"rdst",
|
||||||
|
"rustc-hash",
|
||||||
|
"serde",
|
||||||
|
"sucds",
|
||||||
|
"tempfile",
|
||||||
|
"xxhash-rust",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "quote"
|
name = "quote"
|
||||||
version = "1.0.45"
|
version = "1.0.45"
|
||||||
@@ -2202,6 +2323,12 @@ version = "0.1.27"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d"
|
checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustc-hash"
|
||||||
|
version = "2.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustix"
|
name = "rustix"
|
||||||
version = "1.1.4"
|
version = "1.1.4"
|
||||||
@@ -2393,6 +2520,16 @@ version = "2.6.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
|
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "sucds"
|
||||||
|
version = "0.8.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cd324eaa05be64f105ea5269bb8aabd70e5dd57fa5c673b167f451b07d6c0dcd"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"num-traits",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sux"
|
name = "sux"
|
||||||
version = "0.10.3"
|
version = "0.10.3"
|
||||||
@@ -2404,16 +2541,16 @@ dependencies = [
|
|||||||
"arbitrary-chunks",
|
"arbitrary-chunks",
|
||||||
"bitflags 2.11.1",
|
"bitflags 2.11.1",
|
||||||
"clap",
|
"clap",
|
||||||
"common_traits",
|
"common_traits 0.12.1",
|
||||||
"crossbeam-channel",
|
"crossbeam-channel",
|
||||||
"derivative",
|
"derivative",
|
||||||
"derive_setters",
|
"derive_setters",
|
||||||
"dsi-progress-logger",
|
"dsi-progress-logger",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"epserde",
|
"epserde 0.11.5",
|
||||||
"fallible-iterator",
|
"fallible-iterator",
|
||||||
"flate2",
|
"flate2",
|
||||||
"impl-tools",
|
"impl-tools 0.11.4",
|
||||||
"itertools 0.14.0",
|
"itertools 0.14.0",
|
||||||
"jiff",
|
"jiff",
|
||||||
"lambert_w",
|
"lambert_w",
|
||||||
|
|||||||
@@ -6,7 +6,9 @@ edition = "2024"
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
obikseq = { path = "../obikseq" }
|
obikseq = { path = "../obikseq" }
|
||||||
obiskio = { path = "../obiskio" }
|
obiskio = { path = "../obiskio" }
|
||||||
ph = "0.11"
|
ptr_hash = "1.1"
|
||||||
|
epserde = "0.8"
|
||||||
|
rayon = "1"
|
||||||
memmap2 = "0.9"
|
memmap2 = "0.9"
|
||||||
serde = { version = "1", features = ["derive"] }
|
serde = { version = "1", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
|
|||||||
@@ -1,51 +1,61 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::io::BufWriter;
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use obikseq::{CanonicalKmer, Kmer, Sequence};
|
use epserde::prelude::*;
|
||||||
|
use obikseq::CanonicalKmer;
|
||||||
use obiskio::{UnitigFileReader, UnitigFileWriter};
|
use obiskio::{UnitigFileReader, UnitigFileWriter};
|
||||||
use ph::fmph;
|
use ptr_hash::{DefaultPtrHash, PtrHashParams};
|
||||||
|
|
||||||
use crate::counts::{Counts, CountsWriter};
|
use crate::counts::{Counts, CountsWriter};
|
||||||
use crate::error::{OLMError, OLMResult};
|
use crate::error::{OLMError, OLMResult};
|
||||||
use crate::evidence::{Evidence, EvidenceWriter};
|
use crate::evidence::{Evidence, EvidenceWriter};
|
||||||
|
|
||||||
const MPHF_FILE: &str = "mphf.bin";
|
const MPHF_FILE: &str = "mphf.bin";
|
||||||
const UNITIGS_FILE: &str = "unitigs.bin";
|
const UNITIGS_FILE: &str = "unitigs.bin";
|
||||||
const EVIDENCE_FILE: &str = "evidence.bin";
|
const EVIDENCE_FILE: &str = "evidence.bin";
|
||||||
const COUNTS_FILE: &str = "counts.bin";
|
const COUNTS_FILE: &str = "counts.bin";
|
||||||
|
|
||||||
pub struct Layer {
|
pub struct Layer {
|
||||||
mphf: fmph::Function,
|
mphf: DefaultPtrHash,
|
||||||
evidence: Evidence,
|
evidence: Evidence,
|
||||||
unitigs: UnitigFileReader,
|
unitigs: UnitigFileReader,
|
||||||
counts: Counts,
|
counts: Counts,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Hit {
|
pub struct Hit {
|
||||||
pub slot: usize,
|
pub slot: usize,
|
||||||
pub count: u32,
|
pub count: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Layer {
|
impl Layer {
|
||||||
pub fn open(path: &Path) -> OLMResult<Self> {
|
pub fn open(path: &Path) -> OLMResult<Self> {
|
||||||
let mphf = fmph::Function::read(
|
let mphf: DefaultPtrHash = DefaultPtrHash::load_full(&path.join(MPHF_FILE))
|
||||||
&mut fs::File::open(path.join(MPHF_FILE))?
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||||
).map_err(OLMError::Io)?;
|
|
||||||
|
|
||||||
let unitigs = UnitigFileReader::open(&path.join(UNITIGS_FILE))?;
|
let unitigs = UnitigFileReader::open(&path.join(UNITIGS_FILE))?;
|
||||||
let evidence = Evidence::open(&path.join(EVIDENCE_FILE))?;
|
let evidence = Evidence::open(&path.join(EVIDENCE_FILE))?;
|
||||||
let counts = Counts::open(&path.join(COUNTS_FILE))?;
|
let counts = Counts::open(&path.join(COUNTS_FILE))?;
|
||||||
|
|
||||||
Ok(Self { mphf, evidence, unitigs, counts })
|
Ok(Self {
|
||||||
|
mphf,
|
||||||
|
evidence,
|
||||||
|
unitigs,
|
||||||
|
counts,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn query(&self, kmer: CanonicalKmer) -> Option<Hit> {
|
pub fn query(&self, kmer: CanonicalKmer) -> Option<Hit> {
|
||||||
let slot = self.mphf.get(&kmer.raw())? as usize;
|
let slot = self.mphf.index(&kmer.raw());
|
||||||
let (chunk_id, rank) = self.evidence.decode(slot);
|
let (chunk_id, rank) = self.evidence.decode(slot);
|
||||||
if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
|
if self
|
||||||
Some(Hit { slot, count: self.counts.get(slot) })
|
.unitigs
|
||||||
|
.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer)
|
||||||
|
{
|
||||||
|
Some(Hit {
|
||||||
|
slot,
|
||||||
|
count: self.counts.get(slot),
|
||||||
|
})
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
@@ -55,43 +65,38 @@ impl Layer {
|
|||||||
///
|
///
|
||||||
/// `count_of` maps each canonical kmer to its occurrence count.
|
/// `count_of` maps each canonical kmer to its occurrence count.
|
||||||
/// Returns the number of kmers indexed.
|
/// Returns the number of kmers indexed.
|
||||||
pub fn build(
|
pub fn build(out_dir: &Path, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
|
||||||
out_dir: &Path,
|
use rayon::prelude::*;
|
||||||
count_of: impl Fn(CanonicalKmer) -> u32,
|
|
||||||
) -> OLMResult<usize> {
|
|
||||||
let k = obikseq::params::k();
|
|
||||||
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
|
let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?;
|
||||||
|
let n = unitigs.n_kmers();
|
||||||
|
|
||||||
let mut entries: Vec<(u64, u32, u8)> = Vec::new();
|
|
||||||
for chunk_id in 0..unitigs.len() {
|
|
||||||
let n_kmers = unitigs.seql(chunk_id) - k + 1;
|
|
||||||
for rank in 0..n_kmers {
|
|
||||||
let raw = unitigs.raw_kmer(chunk_id, rank);
|
|
||||||
let canonical: CanonicalKmer = Kmer::from_raw(raw).canonical();
|
|
||||||
entries.push((canonical.raw(), chunk_id as u32, rank as u8));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let n = entries.len();
|
|
||||||
if n == 0 {
|
if n == 0 {
|
||||||
fs::File::create(out_dir.join(EVIDENCE_FILE))?;
|
fs::File::create(out_dir.join(EVIDENCE_FILE))?;
|
||||||
fs::File::create(out_dir.join(COUNTS_FILE))?;
|
fs::File::create(out_dir.join(COUNTS_FILE))?;
|
||||||
let mphf = fmph::Function::new(Vec::<u64>::new());
|
let mphf: DefaultPtrHash = DefaultPtrHash::new(&[] as &[u64], PtrHashParams::default());
|
||||||
mphf.write(&mut BufWriter::new(fs::File::create(out_dir.join(MPHF_FILE))?))?;
|
mphf.store(&out_dir.join(MPHF_FILE))
|
||||||
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||||
return Ok(0);
|
return Ok(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
let keys: Vec<u64> = entries.iter().map(|(k, _, _)| *k).collect();
|
// Build MPHF from a cloneable parallel iterator — no Vec<u64> allocation.
|
||||||
let mphf = fmph::Function::new(keys);
|
// flat_map_iter: outer chunks in parallel, inner kmer sliding-window sequential.
|
||||||
mphf.write(&mut BufWriter::new(fs::File::create(out_dir.join(MPHF_FILE))?))?;
|
let keys = (0..unitigs.len())
|
||||||
|
.into_par_iter()
|
||||||
|
.flat_map_iter(|ci| unitigs.unitig(ci).into_canonical_kmers().map(|km| km.raw()));
|
||||||
|
let mphf: DefaultPtrHash =
|
||||||
|
DefaultPtrHash::new_from_par_iter(n, keys, PtrHashParams::default());
|
||||||
|
mphf.store(&out_dir.join(MPHF_FILE))
|
||||||
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||||
|
|
||||||
|
// Second pass: fill evidence and counts
|
||||||
let mut ev = EvidenceWriter::new(n);
|
let mut ev = EvidenceWriter::new(n);
|
||||||
let mut cnt = CountsWriter::new(n);
|
let mut cnt = CountsWriter::new(n);
|
||||||
|
|
||||||
for (key, chunk_id, rank) in &entries {
|
for (kmer, chunk_id, rank) in unitigs.iter_indexed_canonical_kmers() {
|
||||||
let slot = mphf.get(key).unwrap() as usize;
|
let slot = mphf.index(&kmer.raw());
|
||||||
ev.set(slot, *chunk_id, *rank);
|
ev.set(slot, chunk_id as u32, rank as u8);
|
||||||
let kmer = CanonicalKmer::from_raw_unchecked(*key);
|
|
||||||
cnt.set(slot, count_of(kmer));
|
cnt.set(slot, count_of(kmer));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use super::*;
|
use super::*;
|
||||||
use obikseq::{set_k, Unitig};
|
use obikseq::{set_k, Kmer, Sequence as _, Unitig};
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
fn write_unitigs(dir: &Path, seqs: &[&[u8]]) {
|
fn write_unitigs(dir: &Path, seqs: &[&[u8]]) {
|
||||||
|
|||||||
@@ -137,6 +137,108 @@ fn verify_second_unitig_second_position() {
|
|||||||
assert!(r.verify_canonical_kmer(1, 1, query));
|
assert!(r.verify_canonical_kmer(1, 1, query));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── iter_kmers ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn iter_kmers_empty_file() {
|
||||||
|
set_k(4);
|
||||||
|
let dir = tempdir().unwrap();
|
||||||
|
let path = dir.path().join("unitigs.bin");
|
||||||
|
UnitigFileWriter::create(&path).unwrap().close().unwrap();
|
||||||
|
let r = UnitigFileReader::open(&path).unwrap();
|
||||||
|
assert_eq!(r.iter_kmers().count(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn iter_kmers_single_chunk_count_and_order() {
|
||||||
|
set_k(4);
|
||||||
|
// "AAAACG": 6 nucl → 3 kmers (k=4)
|
||||||
|
let (_dir, r) = write_read(&[b"AAAACG"]);
|
||||||
|
let kmers: Vec<Kmer> = r.iter_kmers().collect();
|
||||||
|
assert_eq!(kmers.len(), 3);
|
||||||
|
for (rank, kmer) in kmers.iter().enumerate() {
|
||||||
|
assert_eq!(kmer.raw(), r.raw_kmer(0, rank), "kmer mismatch at rank {rank}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn iter_kmers_two_chunks_order() {
|
||||||
|
set_k(4);
|
||||||
|
// "AAAACG" → 3 kmers, "CCCCAG" → 3 kmers
|
||||||
|
let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
|
||||||
|
let kmers: Vec<Kmer> = r.iter_kmers().collect();
|
||||||
|
assert_eq!(kmers.len(), 6);
|
||||||
|
// Chunk 0 first
|
||||||
|
for rank in 0..3 {
|
||||||
|
assert_eq!(kmers[rank].raw(), r.raw_kmer(0, rank));
|
||||||
|
}
|
||||||
|
// Chunk 1 after
|
||||||
|
for rank in 0..3 {
|
||||||
|
assert_eq!(kmers[3 + rank].raw(), r.raw_kmer(1, rank));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── iter_canonical_kmers ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn iter_canonical_kmers_all_canonical() {
|
||||||
|
set_k(4);
|
||||||
|
let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
|
||||||
|
for kmer in r.iter_canonical_kmers() {
|
||||||
|
// canonical of a canonical kmer is itself
|
||||||
|
assert_eq!(kmer.raw(), kmer.canonical().raw());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn iter_canonical_kmers_matches_iter_kmers() {
|
||||||
|
set_k(4);
|
||||||
|
let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
|
||||||
|
let canonical: Vec<CanonicalKmer> = r.iter_canonical_kmers().collect();
|
||||||
|
let raw: Vec<Kmer> = r.iter_kmers().collect();
|
||||||
|
assert_eq!(canonical.len(), raw.len());
|
||||||
|
for (ck, rk) in canonical.iter().zip(raw.iter()) {
|
||||||
|
assert_eq!(ck.raw(), rk.canonical().raw());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── iter_indexed_canonical_kmers ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn iter_indexed_chunk_id_and_rank_single_chunk() {
|
||||||
|
set_k(4);
|
||||||
|
let (_dir, r) = write_read(&[b"AAAACG"]);
|
||||||
|
let items: Vec<(CanonicalKmer, usize, usize)> = r.iter_indexed_canonical_kmers().collect();
|
||||||
|
assert_eq!(items.len(), 3);
|
||||||
|
for (rank, (kmer, chunk_id, item_rank)) in items.iter().enumerate() {
|
||||||
|
assert_eq!(*chunk_id, 0, "chunk_id must be 0");
|
||||||
|
assert_eq!(*item_rank, rank, "rank mismatch");
|
||||||
|
assert!(r.verify_canonical_kmer(0, rank, *kmer));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn iter_indexed_chunk_id_and_rank_two_chunks() {
|
||||||
|
set_k(4);
|
||||||
|
let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]);
|
||||||
|
let items: Vec<(CanonicalKmer, usize, usize)> = r.iter_indexed_canonical_kmers().collect();
|
||||||
|
assert_eq!(items.len(), 6);
|
||||||
|
// First 3 items: chunk_id=0, rank 0..2
|
||||||
|
for rank in 0..3 {
|
||||||
|
let (kmer, chunk_id, item_rank) = items[rank];
|
||||||
|
assert_eq!(chunk_id, 0);
|
||||||
|
assert_eq!(item_rank, rank);
|
||||||
|
assert!(r.verify_canonical_kmer(0, rank, kmer));
|
||||||
|
}
|
||||||
|
// Next 3 items: chunk_id=1, rank resets to 0
|
||||||
|
for rank in 0..3 {
|
||||||
|
let (kmer, chunk_id, item_rank) = items[3 + rank];
|
||||||
|
assert_eq!(chunk_id, 1);
|
||||||
|
assert_eq!(item_rank, rank);
|
||||||
|
assert!(r.verify_canonical_kmer(1, rank, kmer));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── Splitting ─────────────────────────────────────────────────────────────────
|
// ── Splitting ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use std::io::{BufWriter, Write as _};
|
|||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
use obikseq::{CanonicalKmer, Unitig};
|
use obikseq::{CanonicalKmer, Kmer, Unitig};
|
||||||
|
|
||||||
pub use obikseq::MAX_KMERS_PER_CHUNK;
|
pub use obikseq::MAX_KMERS_PER_CHUNK;
|
||||||
|
|
||||||
@@ -13,6 +13,7 @@ use crate::error::{SKError, SKResult};
|
|||||||
//
|
//
|
||||||
// magic: [u8; 4] = b"UIDX"
|
// magic: [u8; 4] = b"UIDX"
|
||||||
// n_unitigs: u32 LE
|
// n_unitigs: u32 LE
|
||||||
|
// n_kmers: u64 LE total kmer count across all chunks
|
||||||
// seqls: [u8; n_unitigs] max kmer index per chunk (= n_kmers − 1)
|
// seqls: [u8; n_unitigs] max kmer index per chunk (= n_kmers − 1)
|
||||||
// packed_offsets: [u32; n_unitigs + 1] byte offsets to packed bytes in the
|
// packed_offsets: [u32; n_unitigs + 1] byte offsets to packed bytes in the
|
||||||
// sequence file; last entry is sentinel
|
// sequence file; last entry is sentinel
|
||||||
@@ -44,6 +45,7 @@ pub struct UnitigFileWriter {
|
|||||||
seqls: Vec<u8>,
|
seqls: Vec<u8>,
|
||||||
packed_offsets: Vec<u32>,
|
packed_offsets: Vec<u32>,
|
||||||
next_offset: u32,
|
next_offset: u32,
|
||||||
|
n_kmers: usize,
|
||||||
k: usize,
|
k: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -56,6 +58,7 @@ impl UnitigFileWriter {
|
|||||||
seqls: Vec::new(),
|
seqls: Vec::new(),
|
||||||
packed_offsets: Vec::new(),
|
packed_offsets: Vec::new(),
|
||||||
next_offset: 0,
|
next_offset: 0,
|
||||||
|
n_kmers: 0,
|
||||||
k: obikseq::params::k(),
|
k: obikseq::params::k(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -98,6 +101,7 @@ impl UnitigFileWriter {
|
|||||||
debug_assert!(seql - self.k <= u8::MAX as usize, "chunk exceeds MAX_KMERS_PER_CHUNK");
|
debug_assert!(seql - self.k <= u8::MAX as usize, "chunk exceeds MAX_KMERS_PER_CHUNK");
|
||||||
self.packed_offsets.push(self.next_offset + 1);
|
self.packed_offsets.push(self.next_offset + 1);
|
||||||
self.seqls.push((seql - self.k) as u8);
|
self.seqls.push((seql - self.k) as u8);
|
||||||
|
self.n_kmers += seql - self.k + 1;
|
||||||
|
|
||||||
unitig
|
unitig
|
||||||
.write_to_binary(&mut self.file)
|
.write_to_binary(&mut self.file)
|
||||||
@@ -122,7 +126,7 @@ impl UnitigFileWriter {
|
|||||||
};
|
};
|
||||||
self.packed_offsets.push(sentinel);
|
self.packed_offsets.push(sentinel);
|
||||||
|
|
||||||
write_idx(&idx_path(&self.path), &self.seqls, &self.packed_offsets)
|
write_idx(&idx_path(&self.path), &self.seqls, &self.packed_offsets, self.n_kmers)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn len(&self) -> usize {
|
pub fn len(&self) -> usize {
|
||||||
@@ -134,10 +138,11 @@ impl UnitigFileWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_idx(path: &Path, seqls: &[u8], packed_offsets: &[u32]) -> SKResult<()> {
|
fn write_idx(path: &Path, seqls: &[u8], packed_offsets: &[u32], n_kmers: usize) -> SKResult<()> {
|
||||||
let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?);
|
let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?);
|
||||||
w.write_all(&MAGIC).map_err(SKError::Io)?;
|
w.write_all(&MAGIC).map_err(SKError::Io)?;
|
||||||
w.write_all(&(seqls.len() as u32).to_le_bytes()).map_err(SKError::Io)?;
|
w.write_all(&(seqls.len() as u32).to_le_bytes()).map_err(SKError::Io)?;
|
||||||
|
w.write_all(&(n_kmers as u64).to_le_bytes()).map_err(SKError::Io)?;
|
||||||
w.write_all(seqls).map_err(SKError::Io)?;
|
w.write_all(seqls).map_err(SKError::Io)?;
|
||||||
for &off in packed_offsets {
|
for &off in packed_offsets {
|
||||||
w.write_all(&off.to_le_bytes()).map_err(SKError::Io)?;
|
w.write_all(&off.to_le_bytes()).map_err(SKError::Io)?;
|
||||||
@@ -155,6 +160,7 @@ pub struct UnitigFileReader {
|
|||||||
mmap: Mmap,
|
mmap: Mmap,
|
||||||
seqls: Vec<u8>,
|
seqls: Vec<u8>,
|
||||||
packed_offsets: Vec<u32>,
|
packed_offsets: Vec<u32>,
|
||||||
|
n_kmers: usize,
|
||||||
k: usize,
|
k: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -162,9 +168,9 @@ impl UnitigFileReader {
|
|||||||
pub fn open(path: &Path) -> SKResult<Self> {
|
pub fn open(path: &Path) -> SKResult<Self> {
|
||||||
let file = File::open(path).map_err(SKError::Io)?;
|
let file = File::open(path).map_err(SKError::Io)?;
|
||||||
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
|
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
|
||||||
let (seqls, packed_offsets) = read_idx(&idx_path(path))?;
|
let (seqls, packed_offsets, n_kmers) = read_idx(&idx_path(path))?;
|
||||||
let k = obikseq::params::k();
|
let k = obikseq::params::k();
|
||||||
Ok(Self { mmap, seqls, packed_offsets, k })
|
Ok(Self { mmap, seqls, packed_offsets, n_kmers, k })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn len(&self) -> usize {
|
pub fn len(&self) -> usize {
|
||||||
@@ -175,6 +181,11 @@ impl UnitigFileReader {
|
|||||||
self.seqls.is_empty()
|
self.seqls.is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Total number of kmers across all chunks.
|
||||||
|
pub fn n_kmers(&self) -> usize {
|
||||||
|
self.n_kmers
|
||||||
|
}
|
||||||
|
|
||||||
/// Return the nucleotide length of chunk `i`.
|
/// Return the nucleotide length of chunk `i`.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn seql(&self, i: usize) -> usize {
|
pub fn seql(&self, i: usize) -> usize {
|
||||||
@@ -206,9 +217,42 @@ impl UnitigFileReader {
|
|||||||
pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool {
|
pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool {
|
||||||
canonical_raw(self.raw_kmer(i, j), self.k) == query.raw()
|
canonical_raw(self.raw_kmer(i, j), self.k) == query.raw()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Iterate over all kmers in file order (all positions of chunk 0, then chunk 1, …).
|
||||||
|
///
|
||||||
|
/// Each chunk is copied from the mmap once; iteration within the chunk is
|
||||||
|
/// zero-allocation (sliding-window via [`OwnedPackedSeqKmerIter`]).
|
||||||
|
pub fn iter_kmers(&self) -> impl Iterator<Item = Kmer> + '_ {
|
||||||
|
(0..self.len()).flat_map(move |i| self.unitig(i).into_kmers())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Iterate over all canonical kmers in file order.
|
||||||
|
///
|
||||||
|
/// Equivalent to `iter_kmers().map(|km| km.canonical())` but uses the
|
||||||
|
/// built-in canonical iterator on each chunk, which avoids a separate
|
||||||
|
/// canonicalization pass.
|
||||||
|
pub fn iter_canonical_kmers(&self) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||||
|
(0..self.len()).flat_map(move |i| self.unitig(i).into_canonical_kmers())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Iterate over `(kmer, chunk_id, rank)` for every canonical kmer in the file.
|
||||||
|
///
|
||||||
|
/// `chunk_id` is the index of the chunk within this file; `rank` is the
|
||||||
|
/// 0-based position of the kmer within that chunk. Used to build the
|
||||||
|
/// evidence table in `obilayeredmap`.
|
||||||
|
pub fn iter_indexed_canonical_kmers(
|
||||||
|
&self,
|
||||||
|
) -> impl Iterator<Item = (CanonicalKmer, usize, usize)> + '_ {
|
||||||
|
(0..self.len()).flat_map(move |chunk_id| {
|
||||||
|
self.unitig(chunk_id)
|
||||||
|
.into_canonical_kmers()
|
||||||
|
.enumerate()
|
||||||
|
.map(move |(rank, kmer)| (kmer, chunk_id, rank))
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>)> {
|
fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>, usize)> {
|
||||||
let data = std::fs::read(path).map_err(SKError::Io)?;
|
let data = std::fs::read(path).map_err(SKError::Io)?;
|
||||||
let mut pos = 0;
|
let mut pos = 0;
|
||||||
|
|
||||||
@@ -227,6 +271,11 @@ fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>)> {
|
|||||||
let n = u32::from_le_bytes(n_bytes.try_into().unwrap()) as usize;
|
let n = u32::from_le_bytes(n_bytes.try_into().unwrap()) as usize;
|
||||||
pos += 4;
|
pos += 4;
|
||||||
|
|
||||||
|
let nk_bytes = data.get(pos..pos + 8)
|
||||||
|
.ok_or(SKError::Truncated { context: "unitig index: n_kmers" })?;
|
||||||
|
let n_kmers = u64::from_le_bytes(nk_bytes.try_into().unwrap()) as usize;
|
||||||
|
pos += 8;
|
||||||
|
|
||||||
let seqls = data.get(pos..pos + n)
|
let seqls = data.get(pos..pos + n)
|
||||||
.ok_or(SKError::Truncated { context: "unitig index: seqls" })?
|
.ok_or(SKError::Truncated { context: "unitig index: seqls" })?
|
||||||
.to_vec();
|
.to_vec();
|
||||||
@@ -240,7 +289,7 @@ fn read_idx(path: &Path) -> SKResult<(Vec<u8>, Vec<u32>)> {
|
|||||||
pos += 4;
|
pos += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((seqls, packed_offsets))
|
Ok((seqls, packed_offsets, n_kmers))
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Kmer utilities ────────────────────────────────────────────────────────────
|
// ── Kmer utilities ────────────────────────────────────────────────────────────
|
||||||
|
|||||||
Reference in New Issue
Block a user