feat: add kmer iterators and optimize layered map performance

Replace `ph` with `ptr_hash` and introduce `epserde` and `rayon` dependencies. Refactor MPHF construction to leverage parallel iteration, eliminating intermediate `Vec<u64>` allocations and reducing memory footprint. Add a `n_kmers` field to track and serialize total kmer counts, alongside three zero-allocation iterators for efficient chunk traversal. Include comprehensive unit tests for the new iterators and update CLAUDE.md to enforce explicit dependency validation policies.
This commit is contained in:
Eric Coissac
2026-05-12 22:28:01 +08:00
parent 9c41891cc8
commit ff75c9198d
7 changed files with 359 additions and 61 deletions
+145 -8
View File
@@ -314,6 +314,17 @@ dependencies = [
"pkg-config",
]
[[package]]
name = "cacheline-ef"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af737c6c59cb018ecbe6472cbdf86d39c59d78252febfe311953a991b6e4ed85"
dependencies = [
"common_traits 0.11.4",
"epserde 0.8.0",
"mem_dbg",
]
[[package]]
name = "cast"
version = "0.3.0"
@@ -437,6 +448,15 @@ version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
[[package]]
name = "colored"
version = "3.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34"
dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "combine"
version = "4.6.7"
@@ -447,6 +467,17 @@ dependencies = [
"memchr",
]
[[package]]
name = "common_traits"
version = "0.11.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fda9ae1f26adcae83adb2e92f69cf59421f2a277a942f49f8e59f2fcbd7cf062"
dependencies = [
"anyhow",
"half",
"impl-tools 0.10.3",
]
[[package]]
name = "common_traits"
version = "0.12.1"
@@ -455,7 +486,7 @@ checksum = "65d0a1296e8d359cb197a8f8289f3d3f77cdb67f1a83d0aeb0820a5b7aea4058"
dependencies = [
"anyhow",
"half",
"impl-tools",
"impl-tools 0.11.4",
]
[[package]]
@@ -751,6 +782,24 @@ dependencies = [
"log",
]
[[package]]
name = "epserde"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c40d342ff20a2ce62d9a85ce406e672dfa137f902ac9670034533184f1533976"
dependencies = [
"anyhow",
"bitflags 2.11.1",
"common_traits 0.11.4",
"epserde-derive 0.8.0",
"maligned",
"mem_dbg",
"mmap-rs",
"sealed",
"thiserror 2.0.18",
"xxhash-rust",
]
[[package]]
name = "epserde"
version = "0.11.5"
@@ -759,8 +808,8 @@ checksum = "d8dffc01a379703ad5178f47a22aa532f5811b3ef45979ccd66b79da9856770b"
dependencies = [
"anyhow",
"bitflags 2.11.1",
"common_traits",
"epserde-derive",
"common_traits 0.12.1",
"epserde-derive 0.11.0",
"mem_dbg",
"mmap-rs",
"sealed",
@@ -768,6 +817,17 @@ dependencies = [
"xxhash-rust",
]
[[package]]
name = "epserde-derive"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac80cc78b69765703f48ad93f33b8919cf5d907cda7459ad6ba2919cbbe605dd"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]]
name = "epserde-derive"
version = "0.11.0"
@@ -903,6 +963,15 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "generic-array"
version = "0.14.7"
@@ -1101,6 +1170,18 @@ dependencies = [
"icu_properties",
]
[[package]]
name = "impl-tools"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ae95c9095c2f1126d7db785955c73cdc5fc33e7c3fa911bd4a42931672029a7"
dependencies = [
"autocfg",
"impl-tools-lib",
"proc-macro-error2",
"syn 2.0.117",
]
[[package]]
name = "impl-tools"
version = "0.11.4"
@@ -1364,6 +1445,12 @@ dependencies = [
"libc",
]
[[package]]
name = "maligned"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e88c3cbe8288f77f293e48a28b3232e3defd203a6d839fa7f68ea4329e83464"
[[package]]
name = "matchers"
version = "0.2.0"
@@ -1380,6 +1467,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "728cc9dc97593cd22f7bc81fbef70a2d391d7a9a855e7d658b653318124a6cf0"
dependencies = [
"bitflags 2.11.1",
"maligned",
"mem_dbg-derive",
"mmap-rs",
]
@@ -1653,10 +1741,12 @@ dependencies = [
name = "obilayeredmap"
version = "0.1.0"
dependencies = [
"epserde 0.8.0",
"memmap2",
"obikseq",
"obiskio",
"ph",
"ptr_hash",
"rayon",
"serde",
"serde_json",
"tempfile",
@@ -1807,7 +1897,7 @@ dependencies = [
"binout",
"bitm",
"dyn_size_of",
"epserde",
"epserde 0.11.5",
"mem_dbg",
"rayon",
"seedable_hash",
@@ -2045,6 +2135,37 @@ dependencies = [
"prost",
]
[[package]]
name = "ptr_hash"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b4e4fb9c4c2ba3e5b060f53ef46afd3de37345b08e3ec0f2c65e0ca1d57ccbd"
dependencies = [
"anyhow",
"bitvec",
"cacheline-ef",
"clap",
"colored",
"common_traits 0.11.4",
"epserde 0.8.0",
"epserde-derive 0.8.0",
"fastrand",
"fxhash",
"itertools 0.14.0",
"lazy_static",
"log",
"mem_dbg",
"rand",
"rand_chacha",
"rayon",
"rdst",
"rustc-hash",
"serde",
"sucds",
"tempfile",
"xxhash-rust",
]
[[package]]
name = "quote"
version = "1.0.45"
@@ -2202,6 +2323,12 @@ version = "0.1.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d"
[[package]]
name = "rustc-hash"
version = "2.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
[[package]]
name = "rustix"
version = "1.1.4"
@@ -2393,6 +2520,16 @@ version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]]
name = "sucds"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd324eaa05be64f105ea5269bb8aabd70e5dd57fa5c673b167f451b07d6c0dcd"
dependencies = [
"anyhow",
"num-traits",
]
[[package]]
name = "sux"
version = "0.10.3"
@@ -2404,16 +2541,16 @@ dependencies = [
"arbitrary-chunks",
"bitflags 2.11.1",
"clap",
"common_traits",
"common_traits 0.12.1",
"crossbeam-channel",
"derivative",
"derive_setters",
"dsi-progress-logger",
"env_logger",
"epserde",
"epserde 0.11.5",
"fallible-iterator",
"flate2",
"impl-tools",
"impl-tools 0.11.4",
"itertools 0.14.0",
"jiff",
"lambert_w",