refactor: restructure k-mer partitioning pipeline for memory efficiency

Replace in-memory hashing with a disk-backed external merge sort and `PersistentCompactIntVec` to drastically reduce peak RAM. Unify both phases using a custom `PtrHash` MPHF, eliminating `GOFunction` and `boomphf`. Introduce a concrete three-step `count_partition()` pipeline with adaptive chunk sizing based on available system memory. Update dependencies to `memmap2`, `ptr_hash`, and `obicompactvec`. Additionally, document strict genomics-only memory constraints and enforce an architectural feedback workflow requiring explicit user authorization before structural changes.
This commit is contained in:
Eric Coissac
2026-05-17 15:34:44 +08:00
parent f36b095ce2
commit 4736a7b6de
10 changed files with 230 additions and 114 deletions
+11 -1
View File
@@ -1745,14 +1745,17 @@ dependencies = [
name = "obikpartitionner"
version = "0.1.0"
dependencies = [
"cacheline-ef",
"epserde 0.8.0",
"memmap2",
"niffler 3.0.0",
"obicompactvec",
"obikrope",
"obikseq",
"obiread",
"obiskbuilder",
"obiskio",
"ph",
"ptr_hash",
"rayon",
"remove_dir_all",
"serde",
@@ -1842,6 +1845,13 @@ dependencies = [
"tempfile",
]
[[package]]
name = "obisys"
version = "0.1.0"
dependencies = [
"libc",
]
[[package]]
name = "objc2-core-foundation"
version = "0.3.2"