diff --git a/.gitignore b/.gitignore index 18703dc..84e7f06 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ data-stress *.fasta *.zst *.zst.meta +*.pb diff --git a/Makefile b/Makefile index 15ab920..e203e6a 100644 --- a/Makefile +++ b/Makefile @@ -25,11 +25,25 @@ $(MKDOCS): $(VENV)/bin/activate # ── obikmer binary ─────────────────────────────────────────────────────────── +PROFILE_FILE := profile.pb +PPROF_PORT := 8081 + .PHONY: obikmer obikmer: cargo build --release --manifest-path $(CARGO_DIR)/Cargo.toml --bin obikmer + +.PHONY: debug debug: cargo build --manifest-path $(CARGO_DIR)/Cargo.toml --bin obikmer + +.PHONY: profile-build +profile-build: + cargo build --release --manifest-path $(CARGO_DIR)/Cargo.toml \ + --bin obikmer --features obikmer/profiling + +.PHONY: profile-view +profile-view: + go tool pprof -http=127.0.0.1:$(PPROF_PORT) $(PROFILE_FILE) # ── documentation ───────────────────────────────────────────────────────────── .PHONY: doc diff --git a/profile.json.gz b/profile.json.gz index 321f0f8..f9691b2 100644 Binary files a/profile.json.gz and b/profile.json.gz differ diff --git a/src/Cargo.lock b/src/Cargo.lock index 59b0d35..db5c270 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "addr2line" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" +dependencies = [ + "gimli", +] + [[package]] name = "adler2" version = "2.0.1" @@ -17,6 +26,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "aligned" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee4508988c62edf04abd8d92897fca0c2995d907ce1dfeaf369dac3716a40685" +dependencies = [ + "as-slice", +] + [[package]] name = "allocator-api2" version = "0.2.21" @@ -79,12 +97,42 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "as-slice" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "516b6b4f0e40d50dcda9365d53964ec74560ad4284da2e7fc97122cd83174516" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "autocfg" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "backtrace" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-link", +] + [[package]] name = "base64" version = "0.22.1" @@ -103,6 +151,12 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.11.1" @@ -121,6 +175,15 @@ dependencies = [ "wyz", ] +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "bpaf" version = "0.9.25" @@ -209,6 +272,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "ciborium" version = "0.2.2" @@ -282,6 +351,30 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpp_demangle" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2bb79cb74d735044c972aae58ed0aaa9a837e85b01106a54c39e42e97f62253" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -349,6 +442,44 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "cvt" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ae9bf77fbf2d39ef573205d554d87e86c12f1994e9ea335b0651b9b278bcf1" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "debugid" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d" +dependencies = [ + "uuid", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -394,6 +525,24 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "findshlibs" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64" +dependencies = [ + "cc", + "lazy_static", + "libc", + "winapi", +] + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + [[package]] name = "flate2" version = "1.1.9" @@ -425,12 +574,36 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs_at" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14af6c9694ea25db25baa2a1788703b9e7c6648dcaeeebeb98f7561b5384c036" +dependencies = [ + "aligned", + "cfg-if", + "cvt", + "libc", + "nix 0.29.0", + "windows-sys 0.52.0", +] + [[package]] name = "funty" version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -454,6 +627,12 @@ dependencies = [ "wasip2", ] +[[package]] +name = "gimli" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" + [[package]] name = "half" version = "2.7.1" @@ -476,6 +655,12 @@ dependencies = [ "foldhash", ] +[[package]] +name = "hashbrown" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" + [[package]] name = "heck" version = "0.5.0" @@ -585,6 +770,16 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown 0.17.0", +] + [[package]] name = "infer" version = "0.19.0" @@ -600,6 +795,15 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.18" @@ -670,6 +874,15 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + [[package]] name = "log" version = "0.4.29" @@ -682,7 +895,7 @@ version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" dependencies = [ - "hashbrown", + "hashbrown 0.15.5", ] [[package]] @@ -700,6 +913,15 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "memmap2" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" +dependencies = [ + "libc", +] + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -710,6 +932,12 @@ dependencies = [ "simd-adler32", ] +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" + [[package]] name = "niffler" version = "2.7.0" @@ -739,6 +967,47 @@ dependencies = [ "zstd", ] +[[package]] +name = "nix" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b" +dependencies = [ + "bitflags 1.3.2", + "cfg-if", + "libc", +] + +[[package]] +name = "nix" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46" +dependencies = [ + "bitflags 2.11.1", + "cfg-if", + "cfg_aliases", + "libc", +] + +[[package]] +name = "normpath" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf23ab2b905654b4cb177e30b629937b3868311d4e1cba859f899c041046e69b" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "ntapi" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" +dependencies = [ + "winapi", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -777,6 +1046,9 @@ dependencies = [ "obipipeline", "obiread", "obiskbuilder", + "pprof", + "tracing", + "tracing-subscriber", ] [[package]] @@ -786,8 +1058,12 @@ dependencies = [ "niffler 3.0.0", "obikseq", "obiskio", + "rayon", + "remove_dir_all", "serde", "serde_json", + "sysinfo", + "tracing", ] [[package]] @@ -844,9 +1120,20 @@ dependencies = [ "niffler 3.0.0", "obikseq", "rustix", + "serde", + "serde_json", "tempfile", ] +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.21.4" @@ -865,12 +1152,45 @@ version = "11.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + [[package]] name = "percent-encoding" version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset", + "indexmap", +] + [[package]] name = "pin-project-lite" version = "0.2.17" @@ -892,6 +1212,40 @@ dependencies = [ "zerovec", ] +[[package]] +name = "pprof" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef5c97c51bd34c7e742402e216abdeb44d415fbe6ae41d56b114723e953711cb" +dependencies = [ + "backtrace", + "cfg-if", + "findshlibs", + "libc", + "log", + "nix 0.26.4", + "once_cell", + "parking_lot", + "prost", + "prost-build", + "prost-derive", + "sha2", + "smallvec", + "symbolic-demangle", + "tempfile", + "thiserror 1.0.69", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -901,6 +1255,59 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "prost" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" +dependencies = [ + "bytes", + "heck", + "itertools", + "log", + "multimap", + "once_cell", + "petgraph", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" +dependencies = [ + "prost", +] + [[package]] name = "quote" version = "1.0.45" @@ -942,6 +1349,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags 2.11.1", +] + [[package]] name = "regex" version = "1.12.3" @@ -971,6 +1387,20 @@ version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "remove_dir_all" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a694f9e0eb3104451127f6cc1e5de55f59d3b1fc8c5ddfaeb6f1e716479ceb4a" +dependencies = [ + "cfg-if", + "cvt", + "fs_at", + "libc", + "normpath", + "windows-sys 0.59.0", +] + [[package]] name = "ring" version = "0.17.14" @@ -985,13 +1415,19 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rustc-demangle" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" + [[package]] name = "rustix" version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ - "bitflags", + "bitflags 2.11.1", "errno", "libc", "linux-raw-sys", @@ -1048,6 +1484,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "serde" version = "1.0.228" @@ -1091,6 +1533,17 @@ dependencies = [ "zmij", ] +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -1136,6 +1589,29 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "symbolic-common" +version = "12.18.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "332615d90111d8eeaf86a84dc9bbe9f65d0d8c5cf11b4caccedc37754eb0dcfd" +dependencies = [ + "debugid", + "memmap2", + "stable_deref_trait", + "uuid", +] + +[[package]] +name = "symbolic-demangle" +version = "12.18.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "912017718eb4d21930546245af9a3475c9dccf15675a5c215664e76621afc471" +dependencies = [ + "cpp_demangle", + "rustc-demangle", + "symbolic-common", +] + [[package]] name = "syn" version = "2.0.117" @@ -1158,6 +1634,20 @@ dependencies = [ "syn", ] +[[package]] +name = "sysinfo" +version = "0.33.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fc858248ea01b66f19d8e8a6d55f41deaf91e9d495246fd01368d99935c6c01" +dependencies = [ + "core-foundation-sys", + "libc", + "memchr", + "ntapi", + "rayon", + "windows", +] + [[package]] name = "tap" version = "1.0.1" @@ -1297,6 +1787,12 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "typenum" +version = "1.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" + [[package]] name = "unicode-ident" version = "1.0.24" @@ -1365,6 +1861,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "walkdir" version = "2.5.0" @@ -1453,6 +1955,22 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + [[package]] name = "winapi-util" version = "0.1.11" @@ -1462,12 +1980,71 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143" +dependencies = [ + "windows-core", + "windows-targets", +] + +[[package]] +name = "windows-core" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-result", + "windows-targets", +] + +[[package]] +name = "windows-implement" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-result" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -1477,6 +2054,15 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.61.2" diff --git a/src/Cargo.toml b/src/Cargo.toml index a5059b1..0ef9265 100644 --- a/src/Cargo.toml +++ b/src/Cargo.toml @@ -1,3 +1,5 @@ [workspace] resolver = "3" members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio"] +[profile.release] +debug = 1 diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml index b71562e..409ff11 100644 --- a/src/obikmer/Cargo.toml +++ b/src/obikmer/Cargo.toml @@ -16,3 +16,9 @@ obipipeline = { path = "../obipipeline" } clap = { version = "4", features = ["derive"] } obikrope = { path = "../obikrope" } obikpartitionner = { path = "../obikpartitionner" } +tracing = "0.1.44" +tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] } +pprof = { version = "0.13", features = ["prost-codec"], optional = true } + +[features] +profiling = ["dep:pprof"] diff --git a/src/obikmer/src/cmd/partition.rs b/src/obikmer/src/cmd/partition.rs index c8318ad..980ecc8 100644 --- a/src/obikmer/src/cmd/partition.rs +++ b/src/obikmer/src/cmd/partition.rs @@ -1,13 +1,13 @@ -use std::io; -use std::path::PathBuf; -use std::sync::{Arc, Mutex}; - use clap::Args; +use obikpartitionner::KmerPartition; use obikrope::Rope; use obikseq::superkmer::SuperKmer; use obipipeline::{WorkerPool, make_pipeline}; use obiskbuilder::SuperKmerIter; -use obikpartitionner::KmerPartition; +use std::io; +use std::path::PathBuf; +use std::sync::{Arc, Mutex}; +use tracing::info; #[derive(Args)] pub struct PartitionArgs { @@ -95,10 +95,10 @@ fn build_superkmers( SuperKmerIter::new(&rope, k, m, level_max, theta).collect() } -fn write_batch(batch: Vec, kp: &Mutex) -> io::Result<()> { +fn write_batch(mut batch: Vec, kp: &Mutex) -> io::Result<()> { kp.lock() .unwrap() - .write_batch(&batch) + .write_batch(&mut batch) .map_err(|e| io::Error::other(e)) } @@ -112,7 +112,10 @@ pub fn run(args: PartitionArgs) { let n_workers = args.threads.max(1); let kp = KmerPartition::create(&args.output, args.partition_bits, k, m, args.force) - .unwrap_or_else(|e| { eprintln!("error: {e}"); std::process::exit(1) }); + .unwrap_or_else(|e| { + eprintln!("error: {e}"); + std::process::exit(1) + }); let kp = Arc::new(Mutex::new(kp)); let kp_sink = Arc::clone(&kp); @@ -130,4 +133,7 @@ pub fn run(args: PartitionArgs) { WorkerPool::new(pipeline, n_workers, 1).run(); kp.lock().unwrap().close().expect("close error"); + + info!("dereplicating..."); + kp.lock().unwrap().dereplicate().expect("dereplicate error"); } diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index 1e36e18..06780c4 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -1,6 +1,7 @@ mod cmd; use clap::{Parser, Subcommand}; +use tracing_subscriber::{EnvFilter, fmt}; #[derive(Parser)] #[command(name = "obikmer", about = "DNA k-mer tools")] @@ -18,9 +19,38 @@ enum Commands { } fn main() { + fmt() + .with_env_filter(EnvFilter::from_default_env()) + .with_writer(std::io::stderr) + .init(); + + #[cfg(feature = "profiling")] + let _guard = { + let guard = pprof::ProfilerGuardBuilder::default() + .frequency(1000) + .build() + .expect("failed to start pprof profiler"); + guard + }; + let cli = Cli::parse(); match cli.command { Commands::Superkmer(args) => cmd::superkmer::run(args), Commands::Partition(args) => cmd::partition::run(args), } + + #[cfg(feature = "profiling")] + { + use pprof::protos::Message; + if let Ok(report) = _guard.report().build() { + let mut bytes = Vec::new(); + report + .pprof() + .expect("pprof encode failed") + .encode(&mut bytes) + .expect("pprof encode failed"); + std::fs::write("profile.pb", &bytes).expect("cannot write profile.pb"); + eprintln!("profile written to profile.pb"); + } + } } diff --git a/src/obikpartitionner/Cargo.toml b/src/obikpartitionner/Cargo.toml index fc2c846..fe0379a 100644 --- a/src/obikpartitionner/Cargo.toml +++ b/src/obikpartitionner/Cargo.toml @@ -5,7 +5,11 @@ edition = "2024" [dependencies] niffler = "3.0.0" +remove_dir_all = "0.8" obikseq = { path = "../obikseq" } obiskio = { path = "../obiskio" } +rayon = "1" +sysinfo = "0.33" serde = { version = "1", features = ["derive"] } serde_json = "1" +tracing = "0.1.44" diff --git a/src/obikpartitionner/src/partition.rs b/src/obikpartitionner/src/partition.rs index 0b71110..14c50fd 100644 --- a/src/obikpartitionner/src/partition.rs +++ b/src/obikpartitionner/src/partition.rs @@ -1,36 +1,41 @@ +use std::collections::HashMap; use std::fs; use std::io; use std::path::{Path, PathBuf}; -use std::sync::{Arc, Mutex}; +use tracing::debug; + +use sysinfo::System; + +use remove_dir_all::remove_dir_all; use niffler::Level; use niffler::send::compression::Format; use obikseq::superkmer::SuperKmer; +use obiskio::{SKFileMeta, SKFileReader, SKFileWriter, SKResult}; +use rayon::prelude::*; use serde::{Deserialize, Serialize}; -use obiskio::{SKFilePool, SKFileWriter, SKResult, SharedPool, create_token_with}; const META_FILENAME: &str = "partition.meta"; #[derive(Serialize, Deserialize)] struct PartitionMeta { - n_bits: usize, - kmer_size: usize, + n_bits: usize, + kmer_size: usize, minimizer_size: usize, - format: String, - level: u32, + format: String, + level: u32, } pub struct KmerPartition { - root_path: PathBuf, - n_partitions: usize, + root_path: PathBuf, + n_partitions: usize, partitions_mask: u64, - kmer_size: usize, - minimizer_size: usize, - pool: SharedPool, - writers: Vec>, - format: Format, - level: Level, - closed: bool, + kmer_size: usize, + minimizer_size: usize, + writers: Vec>, + format: Format, + level: Level, + closed: bool, } impl KmerPartition { @@ -41,7 +46,15 @@ impl KmerPartition { minimizer_size: usize, force: bool, ) -> SKResult { - Self::create_with(path, n_bits, kmer_size, minimizer_size, Format::Zstd, Level::Three, force) + Self::create_with( + path, + n_bits, + kmer_size, + minimizer_size, + Format::Zstd, + Level::Three, + force, + ) } pub fn create_with>( @@ -56,18 +69,20 @@ impl KmerPartition { let root_path = path.as_ref().to_owned(); if root_path.exists() { if force { - fs::remove_dir_all(&root_path)?; + remove_dir_all(&root_path)?; } else { return Err(io::Error::new( io::ErrorKind::AlreadyExists, - format!("{}: partition directory already exists", root_path.display()), + format!( + "{}: partition directory already exists", + root_path.display() + ), ) .into()); } } fs::create_dir_all(&root_path)?; let n_partitions = 1usize << n_bits; - let pool = Arc::new(Mutex::new(SKFilePool::from_system_limits())); let writers = (0..n_partitions).map(|_| None).collect(); let partition = Self { root_path, @@ -75,7 +90,6 @@ impl KmerPartition { partitions_mask: (1u64 << n_bits) - 1, kmer_size, minimizer_size, - pool, writers, format, level, @@ -85,16 +99,18 @@ impl KmerPartition { Ok(partition) } - pub fn write(&mut self, sk: &SuperKmer) -> SKResult<()> { + pub fn write(&mut self, sk: &mut SuperKmer) -> SKResult<()> { self.check_not_closed()?; let partition = self.partition_of(sk)?; + sk.init_count(); self.ensure_writer(partition)?.write(sk) } - pub fn write_batch(&mut self, sks: &[SuperKmer]) -> SKResult<()> { + pub fn write_batch(&mut self, sks: &mut [SuperKmer]) -> SKResult<()> { self.check_not_closed()?; for sk in sks { let partition = self.partition_of(sk)?; + sk.init_count(); self.ensure_writer(partition)?.write(sk)?; } Ok(()) @@ -127,15 +143,58 @@ impl KmerPartition { &self.root_path } + /// Deduplicate all `raw.{ext}` files in parallel, replacing each with a + /// `dereplicated.{ext}` file where identical canonical sequences are merged + /// and their counts summed. + /// + /// Each partition file is processed in two phases to bound memory use: + /// + /// 1. **Split** — the raw file is scattered into `2^temp_bits` temporary + /// files routed by `hash(canonical_seq) & temp_mask`. Because duplicates + /// always share the same hash, they always land in the same temp file. + /// 2. **Merge** — each temp file is loaded fully into a `HashMap`, counts + /// are accumulated in `u64` (no 24-bit overflow risk), and the result is + /// appended to `dereplicated.{ext}`. + /// + /// If a merged count exceeds the 24-bit header limit, the sequence is + /// emitted as multiple records whose counts sum to the true total. + /// + /// `temp_bits` controls the split fan-out (`2^temp_bits` temp files per + /// partition). Higher values reduce per-temp-file memory at the cost of + /// more temporary file descriptors — all managed by the global fd pool. + pub fn dereplicate(&self) -> SKResult<()> { + let format = self.format; + let level = self.level; + let ext = format_ext(format); + let root = &self.root_path; + let available = System::new_all().available_memory(); + let n_threads = rayon::current_num_threads().max(1) as u64; + let available_per_thread = available / n_threads; + + let results: Vec> = (0..self.n_partitions) + .into_par_iter() + .map(|i| { + let dir = root.join(format!("part_{:05}", i)); + if !dir.exists() { + return Ok(()); + } + let raw_path = dir.join(format!("raw.{ext}")); + let n_buckets = optimal_buckets(&raw_path, available_per_thread); + dereplicate_partition(&dir, ext, format, level, n_buckets) + }) + .collect(); + + for r in results { + r?; + } + Ok(()) + } + // ── private ─────────────────────────────────────────────────────────────── fn check_not_closed(&self) -> SKResult<()> { if self.closed { - Err(io::Error::new( - io::ErrorKind::BrokenPipe, - "write to closed KmerPartition", - ) - .into()) + Err(io::Error::new(io::ErrorKind::BrokenPipe, "write to closed KmerPartition").into()) } else { Ok(()) } @@ -159,14 +218,13 @@ impl KmerPartition { Format::Bzip => "bzip2", Format::Lzma => "lzma", Format::Zstd => "zstd", - Format::No => "none", + Format::No => "none", } .to_owned(), level: u32::from(self.level), }; let f = fs::File::create(self.root_path.join(META_FILENAME))?; - serde_json::to_writer_pretty(f, &meta) - .map_err(|e| io::Error::other(e))?; + serde_json::to_writer_pretty(f, &meta).map_err(|e| io::Error::other(e))?; Ok(()) } @@ -174,21 +232,165 @@ impl KmerPartition { if self.writers[partition].is_none() { let dir = self.root_path.join(format!("part_{:05}", partition)); fs::create_dir_all(&dir)?; - let ext = match self.format { - Format::Gzip => "skmer.gz", - Format::Bzip => "skmer.bz2", - Format::Lzma => "skmer.xz", - Format::Zstd => "skmer.zst", - Format::No => "skmer", - }; + let ext = format_ext(self.format); let file_path = dir.join(format!("raw.{ext}")); - let writer = create_token_with(&self.pool, file_path, self.format, self.level)?; + let writer = SKFileWriter::create_with(file_path, self.format, self.level)?; self.writers[partition] = Some(writer); } Ok(self.writers[partition].as_mut().unwrap()) } } +// ── free helpers ───────────────────────────────────────────────────────────── + +/// Estimate the number of in-memory buckets needed to deduplicate the partition +/// file at `raw_path` given `available_bytes` of free RAM. +/// +/// Memory per HashMap entry: +/// key Box (1 + avg_seq_bytes) + SuperKmer header (4 B) + avg seq bytes + u64 count (8 B), +/// multiplied by 1.5 for hashbrown load-factor overhead. +/// +/// Returns 1 if the partition fits comfortably in memory (no split needed). +/// Always returns a power of two. +/// Remove a SuperKmer file and its sidecar (if present). +fn remove_skmer_file(path: &Path) -> SKResult<()> { + fs::remove_file(path)?; + let sidecar = SKFileMeta::sidecar_path(path); + match fs::remove_file(&sidecar) { + Ok(()) => {} + Err(e) if e.kind() == io::ErrorKind::NotFound => {} + Err(e) => return Err(e.into()), + } + Ok(()) +} + +fn optimal_buckets(raw_path: &Path, available_bytes: u64) -> usize { + // Use 60 % of available RAM to leave headroom for the rest of the process. + let budget = (available_bytes as f64 * 0.60) as u64; + + let meta = match SKFileMeta::read(raw_path) { + Ok(Some(m)) if m.instances > 0 => m, + _ => return 1, + }; + + let avg_seq_bytes = ((meta.length_sum + meta.instances - 1) / meta.instances + 3) / 4; + // SuperKmer: header (4 B) + Box<[u8]> ptr+len (16 B) + heap seq bytes; value: u64 (8 B); ×1.5 for hashbrown overhead. + let bytes_per_entry = ((4 + 16 + avg_seq_bytes + 8) as f64 * 1.5) as u64; + let estimated = meta.instances * bytes_per_entry; + + if estimated <= budget { + debug!("Dereplication: estimated={estimated} budget={budget} n_temp=1"); + return 1; + } + + // Round up to the next power of two. + let n = (estimated + budget - 1) / budget; + debug!("Dereplication: estimated={estimated} budget={budget} n_temp={n}"); + n.next_power_of_two() as usize +} + +fn format_ext(format: Format) -> &'static str { + match format { + Format::Gzip => "skmer.gz", + Format::Bzip => "skmer.bz2", + Format::Lzma => "skmer.xz", + Format::Zstd => "skmer.zst", + Format::No => "skmer", + } +} + +/// Maximum value that fits in the 24-bit COUNT field of a SuperKmer header. +const MAX_SK_COUNT: u64 = (1 << 24) - 1; + +/// Deduplicate one partition directory in place (two-phase split + merge). +fn dereplicate_partition( + dir: &Path, + ext: &str, + format: Format, + level: Level, + n_temp: usize, +) -> SKResult<()> { + let raw_path = dir.join(format!("raw.{ext}")); + if !raw_path.exists() { + return Ok(()); + } + + let out_path = dir.join(format!("dereplicated.{ext}")); + let mut writer = SKFileWriter::create_with(&out_path, format, level)?; + + if n_temp == 1 { + // ── Direct path: partition fits in memory, no split needed ──────────── + let map = load_bucket(&raw_path)?; + remove_skmer_file(&raw_path)?; + flush_map(map, &mut writer)?; + } else { + // ── Phase 1: split raw file into temp buckets ───────────────────────── + let temp_mask = (n_temp as u64) - 1; + let temp_paths: Vec = (0..n_temp) + .map(|j| dir.join(format!("temp_{j:04}.{ext}"))) + .collect(); + + { + let mut writers: Vec = temp_paths + .iter() + .map(|p| SKFileWriter::create_with(p, format, level)) + .collect::>()?; + + let mut reader = SKFileReader::open(&raw_path)?; + while let Some(mut sk) = reader.read()? { + sk.canonical(); + let bucket = (sk.hash() & temp_mask) as usize; + writers[bucket].write(&sk)?; + } + for w in &mut writers { + w.close()?; + } + } + remove_skmer_file(&raw_path)?; + + // ── Phase 2: merge each temp bucket into the output ─────────────────── + for temp_path in &temp_paths { + let map = load_bucket(temp_path)?; + remove_skmer_file(temp_path)?; + flush_map(map, &mut writer)?; + } + } + + writer.close()?; + Ok(()) +} + +/// Read a SuperKmer file into a deduplication map (already canonical). +fn load_bucket(path: &Path) -> SKResult> { + let capacity = SKFileMeta::read(path) + .ok() + .flatten() + .map(|m| m.instances as usize) + .unwrap_or(0); + let mut map: HashMap = HashMap::with_capacity(capacity); + let mut reader = SKFileReader::open(path)?; + while let Some(mut sk) = reader.read()? { + sk.canonical(); + let count = sk.count() as u64; + *map.entry(sk).or_insert(0) += count; + } + Ok(map) +} + +/// Write all entries of a deduplication map to `writer`, splitting oversized counts. +fn flush_map(map: HashMap, writer: &mut SKFileWriter) -> SKResult<()> { + for (mut sk, mut total) in map { + while total > MAX_SK_COUNT { + sk.set_count(MAX_SK_COUNT as u32); + writer.write(&sk)?; + total -= MAX_SK_COUNT; + } + sk.set_count(total as u32); + writer.write(&sk)?; + } + Ok(()) +} + impl Drop for KmerPartition { fn drop(&mut self) { let _ = self.close(); diff --git a/src/obikseq/src/superkmer.rs b/src/obikseq/src/superkmer.rs index ff27941..2a39f4e 100644 --- a/src/obikseq/src/superkmer.rs +++ b/src/obikseq/src/superkmer.rs @@ -99,12 +99,31 @@ impl SuperKmerHeader { /// Canonical super-kmer: 32-bit header followed by a byte-aligned 2-bit nucleotide sequence. /// Nucleotide 0 is at the MSB of `seq[0]`. Always stored in canonical form. +/// +/// `PartialEq`, `Eq`, and `Hash` compare only sequence content (seql + seq bytes), +/// ignoring the count / minimizer-pos payload — two records with identical sequences +/// but different counts are considered equal. #[derive(Debug, Clone)] pub struct SuperKmer { header: SuperKmerHeader, seq: Box<[u8]>, } +impl PartialEq for SuperKmer { + fn eq(&self, other: &Self) -> bool { + self.header.seql() == other.header.seql() && self.seq == other.seq + } +} + +impl Eq for SuperKmer {} + +impl std::hash::Hash for SuperKmer { + fn hash(&self, state: &mut H) { + self.header.seql().hash(state); + self.seq.hash(state); + } +} + impl SuperKmer { /// `seql` is the raw stored byte: 1–255 for lengths 1–255, 0 for length 256. pub fn new(seql: u8, seq: Box<[u8]>) -> Self { diff --git a/src/obiskio/Cargo.toml b/src/obiskio/Cargo.toml index c1235a0..75d142c 100644 --- a/src/obiskio/Cargo.toml +++ b/src/obiskio/Cargo.toml @@ -7,6 +7,8 @@ edition = "2024" niffler = "3.0.0" rustix = { version = "1.1.4", features = ["process"] } lru = "0.12" +serde = { version = "1", features = ["derive"] } +serde_json = "1" obikseq = { path = "../obikseq" } diff --git a/src/obiskio/src/lib.rs b/src/obiskio/src/lib.rs index 041cddd..6af659b 100644 --- a/src/obiskio/src/lib.rs +++ b/src/obiskio/src/lib.rs @@ -1,9 +1,11 @@ pub mod codec; pub mod error; pub mod limits; +pub mod meta; pub mod pool; pub mod reader; pub use error::{SKError, SKResult}; +pub use meta::SKFileMeta; pub use pool::{create_token, create_token_with, SKFilePool, SharedPool, SKFileWriter}; pub use reader::{SKFileIter, SKFileReader}; diff --git a/src/obiskio/src/meta.rs b/src/obiskio/src/meta.rs new file mode 100644 index 0000000..a4887d7 --- /dev/null +++ b/src/obiskio/src/meta.rs @@ -0,0 +1,46 @@ +use std::path::{Path, PathBuf}; +use serde::{Deserialize, Serialize}; +use crate::error::SKResult; + +/// Statistics sidecar written alongside each SuperKmer file on close. +/// +/// The sidecar path is `.meta` (e.g. `raw.skmer.zst.meta`). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SKFileMeta { + /// Number of SuperKmer records in the file. + pub instances: u64, + /// Sum of all occurrence counts across records. + pub count_sum: u64, + /// Sum of all sequence lengths (in nucleotides) across records. + pub length_sum: u64, +} + +impl SKFileMeta { + pub(crate) fn zero() -> Self { + Self { instances: 0, count_sum: 0, length_sum: 0 } + } + + /// Path of the sidecar file for a given SuperKmer file path. + pub fn sidecar_path(sk_path: &Path) -> PathBuf { + let mut s = sk_path.as_os_str().to_owned(); + s.push(".meta"); + PathBuf::from(s) + } + + /// Read the sidecar for `sk_path`. Returns `None` if the sidecar is absent. + pub fn read(sk_path: &Path) -> SKResult> { + let path = Self::sidecar_path(sk_path); + match std::fs::File::open(&path) { + Ok(f) => Ok(Some(serde_json::from_reader(f).map_err(std::io::Error::other)?)), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(None), + Err(e) => Err(e.into()), + } + } + + pub(crate) fn write(&self, sk_path: &Path) -> SKResult<()> { + let path = Self::sidecar_path(sk_path); + let f = std::fs::File::create(&path)?; + serde_json::to_writer_pretty(f, self).map_err(std::io::Error::other)?; + Ok(()) + } +} diff --git a/src/obiskio/src/pool.rs b/src/obiskio/src/pool.rs index 3b28441..cb21d72 100644 --- a/src/obiskio/src/pool.rs +++ b/src/obiskio/src/pool.rs @@ -1,6 +1,7 @@ use crate::codec::write_superkmer; use crate::error::SKResult; use crate::limits::max_concurrent_files; +use crate::meta::SKFileMeta; use lru::LruCache; use niffler::send::compression::Format; use niffler::Level; @@ -220,6 +221,7 @@ pub struct SKFileWriter { pending: Vec, flush_threshold: usize, logically_closed: bool, + meta: SKFileMeta, } /// Create a `SKFileWriter` for a new file (Zstd, level 3). @@ -242,6 +244,7 @@ pub fn create_token_with( pending: Vec::with_capacity(DEFAULT_FLUSH_THRESHOLD + 128), flush_threshold: DEFAULT_FLUSH_THRESHOLD, logically_closed: false, + meta: SKFileMeta::zero(), }) } @@ -266,6 +269,9 @@ impl SKFileWriter { pub fn write(&mut self, sk: &SuperKmer) -> SKResult<()> { self.check_not_closed()?; write_superkmer(&mut self.pending, sk)?; + self.meta.instances += 1; + self.meta.count_sum += sk.count() as u64; + self.meta.length_sum += sk.seql() as u64; if self.pending.len() >= self.flush_threshold { self.drain()?; } @@ -277,6 +283,9 @@ impl SKFileWriter { self.check_not_closed()?; for sk in sks { write_superkmer(&mut self.pending, sk)?; + self.meta.instances += 1; + self.meta.count_sum += sk.count() as u64; + self.meta.length_sum += sk.seql() as u64; if self.pending.len() >= self.flush_threshold { self.drain()?; } @@ -337,6 +346,7 @@ impl SKFileWriter { w.flush()?; // drop(w) → Zstd frame finalized } + self.meta.write(&self.path)?; Ok(()) }