diff --git a/.gitignore b/.gitignore index 84e7f06..5381ab2 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ data-stress *.zst *.zst.meta *.pb +*.json +*.bin diff --git a/debug.log b/debug.log new file mode 100644 index 0000000..4244ba5 --- /dev/null +++ b/debug.log @@ -0,0 +1,275 @@ +2026-04-27T20:36:04.208955Z  INFO obikmer::cmd::partition: dereplicating... +2026-04-27T20:36:11.471489Z  INFO obikpartitionner::partition: counting kmers in partition 0/256 +2026-04-27T20:36:11.471521Z  INFO obikpartitionner::partition: counting kmers in partition 128/256 +2026-04-27T20:36:11.471530Z  INFO obikpartitionner::partition: counting kmers in partition 32/256 +2026-04-27T20:36:11.471535Z  INFO obikpartitionner::partition: counting kmers in partition 12/256 +2026-04-27T20:36:11.471551Z  INFO obikpartitionner::partition: counting kmers in partition 76/256 +2026-04-27T20:36:11.471522Z  INFO obikpartitionner::partition: counting kmers in partition 2/256 +2026-04-27T20:36:11.471525Z  INFO obikpartitionner::partition: counting kmers in partition 4/256 +2026-04-27T20:36:11.471535Z  INFO obikpartitionner::partition: counting kmers in partition 96/256 +2026-04-27T20:36:11.471535Z  INFO obikpartitionner::partition: counting kmers in partition 80/256 +2026-04-27T20:36:11.471535Z  INFO obikpartitionner::partition: counting kmers in partition 10/256 +2026-04-27T20:36:11.471538Z  INFO obikpartitionner::partition: counting kmers in partition 16/256 +2026-04-27T20:36:11.471543Z  INFO obikpartitionner::partition: counting kmers in partition 224/256 +2026-04-27T20:36:11.471537Z  INFO obikpartitionner::partition: counting kmers in partition 192/256 +2026-04-27T20:36:11.471536Z  INFO obikpartitionner::partition: counting kmers in partition 64/256 +2026-04-27T20:36:11.471537Z  INFO obikpartitionner::partition: counting kmers in partition 72/256 +2026-04-27T20:36:11.471530Z  INFO obikpartitionner::partition: counting kmers in partition 8/256 +2026-04-27T20:36:12.831739Z  INFO obikpartitionner::partition: counting kmers in partition 129/256 +2026-04-27T20:36:13.555104Z  INFO obikpartitionner::partition: counting kmers in partition 9/256 +2026-04-27T20:36:14.077082Z  INFO obikpartitionner::partition: counting kmers in partition 17/256 +2026-04-27T20:36:14.203310Z  INFO obikpartitionner::partition: counting kmers in partition 77/256 +2026-04-27T20:36:14.316403Z  INFO obikpartitionner::partition: counting kmers in partition 3/256 +2026-04-27T20:36:14.415177Z  INFO obikpartitionner::partition: counting kmers in partition 33/256 +2026-04-27T20:36:14.467689Z  INFO obikpartitionner::partition: counting kmers in partition 97/256 +2026-04-27T20:36:14.559650Z  INFO obikpartitionner::partition: counting kmers in partition 5/256 +2026-04-27T20:36:14.591431Z  INFO obikpartitionner::partition: counting kmers in partition 81/256 +2026-04-27T20:36:14.606508Z  INFO obikpartitionner::partition: counting kmers in partition 193/256 +2026-04-27T20:36:14.611406Z  INFO obikpartitionner::partition: counting kmers in partition 73/256 +2026-04-27T20:36:14.623813Z  INFO obikpartitionner::partition: counting kmers in partition 11/256 +2026-04-27T20:36:14.665058Z  INFO obikpartitionner::partition: counting kmers in partition 65/256 +2026-04-27T20:36:14.666877Z  INFO obikpartitionner::partition: counting kmers in partition 1/256 +2026-04-27T20:36:14.667569Z  INFO obikpartitionner::partition: counting kmers in partition 13/256 +2026-04-27T20:36:14.697486Z  INFO obikpartitionner::partition: counting kmers in partition 225/256 +2026-04-27T20:36:14.698441Z  INFO obikpartitionner::partition: counting kmers in partition 130/256 +2026-04-27T20:36:14.998749Z  INFO obikpartitionner::partition: counting kmers in partition 14/256 +2026-04-27T20:36:15.485303Z  INFO obikpartitionner::partition: counting kmers in partition 18/256 +2026-04-27T20:36:15.637613Z  INFO obikpartitionner::partition: counting kmers in partition 78/256 +2026-04-27T20:36:15.722825Z  INFO obikpartitionner::partition: counting kmers in partition 98/256 +2026-04-27T20:36:15.831703Z  INFO obikpartitionner::partition: counting kmers in partition 112/256 +2026-04-27T20:36:16.161408Z  INFO obikpartitionner::partition: counting kmers in partition 66/256 +2026-04-27T20:36:16.340113Z  INFO obikpartitionner::partition: counting kmers in partition 226/256 +2026-04-27T20:36:16.506407Z  INFO obikpartitionner::partition: counting kmers in partition 82/256 +2026-04-27T20:36:16.542430Z  INFO obikpartitionner::partition: counting kmers in partition 74/256 +2026-04-27T20:36:16.550654Z  INFO obikpartitionner::partition: counting kmers in partition 34/256 +2026-04-27T20:36:16.634719Z  INFO obikpartitionner::partition: counting kmers in partition 6/256 +2026-04-27T20:36:16.763833Z  INFO obikpartitionner::partition: counting kmers in partition 194/256 +2026-04-27T20:36:16.813285Z  INFO obikpartitionner::partition: counting kmers in partition 131/256 +2026-04-27T20:36:16.913975Z  INFO obikpartitionner::partition: counting kmers in partition 208/256 +2026-04-27T20:36:16.993526Z  INFO obikpartitionner::partition: counting kmers in partition 15/256 +2026-04-27T20:36:17.092998Z  INFO obikpartitionner::partition: counting kmers in partition 160/256 +2026-04-27T20:36:17.114964Z  INFO obikpartitionner::partition: counting kmers in partition 88/256 +2026-04-27T20:36:17.148685Z  INFO obikpartitionner::partition: counting kmers in partition 79/256 +2026-04-27T20:36:17.161989Z  INFO obikpartitionner::partition: counting kmers in partition 19/256 +2026-04-27T20:36:17.180022Z  INFO obikpartitionner::partition: counting kmers in partition 113/256 +2026-04-27T20:36:17.397037Z  INFO obikpartitionner::partition: counting kmers in partition 67/256 +2026-04-27T20:36:17.484519Z  INFO obikpartitionner::partition: counting kmers in partition 99/256 +2026-04-27T20:36:17.906842Z  INFO obikpartitionner::partition: counting kmers in partition 35/256 +2026-04-27T20:36:18.146262Z  INFO obikpartitionner::partition: counting kmers in partition 75/256 +2026-04-27T20:36:18.247130Z  INFO obikpartitionner::partition: counting kmers in partition 7/256 +2026-04-27T20:36:18.303872Z  INFO obikpartitionner::partition: counting kmers in partition 227/256 +2026-04-27T20:36:18.374448Z  INFO obikpartitionner::partition: counting kmers in partition 195/256 +2026-04-27T20:36:18.382279Z  INFO obikpartitionner::partition: counting kmers in partition 83/256 +2026-04-27T20:36:18.428092Z  INFO obikpartitionner::partition: counting kmers in partition 104/256 +2026-04-27T20:36:18.442213Z  INFO obikpartitionner::partition: counting kmers in partition 132/256 +2026-04-27T20:36:18.621883Z  INFO obikpartitionner::partition: counting kmers in partition 209/256 +2026-04-27T20:36:18.784307Z  INFO obikpartitionner::partition: counting kmers in partition 161/256 +2026-04-27T20:36:18.796465Z  INFO obikpartitionner::partition: counting kmers in partition 114/256 +2026-04-27T20:36:18.877587Z  INFO obikpartitionner::partition: counting kmers in partition 120/256 +2026-04-27T20:36:18.888423Z  INFO obikpartitionner::partition: counting kmers in partition 89/256 +2026-04-27T20:36:18.918340Z  INFO obikpartitionner::partition: counting kmers in partition 20/256 +2026-04-27T20:36:18.985341Z  INFO obikpartitionner::partition: counting kmers in partition 100/256 +2026-04-27T20:36:19.080092Z  INFO obikpartitionner::partition: counting kmers in partition 68/256 +2026-04-27T20:36:19.282760Z  INFO obikpartitionner::partition: counting kmers in partition 36/256 +2026-04-27T20:36:19.485079Z  INFO obikpartitionner::partition: counting kmers in partition 92/256 +2026-04-27T20:36:19.692354Z  INFO obikpartitionner::partition: counting kmers in partition 24/256 +2026-04-27T20:36:19.746458Z  INFO obikpartitionner::partition: counting kmers in partition 228/256 +2026-04-27T20:36:20.004054Z  INFO obikpartitionner::partition: counting kmers in partition 84/256 +2026-04-27T20:36:20.164510Z  INFO obikpartitionner::partition: counting kmers in partition 196/256 +2026-04-27T20:36:20.277288Z  INFO obikpartitionner::partition: counting kmers in partition 105/256 +2026-04-27T20:36:20.349391Z  INFO obikpartitionner::partition: counting kmers in partition 133/256 +2026-04-27T20:36:20.511981Z  INFO obikpartitionner::partition: counting kmers in partition 210/256 +2026-04-27T20:36:20.709583Z  INFO obikpartitionner::partition: counting kmers in partition 101/256 +2026-04-27T20:36:20.715870Z  INFO obikpartitionner::partition: counting kmers in partition 69/256 +2026-04-27T20:36:20.728029Z  INFO obikpartitionner::partition: counting kmers in partition 115/256 +2026-04-27T20:36:20.822257Z  INFO obikpartitionner::partition: counting kmers in partition 162/256 +2026-04-27T20:36:20.982131Z  INFO obikpartitionner::partition: counting kmers in partition 90/256 +2026-04-27T20:36:20.990122Z  INFO obikpartitionner::partition: counting kmers in partition 21/256 +2026-04-27T20:36:21.030814Z  INFO obikpartitionner::partition: counting kmers in partition 121/256 +2026-04-27T20:36:21.088309Z  INFO obikpartitionner::partition: counting kmers in partition 93/256 +2026-04-27T20:36:21.134904Z  INFO obikpartitionner::partition: counting kmers in partition 37/256 +2026-04-27T20:36:21.196814Z  INFO obikpartitionner::partition: counting kmers in partition 25/256 +2026-04-27T20:36:21.444384Z  INFO obikpartitionner::partition: counting kmers in partition 85/256 +2026-04-27T20:36:21.605343Z  INFO obikpartitionner::partition: counting kmers in partition 197/256 +2026-04-27T20:36:21.954010Z  INFO obikpartitionner::partition: counting kmers in partition 106/256 +2026-04-27T20:36:22.026167Z  INFO obikpartitionner::partition: counting kmers in partition 229/256 +2026-04-27T20:36:22.194676Z  INFO obikpartitionner::partition: counting kmers in partition 134/256 +2026-04-27T20:36:22.209911Z  INFO obikpartitionner::partition: counting kmers in partition 211/256 +2026-04-27T20:36:22.281984Z  INFO obikpartitionner::partition: counting kmers in partition 70/256 +2026-04-27T20:36:22.440637Z  INFO obikpartitionner::partition: counting kmers in partition 163/256 +2026-04-27T20:36:22.481750Z  INFO obikpartitionner::partition: counting kmers in partition 91/256 +2026-04-27T20:36:22.522458Z  INFO obikpartitionner::partition: counting kmers in partition 116/256 +2026-04-27T20:36:22.775669Z  INFO obikpartitionner::partition: counting kmers in partition 22/256 +2026-04-27T20:36:23.095766Z  INFO obikpartitionner::partition: counting kmers in partition 102/256 +2026-04-27T20:36:23.119802Z  INFO obikpartitionner::partition: counting kmers in partition 94/256 +2026-04-27T20:36:23.183138Z  INFO obikpartitionner::partition: counting kmers in partition 122/256 +2026-04-27T20:36:23.186904Z  INFO obikpartitionner::partition: counting kmers in partition 38/256 +2026-04-27T20:36:23.196250Z  INFO obikpartitionner::partition: counting kmers in partition 86/256 +2026-04-27T20:36:23.206118Z  INFO obikpartitionner::partition: counting kmers in partition 26/256 +2026-04-27T20:36:23.314520Z  INFO obikpartitionner::partition: counting kmers in partition 198/256 +2026-04-27T20:36:23.355959Z  INFO obikpartitionner::partition: counting kmers in partition 107/256 +2026-04-27T20:36:23.526146Z  INFO obikpartitionner::partition: counting kmers in partition 230/256 +2026-04-27T20:36:23.723572Z  INFO obikpartitionner::partition: counting kmers in partition 212/256 +2026-04-27T20:36:23.856569Z  INFO obikpartitionner::partition: counting kmers in partition 71/256 +2026-04-27T20:36:23.910613Z  INFO obikpartitionner::partition: counting kmers in partition 164/256 +2026-04-27T20:36:23.945560Z  INFO obikpartitionner::partition: counting kmers in partition 135/256 +2026-04-27T20:36:24.085073Z  INFO obikpartitionner::partition: counting kmers in partition 117/256 +2026-04-27T20:36:24.111144Z  INFO obikpartitionner::partition: counting kmers in partition 144/256 +2026-04-27T20:36:24.188547Z  INFO obikpartitionner::partition: counting kmers in partition 23/256 +2026-04-27T20:36:24.573867Z  INFO obikpartitionner::partition: counting kmers in partition 103/256 +2026-04-27T20:36:24.604461Z  INFO obikpartitionner::partition: counting kmers in partition 27/256 +2026-04-27T20:36:24.670189Z  INFO obikpartitionner::partition: counting kmers in partition 95/256 +2026-04-27T20:36:24.935767Z  INFO obikpartitionner::partition: counting kmers in partition 123/256 +2026-04-27T20:36:25.031507Z  INFO obikpartitionner::partition: counting kmers in partition 39/256 +2026-04-27T20:36:25.039809Z  INFO obikpartitionner::partition: counting kmers in partition 108/256 +2026-04-27T20:36:25.067950Z  INFO obikpartitionner::partition: counting kmers in partition 199/256 +2026-04-27T20:36:25.149100Z  INFO obikpartitionner::partition: counting kmers in partition 231/256 +2026-04-27T20:36:25.272346Z  INFO obikpartitionner::partition: counting kmers in partition 213/256 +2026-04-27T20:36:25.400111Z  INFO obikpartitionner::partition: counting kmers in partition 165/256 +2026-04-27T20:36:25.607381Z  INFO obikpartitionner::partition: counting kmers in partition 136/256 +2026-04-27T20:36:25.902415Z  INFO obikpartitionner::partition: counting kmers in partition 152/256 +2026-04-27T20:36:26.055400Z  INFO obikpartitionner::partition: counting kmers in partition 118/256 +2026-04-27T20:36:26.138877Z  INFO obikpartitionner::partition: counting kmers in partition 119/256 +2026-04-27T20:36:26.255235Z  INFO obikpartitionner::partition: counting kmers in partition 87/256 +2026-04-27T20:36:26.350364Z  INFO obikpartitionner::partition: counting kmers in partition 148/256 +2026-04-27T20:36:26.465089Z  INFO obikpartitionner::partition: counting kmers in partition 145/256 +2026-04-27T20:36:26.682375Z  INFO obikpartitionner::partition: counting kmers in partition 40/256 +2026-04-27T20:36:26.694729Z  INFO obikpartitionner::partition: counting kmers in partition 216/256 +2026-04-27T20:36:26.777197Z  INFO obikpartitionner::partition: counting kmers in partition 28/256 +2026-04-27T20:36:26.802996Z  INFO obikpartitionner::partition: counting kmers in partition 124/256 +2026-04-27T20:36:26.972099Z  INFO obikpartitionner::partition: counting kmers in partition 200/256 +2026-04-27T20:36:27.066713Z  INFO obikpartitionner::partition: counting kmers in partition 232/256 +2026-04-27T20:36:27.112569Z  INFO obikpartitionner::partition: counting kmers in partition 166/256 +2026-04-27T20:36:27.156124Z  INFO obikpartitionner::partition: counting kmers in partition 214/256 +2026-04-27T20:36:27.231214Z  INFO obikpartitionner::partition: counting kmers in partition 137/256 +2026-04-27T20:36:27.359784Z  INFO obikpartitionner::partition: counting kmers in partition 109/256 +2026-04-27T20:36:27.369743Z  INFO obikpartitionner::partition: counting kmers in partition 153/256 +2026-04-27T20:36:27.668409Z  INFO obikpartitionner::partition: counting kmers in partition 204/256 +2026-04-27T20:36:27.701887Z  INFO obikpartitionner::partition: counting kmers in partition 202/256 +2026-04-27T20:36:27.794402Z  INFO obikpartitionner::partition: counting kmers in partition 215/256 +2026-04-27T20:36:28.058391Z  INFO obikpartitionner::partition: counting kmers in partition 149/256 +2026-04-27T20:36:28.192195Z  INFO obikpartitionner::partition: counting kmers in partition 217/256 +2026-04-27T20:36:28.369717Z  INFO obikpartitionner::partition: counting kmers in partition 125/256 +2026-04-27T20:36:28.412339Z  INFO obikpartitionner::partition: counting kmers in partition 41/256 +2026-04-27T20:36:28.496068Z  INFO obikpartitionner::partition: counting kmers in partition 29/256 +2026-04-27T20:36:28.693510Z  INFO obikpartitionner::partition: counting kmers in partition 167/256 +2026-04-27T20:36:28.808495Z  INFO obikpartitionner::partition: counting kmers in partition 201/256 +2026-04-27T20:36:28.843564Z  INFO obikpartitionner::partition: counting kmers in partition 233/256 +2026-04-27T20:36:28.917920Z  INFO obikpartitionner::partition: counting kmers in partition 140/256 +2026-04-27T20:36:28.940416Z  INFO obikpartitionner::partition: counting kmers in partition 138/256 +2026-04-27T20:36:29.125956Z  INFO obikpartitionner::partition: counting kmers in partition 110/256 +2026-04-27T20:36:29.197246Z  INFO obikpartitionner::partition: counting kmers in partition 154/256 +2026-04-27T20:36:29.376894Z  INFO obikpartitionner::partition: counting kmers in partition 203/256 +2026-04-27T20:36:29.491887Z  INFO obikpartitionner::partition: counting kmers in partition 176/256 +2026-04-27T20:36:29.501763Z  INFO obikpartitionner::partition: counting kmers in partition 205/256 +2026-04-27T20:36:29.538499Z  INFO obikpartitionner::partition: counting kmers in partition 150/256 +2026-04-27T20:36:29.685646Z  INFO obikpartitionner::partition: counting kmers in partition 218/256 +2026-04-27T20:36:29.874786Z  INFO obikpartitionner::partition: counting kmers in partition 126/256 +2026-04-27T20:36:30.021059Z  INFO obikpartitionner::partition: counting kmers in partition 42/256 +2026-04-27T20:36:30.255780Z  INFO obikpartitionner::partition: counting kmers in partition 240/256 +2026-04-27T20:36:30.602468Z  INFO obikpartitionner::partition: counting kmers in partition 141/256 +2026-04-27T20:36:30.696718Z  INFO obikpartitionner::partition: counting kmers in partition 234/256 +2026-04-27T20:36:30.784522Z  INFO obikpartitionner::partition: counting kmers in partition 139/256 +2026-04-27T20:36:31.027016Z  INFO obikpartitionner::partition: counting kmers in partition 155/256 +2026-04-27T20:36:31.073326Z  INFO obikpartitionner::partition: counting kmers in partition 30/256 +2026-04-27T20:36:31.137264Z  INFO obikpartitionner::partition: counting kmers in partition 111/256 +2026-04-27T20:36:31.143413Z  INFO obikpartitionner::partition: counting kmers in partition 168/256 +2026-04-27T20:36:31.237329Z  INFO obikpartitionner::partition: counting kmers in partition 206/256 +2026-04-27T20:36:31.252908Z  INFO obikpartitionner::partition: counting kmers in partition 236/256 +2026-04-27T20:36:31.306110Z  INFO obikpartitionner::partition: counting kmers in partition 177/256 +2026-04-27T20:36:31.431064Z  INFO obikpartitionner::partition: counting kmers in partition 151/256 +2026-04-27T20:36:31.490837Z  INFO obikpartitionner::partition: counting kmers in partition 127/256 +2026-04-27T20:36:31.528815Z  INFO obikpartitionner::partition: counting kmers in partition 43/256 +2026-04-27T20:36:31.790858Z  INFO obikpartitionner::partition: counting kmers in partition 219/256 +2026-04-27T20:36:31.803934Z  INFO obikpartitionner::partition: counting kmers in partition 241/256 +2026-04-27T20:36:32.131051Z  INFO obikpartitionner::partition: counting kmers in partition 156/256 +2026-04-27T20:36:32.433342Z  INFO obikpartitionner::partition: counting kmers in partition 235/256 +2026-04-27T20:36:32.459699Z  INFO obikpartitionner::partition: counting kmers in partition 142/256 +2026-04-27T20:36:32.532717Z  INFO obikpartitionner::partition: counting kmers in partition 31/256 +2026-04-27T20:36:32.814909Z  INFO obikpartitionner::partition: counting kmers in partition 184/256 +2026-04-27T20:36:32.967090Z  INFO obikpartitionner::partition: counting kmers in partition 169/256 +2026-04-27T20:36:33.062660Z  INFO obikpartitionner::partition: counting kmers in partition 178/256 +2026-04-27T20:36:33.076014Z  INFO obikpartitionner::partition: counting kmers in partition 237/256 +2026-04-27T20:36:33.128961Z  INFO obikpartitionner::partition: counting kmers in partition 207/256 +2026-04-27T20:36:33.228734Z  INFO obikpartitionner::partition: counting kmers in partition 180/256 +2026-04-27T20:36:33.228905Z  INFO obikpartitionner::partition: counting kmers in partition 143/256 +2026-04-27T20:36:33.266931Z  INFO obikpartitionner::partition: counting kmers in partition 44/256 +2026-04-27T20:36:33.280333Z  INFO obikpartitionner::partition: counting kmers in partition 158/256 +2026-04-27T20:36:33.368740Z  INFO obikpartitionner::partition: counting kmers in partition 220/256 +2026-04-27T20:36:33.373469Z  INFO obikpartitionner::partition: counting kmers in partition 179/256 +2026-04-27T20:36:33.409455Z  INFO obikpartitionner::partition: counting kmers in partition 157/256 +2026-04-27T20:36:33.685036Z  INFO obikpartitionner::partition: counting kmers in partition 242/256 +2026-04-27T20:36:34.174714Z  INFO obikpartitionner::partition: counting kmers in partition 182/256 +2026-04-27T20:36:34.255362Z  INFO obikpartitionner::partition: counting kmers in partition 146/256 +2026-04-27T20:36:34.284053Z  INFO obikpartitionner::partition: counting kmers in partition 170/256 +2026-04-27T20:36:34.340602Z  INFO obikpartitionner::partition: counting kmers in partition 185/256 +2026-04-27T20:36:34.748830Z  INFO obikpartitionner::partition: counting kmers in partition 45/256 +2026-04-27T20:36:34.894515Z  INFO obikpartitionner::partition: counting kmers in partition 172/256 +2026-04-27T20:36:35.342815Z  INFO obikpartitionner::partition: counting kmers in partition 238/256 +2026-04-27T20:36:35.549647Z  INFO obikpartitionner::partition: counting kmers in partition 159/256 +2026-04-27T20:36:35.556431Z  INFO obikpartitionner::partition: counting kmers in partition 181/256 +2026-04-27T20:36:35.579877Z  INFO obikpartitionner::partition: counting kmers in partition 188/256 +2026-04-27T20:36:35.619438Z  INFO obikpartitionner::partition: counting kmers in partition 221/256 +2026-04-27T20:36:35.704344Z  INFO obikpartitionner::partition: counting kmers in partition 174/256 +2026-04-27T20:36:35.891634Z  INFO obikpartitionner::partition: counting kmers in partition 190/256 +2026-04-27T20:36:35.893065Z  INFO obikpartitionner::partition: counting kmers in partition 239/256 +2026-04-27T20:36:35.978053Z  INFO obikpartitionner::partition: counting kmers in partition 183/256 +2026-04-27T20:36:35.980835Z  INFO obikpartitionner::partition: counting kmers in partition 186/256 +2026-04-27T20:36:36.010364Z  INFO obikpartitionner::partition: counting kmers in partition 243/256 +2026-04-27T20:36:36.084972Z  INFO obikpartitionner::partition: counting kmers in partition 171/256 +2026-04-27T20:36:36.085806Z  INFO obikpartitionner::partition: counting kmers in partition 147/256 +2026-04-27T20:36:36.129893Z  INFO obikpartitionner::partition: counting kmers in partition 48/256 +2026-04-27T20:36:36.188339Z  INFO obikpartitionner::partition: counting kmers in partition 46/256 +2026-04-27T20:36:36.281975Z  INFO obikpartitionner::partition: counting kmers in partition 173/256 +2026-04-27T20:36:37.004395Z  INFO obikpartitionner::partition: counting kmers in partition 187/256 +2026-04-27T20:36:37.080479Z  INFO obikpartitionner::partition: counting kmers in partition 222/256 +2026-04-27T20:36:37.100573Z  INFO obikpartitionner::partition: counting kmers in partition 189/256 +2026-04-27T20:36:37.422152Z  INFO obikpartitionner::partition: counting kmers in partition 248/256 +2026-04-27T20:36:37.446786Z  INFO obikpartitionner::partition: counting kmers in partition 175/256 +2026-04-27T20:36:37.951518Z  INFO obikpartitionner::partition: counting kmers in partition 244/256 +2026-04-27T20:36:38.235613Z  INFO obikpartitionner::partition: counting kmers in partition 191/256 +2026-04-27T20:36:38.259288Z  INFO obikpartitionner::partition: counting kmers in partition 47/256 +2026-04-27T20:36:38.379966Z  INFO obikpartitionner::partition: counting kmers in partition 252/256 +2026-04-27T20:36:38.380690Z  INFO obikpartitionner::partition: counting kmers in partition 223/256 +2026-04-27T20:36:38.520398Z  INFO obikpartitionner::partition: counting kmers in partition 56/256 +2026-04-27T20:36:38.598256Z  INFO obikpartitionner::partition: counting kmers in partition 49/256 +2026-04-27T20:36:38.680704Z  INFO obikpartitionner::partition: counting kmers in partition 254/256 +2026-04-27T20:36:38.681038Z  INFO obikpartitionner::partition: counting kmers in partition 246/256 +2026-04-27T20:36:38.813533Z  INFO obikpartitionner::partition: counting kmers in partition 247/256 +2026-04-27T20:36:38.820199Z  INFO obikpartitionner::partition: counting kmers in partition 60/256 +2026-04-27T20:36:38.961240Z  INFO obikpartitionner::partition: counting kmers in partition 52/256 +2026-04-27T20:36:39.001585Z  INFO obikpartitionner::partition: counting kmers in partition 58/256 +2026-04-27T20:36:39.050047Z  INFO obikpartitionner::partition: counting kmers in partition 249/256 +2026-04-27T20:36:39.076449Z  INFO obikpartitionner::partition: counting kmers in partition 57/256 +2026-04-27T20:36:39.078254Z  INFO obikpartitionner::partition: counting kmers in partition 250/256 +2026-04-27T20:36:39.082390Z  INFO obikpartitionner::partition: counting kmers in partition 251/256 +2026-04-27T20:36:39.082697Z  INFO obikpartitionner::partition: counting kmers in partition 245/256 +2026-04-27T20:36:39.250465Z  INFO obikpartitionner::partition: counting kmers in partition 50/256 +2026-04-27T20:36:39.448799Z  INFO obikpartitionner::partition: counting kmers in partition 54/256 +2026-04-27T20:36:39.449166Z  INFO obikpartitionner::partition: counting kmers in partition 62/256 +2026-04-27T20:36:39.450954Z  INFO obikpartitionner::partition: counting kmers in partition 63/256 +2026-04-27T20:36:39.766480Z  INFO obikpartitionner::partition: counting kmers in partition 253/256 +2026-04-27T20:36:39.838346Z  INFO obikpartitionner::partition: counting kmers in partition 59/256 +2026-04-27T20:36:39.875987Z  INFO obikpartitionner::partition: counting kmers in partition 255/256 +2026-04-27T20:36:39.876798Z  INFO obikpartitionner::partition: counting kmers in partition 55/256 +2026-04-27T20:36:39.877292Z  INFO obikpartitionner::partition: counting kmers in partition 51/256 +2026-04-27T20:36:40.244598Z  INFO obikpartitionner::partition: counting kmers in partition 61/256 +2026-04-27T20:36:40.245589Z  INFO obikpartitionner::partition: counting kmers in partition 53/256 + 266.84 real 3776.35 user 115.80 sys + 2423504896 maximum resident set size + 0 average shared memory size + 0 average unshared data size + 0 average unshared stack size + 292303 page reclaims + 132311 page faults + 0 swaps + 0 block input operations + 0 block output operations + 0 messages sent + 0 messages received + 0 signals received + 4462 voluntary context switches + 1991864 involuntary context switches + 54933227607056 instructions retired + 11271230983971 cycles elapsed + 2210777320 peak memory footprint diff --git a/docmd/implementation/mphf.md b/docmd/implementation/mphf.md index 316308f..b7a6b05 100644 --- a/docmd/implementation/mphf.md +++ b/docmd/implementation/mphf.md @@ -1,6 +1,29 @@ -# MPHF selection — analysis in progress +# MPHF selection — two-phase indexing architecture -The choice of Minimal Perfect Hash Function for phase 6 is not yet settled. Three candidates were evaluated. +## Indexing architecture + +Kmer indexing per partition proceeds in two phases. The separation is necessary because the exact number of unique kmers in a partition is not known until after counting and filtering. + +### Superkmer vs kmer counts + +The `SKFileMeta` sidecar written by `SKFileWriter` records `instances` (unique superkmers) and `length_sum` (total nucleotides). A superkmer of length L contains L − k + 1 kmers, so the kmer count per partition can be estimated as `length_sum − instances × (k − 1)`. This is an **overestimate** of unique kmers: two distinct superkmers (different flanking contexts, same minimizer) can share kmers. The exact count of unique kmers is only known after enumerating and deduplicating them. + +Note: two superkmers sharing a kmer necessarily share the same minimizer and therefore always land in the same partition — no kmer can appear in two different partitions. + +### Phase 1 — provisional index and spectrum + +1. Enumerate all kmers from the dereplicated superkmers of the partition. +2. Build a provisional MPHF over this key set; capacity is pre-allocated from the sidecar estimate (slight overestimate, harmless). +3. Accumulate counts: for each kmer in each superkmer, `count[MPHF(kmer)] += sk.count()`. +4. Compute the kmer frequency spectrum (histogram: occurrences → number of kmers). +5. Apply count filter (e.g. discard singletons). After filtering, the exact number of surviving kmers is known. +6. Discard the provisional MPHF. + +### Phase 2 — definitive index + +Build a new MPHF over the filtered kmer set only, with the exact key count available. This is the persistent per-partition index used for all downstream operations (queries, set operations). + +--- ## Candidates @@ -8,31 +31,41 @@ The choice of Minimal Perfect Hash Function for phase 6 is not yet settled. Thre - ~3.7 bits/key; mature crate, used in production bioinformatics (Pufferfish, Piscem) - Parallel construction; well-tested with DNA kmer data at scale -- Drawback: largest space footprint of the three +- Drawback: largest space footprint; streaming construction (no exact count needed) was its main differentiator — irrelevant here since exact count is available at phase 2 **ptr_hash** (PtrHash algorithm, Groot Koerkamp, SEA 2025): - ~2.4 bits/key; fastest queries (≥2.1× over alternatives, 8–12 ns/key for u64 in tight loops) and fastest construction (≥3.1×) -- Theoretical foundation solid; paper and Rust crate from the same author +- Requires exact key count at construction — available at phase 2 - Drawback: published February 2025 — very young, no production track record **FMPHGO** (`ph` crate, Beling, ACM JEA 2023): - ~2.1 bits/key — most compact of the three; good query speed; parallelisable construction - More established than ptr_hash; actively maintained -- Currently preferred candidate +- Works well with overestimated capacity → natural fit for phase 1 + +## MPHF choice per phase + +**Phase 1** (provisional, discarded after spectrum computation): FMPHGO. Tolerates overestimated capacity, compact, no need to optimise for query speed on a temporary structure. + +**Phase 2** (persistent, queried repeatedly): open between FMPHGO and ptr_hash. Exact key count is available, so both operate optimally. ptr_hash's query speed advantage (2.1–3.3×) is meaningful for the persistent index but carries the risk of a very young crate. FMPHGO is the conservative default; ptr_hash is worth revisiting once it has broader production use. + +boomphf is effectively eliminated: its space overhead is the largest and its streaming-construction advantage does not apply here. + +--- ## Space at scale -For 1 024 partitions × 100 M kmers/partition: +For 1 024 partitions × 100 M kmers/partition (phase 2 index, after filtering): -| MPHF | bits/key | Total MPHF size | -|---------|----------|-----------------| -| boomphf | 3.7 | ~47 GB | -| ptr_hash | 2.4 | ~31 GB | -| FMPHGO | 2.1 | ~27 GB | +| MPHF | bits/key | Total MPHF size | +|----------|----------|-----------------| +| boomphf | 3.7 | ~47 GB | +| ptr_hash | 2.4 | ~31 GB | +| FMPHGO | 2.1 | ~27 GB | -In practice, partition sizes depend on the dataset. For a human genome at 30× coverage with p=10 (1 024 partitions), realistic partition sizes are 3–30 M kmers → 1–8 MB per MPHF, well within RAM. +For a human genome at 30× coverage with 1 024 partitions, realistic partition sizes are 3–30 M unique kmers → 1–8 MB per phase-2 MPHF, well within RAM. ## On-disk and mmap considerations @@ -42,7 +75,7 @@ No established Rust crate provides a natively on-disk MPHF. **SSHash** (Sparse a ## Open questions -- Confirm actual partition sizes on representative metagenomic datasets before fixing the choice. -- Evaluate whether ptr_hash's query speed advantage (2.1–3.3×) justifies adopting a crate that is less than a year old. -- Assess rkyv integration cost for FMPHGO if true zero-copy mmap becomes necessary. +- Confirm actual partition sizes and overestimation factor on representative metagenomic datasets. +- Revisit ptr_hash for phase 2 once the crate has broader production track record. +- Assess rkyv integration cost for FMPHGO if true zero-copy mmap becomes necessary for the persistent index. - Keep SSHash in mind if the indexing architecture is reconsidered at a higher level. diff --git a/scripts/plot_spectrum.py b/scripts/plot_spectrum.py new file mode 100644 index 0000000..2c7a37e --- /dev/null +++ b/scripts/plot_spectrum.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +"""Plot kmer frequency spectrum from kmer_spectrum_raw.json (log-log scale).""" + +import json +import sys +import argparse +import matplotlib.pyplot as plt + + +def load_spectrum(path: str): + with open(path) as f: + data = json.load(f) + spectrum = data["spectrum"] + pairs = sorted((int(k), v) for k, v in spectrum.items() if v > 0) + counts = [x for x, _ in pairs] + freqs = [y for _, y in pairs] + return counts, freqs, data.get("f0"), data.get("f1") + + +def main(): + parser = argparse.ArgumentParser(description="Log-log kmer spectrum plot") + parser.add_argument("spectrum", nargs="?", default="kmer_spectrum_raw.json", + help="Path to kmer_spectrum_raw.json") + parser.add_argument("-o", "--output", default=None, + help="Save figure to file (PNG/SVG/PDF). Omit to display interactively.") + parser.add_argument("--max-count", type=int, default=None, + help="Truncate x-axis at this count value") + args = parser.parse_args() + + counts, freqs, f0, f1 = load_spectrum(args.spectrum) + + if args.max_count: + pairs = [(c, f) for c, f in zip(counts, freqs) if c <= args.max_count] + counts, freqs = zip(*pairs) if pairs else ([], []) + + fig, ax = plt.subplots(figsize=(9, 5)) + ax.plot(counts, freqs, ".", markersize=2, linewidth=0.6, color="steelblue") + ax.set_xscale("log") + ax.set_yscale("log") + ax.set_xlabel("Repetition degree (count)", fontsize=12) + ax.set_ylabel("Number of distinct k-mers", fontsize=12) + ax.set_title("K-mer frequency spectrum", fontsize=13) + + info = [] + if f0 is not None: + info.append(f"F₀ = {f0:,}") + if f1 is not None: + info.append(f"F₁ = {f1:,}") + if info: + ax.text(0.98, 0.97, " ".join(info), transform=ax.transAxes, + ha="right", va="top", fontsize=9, color="gray") + + ax.grid(True, which="both", linestyle="--", linewidth=0.4, alpha=0.6) + fig.tight_layout() + + if args.output: + fig.savefig(args.output, dpi=150) + print(f"Saved to {args.output}", file=sys.stderr) + else: + plt.show() + + +if __name__ == "__main__": + main() diff --git a/src/Cargo.lock b/src/Cargo.lock index db5c270..9e4a003 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -35,12 +35,39 @@ dependencies = [ "as-slice", ] +[[package]] +name = "aligned-vec" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc890384c8602f339876ded803c97ad529f3842aba97f6392b3dba0dd171769b" +dependencies = [ + "equator", +] + [[package]] name = "allocator-api2" version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" +[[package]] +name = "ambassador" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e68de4cdc6006162265d0957edb4a860fe4e711b1dc17a5746fd95f952f08285" +dependencies = [ + "itertools 0.10.5", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + [[package]] name = "anes" version = "0.2.1" @@ -103,6 +130,18 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +[[package]] +name = "arbitrary-chunks" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ad8689a486416c401ea15715a4694de30054248ec627edbf31f49cb64ee4086" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "as-slice" version = "0.2.1" @@ -130,7 +169,7 @@ dependencies = [ "miniz_oxide", "object", "rustc-demangle", - "windows-link", + "windows-link 0.2.1", ] [[package]] @@ -151,6 +190,12 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "binout" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "222fb4925a15bea6a68075021910e03d6aa2d04951d71ff1d956190a551d738f" + [[package]] name = "bitflags" version = "1.3.2" @@ -163,6 +208,15 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +[[package]] +name = "bitm" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31468ea4a856000d83cb61960dfdc2980ecd96b15b61321c8c76cc96aea6e688" +dependencies = [ + "dyn_size_of", +] + [[package]] name = "bitvec" version = "1.0.1" @@ -184,6 +238,16 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block-pseudorand" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2097358495d244a0643746f4d13eedba4608137008cf9dec54e53a3b700115a6" +dependencies = [ + "chiapos-chacha8", + "nanorand", +] + [[package]] name = "bpaf" version = "0.9.25" @@ -278,6 +342,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chiapos-chacha8" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33f8be573a85f6c2bc1b8e43834c07e32f95e489b914bf856c0549c3c269cd0a" +dependencies = [ + "rayon", +] + [[package]] name = "ciborium" version = "0.2.2" @@ -336,7 +409,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -351,6 +424,27 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +[[package]] +name = "combine" +version = "4.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" +dependencies = [ + "bytes", + "memchr", +] + +[[package]] +name = "common_traits" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65d0a1296e8d359cb197a8f8289f3d3f77cdb67f1a83d0aeb0820a5b7aea4058" +dependencies = [ + "anyhow", + "half", + "impl-tools", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -384,13 +478,49 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes 0.1.6", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools 0.10.5", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools 0.10.5", +] + [[package]] name = "criterion2" version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "861a56bb48e3ba7a2a38580a91577e17d90db946649f5c342fd74ba864180def" dependencies = [ - "anes", + "anes 0.2.1", "bpaf", "cast", "ciborium", @@ -461,6 +591,41 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "darling" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.117", +] + +[[package]] +name = "darling_macro" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.117", +] + [[package]] name = "debugid" version = "0.8.0" @@ -470,6 +635,29 @@ dependencies = [ "uuid", ] +[[package]] +name = "derivative" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_setters" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7e6f6fa1f03c14ae082120b84b3c7fbd7b8588d924cf2d7c3daf9afd49df8b9" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "digest" version = "0.10.7" @@ -488,15 +676,117 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] +[[package]] +name = "dsi-progress-logger" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3969f942da74913b951e19784a2c0b61a2b0d11c2887f6eb3b9b34775b31bf" +dependencies = [ + "log", + "num-format", + "pluralizer", + "sysinfo 0.36.1", +] + +[[package]] +name = "dyn_size_of" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a742b95783b1f45b900129082cbc47717b6a77ee8d17eea70a8ea62462f5de3" + [[package]] name = "either" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "enum-as-inner" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "env_filter" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e90c2accc4b07a8456ea0debdc2e7587bdd890680d71173a15d4ae604f6eef" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0621c04f2196ac3f488dd583365b9c09be011a4ab8b9f37248ffcc8f6198b56a" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "jiff", + "log", +] + +[[package]] +name = "epserde" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8dffc01a379703ad5178f47a22aa532f5811b3ef45979ccd66b79da9856770b" +dependencies = [ + "anyhow", + "bitflags 2.11.1", + "common_traits", + "epserde-derive", + "mem_dbg", + "mmap-rs", + "sealed", + "thiserror 2.0.18", + "xxhash-rust", +] + +[[package]] +name = "epserde-derive" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fc2ceb99084df049085a5bdd15e3b2f7275111e2b9029f95fb01a3a06cf1b13" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "equator" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -513,6 +803,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + [[package]] name = "fastrand" version = "2.4.1" @@ -667,6 +963,12 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "icu_collections" version = "2.2.0" @@ -749,6 +1051,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.1.0" @@ -770,6 +1078,30 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "impl-tools" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ae314a99afb5821e2fda288387546d4a04aace674551e854e6216b892ec3208" +dependencies = [ + "autocfg", + "impl-tools-lib", + "proc-macro-error2", + "syn 2.0.117", +] + +[[package]] +name = "impl-tools-lib" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab699036df31c1f7d3561bfa6e9cb9bc3bb0fd2e2cd9bf121c31cb961d049ddf" +dependencies = [ + "proc-macro-error2", + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "indexmap" version = "2.14.0" @@ -789,12 +1121,32 @@ dependencies = [ "cfb", ] +[[package]] +name = "is-terminal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.61.2", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.12.1" @@ -804,12 +1156,62 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +[[package]] +name = "jiff" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f00b5dbd620d61dfdcb6007c9c1f6054ebd75319f163d886a9055cec1155073d" +dependencies = [ + "jiff-static", + "jiff-tzdb-platform", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", + "windows-sys 0.61.2", +] + +[[package]] +name = "jiff-static" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e000de030ff8022ea1da3f466fbb0f3a809f5e51ed31f6dd931c35181ad8e6d7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "jiff-tzdb" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c900ef84826f1338a557697dc8fc601df9ca9af4ac137c7fb61d4c6f2dfd3076" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + [[package]] name = "jobserver" version = "0.1.34" @@ -830,12 +1232,44 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lambert_w" +version = "1.2.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5f0846ee4f0299ca4c5b9ca06ff55cf88b3430a763bf591474cc734479c9b24" +dependencies = [ + "num-complex", + "num-traits", +] + [[package]] name = "lazy_static" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "lender" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c150e24afda8f769930a339cb5ad6e182101fdf1165c30c157b33ce5050fd7ad" +dependencies = [ + "fallible-iterator", + "lender-derive", + "stable_try_trait_v2", +] + +[[package]] +name = "lender-derive" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d074a297c82222d442171bad4f392fef93d35fb31e24a115f605a0c907ce0af9" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "libc" version = "0.2.185" @@ -898,6 +1332,15 @@ dependencies = [ "hashbrown 0.15.5", ] +[[package]] +name = "mach2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d640282b302c0bb0a2a8e0233ead9035e3bed871f0b7e81fe4a1ec829765db44" +dependencies = [ + "libc", +] + [[package]] name = "matchers" version = "0.2.0" @@ -907,6 +1350,28 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "mem_dbg" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728cc9dc97593cd22f7bc81fbef70a2d391d7a9a855e7d658b653318124a6cf0" +dependencies = [ + "bitflags 2.11.1", + "mem_dbg-derive", + "mmap-rs", +] + +[[package]] +name = "mem_dbg-derive" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d84f40c93b0508d5565db79a814d02d5b2545967205ce44be211592aafa34d6c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "memchr" version = "2.8.0" @@ -922,6 +1387,15 @@ dependencies = [ "libc", ] +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "miniz_oxide" version = "0.8.9" @@ -932,12 +1406,35 @@ dependencies = [ "simd-adler32", ] +[[package]] +name = "mmap-rs" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86968d85441db75203c34deefd0c88032f275aaa85cee19a1dcfff6ae9df56da" +dependencies = [ + "bitflags 1.3.2", + "combine", + "libc", + "mach2", + "nix 0.26.4", + "sysctl", + "thiserror 1.0.69", + "widestring", + "windows 0.48.0", +] + [[package]] name = "multimap" version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +[[package]] +name = "nanorand" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "729eb334247daa1803e0a094d0a5c55711b85571179f5ec6e53eccfdf7008958" + [[package]] name = "niffler" version = "2.7.0" @@ -976,6 +1473,8 @@ dependencies = [ "bitflags 1.3.2", "cfg-if", "libc", + "memoffset", + "pin-utils", ] [[package]] @@ -1017,6 +1516,25 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-format" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" +dependencies = [ + "arrayvec", + "itoa", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1055,14 +1573,16 @@ dependencies = [ name = "obikpartitionner" version = "0.1.0" dependencies = [ + "memmap2", "niffler 3.0.0", "obikseq", "obiskio", + "ph", "rayon", "remove_dir_all", "serde", "serde_json", - "sysinfo", + "sysinfo 0.33.1", "tracing", ] @@ -1125,6 +1645,25 @@ dependencies = [ "tempfile", ] +[[package]] +name = "objc2-core-foundation" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" +dependencies = [ + "bitflags 2.11.1", +] + +[[package]] +name = "objc2-io-kit" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33fafba39597d6dc1fb709123dfa8289d39406734be322956a69f0931c73bb15" +dependencies = [ + "libc", + "objc2-core-foundation", +] + [[package]] name = "object" version = "0.37.3" @@ -1172,9 +1711,15 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-link", + "windows-link 0.2.1", ] +[[package]] +name = "partition" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "947f833aaa585cf12b8ec7c0476c98784c49f33b861376ffc84ed92adebf2aba" + [[package]] name = "percent-encoding" version = "2.3.2" @@ -1191,18 +1736,96 @@ dependencies = [ "indexmap", ] +[[package]] +name = "ph" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a447f203c7254ebb06aa4f111480e821debb361e80fb4d760554a8460f236550" +dependencies = [ + "aligned-vec", + "arrayvec", + "binout", + "bitm", + "dyn_size_of", + "epserde", + "mem_dbg", + "rayon", + "seedable_hash", + "sux", + "voracious_radix_sort", +] + [[package]] name = "pin-project-lite" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "pkg-config" version = "0.3.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "pluralizer" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b3eba432a00a1f6c16f39147847a870e94e2e9b992759b503e330efec778cbe" +dependencies = [ + "once_cell", + "regex", +] + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "portable-atomic-util" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a106d1259c23fac8e543272398ae0e3c0b8d33c88ed73d0cc71b0f1d902618" +dependencies = [ + "portable-atomic", +] + [[package]] name = "potential_utf" version = "0.1.5" @@ -1236,6 +1859,15 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + [[package]] name = "prettyplease" version = "0.2.37" @@ -1243,7 +1875,52 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.109", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96de42df36bb9bba5542fe9f1a054b8cc87e172759a1868aa05c1f3acc89dfc5" +dependencies = [ + "proc-macro2", + "quote", +] + +[[package]] +name = "proc-macro-error2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11ec05c52be0a07b08061f7dd003e7d7092e0472bc731b4af7bb1ef876109802" +dependencies = [ + "proc-macro-error-attr2", + "proc-macro2", + "quote", ] [[package]] @@ -1273,7 +1950,7 @@ checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" dependencies = [ "bytes", "heck", - "itertools", + "itertools 0.12.1", "log", "multimap", "once_cell", @@ -1282,7 +1959,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn", + "syn 2.0.117", "tempfile", ] @@ -1293,10 +1970,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" dependencies = [ "anyhow", - "itertools", + "itertools 0.12.1", "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1329,6 +2006,35 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + [[package]] name = "rayon" version = "1.12.0" @@ -1349,6 +2055,21 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "rdst" +version = "0.20.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e7970b4e577b76a96d5e56b5f6662b66d1a4e1f5bb026ee118fc31b373c2752" +dependencies = [ + "arbitrary-chunks", + "block-pseudorand", + "criterion", + "partition", + "rayon", + "tikv-jemallocator", + "voracious_radix_sort", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -1490,6 +2211,23 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "sealed" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22f968c5ea23d555e670b449c1c5e7b2fc399fdaec1d304a17cd48e288abc107" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "seedable_hash" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba2a159211665e29bbf5a2fbb42da50dd6eadff23eef7a6a7ae4a9b0a7cd0152" + [[package]] name = "serde" version = "1.0.228" @@ -1517,7 +2255,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1577,6 +2315,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "stable_try_trait_v2" +version = "1.75.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c4e48411f4db8ccca0470bfb67e3bb821af4227d455aa147917d8d109be0d13" + [[package]] name = "strsim" version = "0.11.1" @@ -1589,6 +2333,47 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "sux" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29949eff4a64e05149a5147f1695fd8576c990504a217f28391c09e007d831dc" +dependencies = [ + "ambassador", + "anyhow", + "arbitrary-chunks", + "bitflags 2.11.1", + "clap", + "common_traits", + "crossbeam-channel", + "derivative", + "derive_setters", + "dsi-progress-logger", + "env_logger", + "epserde", + "fallible-iterator", + "flate2", + "impl-tools", + "itertools 0.14.0", + "jiff", + "lambert_w", + "lender", + "libc", + "log", + "mem_dbg", + "rand", + "rayon", + "rdst", + "sync-cell-slice", + "tempfile", + "thiserror 2.0.18", + "thread-priority", + "value-traits", + "xxhash-rust", + "zerocopy", + "zstd", +] + [[package]] name = "symbolic-common" version = "12.18.3" @@ -1612,6 +2397,17 @@ dependencies = [ "symbolic-common", ] +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.117" @@ -1623,6 +2419,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync-cell-slice" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cf9ace719a3856838781680d5d677c612e01a0bc0b7b1ded355057ca5015997" + [[package]] name = "synstructure" version = "0.13.2" @@ -1631,7 +2433,21 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "sysctl" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec7dddc5f0fee506baf8b9fdb989e242f17e4b11c61dfbb0635b705217199eea" +dependencies = [ + "bitflags 2.11.1", + "byteorder", + "enum-as-inner", + "libc", + "thiserror 1.0.69", + "walkdir", ] [[package]] @@ -1645,7 +2461,21 @@ dependencies = [ "memchr", "ntapi", "rayon", - "windows", + "windows 0.57.0", +] + +[[package]] +name = "sysinfo" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "252800745060e7b9ffb7b2badbd8b31cfa4aa2e61af879d0a3bf2a317c20217d" +dependencies = [ + "libc", + "memchr", + "ntapi", + "objc2-core-foundation", + "objc2-io-kit", + "windows 0.61.3", ] [[package]] @@ -1693,7 +2523,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1704,7 +2534,21 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "thread-priority" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2210811179577da3d54eb69ab0b50490ee40491a25d95b8c6011ba40771cb721" +dependencies = [ + "bitflags 2.11.1", + "cfg-if", + "libc", + "log", + "rustversion", + "windows 0.61.3", ] [[package]] @@ -1716,6 +2560,26 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "tikv-jemalloc-sys" +version = "0.5.4+5.3.0-patched" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "tikv-jemallocator" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca" +dependencies = [ + "libc", + "tikv-jemalloc-sys", +] + [[package]] name = "tinystr" version = "0.8.3" @@ -1726,6 +2590,16 @@ dependencies = [ "zerovec", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tracing" version = "0.1.44" @@ -1745,7 +2619,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -1861,12 +2735,41 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "value-traits" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17e73c7053d8fa8e9c3c6b16c32d079ed5642a7156514820486a9c4e109cf48d" +dependencies = [ + "value-traits-derive", +] + +[[package]] +name = "value-traits-derive" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d301d1ee4b3eced3e73aa5740a303c7e068f1d4450c5dae4c8cf6bfa266954f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "version_check" version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "voracious_radix_sort" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "446e7ffcb6c27a71d05af7e51ef2ee5b71c48424b122a832f2439651e1914899" +dependencies = [ + "rayon", +] + [[package]] name = "walkdir" version = "2.5.0" @@ -1924,7 +2827,7 @@ dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn", + "syn 2.0.117", "wasm-bindgen-shared", ] @@ -1937,6 +2840,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "web-sys" +version = "0.3.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f2dfbb17949fa2088e5d39408c48368947b86f7834484e87b73de55bc14d97d" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "webpki-roots" version = "0.26.11" @@ -1955,6 +2868,12 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "widestring" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72069c3113ab32ab29e5584db3c6ec55d416895e60715417b5b883a357c3e471" + [[package]] name = "winapi" version = "0.3.9" @@ -1986,14 +2905,45 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows" version = "0.57.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143" dependencies = [ - "windows-core", - "windows-targets", + "windows-core 0.57.0", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows" +version = "0.61.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9babd3a767a4c1aef6900409f85f5d53ce2544ccdfaa86dad48c91782c6d6893" +dependencies = [ + "windows-collections", + "windows-core 0.61.2", + "windows-future", + "windows-link 0.1.3", + "windows-numerics", +] + +[[package]] +name = "windows-collections" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3beeceb5e5cfd9eb1d76b381630e82c4241ccd0d27f1a39ed41b2760b255c5e8" +dependencies = [ + "windows-core 0.61.2", ] [[package]] @@ -2002,10 +2952,34 @@ version = "0.57.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" dependencies = [ - "windows-implement", - "windows-interface", - "windows-result", - "windows-targets", + "windows-implement 0.57.0", + "windows-interface 0.57.0", + "windows-result 0.1.2", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-core" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3" +dependencies = [ + "windows-implement 0.60.2", + "windows-interface 0.59.3", + "windows-link 0.1.3", + "windows-result 0.3.4", + "windows-strings", +] + +[[package]] +name = "windows-future" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc6a41e98427b19fe4b73c550f060b59fa592d7d686537eebf9385621bfbad8e" +dependencies = [ + "windows-core 0.61.2", + "windows-link 0.1.3", + "windows-threading", ] [[package]] @@ -2016,7 +2990,18 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", ] [[package]] @@ -2027,22 +3012,67 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "windows-link" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" + [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-numerics" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9150af68066c4c5c07ddc0ce30421554771e528bde427614c61038bc2c92c2b1" +dependencies = [ + "windows-core 0.61.2", + "windows-link 0.1.3", +] + [[package]] name = "windows-result" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-result" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" +dependencies = [ + "windows-link 0.1.3", +] + +[[package]] +name = "windows-strings" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" +dependencies = [ + "windows-link 0.1.3", ] [[package]] @@ -2051,7 +3081,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -2060,7 +3090,7 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -2069,7 +3099,22 @@ version = "0.61.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" dependencies = [ - "windows-link", + "windows-link 0.2.1", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] @@ -2078,28 +3123,55 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows-threading" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66463ad2e0ea3bbf808b7f1d371311c80e115c0b71d60efc142cafbcfb057a6" +dependencies = [ + "windows-link 0.1.3", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -2112,24 +3184,48 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -2182,7 +3278,7 @@ checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", "synstructure", ] @@ -2203,7 +3299,7 @@ checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] @@ -2223,7 +3319,7 @@ checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", "synstructure", ] @@ -2263,7 +3359,7 @@ checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.117", ] [[package]] diff --git a/src/obikmer/src/cmd/count.rs b/src/obikmer/src/cmd/count.rs new file mode 100644 index 0000000..ed83eda --- /dev/null +++ b/src/obikmer/src/cmd/count.rs @@ -0,0 +1,24 @@ +use clap::Args; +use obikpartitionner::KmerPartition; +use std::path::PathBuf; +use tracing::info; + +#[derive(Args)] +pub struct CountArgs { + /// Partition directory produced by the `partition` command + #[arg(short, long)] + pub partition: PathBuf, +} + +pub fn run(args: CountArgs) { + let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| { + eprintln!("error: {e}"); + std::process::exit(1) + }); + + info!("counting kmers in {}", args.partition.display()); + kp.count_kmer().unwrap_or_else(|e| { + eprintln!("error: {e}"); + std::process::exit(1) + }); +} diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index e259487..a0098df 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,2 +1,3 @@ +pub mod count; pub mod partition; pub mod superkmer; diff --git a/src/obikmer/src/cmd/partition.rs b/src/obikmer/src/cmd/partition.rs index 980ecc8..84d28f4 100644 --- a/src/obikmer/src/cmd/partition.rs +++ b/src/obikmer/src/cmd/partition.rs @@ -136,4 +136,5 @@ pub fn run(args: PartitionArgs) { info!("dereplicating..."); kp.lock().unwrap().dereplicate().expect("dereplicate error"); + kp.lock().unwrap().count_kmer().expect("count kmer error"); } diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index 06780c4..a780237 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -16,6 +16,8 @@ enum Commands { Superkmer(cmd::superkmer::SuperkmerArgs), /// Partition super-kmers on disk by minimizer Partition(cmd::partition::PartitionArgs), + /// Count kmers from an existing dereplicated partition directory + Count(cmd::count::CountArgs), } fn main() { @@ -37,6 +39,7 @@ fn main() { match cli.command { Commands::Superkmer(args) => cmd::superkmer::run(args), Commands::Partition(args) => cmd::partition::run(args), + Commands::Count(args) => cmd::count::run(args), } #[cfg(feature = "profiling")] diff --git a/src/obikpartitionner/Cargo.toml b/src/obikpartitionner/Cargo.toml index fe0379a..d8c1f68 100644 --- a/src/obikpartitionner/Cargo.toml +++ b/src/obikpartitionner/Cargo.toml @@ -13,3 +13,5 @@ sysinfo = "0.33" serde = { version = "1", features = ["derive"] } serde_json = "1" tracing = "0.1.44" +ph = "0.11" +memmap2 = "0.9.10" diff --git a/src/obikpartitionner/src/partition.rs b/src/obikpartitionner/src/partition.rs index 14c50fd..40ef113 100644 --- a/src/obikpartitionner/src/partition.rs +++ b/src/obikpartitionner/src/partition.rs @@ -1,8 +1,12 @@ -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::fs; use std::io; use std::path::{Path, PathBuf}; -use tracing::debug; +use tracing::{debug, info}; + +use memmap2::MmapMut; +use obikseq::kmer::Kmer; +use ph::fmph::GOFunction; use sysinfo::System; @@ -99,6 +103,42 @@ impl KmerPartition { Ok(partition) } + pub fn open>(path: P) -> SKResult { + let root_path = path.as_ref().to_owned(); + if !root_path.exists() { + return Err(io::Error::new( + io::ErrorKind::NotFound, + format!("{}: partition directory not found", root_path.display()), + ) + .into()); + } + let meta_path = root_path.join(META_FILENAME); + let meta: PartitionMeta = serde_json::from_reader(fs::File::open(&meta_path)?) + .map_err(io::Error::other)?; + + let format = match meta.format.as_str() { + "gzip" => Format::Gzip, + "bzip2" => Format::Bzip, + "lzma" => Format::Lzma, + "zstd" => Format::Zstd, + _ => Format::No, + }; + let level = level_from_u32(meta.level); + let n_partitions = 1usize << meta.n_bits; + let writers = (0..n_partitions).map(|_| None).collect(); + Ok(Self { + root_path, + n_partitions, + partitions_mask: (1u64 << meta.n_bits) - 1, + kmer_size: meta.kmer_size, + minimizer_size: meta.minimizer_size, + writers, + format, + level, + closed: true, // read-only: writing is not allowed on an opened partition + }) + } + pub fn write(&mut self, sk: &mut SuperKmer) -> SKResult<()> { self.check_not_closed()?; let partition = self.partition_of(sk)?; @@ -190,6 +230,73 @@ impl KmerPartition { Ok(()) } + /// For each partition that has a `dereplicated.{ext}` file: + /// 1. Enumerates all unique canonical kmers (two passes over the file). + /// 2. Builds a provisional MPHF (FMPHGO) over those kmers. + /// 3. Writes a flat binary count file (`counts1.bin`, one `u32` per slot, + /// memory-mapped) accumulating kmer abundances from the superkmer counts. + /// 4. Persists the MPHF to `mphf1.bin` for downstream use. + /// 5. Writes a global `kmer_spectrum_raw.json` at the partition root. + /// + /// Partitions are processed in parallel via Rayon (one task per thread). + /// Peak memory per partition is ~80 MB, so n_threads partitions run simultaneously. + pub fn count_kmer(&self) -> SKResult<()> { + let ext = format_ext(self.format); + let root = &self.root_path; + let k = self.kmer_size; + + let results: Vec> = (0..self.n_partitions) + .into_par_iter() + .map(|i| { + let dir = root.join(format!("part_{:05}", i)); + let dedup_path = dir.join(format!("dereplicated.{ext}")); + if !dedup_path.exists() { + return Ok(()); + } + info!("counting kmers in partition {}/{}", i, self.n_partitions); + count_partition(&dir, &dedup_path, k) + }) + .collect(); + for r in results { + r?; + } + + // Aggregate per-partition spectra into a global one at the root. + let mut global_spectrum: BTreeMap = BTreeMap::new(); + let mut global_f0: u64 = 0; + let mut global_f1: u64 = 0; + + for i in 0..self.n_partitions { + let path = root + .join(format!("part_{:05}", i)) + .join("kmer_spectrum_raw.json"); + if !path.exists() { + continue; + } + let v: serde_json::Value = + serde_json::from_str(&fs::read_to_string(&path)?).map_err(io::Error::other)?; + global_f0 += v["f0"].as_u64().unwrap_or(0); + global_f1 += v["f1"].as_u64().unwrap_or(0); + if let Some(obj) = v["spectrum"].as_object() { + for (c_str, freq) in obj { + if let (Ok(c), Some(f)) = (c_str.parse::(), freq.as_u64()) { + *global_spectrum.entry(c).or_insert(0) += f; + } + } + } + } + + let global_spectrum_map: BTreeMap = + global_spectrum.iter().map(|(&c, &f)| (format!("{c:010}"), f)).collect(); + serde_json::to_writer_pretty( + fs::File::create(root.join("kmer_spectrum_raw.json"))?, + &serde_json::json!({ "f0": global_f0, "f1": global_f1, "spectrum": &global_spectrum_map }), + ) + .map_err(io::Error::other)?; + + Ok(()) + } + // ── private ─────────────────────────────────────────────────────────────── fn check_not_closed(&self) -> SKResult<()> { @@ -289,6 +396,18 @@ fn optimal_buckets(raw_path: &Path, available_bytes: u64) -> usize { n.next_power_of_two() as usize } +fn level_from_u32(n: u32) -> Level { + match n { + 0 => Level::Zero, 1 => Level::One, 2 => Level::Two, 3 => Level::Three, + 4 => Level::Four, 5 => Level::Five, 6 => Level::Six, 7 => Level::Seven, + 8 => Level::Eight, 9 => Level::Nine, 10 => Level::Ten, 11 => Level::Eleven, + 12 => Level::Twelve, 13 => Level::Thirteen, 14 => Level::Fourteen, + 15 => Level::Fifteen, 16 => Level::Sixteen, 17 => Level::Seventeen, + 18 => Level::Eighteen, 19 => Level::Nineteen, 20 => Level::Twenty, + _ => Level::TwentyOne, + } +} + fn format_ext(format: Format) -> &'static str { match format { Format::Gzip => "skmer.gz", @@ -391,6 +510,122 @@ fn flush_map(map: HashMap, writer: &mut SKFileWriter) -> SKResul Ok(()) } +/// Build the provisional MPHF and count file for one partition directory. +fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> { + // Estimate number of kmers from sidecar to pre-allocate the HashSet. + let capacity = SKFileMeta::read(dedup_path) + .ok() + .flatten() + .map(|m| { + let km1 = (k as u64).saturating_sub(1); + m.length_sum.saturating_sub(m.instances.saturating_mul(km1)) as usize + }) + .unwrap_or(0); + debug!("{}: sidecar capacity estimate={capacity}", dir.display()); + + // Pass 1: collect all unique canonical kmers. + let mut seen: HashSet = HashSet::with_capacity(capacity); + let mut pass1_superkmers: u64 = 0; + { + let mut reader = SKFileReader::open(dedup_path)?; + while let Some(sk) = reader.read()? { + pass1_superkmers += 1; + let seql = sk.seql(); + if seql < k { + continue; + } + for pos in 0..=(seql - k) { + seen.insert(sk.kmer(pos, k).map_err(io::Error::other)?.canonical(k)); + } + } + } + let kmers: Vec = seen.into_iter().collect(); + let n_kmers = kmers.len(); + debug!("{}: pass1 superkmers={pass1_superkmers} unique_kmers={n_kmers}", dir.display()); + + if n_kmers == 0 { + return Ok(()); + } + + // Build provisional MPHF. + let mphf = GOFunction::from(kmers); + debug!("{}: MPHF built len={}", dir.display(), mphf.len()); + + // Create memory-mapped count file (u32 per slot, zero-initialised). + let counts_path = dir.join("counts1.bin"); + let counts_file = fs::OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(&counts_path)?; + counts_file.set_len((n_kmers * std::mem::size_of::()) as u64)?; + let mut mmap = unsafe { MmapMut::map_mut(&counts_file)? }; + mmap.fill(0); + + // Pass 2: accumulate superkmer counts into the mmap'd array. + let mut pass2_superkmers: u64 = 0; + let mut pass2_kmer_hits: u64 = 0; + let mut pass2_kmer_misses: u64 = 0; + let mut pass2_count_sum: u64 = 0; + { + let counts = + unsafe { std::slice::from_raw_parts_mut(mmap.as_mut_ptr() as *mut u32, n_kmers) }; + let mut reader = SKFileReader::open(dedup_path)?; + while let Some(sk) = reader.read()? { + pass2_superkmers += 1; + let seql = sk.seql(); + let sk_count = sk.count(); + if pass2_superkmers <= 3 { + debug!("{}: sk#{pass2_superkmers} seql={seql} count={sk_count}", dir.display()); + } + if seql < k { + continue; + } + pass2_count_sum += sk_count as u64; + for pos in 0..=(seql - k) { + let kmer = sk.kmer(pos, k).map_err(io::Error::other)?.canonical(k); + if let Some(idx) = mphf.get(&kmer) { + counts[idx as usize] = counts[idx as usize].saturating_add(sk_count); + pass2_kmer_hits += 1; + } else { + pass2_kmer_misses += 1; + } + } + } + } + debug!( + "{}: pass2 superkmers={pass2_superkmers} hits={pass2_kmer_hits} misses={pass2_kmer_misses} count_sum={pass2_count_sum}", + dir.display() + ); + mmap.flush()?; + + // Build kmer frequency spectrum from the count array. + let counts = unsafe { std::slice::from_raw_parts(mmap.as_ptr() as *const u32, n_kmers) }; + let mut spectrum: BTreeMap = BTreeMap::new(); + for &c in counts { + if c > 0 { + *spectrum.entry(c).or_insert(0) += 1; + } + } + let f0 = n_kmers as u64; + let f1: u64 = spectrum.iter().map(|(&c, &f)| c as u64 * f).sum(); + + let spectrum_map: BTreeMap = + spectrum.iter().map(|(&c, &f)| (format!("{c:010}"), f)).collect(); + serde_json::to_writer_pretty( + fs::File::create(dir.join("kmer_spectrum_raw.json"))?, + &serde_json::json!({ "f0": f0, "f1": f1, "spectrum": &spectrum_map }), + ) + .map_err(io::Error::other)?; + + // Persist MPHF to disk. + let mphf_path = dir.join("mphf1.bin"); + mphf.write(&mut fs::File::create(&mphf_path)?)?; + + Ok(()) +} + impl Drop for KmerPartition { fn drop(&mut self) { let _ = self.close(); diff --git a/src/obikseq/src/superkmer.rs b/src/obikseq/src/superkmer.rs index 2a39f4e..1175b9f 100644 --- a/src/obikseq/src/superkmer.rs +++ b/src/obikseq/src/superkmer.rs @@ -135,6 +135,18 @@ impl SuperKmer { } } + /// Deserialise from a raw 32-bit header word and packed sequence bytes. + /// Preserves the full header payload (count or minimizer_pos in bits [31:8]). + pub fn from_header_bits(bits: u32, seq: Box<[u8]>) -> Self { + let seql = (bits & 0xFF) as u8; + let len = stored_to_len(seql); + debug_assert_eq!(seq.len(), byte_len(len)); + Self { + header: SuperKmerHeader(bits), + seq, + } + } + /// Returns the sequence length in nucleotides (1–256). pub fn seql(&self) -> usize { stored_to_len(self.header.seql()) diff --git a/src/obiskio/src/codec.rs b/src/obiskio/src/codec.rs index 62f90ae..818e8a0 100644 --- a/src/obiskio/src/codec.rs +++ b/src/obiskio/src/codec.rs @@ -26,7 +26,7 @@ pub(crate) fn read_superkmer( let byte_len = (nt_len + 3) / 4; seq_buf.resize(byte_len, 0); r.read_exact(seq_buf)?; - Ok(Some(SuperKmer::new(seql_byte, seq_buf.as_slice().into()))) + Ok(Some(SuperKmer::from_header_bits(bits, seq_buf.as_slice().into()))) } #[cfg(test)] diff --git a/target/.rustc_info.json b/target/.rustc_info.json deleted file mode 100644 index 3e2b77b..0000000 --- a/target/.rustc_info.json +++ /dev/null @@ -1 +0,0 @@ -{"rustc_fingerprint":2162534064148227772,"outputs":{"7971740275564407648":{"success":true,"status":"","code":0,"stdout":"___\nlib___.rlib\nlib___.dylib\nlib___.dylib\nlib___.a\nlib___.dylib\n/Users/coissac/.rustup/toolchains/stable-aarch64-apple-darwin\noff\npacked\nunpacked\n___\ndebug_assertions\npanic=\"unwind\"\nproc_macro\ntarget_abi=\"\"\ntarget_arch=\"aarch64\"\ntarget_endian=\"little\"\ntarget_env=\"\"\ntarget_family=\"unix\"\ntarget_feature=\"aes\"\ntarget_feature=\"crc\"\ntarget_feature=\"dit\"\ntarget_feature=\"dotprod\"\ntarget_feature=\"dpb\"\ntarget_feature=\"dpb2\"\ntarget_feature=\"fcma\"\ntarget_feature=\"fhm\"\ntarget_feature=\"flagm\"\ntarget_feature=\"fp16\"\ntarget_feature=\"frintts\"\ntarget_feature=\"jsconv\"\ntarget_feature=\"lor\"\ntarget_feature=\"lse\"\ntarget_feature=\"neon\"\ntarget_feature=\"paca\"\ntarget_feature=\"pacg\"\ntarget_feature=\"pan\"\ntarget_feature=\"pmuv3\"\ntarget_feature=\"ras\"\ntarget_feature=\"rcpc\"\ntarget_feature=\"rcpc2\"\ntarget_feature=\"rdm\"\ntarget_feature=\"sb\"\ntarget_feature=\"sha2\"\ntarget_feature=\"sha3\"\ntarget_feature=\"ssbs\"\ntarget_feature=\"vh\"\ntarget_has_atomic=\"128\"\ntarget_has_atomic=\"16\"\ntarget_has_atomic=\"32\"\ntarget_has_atomic=\"64\"\ntarget_has_atomic=\"8\"\ntarget_has_atomic=\"ptr\"\ntarget_os=\"macos\"\ntarget_pointer_width=\"64\"\ntarget_vendor=\"apple\"\nunix\n","stderr":""},"6432102384495711296":{"success":true,"status":"","code":0,"stdout":"___\nlib___.rlib\nlib___.dylib\nlib___.dylib\nlib___.a\nlib___.dylib\n/Users/coissac/.rustup/toolchains/stable-aarch64-apple-darwin\noff\npacked\nunpacked\n___\ndebug_assertions\npanic=\"unwind\"\nproc_macro\ntarget_abi=\"\"\ntarget_arch=\"aarch64\"\ntarget_endian=\"little\"\ntarget_env=\"\"\ntarget_family=\"unix\"\ntarget_feature=\"aes\"\ntarget_feature=\"crc\"\ntarget_feature=\"dit\"\ntarget_feature=\"dotprod\"\ntarget_feature=\"dpb\"\ntarget_feature=\"dpb2\"\ntarget_feature=\"fcma\"\ntarget_feature=\"fhm\"\ntarget_feature=\"flagm\"\ntarget_feature=\"fp16\"\ntarget_feature=\"frintts\"\ntarget_feature=\"jsconv\"\ntarget_feature=\"lor\"\ntarget_feature=\"lse\"\ntarget_feature=\"neon\"\ntarget_feature=\"paca\"\ntarget_feature=\"pacg\"\ntarget_feature=\"pan\"\ntarget_feature=\"pmuv3\"\ntarget_feature=\"ras\"\ntarget_feature=\"rcpc\"\ntarget_feature=\"rcpc2\"\ntarget_feature=\"rdm\"\ntarget_feature=\"sb\"\ntarget_feature=\"sha2\"\ntarget_feature=\"sha3\"\ntarget_feature=\"ssbs\"\ntarget_feature=\"vh\"\ntarget_has_atomic=\"128\"\ntarget_has_atomic=\"16\"\ntarget_has_atomic=\"32\"\ntarget_has_atomic=\"64\"\ntarget_has_atomic=\"8\"\ntarget_has_atomic=\"ptr\"\ntarget_os=\"macos\"\ntarget_pointer_width=\"64\"\ntarget_vendor=\"apple\"\nunix\n","stderr":""},"17747080675513052775":{"success":true,"status":"","code":0,"stdout":"rustc 1.94.1 (e408947bf 2026-03-25)\nbinary: rustc\ncommit-hash: e408947bfd200af42db322daf0fadfe7e26d3bd1\ncommit-date: 2026-03-25\nhost: aarch64-apple-darwin\nrelease: 1.94.1\nLLVM version: 21.1.8\n","stderr":""}},"successes":{}} \ No newline at end of file diff --git a/target/debug/.fingerprint/obikseq-4791c70657a715c0/lib-obikseq.json b/target/debug/.fingerprint/obikseq-4791c70657a715c0/lib-obikseq.json deleted file mode 100644 index 3a60560..0000000 --- a/target/debug/.fingerprint/obikseq-4791c70657a715c0/lib-obikseq.json +++ /dev/null @@ -1 +0,0 @@ -{"rustc":17940977064402226622,"features":"[]","declared_features":"[]","target":1865439097712885860,"profile":2330448797067240312,"path":3117914143523182133,"deps":[],"local":[{"CheckDepInfo":{"dep_info":"debug/.fingerprint/obikseq-4791c70657a715c0/dep-lib-obikseq","checksum":false}}],"rustflags":[],"config":8247474407144887393,"compile_kind":0} \ No newline at end of file diff --git a/target/debug/.fingerprint/obikseq-5cc47015be91e3b1/test-lib-obikseq.json b/target/debug/.fingerprint/obikseq-5cc47015be91e3b1/test-lib-obikseq.json deleted file mode 100644 index 4b01c74..0000000 --- a/target/debug/.fingerprint/obikseq-5cc47015be91e3b1/test-lib-obikseq.json +++ /dev/null @@ -1 +0,0 @@ -{"rustc":17940977064402226622,"features":"[]","declared_features":"[]","target":1865439097712885860,"profile":619605765252926426,"path":3117914143523182133,"deps":[],"local":[{"CheckDepInfo":{"dep_info":"debug/.fingerprint/obikseq-5cc47015be91e3b1/dep-test-lib-obikseq","checksum":false}}],"rustflags":[],"config":8247474407144887393,"compile_kind":0} \ No newline at end of file diff --git a/target/debug/incremental/obikseq-2j6dqqw76e9t8/s-hho6vbiepl-0ie9k92-17oaxmsyy8cxuem2djd4dy9hq/dep-graph.bin b/target/debug/incremental/obikseq-2j6dqqw76e9t8/s-hho6vbiepl-0ie9k92-17oaxmsyy8cxuem2djd4dy9hq/dep-graph.bin deleted file mode 100644 index ea685ce..0000000 Binary files a/target/debug/incremental/obikseq-2j6dqqw76e9t8/s-hho6vbiepl-0ie9k92-17oaxmsyy8cxuem2djd4dy9hq/dep-graph.bin and /dev/null differ diff --git a/target/debug/incremental/obikseq-2j6dqqw76e9t8/s-hho6vbiepl-0ie9k92-17oaxmsyy8cxuem2djd4dy9hq/query-cache.bin b/target/debug/incremental/obikseq-2j6dqqw76e9t8/s-hho6vbiepl-0ie9k92-17oaxmsyy8cxuem2djd4dy9hq/query-cache.bin deleted file mode 100644 index b9f03ef..0000000 Binary files a/target/debug/incremental/obikseq-2j6dqqw76e9t8/s-hho6vbiepl-0ie9k92-17oaxmsyy8cxuem2djd4dy9hq/query-cache.bin and /dev/null differ diff --git a/target/debug/incremental/obikseq-2j6dqqw76e9t8/s-hho6vbiepl-0ie9k92-17oaxmsyy8cxuem2djd4dy9hq/work-products.bin b/target/debug/incremental/obikseq-2j6dqqw76e9t8/s-hho6vbiepl-0ie9k92-17oaxmsyy8cxuem2djd4dy9hq/work-products.bin deleted file mode 100644 index ad58b7b..0000000 Binary files a/target/debug/incremental/obikseq-2j6dqqw76e9t8/s-hho6vbiepl-0ie9k92-17oaxmsyy8cxuem2djd4dy9hq/work-products.bin and /dev/null differ diff --git a/target/debug/incremental/obikseq-3q3fzz1res9p1/s-hho6vbiepy-0dc3k1e-2mx56wrd3i6p76vxgtzl7pdhs/dep-graph.bin b/target/debug/incremental/obikseq-3q3fzz1res9p1/s-hho6vbiepy-0dc3k1e-2mx56wrd3i6p76vxgtzl7pdhs/dep-graph.bin deleted file mode 100644 index 9197fd3..0000000 Binary files a/target/debug/incremental/obikseq-3q3fzz1res9p1/s-hho6vbiepy-0dc3k1e-2mx56wrd3i6p76vxgtzl7pdhs/dep-graph.bin and /dev/null differ diff --git a/target/debug/incremental/obikseq-3q3fzz1res9p1/s-hho6vbiepy-0dc3k1e-2mx56wrd3i6p76vxgtzl7pdhs/query-cache.bin b/target/debug/incremental/obikseq-3q3fzz1res9p1/s-hho6vbiepy-0dc3k1e-2mx56wrd3i6p76vxgtzl7pdhs/query-cache.bin deleted file mode 100644 index 75bb527..0000000 Binary files a/target/debug/incremental/obikseq-3q3fzz1res9p1/s-hho6vbiepy-0dc3k1e-2mx56wrd3i6p76vxgtzl7pdhs/query-cache.bin and /dev/null differ diff --git a/target/debug/incremental/obikseq-3q3fzz1res9p1/s-hho6vbiepy-0dc3k1e-2mx56wrd3i6p76vxgtzl7pdhs/work-products.bin b/target/debug/incremental/obikseq-3q3fzz1res9p1/s-hho6vbiepy-0dc3k1e-2mx56wrd3i6p76vxgtzl7pdhs/work-products.bin deleted file mode 100644 index aaedb06..0000000 Binary files a/target/debug/incremental/obikseq-3q3fzz1res9p1/s-hho6vbiepy-0dc3k1e-2mx56wrd3i6p76vxgtzl7pdhs/work-products.bin and /dev/null differ