From cd2f2f9417b3b073899c0cc2a02bb4873e2aae25 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 3 Jul 2026 17:16:36 +0200 Subject: [PATCH] fix: validate packed matrix columns before repacking Add header parsing helpers to extract column counts without memory mapping. Update packing functions to verify existing files match current metadata, preventing stale or widened-column artifacts. Extract inline tests in obilayeredmap to an external module and add comprehensive aggregation tests. Bump obikmer to 1.1.35 and clean up repository configuration. --- .gitignore | 1 + src/Cargo.lock | 2 +- src/Synthese.docx | Bin 12991 -> 0 bytes src/obicompactvec/src/bitmatrix.rs | 38 +- src/obicompactvec/src/intmatrix.rs | 39 +- src/obikmer/Cargo.toml | 2 +- src/obilayeredmap/src/layered_store.rs | 161 +------- src/obilayeredmap/src/tests/layered_store.rs | 381 +++++++++++++++++++ 8 files changed, 450 insertions(+), 174 deletions(-) delete mode 100644 src/Synthese.docx create mode 100644 src/obilayeredmap/src/tests/layered_store.rs diff --git a/.gitignore b/.gitignore index b44f5fc..793b0a9 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ benchmark/simulated_data benchmark/specimen_index_presence benchmark/specimen_index_count benchmark/global_index_presence +benchmark/all_specific benchmark/global_index_count benchmark/stats benchmark/reference_index diff --git a/src/Cargo.lock b/src/Cargo.lock index 057ddea..0664684 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1704,7 +1704,7 @@ dependencies = [ [[package]] name = "obikmer" -version = "1.1.34" +version = "1.1.35" dependencies = [ "clap", "csv", diff --git a/src/Synthese.docx b/src/Synthese.docx deleted file mode 100644 index 79b120994c5943936fe04ee0057dafcd2dea47bb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12991 zcmZ{KV|Zj+*KKUuw$-sZ=-9Sx+qUhbW7~GewmNpwVTU*8eDC|6)4BK7^X#gpevDbG z##(dlIp!R)62KrR06-8B0QF&snh>UKlL3GL08!uo0LTCU02+ce){aKjjyj5Nwnh$G zbgovGjY(tD{R{}gKBPa;NUhZf0-sa?v;;_hfVbR#j|$~U<~iQIOe0|tfQV0S$`G9# z&-Cyzpe{I`M418e9GQvU)#D5$u7Ps0R@r^@VwAYpJvMAu=7K@{TipfZ>3UEG9OS&viWyd_Q7ceX84Gv(PbO+$ zmw;@>?~88x171~x=@K&ZvkVHBGZwW$``8F}PlV^vg*weZoH0-Wy4jSoHdoHWKAg3M zYW|0tW6!2DuXKG2eGnq#Up*)D+~B6}fE90f4ff-@GRU^4=U86t=l$f!V$h$x!0jq|YkmdiD`zg@M14x)#0;A=Kd3bY^ zu>afsviT&sck&HQzi0PhOpi$-qn^-l)2qBqPp{+ylgfA88X)7p+Hj zHv2wSVv=J&SyNt_uJt=r$a-JdOUxPl=u!_1etF2!<}|TV*q$yJf5A*lH8zh-ciRXv zE9#JVrL%fx$OalR)l8}4Zq{N({bAPmqDnh_cJ_lpR${AcbY4K+>rK|;Kx;n!Fl$<9 z=2}P_0@TB|)xGtd*~nc7tJ%IIA=QZMUo@L%&iKO|DP>Bq3GbPDx27b>*A+y$cmq-= zU&o8#^o53S(X7eqYVJ(ki5@2;!a68FGJ+Ys@sNdtMKNy%X7FcoqUFt z2O8SI-#MryVEMX(I)fy6^+0az3)AQ?J01fWm7CwxT2_9=v&GBUiK z&TAR_XFU{iubJ_r47CBWd0M*qbw<174Bd#lhNKL3(Yg{js|O(}r**L_&xZ=tjdc_ezlAXnCoG!@MVpCH)BGlEcETUCB}FjYOgOanI(8AB`K#6BL~}75N;Sf3;)#&xW_rjY)NBGD#%7f`9&9oi>#TPBnY{92CU2wFg9 zLc+_yn=Rsal*+^rn&;~fzPeq!H9l^`b8<`P{U3_<`)5Pv+}&n7GH9n&_pJuTDv~ zQJ%DJTTLMmhN)_qHv*GB%PbNZA{V9mCb=E z?Nu;rQd>E6h_$Jb-}7si!0O5Ztn|fAtlc|Vvuc~_BXyGpeKA@MM2I^Xno&+eZ##_XF zgBUKJmH}OyaeL)vT1Sz>zzm!fFf}rE7(jX#K^L8wT9js$sjpeMb^!SNvy&R^RYQ?? zVE~XqV2tSb70f)z{@3{+%(6wn|r*+fr}JQB!`vz!V~5P_)s zvF>c9&pdRXM&sc^<%trqg@}@hWFjS{EP?7`o>(fH`_nUZ05!zUNQ}%|NqHdD)RlvC zi269D1G2_Q&hdJJXb+-J!Td4P4#Xyx84jJ#hVx%;)l$WMXt`Fymt)!S@>>wPx1f`_ zsq}D{8HGcl%yzWp661*$Bk9k1D3z*e%Nh1M;@3!VtxP;iG90+n#ljoXY9Oa{`>>P? zjC5EJ~Y0eC)eo39P)sW86=(uM=)`$%gkttUYa{k%)d_fb|5oOdBjWiR4b*O=E?T;CVO>qvWHIgiv zL`)|@7GFy0;wxZ60?vysjmatIQ0kC!8em7q{mxa2luYgX9sx_&GdsQ7iiP1j%|oOl zN7_d#)Y&f5?>Fd!+MTX96F%>E&qLpTZns4ajf<#>$G6Gp517mVqujb5=S5BRX_Sgi zLuZMlvO^<4BXgTGow=ihnqNY*uTN&%IR{YSfIv-cA9d^-og>mGg^HxH!lpwPgsAmE zh4vg>4S~d#Or(}WF+MqhEV2}~)Go?Xr_qX)P`(qR&LaoJPqGfVgl&j)z?lS3%SG=T z3%Maq6l~-M+&rGX>5ll3B=&s>>Nui*?+StsXUKOe4F6ZZ_8YrrfMjTfy^4kg+#sX|GKu%X5-jO#isvzJ;T8`A~ zE_qG-rM4>kbUJa90Gl(_95g(sicKnh8ItdJHwdD~!NqAfT&=Si^3ofAqdrTnjp3CV$ir^UndAWz=|s_pkbKP;?2g$L9lQH4ZFD4U+#i?7LKCGEc!`>z^o~dq{ap)@ zSKvv0UBN3}WpLNAGxA3yzLcP^Od?X!;4P`++rh6jWN*<#7ZSm#(R>N=^hpPOGO~AJkVwxYzWc z%R#zx3`z3~+*Wss=H-RRqYE`v-=lP-*NbaUuG3v-wQ*j*jpuL(M>2UMLd(exKy>WX z-_u}QtYx-c5fzQN#PRD?^_zzN&N$WgRyq-&L09PFUBnxM2555q!jy@hM{nZZMWD7+ zR<=`x#2F>#Wup~YonVz?2>0OR?nXvu?UR~0tBI9|*UPzcQgx>7l z?o`|vfp7%q>tq}eQ5_X@U9B>eN(|!91@&0Bm0@{^1(Vad5?trpFiKv2En2}a!468E zzDG#FC@}}6ATdBuNGS#DtawjX1v9Kira;nEEh(oVEiIeoMcP$($gQMVej!eB0;9qq z03pj-XQK?XZ^z91adWw=>n1bD)4)(J4UNXMWz@jbl%#%@5g?Z_J#b~WL{kKmiZC2? zj=G|h33p)^)5Y<`1F3wJ7hzPsNE?ouD7A(hX3pI@wQAw?9{7>;Y$@8ph*X6gh9+l8 z7$lXKRvGo?bWN!iJTrS%q;~n<=TLD#W52nqzVlt*k-;XuycpN^TcoO948U+1X4?M^f!)aTYC${(ovBXC`BBc1PJFugjnI5B|rpV z#+0lw^rW`R41-9}Wx_3Ve$#t|=?UOMr&3GSYFp@yP2tD`?h(mG0Q8hxAfBH?;!M@j zTR=;&PZozi%@TP5$zi8hCa=u*$8#UF5)xd>Uh5t-E`^B_030jAiGB%JwbOd_4}N>> z*I~Pmt;aU>a!^yHU52K-O6k)3-dJEBC`@c{gTI6|{Wa+~q#eF$3;dsz{KaNoAjpRrtp@=B!1-?_|3C8P-}U_8Qm1i# zL%N>;rRQ3O@NjchIx7NmZKBSwSsmYZb}mLW*N@rBhQ3XWb2o@hll9&HDIm708Q)#k z=E&0~%9m)&FHf%NLW+93aDA&nw_S~Ihk&wUr``LOhK5m_UxbC%{z554{b@YfRKToa zF21k8;y1-Yxn&&vFVE2pPcY3X!+o=XcgY5+IqG#ar+L$(nBH|9ry#m3KU{Ob3)yPl zq}fJMprX{E)*e@;pyD}SZz67?zE+ZVUZfzaZ@u~^-6oyg|{VbLO^S*mn@dWyl zy}*@7Vyh3mI3Pk8;c9_4mvFjB#{O|@INe(S zSY-mZAX$CxE*h=3j)%7n?;C60brw`R+e{h^G@WcQ|A1jHE=^sn1MJKBkUU1EFT{vW z2p-icLU(b@$BI;A1l6-lcZ?c6;%`!Z*M{eqYwzF~n%;k|Gh>KDN9tpj3&8*Ykp8<) zYbPsxBYQJzlfPEjnbaHY&wvoDQ+Vt}O8|s4b2d0IVO$S>4@jkGIg2)nX1nFB0TPwX zDisTyF!?^UM?DIYu@S^PjaX_*%XCD~4G~Il5+6<#!1w!&*024%NJe-pMkt+z0aT#S z3v5cAjZ&bD9XWa--#V0GwKN&W!XXj$#6pd=jxJQbD4rR`RXdQq2xv9?v7})j=*^tN zl^yp7Ox4n+E>d{%w{wQET$5}UY5+shO(rQmF%i|bwb)S>!?cNL)DI}$(}mqB}sgI@UsYAE(o0Busnm|t^5`6eMqq8KvtfLy~(8X2M6m&Q5W*U-C; z;^hz|Hw2Z}-fze1q1DFu;bxI^8+yf<3a%d+?oLZ!2g$g z2S+!{|2Z4yS{g2EC1~D!-F;)1xLZc8<;OI`r7HE(r+ zku=k!N~^5!u$KH=o6iRIUsF1sZ=Po^cm&SWMp};CX zK%5@oM4a@LG%-{9eqnJ8obq&nW}B&LKmlss#v}(^4HxdJc~LR)^eO46x96e(#nA9T zUup3QIZ1l!Zh-E>GNnAcs7ud01zPxE zkBkJjk?$9-0C={AWESqakH7)&x)p?#^9=sCq<0PQKrw38D^srqXRf2~PC8YWH9Zz6w&?TlLEy{72(|NDB;iyEyF!Lx5(I> z+Yko$0~6q3x9@}pNA3ZpN9%(LV_NweH)!7Ev*?Dn?s_q|>0r6)o1BBWmSb+)BD%m8 zZ~*8)Q#S$w;cUCAWx=Fc63?0~&*P>{FJ%F0y=W##^eM<}Y2C zBZH&ONs#z`7b#Ee_#G_K#c@_!4BCiXnq@Pjg5e}&ie)Tk4w)?TY zZal_$ViJx(at^5VH7Vrc(cv>h$pa|HGxY-sgj$$`-u5?jAp*l$?1H3$ z1G^GJadHWA1J^OFlH+yw=X>g{D3KPlnBZks#Ji^?KZ1OXg-I|^2T67e?}|JZCF1LO zFXs0K&}V8RsZbBSwcyi7zNM~&H)_`Q3Fpvq`xyoNs2u$Dt1$VD$r$>6&WMP{%bs7A z(|uqfBLde*Yza8<0v6jKw){m%0Z7K0k$zbb&4yBm=n8eiNo zgZ*NmJcVGmpT)+&t}J5(M7oX~G|54wfFc)Ws>Nox<~fmP>8PkkDhJg1XVZn?&f@Ef zkoh?Bv0kXrG8)v`3ylv!{ITM~8)yK8SFBbMDZD$cKu@%;$gDeVA6%zg!S|mx$r$o( z>Fw+`$c;43a5nYi(#W*KU-yuk#@-UiCh9ug^4h88l))s?=;t~t9RRpE2lhF|#)D6f zHL>@}^=PuP)3WEx=a_$vsiyVqHc@gbf3HcwA-?fqG8xJuc`3=fD$q%~O0Z+x;ZOtZ zlSzc+Gt0P2*b0=fhgIrpt_`V@*gUu&b^9fqG|FX_(J_~k+%X5^b|-%!t)1Sk+|=+Z z-NYF*dNPrN(;}mwehVimVNeBAhq^{q!8mV`6}mAeIS3Q!;DM@`B-4pcwhXsGW&2Pw zE$61`H^=+L6hh=xZCDCid?mB2igE5HEAU-Tn%^M@n@<+wwYUrhM|OPQ5vun_^10KRL_)gEh9hn5X;vksK{IPZ$ zBexOtMXd@4_KT8KwZ^kR7G!*(U$WT6M;Nz-@yvc?CsHx#t^xrURVM;&uwT(U+*ZUF zo6x$t-;48X;Odwsd6qVAOPnV?)d3gV>|1B6*xz)3vQ6ofW^nx!?TjbBL&GWUmKSl} zHY{16h!D5dZrDB;q;|R)!@Jg69r2X)U`xKE@Zf0r(Vj!fE6-j@?NQkD*t%|O&M|)J zai4!$yuHzPH6hSateg`ZGpT#}hL9pHvnyWN^(~Y=l%KSJIBQ-<@_;s{)3JXsMOib; zIqY8kg?jeaAkB;!9$H5dJzQOU9`{LR7v|>Ju<0?gtMIah?$xDFY%awBMk9@(6ailr zuagZPLjNR!Uvf3@Qn0VfRzQDKaYDV%sx_$9GkvU=Rvba|cbgUx`D}>Qs9^W5LO%$t z1*0LDBV|sC8y!3k#o>+0+(Lr5mi|#+zy-d7VqO~%=7%OSMta1}uV;mgYP3KqbyHck zUD5=Z&P7`IDNRl}H3T>m)iOZDvm|}mj2ij%gZ4D{QYo|pBEOsa;;lPtbOwb9aEs3~ zRO9Ih7RGqooDyj%Ee2*Atw1$NA;dgD$$xn2f45w~G*QM?Q&F2}p@OI_ONQdS`*BWc zYL`8C*OhDU1@iEt4J0eksr1$>#RGq=8y%CanhkT(E1gsk;sV8w^wqr(K?b|w`0eZ^ zK{6qp9{Nk`?!>oK?`;Fi`0JA)Q{Cju8R;SFqZ!QUOOf4T~PBg*|ZxTOUPRK3#S}K z6x1Q=6TPP~`bvHXau9o*EIyzmP9JNZ{8`VDp(`)U)G-S+eglqKPrkUaRUBSc9jHkg-K-g+nlx&P!>?KKDl#66*Dju9F+;vU5O10yHj z)2A9Y>3R#OuOfGMG(5#k))-eqK}DcvMC%-O58KF+%#eh?gl0LWuNxzJbDEAakr0_D z*$K9CqoIQH%-S|5@_QR8%M*DU#_tR5vne~=i;rUR{96WOp^wmusbCmSIoOODYP zKUD2COp}jDUk<8?ed@4$YYtX3IJq8lia}lu?Ad@ZacF!aM%Fu0q;{q7HG4Foz3sK# zi4Y7VY-MuvBvl9c?Mr9EFWL-E=--oJClhmMXq)g`kd7VxVd;oe{4oTJfzBa%w+~zc zjw53hq=pALEY@qJd87F|$a*xR5U2b(muUUh`Fck=J3LO$Iskm3=-C}FgQIrmfj*~& zzO5W~!%Gh~09>GJaMJU3q{*x(m%K<{fh{JzxB4iD!6OF|$q*e;-CW;j{IkWWWF{Wj zE?>LCO%6JLjf0gln2#u`xUh4n`m|z{yeQ80LR5FlT`8&`i9)1*oFrl9# ztcD)Xg$gJwS zuzE}vFlVDfJV1ls2jya-6!(@k`=MdY>kH{<%eK~I`Q`3v22f8Wj@45Xy`v;xCwH~b zD%Z$JXrP!+7}AzK4!Z{|z$3(mi@e~`2ZLjD%XIUh58lcA&>zj|ofil&IUW9uR82;q z2ty&i($q*{EX5AW9n?ONW=X}tHqxr{*0S#OT_eTQ!$7cnS_Sfp&YvveTLUAneAMA~ zAH%%=9bOsOSp7A-`dh7K&Gd%%6Ci+Xt3Yll%<&`&r{BxpLpH$^46z6d#bF{pTrbV4 z9_5klkucr(u%2?XY|>LF}NNF5F-qmTGFLvmMxjq>tCVN#$rvziEJ zr<#qeaX_qsUH+jfr6q%Q_uDUuaPjOnz(0AoiSwGn{Wwh`|KZ_7@5|cT*gDV~*w`EW zr3nk;$E^GQ9z*})wZlUdRz^arRBe^uMciD zT|3#1l-st!dp$ecHG_-&c5w_Lsr-ax<_I)xfW&g#OPRO4_;4YgzdCX4@D5(zQcOK!ws+0+=F0g+~0`d z8$AbY4WDMv<$e8@y3W+gnM}PP>da8)4V;^Xv4rmVqE=k2htt9{Tcq_UAOoOeK@okSrE3FxF2`2dgh5Fj;YV3 zLiHC3AtEEW?@6T7eGU;-vWr3H(m4uM@H}Nakmdr&WiY~JG3h0da2-!Ll~|PZfk>GK z^-0+Ht{PCUJfaKiUEm6KJg2}9t0W12)S}}F(2c_tE?5F&F z7`@pNu>LkZ8oFOkt-pYkNZ5DHT+xUxJ6BpJ-SdxEfnaBnjM{G-J>tGb2YPz-L4ggz z*xSu}li!z$VApkx_wz6#eWCUyPU0abW$P-RE=7&+?a{7y>w5#-OL_?{&s@mIEZ>Y7 z)%l%ZDoy*6h5+>*K?mL2S%=yWd!ZUA3$A`Rgo`D?eZKv@WIHWaUY3vgI0l4gGRNe= znpcW!$&Q`j#0@wXce4QFIl%rYJ{a3neC>-Lxt`UfU+g)Nre*#PDW~3z?}UgYV$EPE zEpQV~?U=7t^fHxhIocY&szSNojKl@}&CYV^e#w``SRR&+(hl9~j3eCnU*U|}FVW4g zM*iCEt5)RadU!)%@|c?eLuVonCW5lmhYB^^lrhCwuj`<@*VQgJ(0?WcprQ0P!;c%x z{Kxr1_;*t1+1mb{5%PcATsj`8^Vh>9+{)~=2oZh)K$KEu(t0GQvNsuFgZNBY&jk~( z$IV@gPo5bgb-f)~UBfYAUMla~eW8c8{b8}Ys?=I(6GJ!Fn0j4&1Uz6#fJ$L&rsT{K zLTVu9G(DzBcphu!d1mx6>l|l`lC#}i(153_AN2H|+$Ii~%Z=$XRfUTldw9`NI7i(` z$v&O(_10HLc^MSR8rX+45fol7Nn(PA1zm&zTw5vXDOQ-_%!g!XNe(bc1 z9k}+d&m%gc3Opo~ASvVE&6KL?Jo{oLcy41D+#j5?y*Y|S$~=k}8P5Bg4E>f4ltU%p znq2^in@&8*ZYg4PgG@9?B})#(hl!*#4?bC$SAL*rmiHkuiXuvv6I<41W>3aXqWFBW z*Tl?XE`VKRXZh@>@@(H1?^MPM#7DO(_D=Q;yw-x88jAuLTyNn?vJL*GM4lbR@fwVG z@@vpPcO<={6B7NycVtcj0D$_jBY*og98Ha^jOhP|d2qwso*_xY@ntMDTQKYyQ~H|Qd)PUw+%?FbJC%x@#CJlRQDO6c6!7)wPQ z23Hz0VEK=xFfABD2pEJOf!ZZp$g*HAx&qb`9U#NuCBkoV6mC3;Z#;c)9fpT2v#As0 z)L;e{Z;9MDls1T9(FCQ~_V0i^7kMIm03h~+P>aC%i%0ouEkjI#x64bd52;V>gAL)!(Ymq#Nz;IO&#Ossgx;G zjMdWPzon*&51-rp+1_!rE_#e-9`Ul@4u#a!MX}AB+coLqb2I)H&$}~h zY_Xq>&+n6Sd>`wEJXw;ri~Thf`Yq4*I~_j0r5!Ao`?(f8I0SH)X~q^2U^9@7%)3D2dwyH5j+y( z$&B2A*P{nT*Q}AE7e@xk%zAhrVjA=tnWE>@gqY}RofDE{P?YY&ZmHM>2$2Y~3k(7s zA44#x59>pCzH#6}T_FUJ4Y&*5+v@BUxY!cFyO$+twI%A$bb{xKg*o@La>^4jnG()e z`-j>?v26?W@K>Y4h+@@22URQW>Hgr}-)@PWWDyATp9HY00^KrF5H4JkEwb!1Igc~T z6-ezdpfyC7PY-T5ZrSfAm^}>=&WucyT@w!NF<|G1Snn6*lQmHe)N7mdzQW4e@v@}%%Fsx%7P<#yb|m=deKz_+}g_L5tZ$7xF-N> zU;Rm3o8W1Qa|P+0PmWO*M&^-eH`YGiIfP3K$98VooS%>+wUdFb?KyC9XK-6-fo$BQ zp#?p1Uqbr>?D2gjziEeLU8D|2$FT$*_e40PO&IrNwZd^GYK?fTjrH)^9 zWl6HTQ@j4FpE$DH+UW-~^@t-u!ZgACqonHuu-S9Ig#o#|%;@I8W8<%T;gtMpx{Q*i zli6r`4f5Bv^RVi4GINx<f4O=0$gVHOffwoj(AHHc#YIfmQ^tEw(0`PT{f8d#bXP378{;lK&P09yL4ow*5>AEUyzFLmCVNa)767N)i zbFPxg$Xwwr=rjN7ijT1wVRx*kkR_;%o=Pj1w2#!7e_{EXJTq}K9ZYg&W2J(7>DSoJ z5bOrI)q=e7Oi5m9VMCr8>&0`^^^LrE94e&AuCq1vQ$q(x(yh(c6KfJYRrgvJI*?bv zwEI<42y&sTpT`vLBp8+xI$jwm`6~k}c%#Ir44Q0s8PWyoby>>26`WZDt%|JBO2u+i zEI7r6PB>$hZ0F=A%?y3;G$v%S=wPXmGF!z`Ln)i3um?+GDLhyxX7&XNl0Bnbjme!) zS?a!pnD}eB<(6z`%6SV->jEp*?Lp>+^Aru1%~ouNr4H>;=o@vIt&SyVKao&%K+{#C7g97=76t{L% z)YG^8TRoj8j#(Elpaeg-L5)3A2FD)c$Cj}%& zWk%r!+M-qYHg4X?>Sp9P1fW?hkF%JNNZo-&IL{h-5|lrC=cSEn`Zc9ZyW+AcJG0B}Y#1CQM(z^ne3r%F;K zuO1?MsUgFkxl-${+z6@HM!|p{maaTS$2z(X(rJ@qE+#z(_`nk2fH$oI@HkXoAKq{7 z5UV_>ma4>V6ISbDl#$B>The1;3!ThbMCV~u&H%n$i~WXEceh|?p71^k_~I0Lz^jNH zV~s&m>R`NCsC0#%{BYjCcH@PS1AaPouZLaXL#Gh+t1~U%pG)2=S3ZZ>M^p>oB);I} zTuAu(c1-$y7ecv!@6VDd2uE=;^P`j+{1roc*nIRF($6D)q=KnofOD1u!izG-uQ zL6X5q4ScL0AOcUxNZ6~K>%S&GNl6hXZ{J$*5pW&aK4FABR;r;{v|M(g{NBTU~ z|0mM;$Ncv1h5S#XKU!~p&-g!w`aF35CzRudBKkYjA7l8>_|LQBfAH5It*gKB|4foU zgFm+m{{eG-SUvs*|M3q#<3Bgb{=xfwm>vGc|GRDW8UDGy^bb4?@82Dg|8$!^!#}rF z{(&#x{~P|l4VKRVKDRdh2@p;2U+(^uq@UrR)$2d-6QY0p{(rRWXY^;K^$*&UDeV=>Gcl Fe*neLO(_5X diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs index 89121da..feb7267 100644 --- a/src/obicompactvec/src/bitmatrix.rs +++ b/src/obicompactvec/src/bitmatrix.rs @@ -1,5 +1,5 @@ use std::fs::{self, File}; -use std::io::{self, BufWriter, Write as _}; +use std::io::{self, BufWriter, Read as _, Write as _}; use std::path::{Path, PathBuf}; use memmap2::Mmap; @@ -171,19 +171,43 @@ impl PackedBitMatrix { } } +/// Reads just the `n_cols` field from an existing packed matrix's header, +/// without mapping the file. Used by `pack_bit_matrix` to tell a genuinely +/// complete pack from a stale one that predates a later column-widening. +fn packed_bit_matrix_n_cols(path: &Path) -> io::Result { + let mut f = File::open(path)?; + let mut header = [0u8; PBMX_HEADER]; + f.read_exact(&mut header)?; + Ok(u64::from_le_bytes(header[16..24].try_into().unwrap()) as usize) +} + /// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files. pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> { let packed_path = dir.join("matrix.pbmx"); - if packed_path.exists() { - // Matrix complete; remove any leftover column files from a killed cleanup. - if let Ok(meta) = MatrixMeta::load(dir) { - for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); } - let _ = fs::remove_file(dir.join("meta.json")); + + let meta = match MatrixMeta::load(dir) { + Ok(meta) => meta, + Err(e) => { + // No columnar data pending: either this layer was already + // packed and cleaned up (matrix.pbmx complete, nothing left to + // do), or genuinely nothing was ever written here. + return if packed_path.exists() { Ok(()) } else { Err(e) }; } + }; + + // A `matrix.pbmx` can already exist here even though columnar data is + // still pending — e.g. copied verbatim from a merge's base source + // before this layer was widened with more genome columns (see + // `obikpartitionner::merge_partition`). Only skip (re-)packing if the + // existing file already reflects the current column count; otherwise + // the columnar files are newer and must be (re-)packed, overwriting the + // stale one — never silently discarded as "leftover cleanup". + if packed_bit_matrix_n_cols(&packed_path).ok() == Some(meta.n_cols) { + for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); } + let _ = fs::remove_file(dir.join("meta.json")); return Ok(()); } - let meta = MatrixMeta::load(dir)?; let n_cols = meta.n_cols; // Compute offsets from file sizes — no column data loaded into RAM. diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index b2fa97e..6b030e0 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -1,5 +1,5 @@ use std::fs::{self, File}; -use std::io::{self, BufWriter, Write as _}; +use std::io::{self, BufWriter, Read as _, Write as _}; use std::path::{Path, PathBuf}; use memmap2::Mmap; @@ -228,17 +228,44 @@ impl PackedCompactIntMatrix { } } +/// Reads just the `n_cols` field from an existing packed matrix's header, +/// without mapping the file. Used by `pack_compact_int_matrix` to tell a +/// genuinely complete pack from a stale one that predates a later +/// column-widening. +fn packed_int_matrix_n_cols(path: &Path) -> io::Result { + let mut f = File::open(path)?; + let mut header = [0u8; PCMX_HEADER]; + f.read_exact(&mut header)?; + Ok(u64::from_le_bytes(header[16..24].try_into().unwrap()) as usize) +} + /// Build `counts/matrix.pcmx` from existing `col_*.pciv` files. pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> { let packed_path = dir.join("matrix.pcmx"); - if packed_path.exists() { - if let Ok(meta) = MatrixMeta::load(dir) { - for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); } - let _ = fs::remove_file(dir.join("meta.json")); + + let meta = match MatrixMeta::load(dir) { + Ok(meta) => meta, + Err(e) => { + // No columnar data pending: either this layer was already + // packed and cleaned up (matrix.pcmx complete, nothing left to + // do), or genuinely nothing was ever written here. + return if packed_path.exists() { Ok(()) } else { Err(e) }; } + }; + + // A `matrix.pcmx` can already exist here even though columnar data is + // still pending — e.g. copied verbatim from a merge's base source + // before this layer was widened with more genome columns (see + // `obikpartitionner::merge_partition`). Only skip (re-)packing if the + // existing file already reflects the current column count; otherwise + // the columnar files are newer and must be (re-)packed, overwriting the + // stale one — never silently discarded as "leftover cleanup". + if packed_int_matrix_n_cols(&packed_path).ok() == Some(meta.n_cols) { + for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); } + let _ = fs::remove_file(dir.join("meta.json")); return Ok(()); } - let meta = MatrixMeta::load(dir)?; + let n_cols = meta.n_cols; let col_sizes: Vec = (0..n_cols) .map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len())) diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml index b164593..a38559e 100644 --- a/src/obikmer/Cargo.toml +++ b/src/obikmer/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "obikmer" -version = "1.1.34" +version = "1.1.35" edition = "2024" [[bin]] diff --git a/src/obilayeredmap/src/layered_store.rs b/src/obilayeredmap/src/layered_store.rs index 433183e..4747af8 100644 --- a/src/obilayeredmap/src/layered_store.rs +++ b/src/obilayeredmap/src/layered_store.rs @@ -96,162 +96,5 @@ impl BitPartials for LayeredStore { // ── Tests ───────────────────────────────────────────────────────────────────── #[cfg(test)] -mod tests { - use super::*; - use obicompactvec::{ - PersistentBitMatrix, PersistentBitMatrixBuilder, - PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, - }; - use tempfile::tempdir; - - fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) { - let n = cols.first().map_or(0, |c| c.len()); - let dir = tempdir().unwrap(); - let mut b = PersistentCompactIntMatrixBuilder::new(n, &dir.path().join("counts")).unwrap(); - for &col in cols { - let mut cb = b.add_col().unwrap(); - for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); } - cb.close().unwrap(); - } - b.close().unwrap(); - let m = PersistentCompactIntMatrix::open(dir.path()).unwrap(); - (dir, m) - } - - fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) { - let n = cols.first().map_or(0, |c| c.len()); - let dir = tempdir().unwrap(); - let mut b = PersistentBitMatrixBuilder::new(n, &dir.path().join("presence")).unwrap(); - for &col in cols { - let mut cb = b.add_col().unwrap(); - for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); } - cb.close().unwrap(); - } - b.close().unwrap(); - let m = PersistentBitMatrix::open(dir.path()).unwrap(); - (dir, m) - } - - // ── ColumnWeights ───────────────────────────────────────────────────────── - - #[test] - fn col_weights_sums_across_layers() { - // layer 0: col0=[1,2], col1=[3,4] → weights [3, 7] - // layer 1: col0=[10,0], col1=[0,10] → weights [10, 10] - // combined: [13, 17] - let (_d0, m0) = make_int_matrix(&[&[1, 2], &[3, 4]]); - let (_d1, m1) = make_int_matrix(&[&[10, 0], &[0, 10]]); - let store = LayeredStore::new(vec![m0, m1]); - let w = store.col_weights(); - assert_eq!(w[0], 13); - assert_eq!(w[1], 17); - } - - #[test] - fn col_weights_bit_sums_across_layers() { - // layer 0: col0=[T,F,T], col1=[F,T,T] → counts [2, 2] - // layer 1: col0=[F,F,T], col1=[T,T,F] → counts [1, 2] - // combined: [3, 4] - let (_d0, m0) = make_bit_matrix(&[&[true, false, true], &[false, true, true]]); - let (_d1, m1) = make_bit_matrix(&[&[false, false, true], &[true, true, false]]); - let store = LayeredStore::new(vec![m0, m1]); - let w = store.col_weights(); - assert_eq!(w[0], 3); - assert_eq!(w[1], 4); - } - - // ── CountPartials — layered (one partition) ─────────────────────────────── - - #[test] - fn layered_bray_matches_combined() { - // Split [1,2,3,4,5] across two layers; bray dist should equal direct computation - // on [1,2,3,4,5] for each column pair. - // col0=[1,2,3,4,5], col1=[5,4,3,2,1] - let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); // slots 0-1 - let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); // slots 2-4 - let store = LayeredStore::new(vec![m0, m1]); - - // direct on full data - let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]); - let expected = CountPartials::bray_dist_matrix(&mf); - let got = CountPartials::bray_dist_matrix(&store); - assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "bray [0,1]"); - assert!((got[[1, 0]] - expected[[1, 0]]).abs() < 1e-12, "bray [1,0]"); - } - - #[test] - fn layered_relfreq_bray_matches_combined() { - let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); - let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); - let store = LayeredStore::new(vec![m0, m1]); - - let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]); - let expected = CountPartials::relfreq_bray_dist_matrix(&mf); - let got = CountPartials::relfreq_bray_dist_matrix(&store); - assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "relfreq_bray [0,1]"); - } - - #[test] - fn layered_euclidean_matches_combined() { - let (_d0, m0) = make_int_matrix(&[&[3, 0], &[0, 4]]); - let (_d1, m1) = make_int_matrix(&[&[1, 1], &[2, 2]]); - let store = LayeredStore::new(vec![m0, m1]); - - let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 4, 2, 2]]); - let expected = CountPartials::euclidean_dist_matrix(&mf); - let got = CountPartials::euclidean_dist_matrix(&store); - assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "euclidean [0,1]"); - } - - // ── CountPartials — partitioned (LayeredStore>) ─────────── - - #[test] - fn partitioned_bray_matches_combined() { - // partition 0: slots [1,2,3,4,5] col0 vs col1 - // partition 1: slots [10,20] col0 vs col1 - let (_d0, p0) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]); - let (_d1, p1) = make_int_matrix(&[&[10, 20], &[20, 10]]); - - let partitioned = LayeredStore::new(vec![ - LayeredStore::new(vec![p0]), - LayeredStore::new(vec![p1]), - ]); - - let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5, 10, 20], &[5, 4, 3, 2, 1, 20, 10]]); - let expected = CountPartials::bray_dist_matrix(&mf); - let got = CountPartials::bray_dist_matrix(&partitioned); - assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "partitioned bray [0,1]"); - } - - // ── BitPartials ─────────────────────────────────────────────────────────── - - #[test] - fn layered_jaccard_matches_combined() { - let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]); - let (_d1, m1) = make_bit_matrix(&[&[true, true], &[true, false]]); - let store = LayeredStore::new(vec![m0, m1]); - - let (_df, mf) = make_bit_matrix(&[ - &[true, false, true, true], - &[false, true, true, false], - ]); - let expected = BitPartials::jaccard_dist_matrix(&mf); - let got = BitPartials::jaccard_dist_matrix(&store); - assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "jaccard [0,1]"); - } - - #[test] - fn layered_hamming_matches_combined() { - let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]); - let (_d1, m1) = make_bit_matrix(&[&[true, true], &[false, false]]); - let store = LayeredStore::new(vec![m0, m1]); - - let (_df, mf) = make_bit_matrix(&[ - &[true, false, true, true], - &[false, true, false, false], - ]); - let expected = BitPartials::hamming_dist_matrix(&mf); - let got = BitPartials::hamming_dist_matrix(&store); - assert_eq!(got[[0, 1]], expected[[0, 1]], "hamming [0,1]"); - } -} +#[path = "tests/layered_store.rs"] +mod tests; diff --git a/src/obilayeredmap/src/tests/layered_store.rs b/src/obilayeredmap/src/tests/layered_store.rs new file mode 100644 index 0000000..1387013 --- /dev/null +++ b/src/obilayeredmap/src/tests/layered_store.rs @@ -0,0 +1,381 @@ +use super::*; +use obicompactvec::{ + PersistentBitMatrix, PersistentBitMatrixBuilder, + PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, +}; +use tempfile::tempdir; + +fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) { + let n = cols.first().map_or(0, |c| c.len()); + let dir = tempdir().unwrap(); + let mut b = PersistentCompactIntMatrixBuilder::new(n, &dir.path().join("counts")).unwrap(); + for &col in cols { + let mut cb = b.add_col().unwrap(); + for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); } + cb.close().unwrap(); + } + b.close().unwrap(); + let m = PersistentCompactIntMatrix::open(dir.path()).unwrap(); + (dir, m) +} + +fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) { + let n = cols.first().map_or(0, |c| c.len()); + let dir = tempdir().unwrap(); + let mut b = PersistentBitMatrixBuilder::new(n, &dir.path().join("presence")).unwrap(); + for &col in cols { + let mut cb = b.add_col().unwrap(); + for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); } + cb.close().unwrap(); + } + b.close().unwrap(); + let m = PersistentBitMatrix::open(dir.path()).unwrap(); + (dir, m) +} + +// ── ColumnWeights ───────────────────────────────────────────────────────── + +#[test] +fn col_weights_sums_across_layers() { + // layer 0: col0=[1,2], col1=[3,4] → weights [3, 7] + // layer 1: col0=[10,0], col1=[0,10] → weights [10, 10] + // combined: [13, 17] + let (_d0, m0) = make_int_matrix(&[&[1, 2], &[3, 4]]); + let (_d1, m1) = make_int_matrix(&[&[10, 0], &[0, 10]]); + let store = LayeredStore::new(vec![m0, m1]); + let w = store.col_weights(); + assert_eq!(w[0], 13); + assert_eq!(w[1], 17); +} + +#[test] +fn col_weights_bit_sums_across_layers() { + // layer 0: col0=[T,F,T], col1=[F,T,T] → counts [2, 2] + // layer 1: col0=[F,F,T], col1=[T,T,F] → counts [1, 2] + // combined: [3, 4] + let (_d0, m0) = make_bit_matrix(&[&[true, false, true], &[false, true, true]]); + let (_d1, m1) = make_bit_matrix(&[&[false, false, true], &[true, true, false]]); + let store = LayeredStore::new(vec![m0, m1]); + let w = store.col_weights(); + assert_eq!(w[0], 3); + assert_eq!(w[1], 4); +} + +// ── CountPartials — layered (one partition) ─────────────────────────────── + +#[test] +fn layered_bray_matches_combined() { + // Split [1,2,3,4,5] across two layers; bray dist should equal direct computation + // on [1,2,3,4,5] for each column pair. + // col0=[1,2,3,4,5], col1=[5,4,3,2,1] + let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); // slots 0-1 + let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); // slots 2-4 + let store = LayeredStore::new(vec![m0, m1]); + + // direct on full data + let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]); + let expected = CountPartials::bray_dist_matrix(&mf); + let got = CountPartials::bray_dist_matrix(&store); + assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "bray [0,1]"); + assert!((got[[1, 0]] - expected[[1, 0]]).abs() < 1e-12, "bray [1,0]"); +} + +#[test] +fn layered_relfreq_bray_matches_combined() { + let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); + let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); + let store = LayeredStore::new(vec![m0, m1]); + + let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]); + let expected = CountPartials::relfreq_bray_dist_matrix(&mf); + let got = CountPartials::relfreq_bray_dist_matrix(&store); + assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "relfreq_bray [0,1]"); +} + +#[test] +fn layered_euclidean_matches_combined() { + let (_d0, m0) = make_int_matrix(&[&[3, 0], &[0, 4]]); + let (_d1, m1) = make_int_matrix(&[&[1, 1], &[2, 2]]); + let store = LayeredStore::new(vec![m0, m1]); + + let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 4, 2, 2]]); + let expected = CountPartials::euclidean_dist_matrix(&mf); + let got = CountPartials::euclidean_dist_matrix(&store); + assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "euclidean [0,1]"); +} + +// ── CountPartials — partitioned (LayeredStore>) ─────────── + +#[test] +fn partitioned_bray_matches_combined() { + // partition 0: slots [1,2,3,4,5] col0 vs col1 + // partition 1: slots [10,20] col0 vs col1 + let (_d0, p0) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]); + let (_d1, p1) = make_int_matrix(&[&[10, 20], &[20, 10]]); + + let partitioned = LayeredStore::new(vec![ + LayeredStore::new(vec![p0]), + LayeredStore::new(vec![p1]), + ]); + + let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5, 10, 20], &[5, 4, 3, 2, 1, 20, 10]]); + let expected = CountPartials::bray_dist_matrix(&mf); + let got = CountPartials::bray_dist_matrix(&partitioned); + assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "partitioned bray [0,1]"); +} + +#[test] +fn partitioned_threshold_jaccard_off_diagonal_is_pairwise() { + // 3 genomes, 2 partitions, 1 layer each — mirrors distance.rs's + // LayeredStore> shape. + // partition 0: col0=[3,0], col1=[0,3], col2=[3,3] + // partition 1: col0=[1,1], col1=[1,0], col2=[0,1] + let (_d0, p0) = make_int_matrix(&[&[3, 0], &[0, 3], &[3, 3]]); + let (_d1, p1) = make_int_matrix(&[&[1, 1], &[1, 0], &[0, 1]]); + + let partitioned = LayeredStore::new(vec![ + LayeredStore::new(vec![p0]), + LayeredStore::new(vec![p1]), + ]); + + let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 3, 1, 0], &[3, 3, 0, 1]]); + let threshold = 1u32; + let (inter_p, union_p) = CountPartials::partial_threshold_jaccard(&partitioned, threshold); + let (inter_f, union_f) = CountPartials::partial_threshold_jaccard(&mf, threshold); + + let n = 3; + for i in 0..n { + for j in 0..n { + assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]"); + assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]"); + } + } +} + +#[test] +fn partitioned_threshold_jaccard_packed_off_diagonal_is_pairwise() { + // Same as `partitioned_threshold_jaccard_off_diagonal_is_pairwise` but + // each partition matrix is packed into a single .pcmx file first — + // the on-disk format actually used in production after `pack_matrices`. + use obicompactvec::pack_compact_int_matrix; + + let (d0, _p0) = make_int_matrix(&[&[3, 0], &[0, 3], &[3, 3]]); + pack_compact_int_matrix(&d0.path().join("counts")).unwrap(); + let p0 = PersistentCompactIntMatrix::open(d0.path()).unwrap(); + + let (d1, _p1) = make_int_matrix(&[&[1, 1], &[1, 0], &[0, 1]]); + pack_compact_int_matrix(&d1.path().join("counts")).unwrap(); + let p1 = PersistentCompactIntMatrix::open(d1.path()).unwrap(); + + let partitioned = LayeredStore::new(vec![ + LayeredStore::new(vec![p0]), + LayeredStore::new(vec![p1]), + ]); + + let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 3, 1, 0], &[3, 3, 0, 1]]); + let threshold = 1u32; + let (inter_p, union_p) = CountPartials::partial_threshold_jaccard(&partitioned, threshold); + let (inter_f, union_f) = CountPartials::partial_threshold_jaccard(&mf, threshold); + + let n = 3; + for i in 0..n { + for j in 0..n { + assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]"); + assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]"); + } + } +} + +#[test] +fn partitioned_multilayer_threshold_jaccard_off_diagonal_is_pairwise() { + // 2 partitions, 2 layers each — the shape production indexes actually + // have (MPHF collision layers within a partition). + // partition 0, layer 0: col0=[3,0], col1=[0,3], col2=[3,3] + // partition 0, layer 1: col0=[2,0], col1=[0,0], col2=[2,0] + // partition 1, layer 0: col0=[1,1], col1=[1,0], col2=[0,1] + // partition 1, layer 1: col0=[0,5], col1=[5,5], col2=[0,0] + let (_d0a, p0a) = make_int_matrix(&[&[3, 0], &[0, 3], &[3, 3]]); + let (_d0b, p0b) = make_int_matrix(&[&[2, 0], &[0, 0], &[2, 0]]); + let (_d1a, p1a) = make_int_matrix(&[&[1, 1], &[1, 0], &[0, 1]]); + let (_d1b, p1b) = make_int_matrix(&[&[0, 5], &[5, 5], &[0, 0]]); + + let partitioned = LayeredStore::new(vec![ + LayeredStore::new(vec![p0a, p0b]), + LayeredStore::new(vec![p1a, p1b]), + ]); + + // Flattened equivalent: concatenate every layer's slots into one matrix. + let (_df, mf) = make_int_matrix(&[ + &[3, 0, 2, 0, 1, 1, 0, 5], + &[0, 3, 0, 0, 1, 0, 5, 5], + &[3, 3, 2, 0, 0, 1, 0, 0], + ]); + let threshold = 1u32; + let (inter_p, union_p) = CountPartials::partial_threshold_jaccard(&partitioned, threshold); + let (inter_f, union_f) = CountPartials::partial_threshold_jaccard(&mf, threshold); + + let n = 3; + for i in 0..n { + for j in 0..n { + assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]"); + assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]"); + } + } +} + +// ── BitPartials ─────────────────────────────────────────────────────────── + +#[test] +fn layered_jaccard_matches_combined() { + let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]); + let (_d1, m1) = make_bit_matrix(&[&[true, true], &[true, false]]); + let store = LayeredStore::new(vec![m0, m1]); + + let (_df, mf) = make_bit_matrix(&[ + &[true, false, true, true], + &[false, true, true, false], + ]); + let expected = BitPartials::jaccard_dist_matrix(&mf); + let got = BitPartials::jaccard_dist_matrix(&store); + assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "jaccard [0,1]"); +} + +#[test] +fn layered_hamming_matches_combined() { + let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]); + let (_d1, m1) = make_bit_matrix(&[&[true, true], &[false, false]]); + let store = LayeredStore::new(vec![m0, m1]); + + let (_df, mf) = make_bit_matrix(&[ + &[true, false, true, true], + &[false, true, false, false], + ]); + let expected = BitPartials::hamming_dist_matrix(&mf); + let got = BitPartials::hamming_dist_matrix(&store); + assert_eq!(got[[0, 1]], expected[[0, 1]], "hamming [0,1]"); +} + +#[test] +fn partitioned_bit_jaccard_off_diagonal_is_pairwise() { + // Same shape as the count-based `partitioned_multilayer_threshold_jaccard_*` + // tests, but for the presence/bit path (`with_counts = false` — what + // `all_specifics` actually uses in production). + // 4 genomes, 3 partitions, 2 layers in the last one. + let (_d0, p0) = make_bit_matrix(&[ + &[true, false, true], + &[false, true, true], + &[true, true, false], + &[false, false, true], + ]); + let (_d1, p1) = make_bit_matrix(&[ + &[true, true], + &[false, true], + &[true, false], + &[true, true], + ]); + let (_d2a, p2a) = make_bit_matrix(&[ + &[false, true], + &[true, true], + &[false, false], + &[true, false], + ]); + let (_d2b, p2b) = make_bit_matrix(&[ + &[true], + &[false], + &[true], + &[true], + ]); + + let partitioned = LayeredStore::new(vec![ + LayeredStore::new(vec![p0]), + LayeredStore::new(vec![p1]), + LayeredStore::new(vec![p2a, p2b]), + ]); + + // Flattened equivalent: concatenate every partition/layer's slots. + let (_df, mf) = make_bit_matrix(&[ + &[true, false, true, true, true, false, true, true], + &[false, true, true, false, true, true, true, false], + &[true, true, false, true, false, false, false, true], + &[false, false, true, true, true, true, false, true], + ]); + + let (inter_p, union_p) = BitPartials::partial_jaccard(&partitioned); + let (inter_f, union_f) = BitPartials::partial_jaccard(&mf); + + let n = 4; + for i in 0..n { + for j in 0..n { + assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]"); + assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]"); + } + } +} + +#[test] +fn partitioned_bit_jaccard_packed_off_diagonal_is_pairwise() { + // Same as `partitioned_bit_jaccard_off_diagonal_is_pairwise` but every + // partition's presence matrix is packed into a single .pbmx file — + // the on-disk format actually used in production after `pack_matrices`. + use obicompactvec::pack_bit_matrix; + + let (d0, _p0) = make_bit_matrix(&[ + &[true, false, true], + &[false, true, true], + &[true, true, false], + &[false, false, true], + ]); + pack_bit_matrix(&d0.path().join("presence")).unwrap(); + let p0 = PersistentBitMatrix::open(d0.path()).unwrap(); + + let (d1, _p1) = make_bit_matrix(&[ + &[true, true], + &[false, true], + &[true, false], + &[true, true], + ]); + pack_bit_matrix(&d1.path().join("presence")).unwrap(); + let p1 = PersistentBitMatrix::open(d1.path()).unwrap(); + + let (d2a, _p2a) = make_bit_matrix(&[ + &[false, true], + &[true, true], + &[false, false], + &[true, false], + ]); + pack_bit_matrix(&d2a.path().join("presence")).unwrap(); + let p2a = PersistentBitMatrix::open(d2a.path()).unwrap(); + + let (d2b, _p2b) = make_bit_matrix(&[ + &[true], + &[false], + &[true], + &[true], + ]); + pack_bit_matrix(&d2b.path().join("presence")).unwrap(); + let p2b = PersistentBitMatrix::open(d2b.path()).unwrap(); + + let partitioned = LayeredStore::new(vec![ + LayeredStore::new(vec![p0]), + LayeredStore::new(vec![p1]), + LayeredStore::new(vec![p2a, p2b]), + ]); + + let (_df, mf) = make_bit_matrix(&[ + &[true, false, true, true, true, false, true, true], + &[false, true, true, false, true, true, true, false], + &[true, true, false, true, false, false, false, true], + &[false, false, true, true, true, true, false, true], + ]); + + let (inter_p, union_p) = BitPartials::partial_jaccard(&partitioned); + let (inter_f, union_f) = BitPartials::partial_jaccard(&mf); + + let n = 4; + for i in 0..n { + for j in 0..n { + assert_eq!(inter_p[[i, j]], inter_f[[i, j]], "inter[{i},{j}]"); + assert_eq!(union_p[[i, j]], union_f[[i, j]], "union[{i},{j}]"); + } + } +}