2026-04-16 22:38:20 +02:00
<!doctype html>
< html lang = "en" class = "no-js" >
< head >
< meta charset = "utf-8" >
< meta name = "viewport" content = "width=device-width,initial-scale=1" >
2026-04-29 22:52:42 +02:00
< link rel = "prev" href = "../obipipeline/" >
2026-04-16 22:38:20 +02:00
< link rel = "next" href = "../mphf/" >
< link rel = "icon" href = "../../assets/images/favicon.png" >
< meta name = "generator" content = "mkdocs-1.6.1, mkdocs-material-9.7.6" >
< title > On-disk storage - obikmer< / title >
< link rel = "stylesheet" href = "../../assets/stylesheets/main.484c7ddc.min.css" >
< link rel = "preconnect" href = "https://fonts.gstatic.com" crossorigin >
< link rel = "stylesheet" href = "https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback" >
< style > : root { --md-text-font : "Roboto" ; --md-code-font : "Roboto Mono" } < / style >
< script > _ _md _scope = new URL ( "../.." , location ) , _ _md _hash = e => [ ... e ] . reduce ( ( ( e , _ ) => ( e << 5 ) - e + _ . charCodeAt ( 0 ) ) , 0 ) , _ _md _get = ( e , _ = localStorage , t = _ _md _scope ) => JSON . parse ( _ . getItem ( t . pathname + "." + e ) ) , _ _md _set = ( e , _ , t = localStorage , a = _ _md _scope ) => { try { t . setItem ( a . pathname + "." + e , JSON . stringify ( _ ) ) } catch ( e ) { } } < / script >
< / head >
< body dir = "ltr" >
< input class = "md-toggle" data-md-toggle = "drawer" type = "checkbox" id = "__drawer" autocomplete = "off" >
< input class = "md-toggle" data-md-toggle = "search" type = "checkbox" id = "__search" autocomplete = "off" >
< label class = "md-overlay" for = "__drawer" > < / label >
< div data-md-component = "skip" >
2026-06-04 21:27:01 +02:00
< a href = "#on-disk-index-layout" class = "md-skip" >
2026-04-16 22:38:20 +02:00
Skip to content
< / a >
< / div >
< div data-md-component = "announce" >
< / div >
< header class = "md-header md-header--shadow" data-md-component = "header" >
< nav class = "md-header__inner md-grid" aria-label = "Header" >
< a href = "../.." title = "obikmer" class = "md-header__button md-logo" aria-label = "obikmer" data-md-component = "logo" >
< svg xmlns = "http://www.w3.org/2000/svg" viewBox = "0 0 24 24" > < path d = "M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54" / > < / svg >
< / a >
< label class = "md-header__button md-icon" for = "__drawer" >
< svg xmlns = "http://www.w3.org/2000/svg" viewBox = "0 0 24 24" > < path d = "M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z" / > < / svg >
< / label >
< div class = "md-header__title" data-md-component = "header-title" >
< div class = "md-header__ellipsis" >
< div class = "md-header__topic" >
< span class = "md-ellipsis" >
obikmer
< / span >
< / div >
< div class = "md-header__topic" data-md-component = "header-topic" >
< span class = "md-ellipsis" >
On-disk storage
< / span >
< / div >
< / div >
< / div >
< script > var palette = _ _md _get ( "__palette" ) ; if ( palette && palette . color ) { if ( "(prefers-color-scheme)" === palette . color . media ) { var media = matchMedia ( "(prefers-color-scheme: light)" ) , input = document . querySelector ( media . matches ? "[data-md-color-media='(prefers-color-scheme: light)']" : "[data-md-color-media='(prefers-color-scheme: dark)']" ) ; palette . color . media = input . getAttribute ( "data-md-color-media" ) , palette . color . scheme = input . getAttribute ( "data-md-color-scheme" ) , palette . color . primary = input . getAttribute ( "data-md-color-primary" ) , palette . color . accent = input . getAttribute ( "data-md-color-accent" ) } for ( var [ key , value ] of Object . entries ( palette . color ) ) document . body . setAttribute ( "data-md-color-" + key , value ) } < / script >
< / nav >
< / header >
< div class = "md-container" data-md-component = "container" >
< main class = "md-main" data-md-component = "main" >
< div class = "md-main__inner md-grid" >
< div class = "md-sidebar md-sidebar--primary" data-md-component = "sidebar" data-md-type = "navigation" >
< div class = "md-sidebar__scrollwrap" >
< div class = "md-sidebar__inner" >
< nav class = "md-nav md-nav--primary" aria-label = "Navigation" data-md-level = "0" >
< label class = "md-nav__title" for = "__drawer" >
< a href = "../.." title = "obikmer" class = "md-nav__button md-logo" aria-label = "obikmer" data-md-component = "logo" >
< svg xmlns = "http://www.w3.org/2000/svg" viewBox = "0 0 24 24" > < path d = "M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54" / > < / svg >
< / a >
obikmer
< / label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item" >
< a href = "../.." class = "md-nav__link" >
< span class = "md-ellipsis" >
Home
< / span >
< / a >
< / li >
< li class = "md-nav__item md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle " type = "checkbox" id = "__nav_2" >
< label class = "md-nav__link" for = "__nav_2" id = "__nav_2_label" tabindex = "0" >
< span class = "md-ellipsis" >
Theory
< / span >
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< nav class = "md-nav" data-md-level = "1" aria-labelledby = "__nav_2_label" aria-expanded = "false" >
< label class = "md-nav__title" for = "__nav_2" >
< span class = "md-nav__icon md-icon" > < / span >
Theory
< / label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = "../../kmers/" class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
Kmers and super-kmers
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../../theory/encoding/" class = "md-nav__link" >
< span class = "md-ellipsis" >
DNA encoding
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../../theory/entropy/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Entropy filter
< / span >
< / a >
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a href = "../../theory/minimizer/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Minimizer selection
< / span >
< / a >
< / li >
2026-04-16 22:38:20 +02:00
< li class = "md-nav__item" >
< a href = "../../theory/indexing/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Partitioning architecture
< / span >
< / a >
< / li >
< / ul >
< / nav >
< / li >
< li class = "md-nav__item md-nav__item--active md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle " type = "checkbox" id = "__nav_3" checked >
< label class = "md-nav__link" for = "__nav_3" id = "__nav_3_label" tabindex = "0" >
< span class = "md-ellipsis" >
Implementation
< / span >
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< nav class = "md-nav" data-md-level = "1" aria-labelledby = "__nav_3_label" aria-expanded = "true" >
< label class = "md-nav__title" for = "__nav_3" >
< span class = "md-nav__icon md-icon" > < / span >
Implementation
< / label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item" >
< a href = "../superkmer/" class = "md-nav__link" >
< span class = "md-ellipsis" >
SuperKmer
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../kmer/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Kmer
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../chunkreader/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Chunk reader
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../pipeline/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Construction pipeline
< / span >
< / a >
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a href = "../obipipeline/" class = "md-nav__link" >
< span class = "md-ellipsis" >
obipipeline library
< / span >
< / a >
< / li >
2026-04-16 22:38:20 +02:00
< li class = "md-nav__item md-nav__item--active" >
< input class = "md-nav__toggle md-toggle" type = "checkbox" id = "__toc" >
2026-06-04 21:27:01 +02:00
< label class = "md-nav__link md-nav__link--active" for = "__toc" >
< span class = "md-ellipsis" >
On-disk storage
< / span >
< span class = "md-nav__icon md-icon" > < / span >
< / label >
2026-04-16 22:38:20 +02:00
< a href = "./" class = "md-nav__link md-nav__link--active" >
< span class = "md-ellipsis" >
On-disk storage
< / span >
< / a >
2026-06-04 21:27:01 +02:00
< nav class = "md-nav md-nav--secondary" aria-label = "Table of contents" >
< label class = "md-nav__title" for = "__toc" >
< span class = "md-nav__icon md-icon" > < / span >
Table of contents
< / label >
< ul class = "md-nav__list" data-md-component = "toc" data-md-scrollfix >
< li class = "md-nav__item" >
< a href = "#directory-tree" class = "md-nav__link" >
< span class = "md-ellipsis" >
Directory tree
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#state-machine-sentinels" class = "md-nav__link" >
< span class = "md-ellipsis" >
State machine (sentinels)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#indexmeta-indexmeta" class = "md-nav__link" >
< span class = "md-ellipsis" >
index.meta (IndexMeta)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#layer-files" class = "md-nav__link" >
< span class = "md-ellipsis" >
Layer files
< / span >
< / a >
< nav class = "md-nav" aria-label = "Layer files" >
< ul class = "md-nav__list" >
< li class = "md-nav__item" >
< a href = "#unitigsbin" class = "md-nav__link" >
< span class = "md-ellipsis" >
unitigs.bin
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#unitigsbinidx-exact-only" class = "md-nav__link" >
< span class = "md-ellipsis" >
unitigs.bin.idx (Exact only)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#mphfbin" class = "md-nav__link" >
< span class = "md-ellipsis" >
mphf.bin
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#layer_metajson-layermeta" class = "md-nav__link" >
< span class = "md-ellipsis" >
layer_meta.json (LayerMeta)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#evidencebin-exact" class = "md-nav__link" >
< span class = "md-ellipsis" >
evidence.bin (Exact)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#fingerprintbin-approx" class = "md-nav__link" >
< span class = "md-ellipsis" >
fingerprint.bin (Approx)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#counts-persistentcompactintmatrix" class = "md-nav__link" >
< span class = "md-ellipsis" >
counts/ (PersistentCompactIntMatrix)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#presence-persistentbitmatrix" class = "md-nav__link" >
< span class = "md-ellipsis" >
presence/ (PersistentBitMatrix)
< / span >
< / a >
< / li >
< / ul >
< / nav >
< / li >
< li class = "md-nav__item" >
< a href = "#metajson-partitionmeta" class = "md-nav__link" >
< span class = "md-ellipsis" >
meta.json (PartitionMeta)
< / span >
< / a >
< / li >
< / ul >
< / nav >
2026-04-16 22:38:20 +02:00
< / li >
< li class = "md-nav__item" >
< a href = "../mphf/" class = "md-nav__link" >
< span class = "md-ellipsis" >
MPHF selection
< / span >
< / a >
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a href = "../unitig_evidence/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Unitig evidence encoding
< / span >
< / a >
< / li >
2026-05-15 21:07:23 +08:00
2026-06-04 21:27:01 +02:00
< li class = "md-nav__item" >
< a href = "../evidence_elimination/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Evidence elimination (discussion)
< / span >
< / a >
< / li >
2026-05-15 21:07:23 +08:00
< li class = "md-nav__item" >
< a href = "../obilayeredmap/" class = "md-nav__link" >
< span class = "md-ellipsis" >
obilayeredmap crate
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../persistent_compact_int_vec/" class = "md-nav__link" >
< span class = "md-ellipsis" >
PersistentCompactIntVec
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../persistent_bit_vec/" class = "md-nav__link" >
< span class = "md-ellipsis" >
PersistentBitVec
< / span >
< / a >
< / li >
2026-06-04 21:27:01 +02:00
< li class = "md-nav__item" >
< a href = "../merge/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Merge command
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../rebuild_filter/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Kmer filtering (rebuild/dump/unitig)
< / span >
< / a >
< / li >
2026-04-16 22:38:20 +02:00
< / ul >
< / nav >
< / li >
< li class = "md-nav__item md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle " type = "checkbox" id = "__nav_4" >
< label class = "md-nav__link" for = "__nav_4" id = "__nav_4_label" tabindex = "0" >
< span class = "md-ellipsis" >
Architecture
< / span >
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< nav class = "md-nav" data-md-level = "1" aria-labelledby = "__nav_4_label" aria-expanded = "false" >
< label class = "md-nav__title" for = "__nav_4" >
< span class = "md-nav__icon md-icon" > < / span >
Architecture
< / label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item" >
< a href = "../../architecture/sequences/invariant/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Sequences
2026-05-15 21:07:23 +08:00
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../../architecture/index_architecture/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Kmer index
2026-04-16 22:38:20 +02:00
< / span >
< / a >
< / li >
< / ul >
< / nav >
< / li >
< / ul >
< / nav >
< / div >
< / div >
< / div >
< div class = "md-sidebar md-sidebar--secondary" data-md-component = "sidebar" data-md-type = "toc" >
< div class = "md-sidebar__scrollwrap" >
< div class = "md-sidebar__inner" >
< nav class = "md-nav md-nav--secondary" aria-label = "Table of contents" >
2026-06-04 21:27:01 +02:00
< label class = "md-nav__title" for = "__toc" >
< span class = "md-nav__icon md-icon" > < / span >
Table of contents
< / label >
< ul class = "md-nav__list" data-md-component = "toc" data-md-scrollfix >
< li class = "md-nav__item" >
< a href = "#directory-tree" class = "md-nav__link" >
< span class = "md-ellipsis" >
Directory tree
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#state-machine-sentinels" class = "md-nav__link" >
< span class = "md-ellipsis" >
State machine (sentinels)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#indexmeta-indexmeta" class = "md-nav__link" >
< span class = "md-ellipsis" >
index.meta (IndexMeta)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#layer-files" class = "md-nav__link" >
< span class = "md-ellipsis" >
Layer files
< / span >
< / a >
< nav class = "md-nav" aria-label = "Layer files" >
< ul class = "md-nav__list" >
< li class = "md-nav__item" >
< a href = "#unitigsbin" class = "md-nav__link" >
< span class = "md-ellipsis" >
unitigs.bin
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#unitigsbinidx-exact-only" class = "md-nav__link" >
< span class = "md-ellipsis" >
unitigs.bin.idx (Exact only)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#mphfbin" class = "md-nav__link" >
< span class = "md-ellipsis" >
mphf.bin
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#layer_metajson-layermeta" class = "md-nav__link" >
< span class = "md-ellipsis" >
layer_meta.json (LayerMeta)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#evidencebin-exact" class = "md-nav__link" >
< span class = "md-ellipsis" >
evidence.bin (Exact)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#fingerprintbin-approx" class = "md-nav__link" >
< span class = "md-ellipsis" >
fingerprint.bin (Approx)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#counts-persistentcompactintmatrix" class = "md-nav__link" >
< span class = "md-ellipsis" >
counts/ (PersistentCompactIntMatrix)
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#presence-persistentbitmatrix" class = "md-nav__link" >
< span class = "md-ellipsis" >
presence/ (PersistentBitMatrix)
< / span >
< / a >
< / li >
< / ul >
< / nav >
< / li >
< li class = "md-nav__item" >
< a href = "#metajson-partitionmeta" class = "md-nav__link" >
< span class = "md-ellipsis" >
meta.json (PartitionMeta)
< / span >
< / a >
< / li >
< / ul >
2026-04-16 22:38:20 +02:00
< / nav >
< / div >
< / div >
< / div >
< div class = "md-content" data-md-component = "content" >
< article class = "md-content__inner md-typeset" >
2026-06-04 21:27:01 +02:00
< h1 id = "on-disk-index-layout" > On-disk index layout< / h1 >
< h2 id = "directory-tree" > Directory tree< / h2 >
< div class = "highlight" > < pre > < span > < / span > < code > < index_root> /
index.meta ← JSON: IndexMeta
scatter.done ← sentinel: scatter phase complete
count.done ← sentinel: dereplicate + count complete
index.done ← sentinel: MPHF index fully built
spectrums/
< label> .json ← kmer frequency spectrum per genome
partitions/
part_00000/ ← one dir per partition (zero-padded 5 digits, 0..2^n_bits− 1)
index/
meta.json ← PartitionMeta { n_layers }
layer_0/
unitigs.bin ← binary unitig sequences (2-bit packed)
unitigs.bin.idx ← block-sampled offset index (exact evidence only)
mphf.bin ← serialised PtrHash MPHF
layer_meta.json ← LayerMeta { evidence: EvidenceKind }
evidence.bin ← chunk_id:rank per MPHF slot (Exact only)
fingerprint.bin ← b-bit fingerprints per MPHF slot (Approx only)
counts/ ← PersistentCompactIntMatrix (if with_counts=true)
presence/ ← PersistentBitMatrix (if presence mode, merge)
layer_1/ ← added by merge; same structure as layer_0
layer_2/ …
part_00001/ …
< / code > < / pre > < / div >
< h2 id = "state-machine-sentinels" > State machine (sentinels)< / h2 >
< p > The sentinels are touched atomically at the end of each pipeline stage.
A partial run (e.g. scatter interrupted) leaves no sentinel; the state is
detected as the lowest sentinel present.< / p >
< table >
< thead >
< tr >
< th > State< / th >
< th > Sentinel present< / th >
< th > Meaning< / th >
< / tr >
< / thead >
< tbody >
< tr >
< td > < code > Empty< / code > < / td >
< td > —< / td >
< td > < code > index.meta< / code > exists; scatter not started or interrupted< / td >
< / tr >
< tr >
< td > < code > Scattered< / code > < / td >
< td > < code > scatter.done< / code > < / td >
< td > All super-kmers routed to partition files< / td >
< / tr >
< tr >
< td > < code > Counted< / code > < / td >
< td > < code > count.done< / code > < / td >
< td > Partitions dereplicated; < code > spectrums/< / code > written< / td >
< / tr >
< tr >
< td > < code > Indexed< / code > < / td >
< td > < code > index.done< / code > < / td >
< td > All MPHF layers built; index ready for queries< / td >
< / tr >
< / tbody >
< / table >
< h2 id = "indexmeta-indexmeta" > index.meta (IndexMeta)< / h2 >
< div class = "highlight" > < pre > < span > < / span > < code > < span class = "p" > {< / span >
< span class = "w" > < / span > < span class = "nt" > " version" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "mi" > 1< / span > < span class = "p" > ,< / span >
< span class = "w" > < / span > < span class = "nt" > " config" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "p" > {< / span >
< span class = "w" > < / span > < span class = "nt" > " kmer_size" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "mi" > 31< / span > < span class = "p" > ,< / span >
< span class = "w" > < / span > < span class = "nt" > " minimizer_size" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "mi" > 11< / span > < span class = "p" > ,< / span >
< span class = "w" > < / span > < span class = "nt" > " n_bits" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "mi" > 8< / span > < span class = "p" > ,< / span >
< span class = "w" > < / span > < span class = "nt" > " with_counts" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "kc" > false< / span > < span class = "p" > ,< / span >
< span class = "w" > < / span > < span class = "nt" > " evidence" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "s2" > " Exact" < / span > < span class = "p" > ,< / span >
< span class = "w" > < / span > < span class = "nt" > " block_bits" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "mi" > 0< / span >
< span class = "w" > < / span > < span class = "p" > },< / span >
< span class = "w" > < / span > < span class = "nt" > " genomes" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "p" > [< / span >
< span class = "w" > < / span > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "nt" > " label" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "s2" > " genome_A" < / span > < span class = "p" > ,< / span > < span class = "w" > < / span > < span class = "nt" > " meta" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "nt" > " species" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "s2" > " Homo sapiens" < / span > < span class = "w" > < / span > < span class = "p" > }< / span > < span class = "w" > < / span > < span class = "p" > }< / span >
< span class = "w" > < / span > < span class = "p" > ]< / span >
< span class = "p" > }< / span >
< / code > < / pre > < / div >
< p > < code > n_bits< / code > determines the partition count: < code > 2^n_bits< / code > directories under < code > partitions/< / code > .< / p >
< p > < code > evidence< / code > is either the string < code > "Exact"< / code > or < code > {"Approx": {"b": 8, "z": 1}}< / code > .< / p >
< p > < code > block_bits< / code > controls the < code > .idx< / code > granularity: one offset entry every < code > 2^block_bits< / code >
chunks. < code > block_bits=0< / code > stores one entry per chunk (O(1) random access, largest < code > .idx< / code > ).< / p >
< p > < code > GenomeInfo.meta< / code > is a free-form string→string map for categorical metadata (e.g.
taxonomy, sample origin). It is optional; defaults to empty.< / p >
< h2 id = "layer-files" > Layer files< / h2 >
< h3 id = "unitigsbin" > unitigs.bin< / h3 >
< p > 2-bit packed binary unitig sequences. Each record: 1 byte < code > seql_minus_k< / code >
(nucleotide length − k), followed by < code > ceil((seql_minus_k + k) / 4)< / code > bytes of
packed sequence. Long unitigs are transparently split into overlapping chunks
(k− 1 nucleotide overlap) so no k-mer crosses a chunk boundary.< / p >
< h3 id = "unitigsbinidx-exact-only" > unitigs.bin.idx (Exact only)< / h3 >
< p > Magic < code > UIX3< / code > , little-endian header: < code > block_bits< / code > (u32), < code > n_unitigs< / code > (u32),
< code > n_kmers< / code > (u64), then < code > ceil(n_unitigs / 2^block_bits) + 1< / code > byte-offset entries
(u32 each, last entry is a sentinel past-end offset). Absent for Approx layers.< / p >
< h3 id = "mphfbin" > mphf.bin< / h3 >
< p > PtrHash MPHF serialised with epserde. Maps canonical kmer (u64, left-aligned
2-bit) to a slot index in < code > [0, n_kmers)< / code > .< / p >
< h3 id = "layer_metajson-layermeta" > layer_meta.json (LayerMeta)< / h3 >
< p > < div class = "highlight" > < pre > < span > < / span > < code > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "nt" > " evidence" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "nt" > " type" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "s2" > " exact" < / span > < span class = "w" > < / span > < span class = "p" > }< / span > < span class = "w" > < / span > < span class = "p" > }< / span >
< / code > < / pre > < / div >
or
< div class = "highlight" > < pre > < span > < / span > < code > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "nt" > " evidence" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "nt" > " type" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "s2" > " approx" < / span > < span class = "p" > ,< / span > < span class = "w" > < / span > < span class = "nt" > " b" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "mi" > 8< / span > < span class = "p" > ,< / span > < span class = "w" > < / span > < span class = "nt" > " z" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "mi" > 1< / span > < span class = "w" > < / span > < span class = "p" > }< / span > < span class = "w" > < / span > < span class = "p" > }< / span >
< / code > < / pre > < / div > < / p >
< h3 id = "evidencebin-exact" > evidence.bin (Exact)< / h3 >
< p > One < code > (chunk_id: u32, rank: u8)< / code > record per MPHF slot, packed. Used to verify
that the kmer mapped to a slot is actually present: < code > unitigs.bin[chunk_id][rank]< / code >
is re-read and compared against the query.< / p >
< h3 id = "fingerprintbin-approx" > fingerprint.bin (Approx)< / h3 >
< p > < code > b< / code > -bit fingerprint per MPHF slot derived from the kmer's sequence hash.
False-positive rate per query ≈ < code > 1/2^b< / code > . With Findere parameter < code > z ≥ 2< / code > ,
< code > z< / code > consecutive k-mers must all match, reducing the effective FP rate to
approximately < code > W / 2^(b·z)< / code > per read of length < code > L< / code >
(where < code > W = L − k − z + 2< / code > ).< / p >
< h3 id = "counts-persistentcompactintmatrix" > counts/ (PersistentCompactIntMatrix)< / h3 >
< p > Present when < code > with_counts=true< / code > . One column per genome; each row holds the
per-genome k-mer count for the corresponding MPHF slot. Appended column-by-column
during indexing and merge.< / p >
< h3 id = "presence-persistentbitmatrix" > presence/ (PersistentBitMatrix)< / h3 >
< p > Present when the layer was built in presence/absence mode (merge path).
One bit per genome per MPHF slot. Written during merge; never present on a
freshly indexed single-genome layer.< / p >
< h2 id = "metajson-partitionmeta" > meta.json (PartitionMeta)< / h2 >
< div class = "highlight" > < pre > < span > < / span > < code > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "nt" > " n_layers" < / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "mi" > 2< / span > < span class = "w" > < / span > < span class = "p" > }< / span >
< / code > < / pre > < / div >
< p > Records how many < code > layer_N/< / code > directories exist under < code > index/< / code > . Incremented by
each merge that adds a layer.< / p >
2026-04-16 22:38:20 +02:00
< / article >
< / div >
< script > var target = document . getElementById ( location . hash . slice ( 1 ) ) ; target && target . name && ( target . checked = target . name . startsWith ( "__tabbed_" ) ) < / script >
< / div >
< / main >
< footer class = "md-footer" >
< div class = "md-footer-meta md-typeset" >
< div class = "md-footer-meta__inner md-grid" >
< div class = "md-copyright" >
Made with
< a href = "https://squidfunk.github.io/mkdocs-material/" target = "_blank" rel = "noopener" >
Material for MkDocs
< / a >
< / div >
< / div >
< / div >
< / footer >
< / div >
< div class = "md-dialog" data-md-component = "dialog" >
< div class = "md-dialog__inner md-typeset" > < / div >
< / div >
< script id = "__config" type = "application/json" > { "annotate" : null , "base" : "../.." , "features" : [ ] , "search" : "../../assets/javascripts/workers/search.2c215733.min.js" , "tags" : null , "translations" : { "clipboard.copied" : "Copied to clipboard" , "clipboard.copy" : "Copy to clipboard" , "search.result.more.one" : "1 more on this page" , "search.result.more.other" : "# more on this page" , "search.result.none" : "No matching documents" , "search.result.one" : "1 matching document" , "search.result.other" : "# matching documents" , "search.result.placeholder" : "Type to start searching" , "search.result.term.missing" : "Missing" , "select.version" : "Select version" } , "version" : null } < / script >
< script src = "../../assets/javascripts/bundle.79ae519e.min.js" > < / script >
< script src = "https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js" > < / script >
< / body >
< / html >