2026-04-16 22:38:20 +02:00
<!doctype html>
< html lang = "en" class = "no-js" >
< head >
< meta charset = "utf-8" >
< meta name = "viewport" content = "width=device-width,initial-scale=1" >
2026-04-29 22:52:42 +02:00
< link rel = "next" href = "kmers/" >
2026-04-16 22:38:20 +02:00
< link rel = "icon" href = "assets/images/favicon.png" >
< meta name = "generator" content = "mkdocs-1.6.1, mkdocs-material-9.7.6" >
< title > obikmer</ title >
< link rel = "stylesheet" href = "assets/stylesheets/main.484c7ddc.min.css" >
< link rel = "preconnect" href = "https://fonts.gstatic.com" crossorigin >
< link rel = "stylesheet" href = "https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback" >
< style >: root { --md-text-font : "Roboto" ; --md-code-font : "Roboto Mono" }</ style >
< script > __md_scope = new URL ( "." , location ), __md_hash = e =>[... e ]. reduce ((( e , _ )=>( e << 5 ) - e + _ . charCodeAt ( 0 )), 0 ), __md_get = ( e , _ = localStorage , t = __md_scope )=> JSON . parse ( _ . getItem ( t . pathname + "." + e )), __md_set = ( e , _ , t = localStorage , a = __md_scope )=>{ try { t . setItem ( a . pathname + "." + e , JSON . stringify ( _ ))} catch ( e ){}}</ script >
</ head >
< body dir = "ltr" >
< input class = "md-toggle" data-md-toggle = "drawer" type = "checkbox" id = "__drawer" autocomplete = "off" >
< input class = "md-toggle" data-md-toggle = "search" type = "checkbox" id = "__search" autocomplete = "off" >
< label class = "md-overlay" for = "__drawer" ></ label >
< div data-md-component = "skip" >
< a href = "#obikmer" class = "md-skip" >
Skip to content
</ a >
</ div >
< div data-md-component = "announce" >
</ div >
< header class = "md-header md-header--shadow" data-md-component = "header" >
< nav class = "md-header__inner md-grid" aria-label = "Header" >
< a href = "." title = "obikmer" class = "md-header__button md-logo" aria-label = "obikmer" data-md-component = "logo" >
< svg xmlns = "http://www.w3.org/2000/svg" viewBox = "0 0 24 24" >< path d = "M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54" /></ svg >
</ a >
< label class = "md-header__button md-icon" for = "__drawer" >
< svg xmlns = "http://www.w3.org/2000/svg" viewBox = "0 0 24 24" >< path d = "M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z" /></ svg >
</ label >
< div class = "md-header__title" data-md-component = "header-title" >
< div class = "md-header__ellipsis" >
< div class = "md-header__topic" >
< span class = "md-ellipsis" >
obikmer
</ span >
</ div >
< div class = "md-header__topic" data-md-component = "header-topic" >
< span class = "md-ellipsis" >
Home
</ span >
</ div >
</ div >
</ div >
< script > var palette = __md_get ( "__palette" ); if ( palette && palette . color ){ if ( "(prefers-color-scheme)" === palette . color . media ){ var media = matchMedia ( "(prefers-color-scheme: light)" ), input = document . querySelector ( media . matches ? "[data-md-color-media='(prefers-color-scheme: light)']" : "[data-md-color-media='(prefers-color-scheme: dark)']" ); palette . color . media = input . getAttribute ( "data-md-color-media" ), palette . color . scheme = input . getAttribute ( "data-md-color-scheme" ), palette . color . primary = input . getAttribute ( "data-md-color-primary" ), palette . color . accent = input . getAttribute ( "data-md-color-accent" )} for ( var [ key , value ] of Object . entries ( palette . color )) document . body . setAttribute ( "data-md-color-" + key , value )}</ script >
</ nav >
</ header >
< div class = "md-container" data-md-component = "container" >
< main class = "md-main" data-md-component = "main" >
< div class = "md-main__inner md-grid" >
< div class = "md-sidebar md-sidebar--primary" data-md-component = "sidebar" data-md-type = "navigation" >
< div class = "md-sidebar__scrollwrap" >
< div class = "md-sidebar__inner" >
< nav class = "md-nav md-nav--primary" aria-label = "Navigation" data-md-level = "0" >
< label class = "md-nav__title" for = "__drawer" >
< a href = "." title = "obikmer" class = "md-nav__button md-logo" aria-label = "obikmer" data-md-component = "logo" >
< svg xmlns = "http://www.w3.org/2000/svg" viewBox = "0 0 24 24" >< path d = "M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54" /></ svg >
</ a >
obikmer
</ label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item md-nav__item--active" >
< input class = "md-nav__toggle md-toggle" type = "checkbox" id = "__toc" >
< label class = "md-nav__link md-nav__link--active" for = "__toc" >
< span class = "md-ellipsis" >
Home
</ span >
< span class = "md-nav__icon md-icon" ></ span >
</ label >
< a href = "." class = "md-nav__link md-nav__link--active" >
< span class = "md-ellipsis" >
Home
</ span >
</ a >
< nav class = "md-nav md-nav--secondary" aria-label = "Table of contents" >
< label class = "md-nav__title" for = "__toc" >
< span class = "md-nav__icon md-icon" ></ span >
Table of contents
</ label >
< ul class = "md-nav__list" data-md-component = "toc" data-md-scrollfix >
2026-06-04 21:27:01 +02:00
< li class = "md-nav__item" >
< a href = "#subcommands" class = "md-nav__link" >
< span class = "md-ellipsis" >
Subcommands
</ span >
</ a >
</ li >
2026-04-16 22:38:20 +02:00
< li class = "md-nav__item" >
< a href = "#constraints" class = "md-nav__link" >
< span class = "md-ellipsis" >
Constraints
</ span >
</ a >
2026-06-04 21:27:01 +02:00
</ li >
< li class = "md-nav__item" >
< a href = "#parameter-constraints-enforced-at-cli" class = "md-nav__link" >
< span class = "md-ellipsis" >
Parameter constraints (enforced at CLI)
</ span >
</ a >
</ li >
< li class = "md-nav__item" >
< a href = "#genome-label-constraints" class = "md-nav__link" >
< span class = "md-ellipsis" >
Genome label constraints
</ span >
</ a >
2026-04-16 22:38:20 +02:00
</ li >
< li class = "md-nav__item" >
< a href = "#priority-operations" class = "md-nav__link" >
< span class = "md-ellipsis" >
Priority operations
</ span >
</ a >
</ li >
</ ul >
</ nav >
</ li >
< li class = "md-nav__item md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle " type = "checkbox" id = "__nav_2" >
< label class = "md-nav__link" for = "__nav_2" id = "__nav_2_label" tabindex = "0" >
< span class = "md-ellipsis" >
Theory
</ span >
< span class = "md-nav__icon md-icon" ></ span >
</ label >
< nav class = "md-nav" data-md-level = "1" aria-labelledby = "__nav_2_label" aria-expanded = "false" >
< label class = "md-nav__title" for = "__nav_2" >
< span class = "md-nav__icon md-icon" ></ span >
Theory
</ label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = "kmers/" class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
Kmers and super-kmers
</ span >
</ a >
</ li >
< li class = "md-nav__item" >
< a href = "theory/encoding/" class = "md-nav__link" >
< span class = "md-ellipsis" >
DNA encoding
</ span >
</ a >
</ li >
< li class = "md-nav__item" >
< a href = "theory/entropy/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Entropy filter
</ span >
</ a >
</ li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a href = "theory/minimizer/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Minimizer selection
</ span >
</ a >
</ li >
2026-04-16 22:38:20 +02:00
< li class = "md-nav__item" >
< a href = "theory/indexing/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Partitioning architecture
</ span >
</ a >
</ li >
</ ul >
</ nav >
</ li >
< li class = "md-nav__item md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle " type = "checkbox" id = "__nav_3" >
< label class = "md-nav__link" for = "__nav_3" id = "__nav_3_label" tabindex = "0" >
< span class = "md-ellipsis" >
Implementation
</ span >
< span class = "md-nav__icon md-icon" ></ span >
</ label >
< nav class = "md-nav" data-md-level = "1" aria-labelledby = "__nav_3_label" aria-expanded = "false" >
< label class = "md-nav__title" for = "__nav_3" >
< span class = "md-nav__icon md-icon" ></ span >
Implementation
</ label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item" >
< a href = "implementation/superkmer/" class = "md-nav__link" >
< span class = "md-ellipsis" >
SuperKmer
</ span >
</ a >
</ li >
< li class = "md-nav__item" >
< a href = "implementation/kmer/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Kmer
</ span >
</ a >
</ li >
< li class = "md-nav__item" >
< a href = "implementation/chunkreader/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Chunk reader
</ span >
</ a >
</ li >
< li class = "md-nav__item" >
< a href = "implementation/pipeline/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Construction pipeline
</ span >
</ a >
</ li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a href = "implementation/obipipeline/" class = "md-nav__link" >
< span class = "md-ellipsis" >
obipipeline library
</ span >
</ a >
</ li >
2026-04-16 22:38:20 +02:00
< li class = "md-nav__item" >
< a href = "implementation/storage/" class = "md-nav__link" >
< span class = "md-ellipsis" >
On-disk storage
</ span >
</ a >
</ li >
< li class = "md-nav__item" >
< a href = "implementation/mphf/" class = "md-nav__link" >
< span class = "md-ellipsis" >
MPHF selection
</ span >
</ a >
</ li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a href = "implementation/unitig_evidence/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Unitig evidence encoding
</ span >
</ a >
</ li >
2026-05-15 21:07:23 +08:00
2026-06-04 21:27:01 +02:00
< li class = "md-nav__item" >
< a href = "implementation/evidence_elimination/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Evidence elimination (discussion)
</ span >
</ a >
</ li >
2026-05-15 21:07:23 +08:00
< li class = "md-nav__item" >
< a href = "implementation/obilayeredmap/" class = "md-nav__link" >
< span class = "md-ellipsis" >
obilayeredmap crate
</ span >
</ a >
</ li >
< li class = "md-nav__item" >
< a href = "implementation/persistent_compact_int_vec/" class = "md-nav__link" >
< span class = "md-ellipsis" >
PersistentCompactIntVec
</ span >
</ a >
</ li >
< li class = "md-nav__item" >
< a href = "implementation/persistent_bit_vec/" class = "md-nav__link" >
< span class = "md-ellipsis" >
PersistentBitVec
</ span >
</ a >
</ li >
2026-06-04 21:27:01 +02:00
< li class = "md-nav__item" >
< a href = "implementation/merge/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Merge command
</ span >
</ a >
</ li >
< li class = "md-nav__item" >
< a href = "implementation/rebuild_filter/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Kmer filtering (rebuild/dump/unitig)
</ span >
</ a >
</ li >
2026-04-16 22:38:20 +02:00
</ ul >
</ nav >
</ li >
< li class = "md-nav__item md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle " type = "checkbox" id = "__nav_4" >
< label class = "md-nav__link" for = "__nav_4" id = "__nav_4_label" tabindex = "0" >
< span class = "md-ellipsis" >
Architecture
</ span >
< span class = "md-nav__icon md-icon" ></ span >
</ label >
< nav class = "md-nav" data-md-level = "1" aria-labelledby = "__nav_4_label" aria-expanded = "false" >
< label class = "md-nav__title" for = "__nav_4" >
< span class = "md-nav__icon md-icon" ></ span >
Architecture
</ label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item" >
< a href = "architecture/sequences/invariant/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Sequences
</ span >
</ a >
</ li >
2026-05-15 21:07:23 +08:00
< li class = "md-nav__item" >
< a href = "architecture/index_architecture/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Kmer index
</ span >
</ a >
</ li >
2026-04-16 22:38:20 +02:00
</ ul >
</ nav >
</ li >
</ ul >
</ nav >
</ div >
</ div >
</ div >
< div class = "md-sidebar md-sidebar--secondary" data-md-component = "sidebar" data-md-type = "toc" >
< div class = "md-sidebar__scrollwrap" >
< div class = "md-sidebar__inner" >
< nav class = "md-nav md-nav--secondary" aria-label = "Table of contents" >
< label class = "md-nav__title" for = "__toc" >
< span class = "md-nav__icon md-icon" ></ span >
Table of contents
</ label >
< ul class = "md-nav__list" data-md-component = "toc" data-md-scrollfix >
2026-06-04 21:27:01 +02:00
< li class = "md-nav__item" >
< a href = "#subcommands" class = "md-nav__link" >
< span class = "md-ellipsis" >
Subcommands
</ span >
</ a >
</ li >
2026-04-16 22:38:20 +02:00
< li class = "md-nav__item" >
< a href = "#constraints" class = "md-nav__link" >
< span class = "md-ellipsis" >
Constraints
</ span >
</ a >
2026-06-04 21:27:01 +02:00
</ li >
< li class = "md-nav__item" >
< a href = "#parameter-constraints-enforced-at-cli" class = "md-nav__link" >
< span class = "md-ellipsis" >
Parameter constraints (enforced at CLI)
</ span >
</ a >
</ li >
< li class = "md-nav__item" >
< a href = "#genome-label-constraints" class = "md-nav__link" >
< span class = "md-ellipsis" >
Genome label constraints
</ span >
</ a >
2026-04-16 22:38:20 +02:00
</ li >
< li class = "md-nav__item" >
< a href = "#priority-operations" class = "md-nav__link" >
< span class = "md-ellipsis" >
Priority operations
</ span >
</ a >
</ li >
</ ul >
</ nav >
</ div >
</ div >
</ div >
< div class = "md-content" data-md-component = "content" >
< article class = "md-content__inner md-typeset" >
< h1 id = "obikmer" > obikmer</ h1 >
< p >< code > obikmer</ code > is a Rust tool for manipulation, counting, indexing, and set operations on DNA sequences represented as kmer sets.</ p >
2026-06-04 21:27:01 +02:00
< h2 id = "subcommands" > Subcommands</ h2 >
< table >
< thead >
< tr >
< th > Subcommand</ th >
< th > Purpose</ th >
</ tr >
</ thead >
< tbody >
< tr >
< td >< code > superkmer</ code ></ td >
< td > Extract super-kmers from a sequence file and write to stdout</ td >
</ tr >
< tr >
< td >< code > index</ code ></ td >
< td > Build a complete genome index (scatter → dereplicate → count → layered MPHF)</ td >
</ tr >
< tr >
< td >< code > merge</ code ></ td >
< td > Merge multiple built indexes into one</ td >
</ tr >
< tr >
< td >< code > rebuild</ code ></ td >
< td > Filter and compact an existing index into a new single-layer index; supports ingroup/outgroup predicates on genome metadata</ td >
</ tr >
< tr >
< td >< code > query</ code ></ td >
< td > Query an index with sequences and annotate matches</ td >
</ tr >
< tr >
< td >< code > dump</ code ></ td >
< td > Dump all indexed k-mers as CSV (kmer + per-genome counts or presence); supports the same ingroup/outgroup filtering as < code > rebuild</ code ></ td >
</ tr >
< tr >
< td >< code > annotate</ code ></ td >
< td > Add or update genome metadata from a CSV file; or dump metadata as CSV</ td >
</ tr >
< tr >
< td >< code > distance</ code ></ td >
< td > Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees</ td >
</ tr >
< tr >
< td >< code > unitig</ code ></ td >
< td > Build a global de Bruijn graph across all partitions and enumerate its unitigs as FASTA; supports the same ingroup/outgroup filtering as < code > rebuild</ code ></ td >
</ tr >
< tr >
< td >< code > estimate</ code ></ td >
< td > Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing</ td >
</ tr >
< tr >
< td >< code > reindex</ code ></ td >
< td > Convert an index's evidence in-place: exact ↔ approx</ td >
</ tr >
< tr >
< td >< code > utils</ code ></ td >
< td > Miscellaneous index utilities: < code > --new-label NEW=OLD</ code > renames a genome label; < code > --upgrade-index</ code > adds missing < code > layer_meta.json</ code > to old indexes</ td >
</ tr >
< tr >
< td >< code > pack</ code ></ td >
< td > Pack per-column matrix files into single-file format to reduce query I/O</ td >
</ tr >
</ tbody >
</ table >
2026-04-16 22:38:20 +02:00
< h2 id = "constraints" > Constraints</ h2 >
< ul >
2026-05-17 10:20:22 +08:00
< li > Target scale: individual genome datasets, tens of Gbases</ li >
2026-04-16 22:38:20 +02:00
< li > Maximum efficiency in computation, memory, and disk usage</ li >
2026-06-04 21:27:01 +02:00
< li > k odd, k ∈ [11, 31], fixed at runtime; kmer fits in a u64 (2 bits/base)</ li >
< li > Canonical form: < code > min(kmer, revcomp(kmer))</ code > reduces strand-symmetric space by half</ li >
< li > Input formats for < code > index</ code > /< code > superkmer</ code > : FASTA (< code > .fa</ code > , < code > .fasta</ code > ), FASTQ (< code > .fq</ code > , < code > .fastq</ code > ), GenBank flat file (< code > .gb</ code > , < code > .gbk</ code > , < code > .gbff</ code > ), all optionally gzip-compressed; directories expanded recursively; streaming stdin via < code > -</ code ></ li >
< li > Input formats for < code > query</ code > : FASTA, FASTQ, optionally gzip-compressed; streaming stdin via < code > -</ code ></ li >
2026-04-16 22:38:20 +02:00
</ ul >
2026-06-04 21:27:01 +02:00
< h2 id = "parameter-constraints-enforced-at-cli" > Parameter constraints (enforced at CLI)</ h2 >
< p > All constraints below are checked by < code > CommonArgs::validate()</ code > at the start of < code > superkmer</ code > and < code > index</ code > . Invalid values exit immediately with an error.</ p >
< table >
< thead >
< tr >
< th > Parameter</ th >
< th > Constraint</ th >
< th > Reason</ th >
</ tr >
</ thead >
< tbody >
< tr >
< td > k (< code > --kmer-size</ code > )</ td >
< td > odd</ td >
< td > even k allows palindromic k-mers: kmer == revcomp(kmer), breaking the canonical form invariant</ td >
</ tr >
< tr >
< td > k (< code > --kmer-size</ code > )</ td >
< td > k ∈ [11, 31]</ td >
< td > k > 31 overflows u64 at 2 bits/base; k < 11 gives insufficient specificity</ td >
</ tr >
< tr >
< td > m (< code > --minimizer-size</ code > )</ td >
< td > odd</ td >
< td > same palindrome argument as k</ td >
</ tr >
< tr >
< td > m (< code > --minimizer-size</ code > )</ td >
< td > 3 ≤ m ≤ k− 1</ td >
< td > minimizer must be strictly shorter than the kmer</ td >
</ tr >
< tr >
< td > z (< code > -z</ code > , Findere, < code > index --approx</ code > only)</ td >
< td > z ≤ k− 1</ td >
< td > effective indexed kmer size is k− z+1; z ≥ k would make it ≤ 0</ td >
</ tr >
</ tbody >
</ table >
< h2 id = "genome-label-constraints" > Genome label constraints</ h2 >
< p > Genome labels are arbitrary Unicode strings with the following restrictions:</ p >
< table >
< thead >
< tr >
< th > Character</ th >
< th > Forbidden</ th >
< th > Reason</ th >
</ tr >
</ thead >
< tbody >
< tr >
< td >< code > /</ code ></ td >
< td > yes</ td >
< td > filesystem path separator</ td >
</ tr >
< tr >
< td >< code > =</ code ></ td >
< td > yes</ td >
< td >< code > --new-label</ code > parser separator</ td >
</ tr >
< tr >
< td >< code > \0</ code ></ td >
< td > yes</ td >
< td > null byte</ td >
</ tr >
< tr >
< td >< code > \n</ code > < code > \r</ code > < code > \t</ code ></ td >
< td > yes</ td >
< td > break CSV output</ td >
</ tr >
< tr >
< td > spaces</ td >
< td >< strong > allowed</ strong ></ td >
< td > use shell quoting: < code > --new-label 'new label=old label'</ code ></ td >
</ tr >
</ tbody >
</ table >
< p > Empty labels are also rejected. Labels derived automatically from the index directory name (when < code > --label</ code > is omitted) are not validated since they come from the filesystem and are already safe.</ p >
2026-04-16 22:38:20 +02:00
< h2 id = "priority-operations" > Priority operations</ h2 >
< ul >
< li > Kmer counting (frequencies)</ li >
< li > Fast search / query</ li >
< li > Set operations: union, intersection, difference</ li >
</ ul >
</ article >
</ div >
< script > var target = document . getElementById ( location . hash . slice ( 1 )); target && target . name && ( target . checked = target . name . startsWith ( "__tabbed_" ))</ script >
</ div >
</ main >
< footer class = "md-footer" >
< div class = "md-footer-meta md-typeset" >
< div class = "md-footer-meta__inner md-grid" >
< div class = "md-copyright" >
Made with
< a href = "https://squidfunk.github.io/mkdocs-material/" target = "_blank" rel = "noopener" >
Material for MkDocs
</ a >
</ div >
</ div >
</ div >
</ footer >
</ div >
< div class = "md-dialog" data-md-component = "dialog" >
< div class = "md-dialog__inner md-typeset" ></ div >
</ div >
< script id = "__config" type = "application/json" >{ "annotate" : null , "base" : "." , "features" : [], "search" : "assets/javascripts/workers/search.2c215733.min.js" , "tags" : null , "translations" : { "clipboard.copied" : "Copied to clipboard" , "clipboard.copy" : "Copy to clipboard" , "search.result.more.one" : "1 more on this page" , "search.result.more.other" : "# more on this page" , "search.result.none" : "No matching documents" , "search.result.one" : "1 matching document" , "search.result.other" : "# matching documents" , "search.result.placeholder" : "Type to start searching" , "search.result.term.missing" : "Missing" , "select.version" : "Select version" }, "version" : null }</ script >
< script src = "assets/javascripts/bundle.79ae519e.min.js" ></ script >
< script src = "https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js" ></ script >
</ body >
</ html >