2026-04-16 22:38:20 +02:00
<!doctype html>
< html lang = "en" class = "no-js" >
< head >
< meta charset = "utf-8" >
< meta name = "viewport" content = "width=device-width,initial-scale=1" >
2026-04-29 22:52:42 +02:00
< link rel = "prev" href = ".." >
2026-04-16 22:38:20 +02:00
2026-04-29 22:52:42 +02:00
< link rel = "next" href = "../theory/encoding/" >
2026-04-16 22:38:20 +02:00
2026-04-29 22:52:42 +02:00
< link rel = "icon" href = "../assets/images/favicon.png" >
2026-04-16 22:38:20 +02:00
< meta name = "generator" content = "mkdocs-1.6.1, mkdocs-material-9.7.6" >
< title > Kmers and super-kmers - obikmer< / title >
2026-04-29 22:52:42 +02:00
< link rel = "stylesheet" href = "../assets/stylesheets/main.484c7ddc.min.css" >
2026-04-16 22:38:20 +02:00
< link rel = "preconnect" href = "https://fonts.gstatic.com" crossorigin >
< link rel = "stylesheet" href = "https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback" >
< style > : root { --md-text-font : "Roboto" ; --md-code-font : "Roboto Mono" } < / style >
2026-04-29 22:52:42 +02:00
< script > _ _md _scope = new URL ( ".." , location ) , _ _md _hash = e => [ ... e ] . reduce ( ( ( e , _ ) => ( e << 5 ) - e + _ . charCodeAt ( 0 ) ) , 0 ) , _ _md _get = ( e , _ = localStorage , t = _ _md _scope ) => JSON . parse ( _ . getItem ( t . pathname + "." + e ) ) , _ _md _set = ( e , _ , t = localStorage , a = _ _md _scope ) => { try { t . setItem ( a . pathname + "." + e , JSON . stringify ( _ ) ) } catch ( e ) { } } < / script >
2026-04-16 22:38:20 +02:00
< / head >
< body dir = "ltr" >
< input class = "md-toggle" data-md-toggle = "drawer" type = "checkbox" id = "__drawer" autocomplete = "off" >
< input class = "md-toggle" data-md-toggle = "search" type = "checkbox" id = "__search" autocomplete = "off" >
< label class = "md-overlay" for = "__drawer" > < / label >
< div data-md-component = "skip" >
< a href = "#kmers-and-super-kmers" class = "md-skip" >
Skip to content
< / a >
< / div >
< div data-md-component = "announce" >
< / div >
< header class = "md-header md-header--shadow" data-md-component = "header" >
< nav class = "md-header__inner md-grid" aria-label = "Header" >
2026-04-29 22:52:42 +02:00
< a href = ".." title = "obikmer" class = "md-header__button md-logo" aria-label = "obikmer" data-md-component = "logo" >
2026-04-16 22:38:20 +02:00
< svg xmlns = "http://www.w3.org/2000/svg" viewBox = "0 0 24 24" > < path d = "M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54" / > < / svg >
< / a >
< label class = "md-header__button md-icon" for = "__drawer" >
< svg xmlns = "http://www.w3.org/2000/svg" viewBox = "0 0 24 24" > < path d = "M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z" / > < / svg >
< / label >
< div class = "md-header__title" data-md-component = "header-title" >
< div class = "md-header__ellipsis" >
< div class = "md-header__topic" >
< span class = "md-ellipsis" >
obikmer
< / span >
< / div >
< div class = "md-header__topic" data-md-component = "header-topic" >
< span class = "md-ellipsis" >
Kmers and super-kmers
< / span >
< / div >
< / div >
< / div >
< script > var palette = _ _md _get ( "__palette" ) ; if ( palette && palette . color ) { if ( "(prefers-color-scheme)" === palette . color . media ) { var media = matchMedia ( "(prefers-color-scheme: light)" ) , input = document . querySelector ( media . matches ? "[data-md-color-media='(prefers-color-scheme: light)']" : "[data-md-color-media='(prefers-color-scheme: dark)']" ) ; palette . color . media = input . getAttribute ( "data-md-color-media" ) , palette . color . scheme = input . getAttribute ( "data-md-color-scheme" ) , palette . color . primary = input . getAttribute ( "data-md-color-primary" ) , palette . color . accent = input . getAttribute ( "data-md-color-accent" ) } for ( var [ key , value ] of Object . entries ( palette . color ) ) document . body . setAttribute ( "data-md-color-" + key , value ) } < / script >
< / nav >
< / header >
< div class = "md-container" data-md-component = "container" >
< main class = "md-main" data-md-component = "main" >
< div class = "md-main__inner md-grid" >
< div class = "md-sidebar md-sidebar--primary" data-md-component = "sidebar" data-md-type = "navigation" >
< div class = "md-sidebar__scrollwrap" >
< div class = "md-sidebar__inner" >
< nav class = "md-nav md-nav--primary" aria-label = "Navigation" data-md-level = "0" >
< label class = "md-nav__title" for = "__drawer" >
2026-04-29 22:52:42 +02:00
< a href = ".." title = "obikmer" class = "md-nav__button md-logo" aria-label = "obikmer" data-md-component = "logo" >
2026-04-16 22:38:20 +02:00
< svg xmlns = "http://www.w3.org/2000/svg" viewBox = "0 0 24 24" > < path d = "M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54" / > < / svg >
< / a >
obikmer
< / label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = ".." class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
Home
< / span >
< / a >
< / li >
< li class = "md-nav__item md-nav__item--active md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle " type = "checkbox" id = "__nav_2" checked >
< label class = "md-nav__link" for = "__nav_2" id = "__nav_2_label" tabindex = "0" >
< span class = "md-ellipsis" >
Theory
< / span >
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< nav class = "md-nav" data-md-level = "1" aria-labelledby = "__nav_2_label" aria-expanded = "true" >
< label class = "md-nav__title" for = "__nav_2" >
< span class = "md-nav__icon md-icon" > < / span >
Theory
< / label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item md-nav__item--active" >
< input class = "md-nav__toggle md-toggle" type = "checkbox" id = "__toc" >
< label class = "md-nav__link md-nav__link--active" for = "__toc" >
< span class = "md-ellipsis" >
Kmers and super-kmers
< / span >
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< a href = "./" class = "md-nav__link md-nav__link--active" >
< span class = "md-ellipsis" >
Kmers and super-kmers
< / span >
< / a >
< nav class = "md-nav md-nav--secondary" aria-label = "Table of contents" >
< label class = "md-nav__title" for = "__toc" >
< span class = "md-nav__icon md-icon" > < / span >
Table of contents
< / label >
< ul class = "md-nav__list" data-md-component = "toc" data-md-scrollfix >
< li class = "md-nav__item" >
< a href = "#kmers" class = "md-nav__link" >
< span class = "md-ellipsis" >
Kmers
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#super-kmers" class = "md-nav__link" >
< span class = "md-ellipsis" >
Super-kmers
< / span >
< / a >
< nav class = "md-nav" aria-label = "Super-kmers" >
< ul class = "md-nav__list" >
< li class = "md-nav__item" >
< a href = "#canonical-super-kmers" class = "md-nav__link" >
< span class = "md-ellipsis" >
Canonical super-kmers
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#expected-length-of-a-super-kmer" class = "md-nav__link" >
< span class = "md-ellipsis" >
Expected length of a super-kmer
< / span >
< / a >
< / li >
< / ul >
< / nav >
< / li >
< / ul >
< / nav >
< / li >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = "../theory/encoding/" class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
DNA encoding
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = "../theory/entropy/" class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
Entropy filter
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = "../theory/minimizer/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Minimizer selection
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../theory/indexing/" class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
Partitioning architecture
< / span >
< / a >
< / li >
< / ul >
< / nav >
< / li >
< li class = "md-nav__item md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle " type = "checkbox" id = "__nav_3" >
< label class = "md-nav__link" for = "__nav_3" id = "__nav_3_label" tabindex = "0" >
< span class = "md-ellipsis" >
Implementation
< / span >
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< nav class = "md-nav" data-md-level = "1" aria-labelledby = "__nav_3_label" aria-expanded = "false" >
< label class = "md-nav__title" for = "__nav_3" >
< span class = "md-nav__icon md-icon" > < / span >
Implementation
< / label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = "../implementation/superkmer/" class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
SuperKmer
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = "../implementation/kmer/" class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
Kmer
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = "../implementation/chunkreader/" class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
Chunk reader
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = "../implementation/pipeline/" class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
Construction pipeline
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = "../implementation/obipipeline/" class = "md-nav__link" >
< span class = "md-ellipsis" >
obipipeline library
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../implementation/storage/" class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
On-disk storage
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = "../implementation/mphf/" class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
MPHF selection
< / span >
< / a >
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a href = "../implementation/unitig_evidence/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Unitig evidence encoding
< / span >
< / a >
< / li >
2026-05-15 21:07:23 +08:00
2026-06-04 21:27:01 +02:00
< li class = "md-nav__item" >
< a href = "../implementation/evidence_elimination/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Evidence elimination (discussion)
< / span >
< / a >
< / li >
2026-05-15 21:07:23 +08:00
< li class = "md-nav__item" >
< a href = "../implementation/obilayeredmap/" class = "md-nav__link" >
< span class = "md-ellipsis" >
obilayeredmap crate
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../implementation/persistent_compact_int_vec/" class = "md-nav__link" >
< span class = "md-ellipsis" >
PersistentCompactIntVec
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../implementation/persistent_bit_vec/" class = "md-nav__link" >
< span class = "md-ellipsis" >
PersistentBitVec
< / span >
< / a >
< / li >
2026-06-04 21:27:01 +02:00
< li class = "md-nav__item" >
< a href = "../implementation/merge/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Merge command
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../implementation/rebuild_filter/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Kmer filtering (rebuild/dump/unitig)
< / span >
< / a >
< / li >
2026-04-16 22:38:20 +02:00
< / ul >
< / nav >
< / li >
< li class = "md-nav__item md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle " type = "checkbox" id = "__nav_4" >
< label class = "md-nav__link" for = "__nav_4" id = "__nav_4_label" tabindex = "0" >
< span class = "md-ellipsis" >
Architecture
< / span >
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< nav class = "md-nav" data-md-level = "1" aria-labelledby = "__nav_4_label" aria-expanded = "false" >
< label class = "md-nav__title" for = "__nav_4" >
< span class = "md-nav__icon md-icon" > < / span >
Architecture
< / label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = "../architecture/sequences/invariant/" class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
Sequences
< / span >
< / a >
< / li >
2026-05-15 21:07:23 +08:00
< li class = "md-nav__item" >
< a href = "../architecture/index_architecture/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Kmer index
< / span >
< / a >
< / li >
2026-04-16 22:38:20 +02:00
< / ul >
< / nav >
< / li >
< / ul >
< / nav >
< / div >
< / div >
< / div >
< div class = "md-sidebar md-sidebar--secondary" data-md-component = "sidebar" data-md-type = "toc" >
< div class = "md-sidebar__scrollwrap" >
< div class = "md-sidebar__inner" >
< nav class = "md-nav md-nav--secondary" aria-label = "Table of contents" >
< label class = "md-nav__title" for = "__toc" >
< span class = "md-nav__icon md-icon" > < / span >
Table of contents
< / label >
< ul class = "md-nav__list" data-md-component = "toc" data-md-scrollfix >
< li class = "md-nav__item" >
< a href = "#kmers" class = "md-nav__link" >
< span class = "md-ellipsis" >
Kmers
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#super-kmers" class = "md-nav__link" >
< span class = "md-ellipsis" >
Super-kmers
< / span >
< / a >
< nav class = "md-nav" aria-label = "Super-kmers" >
< ul class = "md-nav__list" >
< li class = "md-nav__item" >
< a href = "#canonical-super-kmers" class = "md-nav__link" >
< span class = "md-ellipsis" >
Canonical super-kmers
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#expected-length-of-a-super-kmer" class = "md-nav__link" >
< span class = "md-ellipsis" >
Expected length of a super-kmer
< / span >
< / a >
< / li >
< / ul >
< / nav >
< / li >
< / ul >
< / nav >
< / div >
< / div >
< / div >
< div class = "md-content" data-md-component = "content" >
< article class = "md-content__inner md-typeset" >
< h1 id = "kmers-and-super-kmers" > Kmers and super-kmers< / h1 >
< h2 id = "kmers" > Kmers< / h2 >
< p > A < strong > kmer< / strong > is a DNA subsequence of fixed length k. Two constraints govern the choice of k:< / p >
< ul >
2026-06-04 21:27:01 +02:00
< li > < strong > k ∈ [11, 31]< / strong > : the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word (u64 at 2 bits/base requires k ≤ 32; k < 11 yields insufficient specificity).< / li >
2026-04-16 22:38:20 +02:00
< li > < strong > k is odd< / strong > : an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form < code > min(kmer, revcomp(kmer))< / code > is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting.< / li >
< / ul >
2026-06-04 21:27:01 +02:00
< p > Both constraints are < strong > enforced at CLI entry< / strong > by < code > CommonArgs::validate()< / code > in < code > superkmer< / code > and < code > index< / code > . Passing an invalid k exits immediately with an error message.< / p >
2026-04-16 22:38:20 +02:00
< h2 id = "super-kmers" > Super-kmers< / h2 >
2026-06-04 21:27:01 +02:00
< p > A < strong > super-kmer< / strong > is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k− 1 nucleotides, sharing the same < strong > canonical minimizer< / strong > . The < strong > canonical minimizer< / strong > of a kmer is the m-mer (m < k) whose canonical hash < code > hash_kmer(min(m-mer, revcomp(m-mer)))< / code > is smallest over all m-mers in the kmer window. The hash function is a < code > mix64< / code > -based bijection; selection is purely hash-ordered with no degeneracy filter. A super-kmer is capped at 256 nucleotides; a longer run is split at that boundary.< / p >
2026-04-16 22:38:20 +02:00
< h3 id = "canonical-super-kmers" > Canonical super-kmers< / h3 >
< p > A < strong > canonical super-kmer< / strong > is the lexicographic minimum of a super-kmer and its reverse complement:< / p >
< div class = "highlight" > < pre > < span > < / span > < code > canonical(super-kmer) = min(super-kmer, revcomp(super-kmer))
< / code > < / pre > < / div >
< p > When a read and its reverse-complement are both sequenced, they produce super-kmers that are reverse complements of each other. Both map to the same canonical form: the same genomic region is represented by a single canonical super-kmer regardless of which strand was read.< / p >
< h3 id = "expected-length-of-a-super-kmer" > Expected length of a super-kmer< / h3 >
< p > For a random minimizer of length m over k-mers of length k, the density of minimizer positions is approximately 2/(k− m+2) (Golan & Shur 2025; Zheng < em > et al.< / em > 2020)< sup id = "fnref:Zheng2020-ji" > < a class = "footnote-ref" href = "#fn:Zheng2020-ji" > 2< / a > < / sup > < sup id = "fnref:Golan2025-xf" > < a class = "footnote-ref" href = "#fn:Golan2025-xf" > 3< / a > < / sup > , so the expected number of consecutive k-mers per super-kmer is (k− m+2)/2. A run of n k-mers spans n + k − 1 nucleotides, giving:< / p >
< div class = "arithmatex" > \[L_{\text{nt}} = \frac{k-m+2}{2} + k - 1\]< / div >
< p > For k=31, m=13: expected ≈ 40 nt. In practice super-kmers rarely exceed a few dozen nucleotides.< sup id = "fnref:superkmer_length" > < a class = "footnote-ref" href = "#fn:superkmer_length" > 1< / a > < / sup > < / p >
< div class = "footnote" >
< hr / >
< ol >
< li id = "fn:superkmer_length" >
< p > The expected length formula and the density approximation 2/(k− m+2) should be verified against the values reported in (Zheng < em > et al.< / em > 2020)< sup id = "fnref2:Zheng2020-ji" > < a class = "footnote-ref" href = "#fn:Zheng2020-ji" > 2< / a > < / sup > and (Golan & Shur 2025)< sup id = "fnref2:Golan2025-xf" > < a class = "footnote-ref" href = "#fn:Golan2025-xf" > 3< / a > < / sup > .  < a class = "footnote-backref" href = "#fnref:superkmer_length" title = "Jump back to footnote 1 in the text" > ↩ < / a > < / p >
< / li >
< li id = "fn:Zheng2020-ji" >
< p > Zheng, H., Kingsford, C. & Marçais, G. (2020). < a href = "https://doi.org/10.1093/bioinformatics/btaa472" > Improved design and analysis of practical minimizers< / a > . < em > Bioinformatics (Oxford, England)< / em > , 36, i119--i127.  < a class = "footnote-backref" href = "#fnref:Zheng2020-ji" title = "Jump back to footnote 2 in the text" > ↩ < / a > < a class = "footnote-backref" href = "#fnref2:Zheng2020-ji" title = "Jump back to footnote 2 in the text" > ↩ < / a > < / p >
< / li >
< li id = "fn:Golan2025-xf" >
< p > Golan, S. & Shur, A.M. (2025). < a href = "https://doi.org/10.1007/978-3-031-82670-2\_25" > Expected density of random minimizers< / a > . In: < em > Lecture notes in computer science< / em > , Lecture notes in computer science. Springer Nature Switzerland, Cham, pp. 347--360.  < a class = "footnote-backref" href = "#fnref:Golan2025-xf" title = "Jump back to footnote 3 in the text" > ↩ < / a > < a class = "footnote-backref" href = "#fnref2:Golan2025-xf" title = "Jump back to footnote 3 in the text" > ↩ < / a > < / p >
< / li >
< / ol >
< / div >
< / article >
< / div >
< script > var target = document . getElementById ( location . hash . slice ( 1 ) ) ; target && target . name && ( target . checked = target . name . startsWith ( "__tabbed_" ) ) < / script >
< / div >
< / main >
< footer class = "md-footer" >
< div class = "md-footer-meta md-typeset" >
< div class = "md-footer-meta__inner md-grid" >
< div class = "md-copyright" >
Made with
< a href = "https://squidfunk.github.io/mkdocs-material/" target = "_blank" rel = "noopener" >
Material for MkDocs
< / a >
< / div >
< / div >
< / div >
< / footer >
< / div >
< div class = "md-dialog" data-md-component = "dialog" >
< div class = "md-dialog__inner md-typeset" > < / div >
< / div >
2026-04-29 22:52:42 +02:00
< script id = "__config" type = "application/json" > { "annotate" : null , "base" : ".." , "features" : [ ] , "search" : "../assets/javascripts/workers/search.2c215733.min.js" , "tags" : null , "translations" : { "clipboard.copied" : "Copied to clipboard" , "clipboard.copy" : "Copy to clipboard" , "search.result.more.one" : "1 more on this page" , "search.result.more.other" : "# more on this page" , "search.result.none" : "No matching documents" , "search.result.one" : "1 matching document" , "search.result.other" : "# matching documents" , "search.result.placeholder" : "Type to start searching" , "search.result.term.missing" : "Missing" , "select.version" : "Select version" } , "version" : null } < / script >
2026-04-16 22:38:20 +02:00
2026-04-29 22:52:42 +02:00
< script src = "../assets/javascripts/bundle.79ae519e.min.js" > < / script >
2026-04-16 22:38:20 +02:00
< script src = "https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js" > < / script >
< / body >
< / html >