2026-04-16 22:38:20 +02:00
<!doctype html>
< html lang = "en" class = "no-js" >
< head >
< meta charset = "utf-8" >
< meta name = "viewport" content = "width=device-width,initial-scale=1" >
< link rel = "prev" href = "../../theory/indexing/" >
< link rel = "next" href = "../kmer/" >
< link rel = "icon" href = "../../assets/images/favicon.png" >
< meta name = "generator" content = "mkdocs-1.6.1, mkdocs-material-9.7.6" >
< title > SuperKmer - obikmer< / title >
< link rel = "stylesheet" href = "../../assets/stylesheets/main.484c7ddc.min.css" >
< link rel = "preconnect" href = "https://fonts.gstatic.com" crossorigin >
< link rel = "stylesheet" href = "https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback" >
< style > : root { --md-text-font : "Roboto" ; --md-code-font : "Roboto Mono" } < / style >
< script > _ _md _scope = new URL ( "../.." , location ) , _ _md _hash = e => [ ... e ] . reduce ( ( ( e , _ ) => ( e << 5 ) - e + _ . charCodeAt ( 0 ) ) , 0 ) , _ _md _get = ( e , _ = localStorage , t = _ _md _scope ) => JSON . parse ( _ . getItem ( t . pathname + "." + e ) ) , _ _md _set = ( e , _ , t = localStorage , a = _ _md _scope ) => { try { t . setItem ( a . pathname + "." + e , JSON . stringify ( _ ) ) } catch ( e ) { } } < / script >
< / head >
< body dir = "ltr" >
< input class = "md-toggle" data-md-toggle = "drawer" type = "checkbox" id = "__drawer" autocomplete = "off" >
< input class = "md-toggle" data-md-toggle = "search" type = "checkbox" id = "__search" autocomplete = "off" >
< label class = "md-overlay" for = "__drawer" > < / label >
< div data-md-component = "skip" >
< a href = "#superkmer-implementation" class = "md-skip" >
Skip to content
< / a >
< / div >
< div data-md-component = "announce" >
< / div >
< header class = "md-header md-header--shadow" data-md-component = "header" >
< nav class = "md-header__inner md-grid" aria-label = "Header" >
< a href = "../.." title = "obikmer" class = "md-header__button md-logo" aria-label = "obikmer" data-md-component = "logo" >
< svg xmlns = "http://www.w3.org/2000/svg" viewBox = "0 0 24 24" > < path d = "M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54" / > < / svg >
< / a >
< label class = "md-header__button md-icon" for = "__drawer" >
< svg xmlns = "http://www.w3.org/2000/svg" viewBox = "0 0 24 24" > < path d = "M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z" / > < / svg >
< / label >
< div class = "md-header__title" data-md-component = "header-title" >
< div class = "md-header__ellipsis" >
< div class = "md-header__topic" >
< span class = "md-ellipsis" >
obikmer
< / span >
< / div >
< div class = "md-header__topic" data-md-component = "header-topic" >
< span class = "md-ellipsis" >
SuperKmer
< / span >
< / div >
< / div >
< / div >
< script > var palette = _ _md _get ( "__palette" ) ; if ( palette && palette . color ) { if ( "(prefers-color-scheme)" === palette . color . media ) { var media = matchMedia ( "(prefers-color-scheme: light)" ) , input = document . querySelector ( media . matches ? "[data-md-color-media='(prefers-color-scheme: light)']" : "[data-md-color-media='(prefers-color-scheme: dark)']" ) ; palette . color . media = input . getAttribute ( "data-md-color-media" ) , palette . color . scheme = input . getAttribute ( "data-md-color-scheme" ) , palette . color . primary = input . getAttribute ( "data-md-color-primary" ) , palette . color . accent = input . getAttribute ( "data-md-color-accent" ) } for ( var [ key , value ] of Object . entries ( palette . color ) ) document . body . setAttribute ( "data-md-color-" + key , value ) } < / script >
< / nav >
< / header >
< div class = "md-container" data-md-component = "container" >
< main class = "md-main" data-md-component = "main" >
< div class = "md-main__inner md-grid" >
< div class = "md-sidebar md-sidebar--primary" data-md-component = "sidebar" data-md-type = "navigation" >
< div class = "md-sidebar__scrollwrap" >
< div class = "md-sidebar__inner" >
< nav class = "md-nav md-nav--primary" aria-label = "Navigation" data-md-level = "0" >
< label class = "md-nav__title" for = "__drawer" >
< a href = "../.." title = "obikmer" class = "md-nav__button md-logo" aria-label = "obikmer" data-md-component = "logo" >
< svg xmlns = "http://www.w3.org/2000/svg" viewBox = "0 0 24 24" > < path d = "M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54" / > < / svg >
< / a >
obikmer
< / label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item" >
< a href = "../.." class = "md-nav__link" >
< span class = "md-ellipsis" >
Home
< / span >
< / a >
< / li >
< li class = "md-nav__item md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle " type = "checkbox" id = "__nav_2" >
< label class = "md-nav__link" for = "__nav_2" id = "__nav_2_label" tabindex = "0" >
< span class = "md-ellipsis" >
Theory
< / span >
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< nav class = "md-nav" data-md-level = "1" aria-labelledby = "__nav_2_label" aria-expanded = "false" >
< label class = "md-nav__title" for = "__nav_2" >
< span class = "md-nav__icon md-icon" > < / span >
Theory
< / label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item" >
2026-04-29 22:52:42 +02:00
< a href = "../../kmers/" class = "md-nav__link" >
2026-04-16 22:38:20 +02:00
< span class = "md-ellipsis" >
Kmers and super-kmers
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../../theory/encoding/" class = "md-nav__link" >
< span class = "md-ellipsis" >
DNA encoding
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../../theory/entropy/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Entropy filter
< / span >
< / a >
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a href = "../../theory/minimizer/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Minimizer selection
< / span >
< / a >
< / li >
2026-04-16 22:38:20 +02:00
< li class = "md-nav__item" >
< a href = "../../theory/indexing/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Partitioning architecture
< / span >
< / a >
< / li >
< / ul >
< / nav >
< / li >
< li class = "md-nav__item md-nav__item--active md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle " type = "checkbox" id = "__nav_3" checked >
< label class = "md-nav__link" for = "__nav_3" id = "__nav_3_label" tabindex = "0" >
< span class = "md-ellipsis" >
Implementation
< / span >
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< nav class = "md-nav" data-md-level = "1" aria-labelledby = "__nav_3_label" aria-expanded = "true" >
< label class = "md-nav__title" for = "__nav_3" >
< span class = "md-nav__icon md-icon" > < / span >
Implementation
< / label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item md-nav__item--active" >
< input class = "md-nav__toggle md-toggle" type = "checkbox" id = "__toc" >
< label class = "md-nav__link md-nav__link--active" for = "__toc" >
< span class = "md-ellipsis" >
SuperKmer
< / span >
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< a href = "./" class = "md-nav__link md-nav__link--active" >
< span class = "md-ellipsis" >
SuperKmer
< / span >
< / a >
< nav class = "md-nav md-nav--secondary" aria-label = "Table of contents" >
< label class = "md-nav__title" for = "__toc" >
< span class = "md-nav__icon md-icon" > < / span >
Table of contents
< / label >
< ul class = "md-nav__list" data-md-component = "toc" data-md-scrollfix >
< li class = "md-nav__item" >
< a href = "#memory-layout" class = "md-nav__link" >
< span class = "md-ellipsis" >
Memory layout
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#ascii-encoding-and-decoding" class = "md-nav__link" >
< span class = "md-ellipsis" >
ASCII encoding and decoding
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#reverse-complement" class = "md-nav__link" >
< span class = "md-ellipsis" >
Reverse complement
< / span >
< / a >
2026-04-29 22:52:42 +02:00
< / li >
< li class = "md-nav__item" >
< a href = "#minimizer-sliding-window" class = "md-nav__link" >
< span class = "md-ellipsis" >
Minimizer sliding window
< / span >
< / a >
2026-04-16 22:38:20 +02:00
< / li >
< li class = "md-nav__item" >
< a href = "#kmer-extraction" class = "md-nav__link" >
< span class = "md-ellipsis" >
Kmer extraction
< / span >
< / a >
< / li >
< / ul >
< / nav >
< / li >
< li class = "md-nav__item" >
< a href = "../kmer/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Kmer
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../chunkreader/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Chunk reader
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../pipeline/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Construction pipeline
< / span >
< / a >
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a href = "../obipipeline/" class = "md-nav__link" >
< span class = "md-ellipsis" >
obipipeline library
< / span >
< / a >
< / li >
2026-04-16 22:38:20 +02:00
< li class = "md-nav__item" >
< a href = "../storage/" class = "md-nav__link" >
< span class = "md-ellipsis" >
On-disk storage
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "../mphf/" class = "md-nav__link" >
< span class = "md-ellipsis" >
MPHF selection
< / span >
< / a >
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a href = "../unitig_evidence/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Unitig evidence encoding
< / span >
< / a >
< / li >
2026-04-16 22:38:20 +02:00
< / ul >
< / nav >
< / li >
< li class = "md-nav__item md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle " type = "checkbox" id = "__nav_4" >
< label class = "md-nav__link" for = "__nav_4" id = "__nav_4_label" tabindex = "0" >
< span class = "md-ellipsis" >
Architecture
< / span >
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< nav class = "md-nav" data-md-level = "1" aria-labelledby = "__nav_4_label" aria-expanded = "false" >
< label class = "md-nav__title" for = "__nav_4" >
< span class = "md-nav__icon md-icon" > < / span >
Architecture
< / label >
< ul class = "md-nav__list" data-md-scrollfix >
< li class = "md-nav__item" >
< a href = "../../architecture/sequences/invariant/" class = "md-nav__link" >
< span class = "md-ellipsis" >
Sequences
< / span >
< / a >
< / li >
< / ul >
< / nav >
< / li >
< / ul >
< / nav >
< / div >
< / div >
< / div >
< div class = "md-sidebar md-sidebar--secondary" data-md-component = "sidebar" data-md-type = "toc" >
< div class = "md-sidebar__scrollwrap" >
< div class = "md-sidebar__inner" >
< nav class = "md-nav md-nav--secondary" aria-label = "Table of contents" >
< label class = "md-nav__title" for = "__toc" >
< span class = "md-nav__icon md-icon" > < / span >
Table of contents
< / label >
< ul class = "md-nav__list" data-md-component = "toc" data-md-scrollfix >
< li class = "md-nav__item" >
< a href = "#memory-layout" class = "md-nav__link" >
< span class = "md-ellipsis" >
Memory layout
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#ascii-encoding-and-decoding" class = "md-nav__link" >
< span class = "md-ellipsis" >
ASCII encoding and decoding
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a href = "#reverse-complement" class = "md-nav__link" >
< span class = "md-ellipsis" >
Reverse complement
< / span >
< / a >
2026-04-29 22:52:42 +02:00
< / li >
< li class = "md-nav__item" >
< a href = "#minimizer-sliding-window" class = "md-nav__link" >
< span class = "md-ellipsis" >
Minimizer sliding window
< / span >
< / a >
2026-04-16 22:38:20 +02:00
< / li >
< li class = "md-nav__item" >
< a href = "#kmer-extraction" class = "md-nav__link" >
< span class = "md-ellipsis" >
Kmer extraction
< / span >
< / a >
< / li >
< / ul >
< / nav >
< / div >
< / div >
< / div >
< div class = "md-content" data-md-component = "content" >
< article class = "md-content__inner md-typeset" >
< h1 id = "superkmer-implementation" > SuperKmer — implementation< / h1 >
< h2 id = "memory-layout" > Memory layout< / h2 >
2026-04-29 22:52:42 +02:00
< p > A super-kmer is stored as a < strong > 32-bit header< / strong > followed by a < strong > byte-aligned nucleotide sequence< / strong > (2 bits/base, nucleotide 0 at the MSB of the first byte):< / p >
2026-04-16 22:38:20 +02:00
< table >
< thead >
< tr >
< th > Field< / th >
< th > Bits< / th >
< th > Role< / th >
< / tr >
< / thead >
< tbody >
< tr >
< td > COUNT< / td >
< td > 24< / td >
< td > Occurrence count (≤ 16 M)< / td >
< / tr >
< tr >
2026-04-29 22:52:42 +02:00
< td > NKMERS< / td >
2026-04-16 22:38:20 +02:00
< td > 8< / td >
2026-04-29 22:52:42 +02:00
< td > Number of kmers (= seq_length − k + 1, range 1– 255)< / td >
2026-04-16 22:38:20 +02:00
< / tr >
< / tbody >
< / table >
2026-04-29 22:52:42 +02:00
< p > Bit layout (MSB to LSB): < code > [31:8] COUNT [7:0] NKMERS< / code > < / p >
< p > NKMERS is stored as a raw < code > u8< / code > in < strong > kmer units< / strong > , not nucleotides. The nucleotide length is recovered as < code > NKMERS + k − 1< / code > . This avoids the awkward wrapping convention (< code > 0 = 256< / code > ) that would be needed if nucleotide length were stored directly, and gains k− 1 = 30 units of headroom:< / p >
< table >
< thead >
< tr >
< th > unit< / th >
< th > u8 covers< / th >
< th > max nucleotides< / th >
< / tr >
< / thead >
< tbody >
< tr >
< td > nucleotides< / td >
< td > 255 nt< / td >
< td > 225 kmers< / td >
< / tr >
< tr >
< td > < strong > kmers< / strong > < / td >
< td > < strong > 255 kmers< / strong > < / td >
< td > < strong > 285 nt< / strong > < / td >
< / tr >
< / tbody >
< / table >
< p > The public accessors:< / p >
< div class = "highlight" > < pre > < span > < / span > < code > < span class = "k" > fn< / span > < span class = "w" > < / span > < span class = "nf" > n_kmers< / span > < span class = "p" > (< / span > < span class = "o" > & < / span > < span class = "bp" > self< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "p" > -> < / span > < span class = "w" > < / span > < span class = "kt" > usize< / span > < span class = "w" > < / span > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "p" > (< / span > < span class = "bp" > self< / span > < span class = "p" > .< / span > < span class = "mi" > 0< / span > < span class = "w" > < / span > < span class = "o" > & < / span > < span class = "w" > < / span > < span class = "mh" > 0xFF< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "k" > as< / span > < span class = "w" > < / span > < span class = "kt" > usize< / span > < span class = "w" > < / span > < span class = "p" > }< / span >
< span class = "k" > fn< / span > < span class = "w" > < / span > < span class = "nf" > seql< / span > < span class = "p" > (< / span > < span class = "o" > & < / span > < span class = "bp" > self< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "p" > -> < / span > < span class = "w" > < / span > < span class = "kt" > usize< / span > < span class = "w" > < / span > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "bp" > self< / span > < span class = "p" > .< / span > < span class = "n" > n_kmers< / span > < span class = "p" > ()< / span > < span class = "w" > < / span > < span class = "o" > +< / span > < span class = "w" > < / span > < span class = "n" > K< / span > < span class = "w" > < / span > < span class = "o" > -< / span > < span class = "w" > < / span > < span class = "mi" > 1< / span > < span class = "w" > < / span > < span class = "p" > }< / span >
2026-04-16 22:38:20 +02:00
< span class = "k" > fn< / span > < span class = "w" > < / span > < span class = "nf" > count< / span > < span class = "p" > (< / span > < span class = "o" > & < / span > < span class = "bp" > self< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "p" > -> < / span > < span class = "w" > < / span > < span class = "kt" > u32< / span > < span class = "w" > < / span > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "bp" > self< / span > < span class = "p" > .< / span > < span class = "mi" > 0< / span > < span class = "w" > < / span > < span class = "o" > > > < / span > < span class = "w" > < / span > < span class = "mi" > 8< / span > < span class = "w" > < / span > < span class = "p" > }< / span >
< span class = "k" > fn< / span > < span class = "w" > < / span > < span class = "nf" > increment< / span > < span class = "p" > (< / span > < span class = "o" > & < / span > < span class = "k" > mut< / span > < span class = "w" > < / span > < span class = "bp" > self< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "bp" > self< / span > < span class = "p" > .< / span > < span class = "mi" > 0< / span > < span class = "w" > < / span > < span class = "o" > +=< / span > < span class = "w" > < / span > < span class = "mi" > 1< / span > < span class = "w" > < / span > < span class = "o" > < < < / span > < span class = "w" > < / span > < span class = "mi" > 8< / span > < span class = "p" > ;< / span > < span class = "w" > < / span > < span class = "p" > }< / span >
< span class = "k" > fn< / span > < span class = "w" > < / span > < span class = "nf" > add< / span > < span class = "p" > (< / span > < span class = "o" > & < / span > < span class = "k" > mut< / span > < span class = "w" > < / span > < span class = "bp" > self< / span > < span class = "p" > ,< / span > < span class = "w" > < / span > < span class = "n" > n< / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "kt" > u32< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "bp" > self< / span > < span class = "p" > .< / span > < span class = "mi" > 0< / span > < span class = "w" > < / span > < span class = "o" > +=< / span > < span class = "w" > < / span > < span class = "n" > n< / span > < span class = "w" > < / span > < span class = "o" > < < < / span > < span class = "w" > < / span > < span class = "mi" > 8< / span > < span class = "p" > ;< / span > < span class = "w" > < / span > < span class = "p" > }< / span >
< span class = "k" > fn< / span > < span class = "w" > < / span > < span class = "nf" > set_count< / span > < span class = "p" > (< / span > < span class = "o" > & < / span > < span class = "k" > mut< / span > < span class = "w" > < / span > < span class = "bp" > self< / span > < span class = "p" > ,< / span > < span class = "w" > < / span > < span class = "n" > n< / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "kt" > u32< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "bp" > self< / span > < span class = "p" > .< / span > < span class = "mi" > 0< / span > < span class = "w" > < / span > < span class = "o" > =< / span > < span class = "w" > < / span > < span class = "p" > (< / span > < span class = "bp" > self< / span > < span class = "p" > .< / span > < span class = "mi" > 0< / span > < span class = "w" > < / span > < span class = "o" > & < / span > < span class = "w" > < / span > < span class = "mh" > 0xFF< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "o" > |< / span > < span class = "w" > < / span > < span class = "p" > (< / span > < span class = "n" > n< / span > < span class = "w" > < / span > < span class = "o" > < < < / span > < span class = "w" > < / span > < span class = "mi" > 8< / span > < span class = "p" > );< / span > < span class = "w" > < / span > < span class = "p" > }< / span >
< / code > < / pre > < / div >
2026-04-29 22:52:42 +02:00
< p > In practice, observed super-kmer lengths on metagenomic data (k=31) are below 55 nucleotides (≤ 25 kmers) — far from the 255-kmer cap. If a super-kmer ever exceeds 255 kmers, it is split with a k− 1 nucleotide overlap, preserving all kmers without duplication (identical mechanism to partition-boundary splits).< / p >
2026-04-16 22:38:20 +02:00
< p > The sequence is always stored in canonical form (lexicographic minimum of forward and reverse complement), with nucleotide 0 at the MSB of the first byte. The byte array can be hashed directly without any adjustment.< / p >
< h2 id = "ascii-encoding-and-decoding" > ASCII encoding and decoding< / h2 >
< p > Two lookup tables handle ASCII ↔ 2-bit conversion:< / p >
< ul >
< li > < strong > < code > ENC: [u8; 32]< / code > < / strong > — indexed by < code > b & 0x1F< / code > (lower 5 bits of the ASCII byte). Maps A/a→0, C/c→1, G/g→2, T/t and U/u→3; ambiguous bases and unknowns silently map to 0 (A). 32 entries, fits entirely in L1 cache. Upper- and lowercase are handled identically.< / li >
< li > < strong > < code > DEC4: [u32; 256]< / code > < / strong > — maps a packed byte (4 nucleotides) to 4 ASCII characters packed as a big-endian < code > u32< / code > . 1 KB total, fits in L1 cache. One lookup per output byte yields 4 decoded characters.< / li >
< / ul >
< p > Encoding 4 nucleotides into one byte:< / p >
< div class = "highlight" > < pre > < span > < / span > < code > < span class = "n" > byte< / span > < span class = "w" > < / span > < span class = "o" > =< / span > < span class = "w" > < / span > < span class = "n" > ENC< / span > < span class = "p" > [< / span > < span class = "n" > c0< / span > < span class = "w" > < / span > < span class = "o" > & < / span > < span class = "w" > < / span > < span class = "mh" > 0x1F< / span > < span class = "p" > ]< / span > < span class = "w" > < / span > < span class = "o" > < < < / span > < span class = "w" > < / span > < span class = "mi" > 6< / span > < span class = "w" > < / span > < span class = "o" > |< / span > < span class = "w" > < / span > < span class = "n" > ENC< / span > < span class = "p" > [< / span > < span class = "n" > c1< / span > < span class = "w" > < / span > < span class = "o" > & < / span > < span class = "w" > < / span > < span class = "mh" > 0x1F< / span > < span class = "p" > ]< / span > < span class = "w" > < / span > < span class = "o" > < < < / span > < span class = "w" > < / span > < span class = "mi" > 4< / span > < span class = "w" > < / span > < span class = "o" > |< / span > < span class = "w" > < / span > < span class = "n" > ENC< / span > < span class = "p" > [< / span > < span class = "n" > c2< / span > < span class = "w" > < / span > < span class = "o" > & < / span > < span class = "w" > < / span > < span class = "mh" > 0x1F< / span > < span class = "p" > ]< / span > < span class = "w" > < / span > < span class = "o" > < < < / span > < span class = "w" > < / span > < span class = "mi" > 2< / span > < span class = "w" > < / span > < span class = "o" > |< / span > < span class = "w" > < / span > < span class = "n" > ENC< / span > < span class = "p" > [< / span > < span class = "n" > c3< / span > < span class = "w" > < / span > < span class = "o" > & < / span > < span class = "w" > < / span > < span class = "mh" > 0x1F< / span > < span class = "p" > ]< / span >
< / code > < / pre > < / div >
< p > Decoding one byte into 4 ASCII characters:< / p >
< div class = "highlight" > < pre > < span > < / span > < code > < span class = "n" > DEC4< / span > < span class = "p" > [< / span > < span class = "n" > byte< / span > < span class = "p" > ].< / span > < span class = "n" > to_be_bytes< / span > < span class = "p" > ()< / span > < span class = "w" > < / span > < span class = "c1" > // [nuc0, nuc1, nuc2, nuc3] in ASCII< / span >
< / code > < / pre > < / div >
< h2 id = "reverse-complement" > Reverse complement< / h2 >
< p > The reverse complement is computed < strong > in place< / strong > with zero allocation in two steps.< / p >
< p > < strong > Step 1 — byte swap with < code > REVCOMP4< / code > .< / strong > A 256-byte lookup table < code > REVCOMP4< / code > maps each byte (4 nucleotides) to its reverse complement. Bytes are swapped from the outside in, applying < code > REVCOMP4< / code > to each:< / p >
< div class = "highlight" > < pre > < span > < / span > < code > < span class = "k" > const< / span > < span class = "w" > < / span > < span class = "k" > fn< / span > < span class = "w" > < / span > < span class = "nf" > revcomp4< / span > < span class = "p" > (< / span > < span class = "n" > x< / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "kt" > u8< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "p" > -> < / span > < span class = "w" > < / span > < span class = "kt" > u8< / span > < span class = "w" > < / span > < span class = "p" > {< / span >
< span class = "w" > < / span > < span class = "kd" > let< / span > < span class = "w" > < / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > =< / span > < span class = "w" > < / span > < span class = "o" > !< / span > < span class = "n" > x< / span > < span class = "p" > ;< / span > < span class = "w" > < / span > < span class = "c1" > // complement all bases< / span >
< span class = "w" > < / span > < span class = "kd" > let< / span > < span class = "w" > < / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > =< / span > < span class = "w" > < / span > < span class = "p" > (< / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > > > < / span > < span class = "w" > < / span > < span class = "mi" > 4< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "o" > |< / span > < span class = "w" > < / span > < span class = "p" > (< / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > < < < / span > < span class = "w" > < / span > < span class = "mi" > 4< / span > < span class = "p" > );< / span > < span class = "w" > < / span > < span class = "c1" > // swap nibbles< / span >
< span class = "w" > < / span > < span class = "kd" > let< / span > < span class = "w" > < / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > =< / span > < span class = "w" > < / span > < span class = "p" > ((< / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > > > < / span > < span class = "w" > < / span > < span class = "mi" > 2< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "o" > & < / span > < span class = "w" > < / span > < span class = "mh" > 0x33< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "o" > |< / span > < span class = "w" > < / span > < span class = "p" > ((< / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > & < / span > < span class = "w" > < / span > < span class = "mh" > 0x33< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "o" > < < < / span > < span class = "w" > < / span > < span class = "mi" > 2< / span > < span class = "p" > );< / span > < span class = "w" > < / span > < span class = "c1" > // swap 2-bit groups< / span >
< span class = "w" > < / span > < span class = "n" > x< / span >
< span class = "p" > }< / span >
< / code > < / pre > < / div >
< p > < code > REVCOMP4< / code > is 256 bytes (fits in L1 cache), computed at compile time. No endianness dependency — all operations are pure arithmetic on byte values.< / p >
2026-04-29 22:52:42 +02:00
< p > < strong > Step 2 — realignment.< / strong > After step 1, < code > padding = n × 8 − seql × 2< / code > spurious bits (complements of the original padding A's) appear at the start of the array. They are flushed left using < code > BitSlice< u8, Msb0> ::rotate_left(padding)< / code > from the < code > bitvec< / code > crate, which is SIMD-accelerated. The trailing < code > padding< / code > bits are then zeroed:< / p >
< div class = "highlight" > < pre > < span > < / span > < code > < span class = "kd" > let< / span > < span class = "w" > < / span > < span class = "n" > seql< / span > < span class = "w" > < / span > < span class = "o" > =< / span > < span class = "w" > < / span > < span class = "bp" > self< / span > < span class = "p" > .< / span > < span class = "n" > n_kmers< / span > < span class = "p" > ()< / span > < span class = "w" > < / span > < span class = "o" > +< / span > < span class = "w" > < / span > < span class = "n" > k< / span > < span class = "w" > < / span > < span class = "o" > -< / span > < span class = "w" > < / span > < span class = "mi" > 1< / span > < span class = "p" > ;< / span >
< span class = "n" > shift< / span > < span class = "w" > < / span > < span class = "o" > =< / span > < span class = "w" > < / span > < span class = "n" > n< / span > < span class = "w" > < / span > < span class = "o" > *< / span > < span class = "w" > < / span > < span class = "mi" > 8< / span > < span class = "w" > < / span > < span class = "o" > -< / span > < span class = "w" > < / span > < span class = "n" > seql< / span > < span class = "w" > < / span > < span class = "o" > *< / span > < span class = "w" > < / span > < span class = "mi" > 2< / span > < span class = "w" > < / span > < span class = "c1" > // number of padding bits< / span >
2026-04-16 22:38:20 +02:00
< span class = "n" > bits< / span > < span class = "p" > .< / span > < span class = "n" > rotate_left< / span > < span class = "p" > (< / span > < span class = "n" > shift< / span > < span class = "p" > )< / span >
< span class = "n" > bits< / span > < span class = "p" > [< / span > < span class = "n" > len< / span > < span class = "w" > < / span > < span class = "o" > -< / span > < span class = "w" > < / span > < span class = "n" > shift< / span > < span class = "o" > ..< / span > < span class = "p" > ].< / span > < span class = "n" > fill< / span > < span class = "p" > (< / span > < span class = "kc" > false< / span > < span class = "p" > )< / span >
< / code > < / pre > < / div >
< p > < code > Msb0< / code > ordering makes the bit layout hardware-independent.< / p >
< div class = "admonition abstract" >
< p class = "admonition-title" > Algorithm — Super-kmer canonisation< / p >
< div class = "highlight" > < pre > < span > < / span > < code > procedure SuperKmerCanonical(seq, SEQL):
for i ← 0 to SEQL − 1:
fwd ← nucleotide(seq, i)
rev ← complement(nucleotide(seq, SEQL − 1 − i))
if fwd < rev: return seq -- forward is canonical
if fwd > rev: return SuperKmerRevcomp(seq, SEQL) -- revcomp is canonical
return seq -- palindrome: either orientation valid
< / code > < / pre > < / div >
< / div >
2026-04-29 22:52:42 +02:00
< h2 id = "minimizer-sliding-window" > Minimizer sliding window< / h2 >
< p > Super-kmers are built by < code > SuperKmerIter< / code > (crate < code > obiskbuilder< / code > ), which maintains the current minimizer with a < strong > monotonic deque< / strong > over a sliding window of W = k − m + 1 m-mer positions.< / p >
< p > Each deque entry stores:< / p >
< table >
< thead >
< tr >
< th > Field< / th >
< th > Type< / th >
< th > Purpose< / th >
< / tr >
< / thead >
< tbody >
< tr >
< td > < code > position< / code > < / td >
< td > usize< / td >
< td > 0-based start of this m-mer in the segment< / td >
< / tr >
< tr >
< td > < code > canonical< / code > < / td >
< td > u64< / td >
< td > right-aligned canonical m-mer value (lex-min of fwd and rc); used as partition key< / td >
< / tr >
< tr >
< td > < code > hash< / code > < / td >
< td > u64< / td >
< td > < span class = "arithmatex" > \(H(\text{canonical})\)< / span > — ordering key for random minimizer selection< / td >
< / tr >
< / tbody >
< / table >
< p > The hash < span class = "arithmatex" > \(H\)< / span > is the seeded splitmix64 finalizer (see < a href = "../../theory/minimizer/" > Minimizer selection< / a > ):< / p >
< div class = "highlight" > < pre > < span > < / span > < code > < span class = "k" > fn< / span > < span class = "w" > < / span > < span class = "nf" > hash_mmer< / span > < span class = "p" > (< / span > < span class = "n" > canonical< / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "kt" > u64< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "p" > -> < / span > < span class = "w" > < / span > < span class = "kt" > u64< / span > < span class = "w" > < / span > < span class = "p" > {< / span >
< span class = "w" > < / span > < span class = "kd" > let< / span > < span class = "w" > < / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > =< / span > < span class = "w" > < / span > < span class = "n" > canonical< / span > < span class = "w" > < / span > < span class = "o" > ^< / span > < span class = "w" > < / span > < span class = "mh" > 0x9e3779b97f4a7c15< / span > < span class = "p" > ;< / span > < span class = "w" > < / span > < span class = "c1" > // seed: eliminates fixed point at 0< / span >
< span class = "w" > < / span > < span class = "kd" > let< / span > < span class = "w" > < / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > =< / span > < span class = "w" > < / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > ^< / span > < span class = "w" > < / span > < span class = "p" > (< / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > > > < / span > < span class = "w" > < / span > < span class = "mi" > 30< / span > < span class = "p" > );< / span >
< span class = "w" > < / span > < span class = "kd" > let< / span > < span class = "w" > < / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > =< / span > < span class = "w" > < / span > < span class = "n" > x< / span > < span class = "p" > .< / span > < span class = "n" > wrapping_mul< / span > < span class = "p" > (< / span > < span class = "mh" > 0xbf58476d1ce4e5b9< / span > < span class = "p" > );< / span >
< span class = "w" > < / span > < span class = "kd" > let< / span > < span class = "w" > < / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > =< / span > < span class = "w" > < / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > ^< / span > < span class = "w" > < / span > < span class = "p" > (< / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > > > < / span > < span class = "w" > < / span > < span class = "mi" > 27< / span > < span class = "p" > );< / span >
< span class = "w" > < / span > < span class = "kd" > let< / span > < span class = "w" > < / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > =< / span > < span class = "w" > < / span > < span class = "n" > x< / span > < span class = "p" > .< / span > < span class = "n" > wrapping_mul< / span > < span class = "p" > (< / span > < span class = "mh" > 0x94d049bb133111eb< / span > < span class = "p" > );< / span >
< span class = "w" > < / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > ^< / span > < span class = "w" > < / span > < span class = "p" > (< / span > < span class = "n" > x< / span > < span class = "w" > < / span > < span class = "o" > > > < / span > < span class = "w" > < / span > < span class = "mi" > 31< / span > < span class = "p" > )< / span >
< span class = "p" > }< / span >
< / code > < / pre > < / div >
< p > On each new nucleotide, once the window is full, the deque is updated:< / p >
< div class = "admonition abstract" >
< p class = "admonition-title" > Algorithm — minimizer deque update< / p >
< div class = "highlight" > < pre > < span > < / span > < code > procedure UpdateMinimizer(deque, position, canonical, hash, k, received):
-- pop dominated entries from the back
while deque.back.hash ≥ hash:
deque.pop_back()
deque.push_back({position, canonical, hash})
-- evict expired entries from the front
while deque.front.position + k < received:
deque.pop_front()
< / code > < / pre > < / div >
< / div >
< p > The front of the deque is always the current minimizer. Because the deque is maintained in strictly increasing hash order, each entry is popped at most once — O(1) amortized per nucleotide.< / p >
< p > A super-kmer boundary is emitted when the minimizer changes: < code > deque.front.hash ≠ prev_hash< / code > . The < code > canonical< / code > field of the front entry is < strong > not< / strong > used for boundary detection — that uses the hash alone. The canonical value is stored so that the partition key < span class = "arithmatex" > \(H(\text{canonical})\)< / span > can be recomputed independently at routing time from the stored < code > minimizer_pos< / code > , without inheriting the minimum-order-statistic bias (see < a href = "../../theory/minimizer/#partition-key-independence" > Minimizer selection — partition key independence< / a > ).< / p >
2026-04-16 22:38:20 +02:00
< h2 id = "kmer-extraction" > Kmer extraction< / h2 >
< p > A k-mer is extracted from a super-kmer with < code > SuperKmer::kmer(i, k)< / code > , which returns a < code > Kmer< / code > — a left-aligned < code > u64< / code > newtype (see < a href = "../kmer/" > Kmer implementation< / a > ):< / p >
< div class = "highlight" > < pre > < span > < / span > < code > < span class = "k" > pub< / span > < span class = "w" > < / span > < span class = "k" > fn< / span > < span class = "w" > < / span > < span class = "nf" > kmer< / span > < span class = "p" > (< / span > < span class = "o" > & < / span > < span class = "bp" > self< / span > < span class = "p" > ,< / span > < span class = "w" > < / span > < span class = "n" > i< / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "kt" > usize< / span > < span class = "p" > ,< / span > < span class = "w" > < / span > < span class = "n" > k< / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "kt" > usize< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "p" > -> < / span > < span class = "w" > < / span > < span class = "nb" > Result< / span > < span class = "o" > < < / span > < span class = "n" > Kmer< / span > < span class = "p" > ,< / span > < span class = "w" > < / span > < span class = "n" > KmerError< / span > < span class = "o" > > < / span >
< / code > < / pre > < / div >
< p > The bit slice < code > seq[i*2 .. (i+k)*2]< / code > (Msb0 order) is loaded as a big-endian < code > u64< / code > via < code > bitvec::load_be< / code > , then left-shifted to produce the canonical left-aligned layout. One call — no loop, no allocation.< / p >
< hr / >
< div class = "admonition abstract" >
< p class = "admonition-title" > Algorithm — Super-kmer reverse complement< / p >
< div class = "highlight" > < pre > < span > < / span > < code > procedure SuperKmerRevcomp(seq, SEQL):
2026-04-29 22:52:42 +02:00
seql ← NKMERS + k − 1 -- nucleotide length
n ← ⌈seql / 4⌉ -- number of bytes
shift ← n × 8 − seql × 2 -- padding bits to flush
2026-04-16 22:38:20 +02:00
-- step 1: swap bytes outside-in, applying REVCOMP4 to each (256-byte L1 table)
lo ← 0 ; hi ← n − 1
while lo < hi:
seq[lo], seq[hi] ← REVCOMP4[seq[hi]], REVCOMP4[seq[lo]]
lo ← lo + 1 ; hi ← hi − 1
if lo == hi: seq[lo] ← REVCOMP4[seq[lo]]
-- step 2: left-rotate entire bit array by shift, zero trailing bits (SIMD via bitvec)
if shift > 0:
bits.rotate_left(shift)
bits[n× 8 − shift .. n× 8].fill(0)
< / code > < / pre > < / div >
< / div >
< / article >
< / div >
< script > var target = document . getElementById ( location . hash . slice ( 1 ) ) ; target && target . name && ( target . checked = target . name . startsWith ( "__tabbed_" ) ) < / script >
< / div >
< / main >
< footer class = "md-footer" >
< div class = "md-footer-meta md-typeset" >
< div class = "md-footer-meta__inner md-grid" >
< div class = "md-copyright" >
Made with
< a href = "https://squidfunk.github.io/mkdocs-material/" target = "_blank" rel = "noopener" >
Material for MkDocs
< / a >
< / div >
< / div >
< / div >
< / footer >
< / div >
< div class = "md-dialog" data-md-component = "dialog" >
< div class = "md-dialog__inner md-typeset" > < / div >
< / div >
< script id = "__config" type = "application/json" > { "annotate" : null , "base" : "../.." , "features" : [ ] , "search" : "../../assets/javascripts/workers/search.2c215733.min.js" , "tags" : null , "translations" : { "clipboard.copied" : "Copied to clipboard" , "clipboard.copy" : "Copy to clipboard" , "search.result.more.one" : "1 more on this page" , "search.result.more.other" : "# more on this page" , "search.result.none" : "No matching documents" , "search.result.one" : "1 matching document" , "search.result.other" : "# matching documents" , "search.result.placeholder" : "Type to start searching" , "search.result.term.missing" : "Missing" , "select.version" : "Select version" } , "version" : null } < / script >
< script src = "../../assets/javascripts/bundle.79ae519e.min.js" > < / script >
< script src = "https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js" > < / script >
< / body >
< / html >