2026-04-16 22:38:20 +02:00
2026-04-29 22:52:42 +02:00
<!DOCTYPE html>
< html class = "no-js" lang = "en" >
< head >
< meta charset = "utf-8" / >
< meta content = "width=device-width,initial-scale=1" name = "viewport" / >
< link href = "../kmer/" rel = "prev" / >
< link href = "../pipeline/" rel = "next" / >
< link href = "../../assets/images/favicon.png" rel = "icon" / >
< meta content = "mkdocs-1.6.1, mkdocs-material-9.7.6" name = "generator" / >
< title > Chunk reader - obikmer< / title >
< link href = "../../assets/stylesheets/main.484c7ddc.min.css" rel = "stylesheet" / >
< link crossorigin = "" href = "https://fonts.gstatic.com" rel = "preconnect" / >
< link href = "https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback" rel = "stylesheet" / >
< style > : root { --md-text-font : "Roboto" ; --md-code-font : "Roboto Mono" } < / style >
< script > _ _md _scope = new URL ( "../.." , location ) , _ _md _hash = e => [ ... e ] . reduce ( ( ( e , _ ) => ( e << 5 ) - e + _ . charCodeAt ( 0 ) ) , 0 ) , _ _md _get = ( e , _ = localStorage , t = _ _md _scope ) => JSON . parse ( _ . getItem ( t . pathname + "." + e ) ) , _ _md _set = ( e , _ , t = localStorage , a = _ _md _scope ) => { try { t . setItem ( a . pathname + "." + e , JSON . stringify ( _ ) ) } catch ( e ) { } } < / script >
< / head >
< body dir = "ltr" >
< input autocomplete = "off" class = "md-toggle" data-md-toggle = "drawer" id = "__drawer" type = "checkbox" / >
< input autocomplete = "off" class = "md-toggle" data-md-toggle = "search" id = "__search" type = "checkbox" / >
< label class = "md-overlay" for = "__drawer" > < / label >
< div data-md-component = "skip" >
< a class = "md-skip" href = "#chunk-reader-implementation" >
2026-04-16 22:38:20 +02:00
Skip to content
< / a >
2026-04-29 22:52:42 +02:00
< / div >
< div data-md-component = "announce" >
< / div >
2026-04-16 22:38:20 +02:00
< header class = "md-header md-header--shadow" data-md-component = "header" >
2026-04-29 22:52:42 +02:00
< nav aria-label = "Header" class = "md-header__inner md-grid" >
< a aria-label = "obikmer" class = "md-header__button md-logo" data-md-component = "logo" href = "../.." title = "obikmer" >
< svg viewbox = "0 0 24 24" xmlns = "http://www.w3.org/2000/svg" > < path d = "M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54" > < / path > < / svg >
< / a >
< label class = "md-header__button md-icon" for = "__drawer" >
< svg viewbox = "0 0 24 24" xmlns = "http://www.w3.org/2000/svg" > < path d = "M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z" > < / path > < / svg >
< / label >
< div class = "md-header__title" data-md-component = "header-title" >
< div class = "md-header__ellipsis" >
< div class = "md-header__topic" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
obikmer
< / span >
2026-04-29 22:52:42 +02:00
< / div >
< div class = "md-header__topic" data-md-component = "header-topic" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Chunk reader
< / span >
2026-04-29 22:52:42 +02:00
< / div >
< / div >
< / div >
< script > var palette = _ _md _get ( "__palette" ) ; if ( palette && palette . color ) { if ( "(prefers-color-scheme)" === palette . color . media ) { var media = matchMedia ( "(prefers-color-scheme: light)" ) , input = document . querySelector ( media . matches ? "[data-md-color-media='(prefers-color-scheme: light)']" : "[data-md-color-media='(prefers-color-scheme: dark)']" ) ; palette . color . media = input . getAttribute ( "data-md-color-media" ) , palette . color . scheme = input . getAttribute ( "data-md-color-scheme" ) , palette . color . primary = input . getAttribute ( "data-md-color-primary" ) , palette . color . accent = input . getAttribute ( "data-md-color-accent" ) } for ( var [ key , value ] of Object . entries ( palette . color ) ) document . body . setAttribute ( "data-md-color-" + key , value ) } < / script >
< / nav >
2026-04-16 22:38:20 +02:00
< / header >
2026-04-29 22:52:42 +02:00
< div class = "md-container" data-md-component = "container" >
< main class = "md-main" data-md-component = "main" >
< div class = "md-main__inner md-grid" >
< div class = "md-sidebar md-sidebar--primary" data-md-component = "sidebar" data-md-type = "navigation" >
< div class = "md-sidebar__scrollwrap" >
< div class = "md-sidebar__inner" >
< nav aria-label = "Navigation" class = "md-nav md-nav--primary" data-md-level = "0" >
< label class = "md-nav__title" for = "__drawer" >
< a aria-label = "obikmer" class = "md-nav__button md-logo" data-md-component = "logo" href = "../.." title = "obikmer" >
< svg viewbox = "0 0 24 24" xmlns = "http://www.w3.org/2000/svg" > < path d = "M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54" > < / path > < / svg >
< / a >
2026-04-16 22:38:20 +02:00
obikmer
< / label >
2026-04-29 22:52:42 +02:00
< ul class = "md-nav__list" data-md-scrollfix = "" >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../.." >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Home
< / span >
2026-04-29 22:52:42 +02:00
< / a >
< / li >
< li class = "md-nav__item md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle" id = "__nav_2" type = "checkbox" / >
< label class = "md-nav__link" for = "__nav_2" id = "__nav_2_label" tabindex = "0" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Theory
< / span >
2026-04-29 22:52:42 +02:00
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< nav aria-expanded = "false" aria-labelledby = "__nav_2_label" class = "md-nav" data-md-level = "1" >
< label class = "md-nav__title" for = "__nav_2" >
< span class = "md-nav__icon md-icon" > < / span >
2026-04-16 22:38:20 +02:00
Theory
< / label >
2026-04-29 22:52:42 +02:00
< ul class = "md-nav__list" data-md-scrollfix = "" >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../../kmers/" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Kmers and super-kmers
< / span >
2026-04-29 22:52:42 +02:00
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../../theory/encoding/" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
DNA encoding
< / span >
2026-04-29 22:52:42 +02:00
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../../theory/entropy/" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Entropy filter
< / span >
2026-04-29 22:52:42 +02:00
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../../theory/minimizer/" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
2026-04-29 22:52:42 +02:00
Minimizer selection
2026-04-16 22:38:20 +02:00
< / span >
2026-04-29 22:52:42 +02:00
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../../theory/indexing/" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
2026-04-29 22:52:42 +02:00
Partitioning architecture
2026-04-16 22:38:20 +02:00
2026-04-29 22:52:42 +02:00
2026-04-16 22:38:20 +02:00
2026-04-29 22:52:42 +02:00
< / span >
< / a >
< / li >
< / ul >
< / nav >
< / li >
< li class = "md-nav__item md-nav__item--active md-nav__item--nested" >
< input checked = "" class = "md-nav__toggle md-toggle" id = "__nav_3" type = "checkbox" / >
< label class = "md-nav__link" for = "__nav_3" id = "__nav_3_label" tabindex = "0" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Implementation
< / span >
2026-04-29 22:52:42 +02:00
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< nav aria-expanded = "true" aria-labelledby = "__nav_3_label" class = "md-nav" data-md-level = "1" >
< label class = "md-nav__title" for = "__nav_3" >
< span class = "md-nav__icon md-icon" > < / span >
2026-04-16 22:38:20 +02:00
Implementation
< / label >
2026-04-29 22:52:42 +02:00
< ul class = "md-nav__list" data-md-scrollfix = "" >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../superkmer/" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
SuperKmer
< / span >
2026-04-29 22:52:42 +02:00
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../kmer/" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Kmer
< / span >
2026-04-29 22:52:42 +02:00
< / a >
< / li >
< li class = "md-nav__item md-nav__item--active" >
< input class = "md-nav__toggle md-toggle" id = "__toc" type = "checkbox" / >
< label class = "md-nav__link md-nav__link--active" for = "__toc" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Chunk reader
< / span >
2026-04-29 22:52:42 +02:00
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< a class = "md-nav__link md-nav__link--active" href = "./" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Chunk reader
< / span >
2026-04-29 22:52:42 +02:00
< / a >
< nav aria-label = "Table of contents" class = "md-nav md-nav--secondary" >
< label class = "md-nav__title" for = "__toc" >
< span class = "md-nav__icon md-icon" > < / span >
2026-04-16 22:38:20 +02:00
Table of contents
< / label >
2026-04-29 22:52:42 +02:00
< ul class = "md-nav__list" data-md-component = "toc" data-md-scrollfix = "" >
< li class = "md-nav__item" >
2026-06-04 21:27:01 +02:00
< a class = "md-nav__link" href = "#two-reading-paths" >
2026-04-29 22:52:42 +02:00
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
2026-06-04 21:27:01 +02:00
Two reading paths
2026-04-16 22:38:20 +02:00
< / span >
2026-04-29 22:52:42 +02:00
< / a >
2026-04-16 22:38:20 +02:00
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
2026-06-04 21:27:01 +02:00
< a class = "md-nav__link" href = "#record-path-chunk-reader" >
2026-04-29 22:52:42 +02:00
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
2026-06-04 21:27:01 +02:00
Record path: chunk reader
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "#output-type-rope" >
< span class = "md-ellipsis" >
Output type: Rope
2026-04-16 22:38:20 +02:00
< / span >
2026-04-29 22:52:42 +02:00
< / a >
2026-04-16 22:38:20 +02:00
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "#seqchunkiter" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
SeqChunkIter
< / span >
2026-04-29 22:52:42 +02:00
< / a >
2026-04-16 22:38:20 +02:00
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "#boundary-detection-fasta" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Boundary detection — FASTA
< / span >
2026-04-29 22:52:42 +02:00
< / a >
2026-04-16 22:38:20 +02:00
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "#boundary-detection-fastq" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Boundary detection — FASTQ
< / span >
2026-04-29 22:52:42 +02:00
< / a >
2026-04-16 22:38:20 +02:00
< / li >
2026-04-29 22:52:42 +02:00
< / ul >
2026-04-16 22:38:20 +02:00
< / nav >
2026-04-29 22:52:42 +02:00
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../pipeline/" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Construction pipeline
< / span >
2026-04-29 22:52:42 +02:00
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../obipipeline/" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
2026-04-29 22:52:42 +02:00
obipipeline library
2026-04-16 22:38:20 +02:00
2026-04-29 22:52:42 +02:00
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../storage/" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
On-disk storage
< / span >
2026-04-29 22:52:42 +02:00
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../mphf/" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
MPHF selection
< / span >
2026-04-29 22:52:42 +02:00
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../unitig_evidence/" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
2026-04-29 22:52:42 +02:00
Unitig evidence encoding
2026-04-16 22:38:20 +02:00
2026-04-29 22:52:42 +02:00
2026-04-16 22:38:20 +02:00
2026-06-04 21:27:01 +02:00
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../evidence_elimination/" >
< span class = "md-ellipsis" >
Evidence elimination (discussion)
2026-05-15 21:07:23 +08:00
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../obilayeredmap/" >
< span class = "md-ellipsis" >
obilayeredmap crate
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../persistent_compact_int_vec/" >
< span class = "md-ellipsis" >
PersistentCompactIntVec
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../persistent_bit_vec/" >
< span class = "md-ellipsis" >
PersistentBitVec
2026-06-04 21:27:01 +02:00
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../merge/" >
< span class = "md-ellipsis" >
Merge command
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../rebuild_filter/" >
< span class = "md-ellipsis" >
Kmer filtering (rebuild/dump/unitig)
2026-04-29 22:52:42 +02:00
< / span >
< / a >
< / li >
< / ul >
< / nav >
< / li >
< li class = "md-nav__item md-nav__item--nested" >
< input class = "md-nav__toggle md-toggle" id = "__nav_4" type = "checkbox" / >
< label class = "md-nav__link" for = "__nav_4" id = "__nav_4_label" tabindex = "0" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Architecture
< / span >
2026-04-29 22:52:42 +02:00
< span class = "md-nav__icon md-icon" > < / span >
< / label >
< nav aria-expanded = "false" aria-labelledby = "__nav_4_label" class = "md-nav" data-md-level = "1" >
< label class = "md-nav__title" for = "__nav_4" >
< span class = "md-nav__icon md-icon" > < / span >
2026-04-16 22:38:20 +02:00
Architecture
< / label >
2026-04-29 22:52:42 +02:00
< ul class = "md-nav__list" data-md-scrollfix = "" >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../../architecture/sequences/invariant/" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Sequences
2026-05-15 21:07:23 +08:00
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "../../architecture/index_architecture/" >
< span class = "md-ellipsis" >
Kmer index
2026-04-16 22:38:20 +02:00
< / span >
2026-04-29 22:52:42 +02:00
< / a >
< / li >
< / ul >
2026-04-16 22:38:20 +02:00
< / nav >
2026-04-29 22:52:42 +02:00
< / li >
< / ul >
< / nav >
< / div >
< / div >
< / div >
< div class = "md-sidebar md-sidebar--secondary" data-md-component = "sidebar" data-md-type = "toc" >
< div class = "md-sidebar__scrollwrap" >
< div class = "md-sidebar__inner" >
< nav aria-label = "Table of contents" class = "md-nav md-nav--secondary" >
< label class = "md-nav__title" for = "__toc" >
< span class = "md-nav__icon md-icon" > < / span >
2026-04-16 22:38:20 +02:00
Table of contents
< / label >
2026-04-29 22:52:42 +02:00
< ul class = "md-nav__list" data-md-component = "toc" data-md-scrollfix = "" >
< li class = "md-nav__item" >
2026-06-04 21:27:01 +02:00
< a class = "md-nav__link" href = "#two-reading-paths" >
< span class = "md-ellipsis" >
Two reading paths
< / span >
< / a >
< / li >
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "#record-path-chunk-reader" >
2026-04-29 22:52:42 +02:00
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
2026-06-04 21:27:01 +02:00
Record path: chunk reader
2026-04-16 22:38:20 +02:00
< / span >
2026-04-29 22:52:42 +02:00
< / a >
2026-04-16 22:38:20 +02:00
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
2026-06-04 21:27:01 +02:00
< a class = "md-nav__link" href = "#output-type-rope" >
2026-04-29 22:52:42 +02:00
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
2026-06-04 21:27:01 +02:00
Output type: Rope
2026-04-16 22:38:20 +02:00
< / span >
2026-04-29 22:52:42 +02:00
< / a >
2026-04-16 22:38:20 +02:00
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "#seqchunkiter" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
SeqChunkIter
< / span >
2026-04-29 22:52:42 +02:00
< / a >
2026-04-16 22:38:20 +02:00
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "#boundary-detection-fasta" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Boundary detection — FASTA
< / span >
2026-04-29 22:52:42 +02:00
< / a >
2026-04-16 22:38:20 +02:00
< / li >
2026-04-29 22:52:42 +02:00
< li class = "md-nav__item" >
< a class = "md-nav__link" href = "#boundary-detection-fastq" >
< span class = "md-ellipsis" >
2026-04-16 22:38:20 +02:00
Boundary detection — FASTQ
< / span >
2026-04-29 22:52:42 +02:00
< / a >
2026-04-16 22:38:20 +02:00
< / li >
2026-04-29 22:52:42 +02:00
< / ul >
2026-04-16 22:38:20 +02:00
< / nav >
2026-04-29 22:52:42 +02:00
< / div >
< / div >
< / div >
< div class = "md-content" data-md-component = "content" >
< article class = "md-content__inner md-typeset" >
2026-04-16 22:38:20 +02:00
< h1 id = "chunk-reader-implementation" > Chunk reader — implementation< / h1 >
2026-06-04 21:27:01 +02:00
< p > < code > obiread< / code > exposes two distinct sequence reading paths, each optimised for a different use case.< / p >
< h2 id = "two-reading-paths" > Two reading paths< / h2 >
2026-04-16 22:38:20 +02:00
< table >
< thead >
< tr >
2026-06-04 21:27:01 +02:00
< th > Path< / th >
< th > API< / th >
< th > Output unit< / th >
< th > Per-record identity< / th >
< th > Use case< / th >
2026-04-16 22:38:20 +02:00
< / tr >
< / thead >
< tbody >
< tr >
2026-06-04 21:27:01 +02:00
< td > < strong > Record path< / strong > < / td >
< td > < code > read_sequence_chunks< / code > → < code > parse_chunk< / code > < / td >
< td > < code > SeqRecord< / code > (id + raw sequence + normalised rope)< / td >
< td > yes< / td >
< td > < code > query< / code > — must read complete records< / td >
2026-04-16 22:38:20 +02:00
< / tr >
< tr >
2026-06-04 21:27:01 +02:00
< td > < strong > Stream path< / strong > < / td >
< td > < code > open_nuc_stream< / code > < / td >
< td > < code > NucPage< / code > (flat normalised byte buffer)< / td >
< td > no< / td >
< td > < code > index< / code > , < code > superkmer< / code > — bulk throughput< / td >
2026-04-16 22:38:20 +02:00
< / tr >
< / tbody >
< / table >
2026-06-04 21:27:01 +02:00
< p > The record path uses < code > Rope< / code > -backed chunks and is described in detail below.
The stream path (< code > NucStream< / code > / < code > NucPage< / code > ) is described in the scatter section of < a href = "../pipeline/" > pipeline< / a > .< / p >
< hr / >
< h2 id = "record-path-chunk-reader" > Record path: chunk reader< / h2 >
< p > The chunk reader reads FASTA or FASTQ files in fixed-size blocks and yields self-contained chunks, each ending on a complete sequence record boundary. < code > parse_chunk< / code > then converts each chunk into a < code > Vec< SeqRecord> < / code > , where each record carries its identifier, raw sequence bytes, and a normalised rope ready for superkmer building.< / p >
< p > This path is mandatory for < code > query< / code > , where superkmers must be tracked back to their originating sequence (id, kmer offset) for output annotation.< / p >
< h2 id = "output-type-rope" > Output type: Rope< / h2 >
< p > Each chunk is a < code > Rope< / code > — a segmented byte sequence: a < code > Vec< / code > of blocks, where each block is a < code > Vec< Cell< u8> > < / code > . The consumer iterates over the blocks via a forward or backward cursor.< / p >
< p > < code > Rope::split_off(pos)< / code > splits at an absolute byte offset in O(log n) (binary search over block-start index). If < code > pos< / code > falls inside a block, that block is split in two via < code > Vec::split_off< / code > — no < code > memcpy< / code > in the common case.< / p >
2026-04-16 22:38:20 +02:00
< h2 id = "seqchunkiter" > SeqChunkIter< / h2 >
< div class = "highlight" > < pre > < span > < / span > < code > < span class = "k" > pub< / span > < span class = "w" > < / span > < span class = "k" > struct< / span > < span class = "w" > < / span > < span class = "nc" > SeqChunkIter< / span > < span class = "o" > < < / span > < span class = "n" > R< / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "nc" > Read< / span > < span class = "o" > > < / span > < span class = "w" > < / span > < span class = "p" > {< / span > < span class = "w" > < / span > < span class = "cm" > /* private */< / span > < span class = "w" > < / span > < span class = "p" > }< / span >
< span class = "k" > impl< / span > < span class = "o" > < < / span > < span class = "n" > R< / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "nc" > Read< / span > < span class = "o" > > < / span > < span class = "w" > < / span > < span class = "nb" > Iterator< / span > < span class = "w" > < / span > < span class = "k" > for< / span > < span class = "w" > < / span > < span class = "n" > SeqChunkIter< / span > < span class = "o" > < < / span > < span class = "n" > R< / span > < span class = "o" > > < / span > < span class = "w" > < / span > < span class = "p" > {< / span >
2026-06-04 21:27:01 +02:00
< span class = "w" > < / span > < span class = "k" > type< / span > < span class = "w" > < / span > < span class = "nc" > Item< / span > < span class = "w" > < / span > < span class = "o" > =< / span > < span class = "w" > < / span > < span class = "n" > io< / span > < span class = "p" > ::< / span > < span class = "nb" > Result< / span > < span class = "o" > < < / span > < span class = "n" > Rope< / span > < span class = "o" > > < / span > < span class = "p" > ;< / span >
2026-04-16 22:38:20 +02:00
< span class = "p" > }< / span >
< span class = "k" > pub< / span > < span class = "w" > < / span > < span class = "k" > fn< / span > < span class = "w" > < / span > < span class = "nf" > fasta_chunks< / span > < span class = "o" > < < / span > < span class = "n" > R< / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "nc" > Read< / span > < span class = "o" > > < / span > < span class = "p" > (< / span > < span class = "n" > source< / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "nc" > R< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "p" > -> < / span > < span class = "w" > < / span > < span class = "nc" > SeqChunkIter< / span > < span class = "o" > < < / span > < span class = "n" > R< / span > < span class = "o" > > < / span >
< span class = "k" > pub< / span > < span class = "w" > < / span > < span class = "k" > fn< / span > < span class = "w" > < / span > < span class = "nf" > fastq_chunks< / span > < span class = "o" > < < / span > < span class = "n" > R< / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "nc" > Read< / span > < span class = "o" > > < / span > < span class = "p" > (< / span > < span class = "n" > source< / span > < span class = "p" > :< / span > < span class = "w" > < / span > < span class = "nc" > R< / span > < span class = "p" > )< / span > < span class = "w" > < / span > < span class = "p" > -> < / span > < span class = "w" > < / span > < span class = "nc" > SeqChunkIter< / span > < span class = "o" > < < / span > < span class = "n" > R< / span > < span class = "o" > > < / span >
< / code > < / pre > < / div >
< p > < code > next()< / code > loop:< / p >
2026-06-04 21:27:01 +02:00
< div class = "highlight" > < pre > < span > < / span > < code > 1. read one block of block_size bytes → push onto Rope
2. call splitter(rope) → Option< abs_offset>
if Some(pos):
tail = rope.split_off(pos) ← O(log n), may split one block
chunk = mem::replace(& mut rope, tail)
return Some(Ok(chunk))
3. if EOF and rope non-empty: return Some(Ok(rope)) as final chunk
4. if EOF and rope empty: return None
2026-04-16 22:38:20 +02:00
< / code > < / pre > < / div >
2026-06-04 21:27:01 +02:00
< p > The < code > Splitter< / code > function signature is < code > fn(& Rope) -> Option< usize> < / code > . It returns the absolute byte offset of the start of the last complete record, or < code > None< / code > if no boundary was found in the accumulated rope (need more data).< / p >
2026-04-16 22:38:20 +02:00
< h2 id = "boundary-detection-fasta" > Boundary detection — FASTA< / h2 >
2026-06-04 21:27:01 +02:00
< p > Backward scan with a 2-state machine. Searches (right to left) for < code > > < / code > followed by < code > \n< / code > or < code > \r< / code > (i.e., a < code > > < / code > that is preceded by a newline in forward order):< / p >
2026-04-29 22:52:42 +02:00
< pre class = "mermaid" > < code > stateDiagram-v2
2026-04-16 22:38:20 +02:00
direction LR
[*] --> Scanning
2026-04-29 22:52:42 +02:00
Scanning --> FoundGt : '> '
2026-04-16 22:38:20 +02:00
FoundGt --> Scanning : other
2026-04-29 22:52:42 +02:00
FoundGt --> [*] : '\\n' / '\\r' ✓< / code > < / pre >
2026-06-04 21:27:01 +02:00
< p > Returns the byte offset of the < code > > < / code > that starts the last complete record. Returns < code > None< / code > if only one < code > > < / code > is found (cannot confirm there is a prior complete record).< / p >
2026-04-16 22:38:20 +02:00
< h2 id = "boundary-detection-fastq" > Boundary detection — FASTQ< / h2 >
< p > FASTQ records have a rigid 4-line structure (< code > @header< / code > , sequence, < code > +< / code > , quality). The < code > @< / code > character (ASCII 64, Phred score 31) can appear legitimately in quality lines, making any forward heuristic unreliable. The backward scanner verifies the full structural context before accepting a candidate < code > @< / code > .< / p >
2026-06-04 21:27:01 +02:00
< p > 7-state machine (states 0– 6), scanning from < strong > right to left< / strong > . Each time a < code > +< / code > is found, its position is saved as < code > restart< / code > ; any state mismatch resets the scan to that position.< / p >
2026-04-29 22:52:42 +02:00
< pre class = "mermaid" > < code > stateDiagram-v2
2026-04-16 22:38:20 +02:00
direction LR
[*] --> Scanning
2026-04-29 22:52:42 +02:00
Scanning --> FoundPlus : '+' (save restart)
FoundPlus --> AfterNlPlus : '\\n' / '\\r'
2026-04-16 22:38:20 +02:00
FoundPlus --> Scanning : other → backtrack
AfterNlPlus --> AfterNlPlus : séparateur
AfterNlPlus --> InSequence : lettre / - / . / [ / ]
AfterNlPlus --> Scanning : other → backtrack
2026-04-29 22:52:42 +02:00
InSequence --> AfterSequence : '\\n' / '\\r'
2026-04-16 22:38:20 +02:00
InSequence --> InSequence : lettre / - / . / [ / ]
InSequence --> Scanning : other → backtrack
2026-04-29 22:52:42 +02:00
AfterSequence --> AfterSequence : '\\n' / '\\r'
2026-04-16 22:38:20 +02:00
AfterSequence --> InHeader : other
2026-04-29 22:52:42 +02:00
InHeader --> FoundAt : '@' (save cut)
InHeader --> Scanning : '\\n' / '\\r' → backtrack
2026-04-16 22:38:20 +02:00
InHeader --> InHeader : other
2026-04-29 22:52:42 +02:00
FoundAt --> [*] : '\\n' / '\\r' ✓
FoundAt --> InHeader : other< / code > < / pre >
2026-04-16 22:38:20 +02:00
< p > < code > restart< / code > is updated each time a < code > +< / code > is found. When any state fails its expected input, the scan jumps back to < code > restart< / code > and continues from there — guaranteeing that a < code > @< / code > in a quality line cannot be accepted as a record start, because the < code > \n+\n< / code > structure immediately following it (going backward) will not be found.< / p >
< p > Returns the byte offset of the < code > @< / code > that starts the last complete record.< / p >
2026-04-29 22:52:42 +02:00
< / article >
< / div >
2026-04-16 22:38:20 +02:00
< script > var target = document . getElementById ( location . hash . slice ( 1 ) ) ; target && target . name && ( target . checked = target . name . startsWith ( "__tabbed_" ) ) < / script >
2026-04-29 22:52:42 +02:00
< / div >
< / main >
< footer class = "md-footer" >
< div class = "md-footer-meta md-typeset" >
< div class = "md-footer-meta__inner md-grid" >
< div class = "md-copyright" >
2026-04-16 22:38:20 +02:00
Made with
2026-04-29 22:52:42 +02:00
< a href = "https://squidfunk.github.io/mkdocs-material/" rel = "noopener" target = "_blank" >
2026-04-16 22:38:20 +02:00
Material for MkDocs
< / a >
< / div >
2026-04-29 22:52:42 +02:00
< / div >
< / div >
2026-04-16 22:38:20 +02:00
< / footer >
2026-04-29 22:52:42 +02:00
< / div >
< div class = "md-dialog" data-md-component = "dialog" >
< div class = "md-dialog__inner md-typeset" > < / div >
< / div >
< script id = "__config" type = "application/json" > { "annotate" : null , "base" : "../.." , "features" : [ ] , "search" : "../../assets/javascripts/workers/search.2c215733.min.js" , "tags" : null , "translations" : { "clipboard.copied" : "Copied to clipboard" , "clipboard.copy" : "Copy to clipboard" , "search.result.more.one" : "1 more on this page" , "search.result.more.other" : "# more on this page" , "search.result.none" : "No matching documents" , "search.result.one" : "1 matching document" , "search.result.other" : "# matching documents" , "search.result.placeholder" : "Type to start searching" , "search.result.term.missing" : "Missing" , "select.version" : "Select version" } , "version" : null } < / script >
< script src = "../../assets/javascripts/bundle.79ae519e.min.js" > < / script >
< script src = "https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js" > < / script >
< / body >
2026-04-16 22:38:20 +02:00
< / html >