first implementation but far to be optimal

This commit is contained in:
Eric Coissac
2026-04-16 22:38:20 +02:00
commit de3f9b16cf
19336 changed files with 380276 additions and 0 deletions
+2133
View File
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,9 @@
# Manipulated sequences
- We consider sequences only as compact form of a set of overlaping kmers
- The largest kmers we considere are 31-mer
- We only consider odd k
- all sequences match /^[acgtACGT]+/
- maximum length 256 nucleotides
- minimum length k
+35
View File
@@ -0,0 +1,35 @@
/* docs/css/extra.css */
/* Styles principaux pour le conteneur et le texte */
.ps-root {
font-family: "Courier New", monospace;
font-size: 0.9em;
line-height: 1.5;
}
/* Styles pour les mots-clés */
.ps-keyword {
font-weight: bold;
color: #d73a49; /* Une belle teinte de rouge */
}
/* --- CORRECTION DE L'INDENTATION --- */
/* Cible tous les niveaux d'indentation et applique une marge gauche */
[class*="ps-indent-"] {
display: inline-block;
}
.ps-indent-1 {
margin-left: 2em;
}
.ps-indent-2 {
margin-left: 4em;
}
.ps-indent-3 {
margin-left: 6em;
}
.ps-indent-4 {
margin-left: 8em;
}
.ps-indent-5 {
margin-left: 10em;
}
+230
View File
@@ -0,0 +1,230 @@
<?xml version="1.0" encoding="utf-8"?>
<style xmlns="http://purl.org/net/xbiblio/csl" class="in-text" version="1.0" demote-non-dropping-particle="sort-only" default-locale="en-US">
<info>
<title>Ecology Letters</title>
<id>http://www.zotero.org/styles/ecology-letters</id>
<link href="http://www.zotero.org/styles/ecology-letters" rel="self"/>
<link href="http://www.zotero.org/styles/apa" rel="template"/>
<link href="http://onlinelibrary.wiley.com/journal/10.1111/%28ISSN%291461-0248/homepage/ForAuthors.html" rel="documentation"/>
<author>
<name>David Kaplan</name>
<email>david.kaplan@ird.fr</email>
</author>
<contributor>
<name>Sebastian Karcher</name>
</contributor>
<category citation-format="author-date"/>
<category field="biology"/>
<issn>1461-023X</issn>
<eissn>1461-0248</eissn>
<updated>2023-10-11T10:45:32+00:00</updated>
<rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
</info>
<macro name="container">
<choose>
<if type="chapter paper-conference" match="any">
<text term="in" text-case="capitalize-first" suffix=": "/>
<text variable="container-title" font-style="italic"/>
<text variable="collection-title" prefix=", "/>
<names variable="editor translator" prefix=" (" delimiter=", " suffix=")">
<label form="short" suffix=" "/>
<name name-as-sort-order="all" and="symbol" sort-separator=", " initialize-with="." delimiter=", " delimiter-precedes-last="never"/>
</names>
</if>
<else>
<group delimiter=", ">
<text variable="container-title" font-style="italic" form="short"/>
<text variable="collection-title"/>
</group>
</else>
</choose>
</macro>
<macro name="author">
<names variable="author">
<name name-as-sort-order="all" and="symbol" sort-separator=", " initialize-with="." delimiter=", " delimiter-precedes-last="never"/>
<label form="short" prefix=" (" suffix=")" text-case="capitalize-first"/>
<et-al font-style="italic"/>
<substitute>
<names variable="editor"/>
<names variable="translator"/>
<text macro="title"/>
</substitute>
</names>
</macro>
<macro name="author-short">
<names variable="author">
<name form="short" and="symbol" delimiter=", " initialize-with=". "/>
<et-al font-style="italic"/>
<substitute>
<names variable="editor"/>
<names variable="translator"/>
<choose>
<if type="bill book graphic legal_case legislation motion_picture report song" match="any">
<text variable="title" form="short" font-style="italic"/>
</if>
<else>
<text variable="title" form="short" quotes="true"/>
</else>
</choose>
</substitute>
</names>
</macro>
<macro name="access">
<choose>
<if type="webpage">
<group>
<text term="available at" text-case="capitalize-first" suffix=": "/>
<text variable="URL" suffix="."/>
</group>
<text value="Last accessed" prefix=" " suffix=" "/>
<date variable="accessed">
<date-part name="day" suffix=" "/>
<date-part name="month" suffix=" "/>
<date-part name="year"/>
</date>
</if>
</choose>
</macro>
<macro name="title">
<choose>
<if type="report" match="any">
<text variable="title" font-style="italic"/>
<group prefix=" (" suffix=")">
<text variable="genre"/>
<text variable="number" prefix=" No. "/>
</group>
</if>
<else-if type="bill book graphic legal_case legislation motion_picture report song speech" match="any">
<text variable="title" font-style="italic"/>
</else-if>
<else-if type="webpage">
<text variable="title" font-style="italic"/>
</else-if>
<else>
<text variable="title"/>
</else>
</choose>
</macro>
<macro name="publisher">
<choose>
<if type="report" match="any">
<group delimiter=", ">
<text variable="publisher"/>
<text variable="publisher-place"/>
</group>
</if>
<else>
<text variable="genre" suffix=". "/>
<group delimiter=", ">
<text variable="publisher"/>
<text variable="publisher-place"/>
</group>
</else>
</choose>
</macro>
<macro name="event">
<choose>
<if variable="event">
<text term="presented at" text-case="capitalize-first" suffix=" "/>
<text variable="event"/>
</if>
</choose>
</macro>
<macro name="issued">
<choose>
<if variable="issued">
<date variable="issued">
<date-part name="year"/>
</date>
</if>
<else-if variable="accessed">
<choose>
<if type="webpage">
<date variable="accessed">
<date-part name="year"/>
</date>
</if>
<else>
<text term="no date" form="short"/>
</else>
</choose>
</else-if>
<else>
<text term="no date" form="short"/>
</else>
</choose>
</macro>
<macro name="edition">
<choose>
<if is-numeric="edition">
<group delimiter=" ">
<number variable="edition" form="ordinal"/>
<text value="edn"/>
</group>
</if>
<else>
<text variable="edition" suffix="."/>
</else>
</choose>
</macro>
<macro name="locators">
<choose>
<if type="article-journal article-magazine article-newspaper" match="any">
<group prefix=", " delimiter=", ">
<group>
<text variable="volume"/>
</group>
<text variable="page"/>
</group>
</if>
<else-if type="bill book graphic legal_case legislation motion_picture report song thesis" match="any">
<group delimiter=". " prefix=". ">
<text macro="edition"/>
<text macro="event"/>
<text macro="publisher"/>
</group>
</else-if>
<else-if type="chapter paper-conference" match="any">
<group delimiter=", " prefix=". ">
<text macro="event"/>
<text macro="publisher"/>
<group>
<label variable="page" form="short" suffix=" "/>
<text variable="page"/>
</group>
</group>
</else-if>
</choose>
</macro>
<citation et-al-min="3" et-al-use-first="1" disambiguate-add-year-suffix="true" collapse="year-suffix" year-suffix-delimiter=", ">
<sort>
<key macro="author"/>
<key macro="issued"/>
</sort>
<layout prefix="(" suffix=")" delimiter="; ">
<group delimiter=" ">
<text macro="author-short"/>
<text macro="issued"/>
</group>
</layout>
</citation>
<bibliography et-al-min="7" et-al-use-first="6" entry-spacing="0" hanging-indent="true">
<sort>
<key macro="author"/>
<key macro="issued" sort="ascending"/>
<key macro="title"/>
</sort>
<layout>
<group suffix=".">
<text macro="author" suffix="."/>
<text macro="issued" prefix=" (" suffix="). "/>
<group delimiter=". ">
<text macro="title"/>
<text macro="container"/>
</group>
<text macro="locators"/>
<text macro="access" prefix=". "/>
</group>
</layout>
</bibliography>
</style>
+100
View File
@@ -0,0 +1,100 @@
# Chunk reader — implementation
The `obiread` crate provides a streaming iterator that reads FASTA or FASTQ files in fixed-size blocks and yields self-contained chunks, each ending on a complete sequence record boundary. Chunks are consumed in parallel by downstream workers.
## Output type: rope
Each chunk is a `Vec<Bytes>` — a **rope**: a list of reference-counted byte slices that are not necessarily contiguous in memory. The consumer iterates over the slices in order.
Using `bytes::Bytes` means the split at the record boundary is O(1): `Bytes::split_to(n)` adjusts a reference counter, not memory. No `memcpy` in the common case.
## Allocation policy
| Case | Cost |
|------|------|
| Boundary found in the current block (common) | zero extra allocation — `split_to` only |
| Boundary straddles multiple blocks (sequence > block size, rare) | one allocation to pack the rope into a flat buffer |
| EOF flush | zero extra allocation |
## SeqChunkIter
```rust
pub struct SeqChunkIter<R: Read> { /* private */ }
impl<R: Read> Iterator for SeqChunkIter<R> {
type Item = io::Result<Vec<Bytes>>;
}
pub fn fasta_chunks<R: Read>(source: R) -> SeqChunkIter<R>
pub fn fastq_chunks<R: Read>(source: R) -> SeqChunkIter<R>
```
`next()` loop:
```text
1. read one block of block_size bytes → push onto rope
2. probe check: if the boundary marker ("\n>" or "\n@") is absent from the
last block, skip the splitter (avoids a full backward scan for nothing)
3. call splitter on last block
if found at offset n:
remainder = last_block.split_to(n) ← O(1), zero copy
return std::mem::take(&mut self.rope) ← the chunk
4. if rope.len() > 1 (multi-block accumulation):
pack rope into one flat buffer ← one alloc
retry splitter on flat buffer
5. if EOF: flush remaining rope as final chunk
```
## Boundary detection — FASTA
Backward scan with a 2-state machine. Searches for `>` immediately preceded by `\n` or `\r`:
```mermaid
stateDiagram-v2
direction LR
[*] --> Scanning
Scanning --> FoundGt : '>'
FoundGt --> Scanning : other
FoundGt --> [*] : '\\n' / '\\r' ✓
```
Returns the byte offset of the `>` that starts the last complete record.
## Boundary detection — FASTQ
FASTQ records have a rigid 4-line structure (`@header`, sequence, `+`, quality). The `@` character (ASCII 64, Phred score 31) can appear legitimately in quality lines, making any forward heuristic unreliable. The backward scanner verifies the full structural context before accepting a candidate `@`.
7-state machine (port of Go's `EndOfLastFastqEntry`), scanning from **right to left**. Each time a `+` is found, its position is saved as `restart`; any state mismatch resets the scan to that position.
```mermaid
stateDiagram-v2
direction LR
[*] --> Scanning
Scanning --> FoundPlus : '+' (save restart)
FoundPlus --> AfterNlPlus : '\\n' / '\\r'
FoundPlus --> Scanning : other → backtrack
AfterNlPlus --> AfterNlPlus : séparateur
AfterNlPlus --> InSequence : lettre / - / . / [ / ]
AfterNlPlus --> Scanning : other → backtrack
InSequence --> AfterSequence : '\\n' / '\\r'
InSequence --> InSequence : lettre / - / . / [ / ]
InSequence --> Scanning : other → backtrack
AfterSequence --> AfterSequence : '\\n' / '\\r'
AfterSequence --> InHeader : other
InHeader --> FoundAt : '@' (save cut)
InHeader --> Scanning : '\\n' / '\\r' → backtrack
InHeader --> InHeader : other
FoundAt --> [*] : '\\n' / '\\r' ✓
FoundAt --> InHeader : other
```
`restart` is updated each time a `+` is found. When any state fails its expected input, the scan jumps back to `restart` and continues from there — guaranteeing that a `@` in a quality line cannot be accepted as a record start, because the `\n+\n` structure immediately following it (going backward) will not be found.
Returns the byte offset of the `@` that starts the last complete record.
+60
View File
@@ -0,0 +1,60 @@
# Kmer — implementation
## Memory layout
`Kmer` is a `#[repr(transparent)]` newtype over `u64`:
```rust
#[repr(transparent)]
pub struct Kmer(u64);
```
Nucleotides are packed 2 bits each, **left-aligned**, MSB-first. Nucleotide 0 occupies bits 6362; nucleotide i occupies bits 632i and 622i. The low 642k bits are always zero. k is **not stored** — it is a parameter of every operation that needs it, and will be owned by the future collection-level indexer.
| 6362 | 6160 | … | 632(k1)1 to 632(k1) | 632k down to 0 |
|-------|-------|---|--------------------------|-----------------|
| nt 0 | nt 1 | … | nt k1 | zero padding |
## Encoding
`Kmer::from_ascii(ascii, k)` encodes the first k bytes of an ASCII slice using the shared `ENC` table (see [SuperKmer — ASCII encoding](superkmer.md#ascii-encoding-and-decoding)):
```rust
for i in 0..k {
val = (val << 2) | encode_base(ascii[i]) as u64;
}
Kmer(val << (64 - 2 * k))
```
Zero allocation — result lives on the stack.
## Decoding
`write_ascii(k, buf)` appends k ASCII characters to a caller-supplied `Vec<u8>` using the shared `DEC4` table: one lookup per 4 nucleotides, two partial-byte lookups for the remainder. No allocation in the hot path.
`to_ascii(k)` is a convenience wrapper that allocates and returns a `Vec<u8>`; intended for tests and display only.
## Reverse complement
Computed as pure arithmetic — no lookup table, no memory access:
```rust
let x = !self.0; // complement
let x = x.swap_bytes(); // reverse bytes
let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4); // swap nibbles
let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2); // swap 2-bit groups
Kmer(x << (64 - 2 * k))
```
After complementing, bytes are reversed (`swap_bytes`), then nibbles, then 2-bit groups — restoring 2-bit nucleotides to their correct positions in reverse order. A final left-shift realigns to MSB. Zero allocation — result lives on the stack.
## Canonical form
```rust
pub fn canonical(&self, k: usize) -> Self {
let rc = self.revcomp(k);
if self.0 <= rc.0 { *self } else { rc }
}
```
Lexicographic minimum of forward and reverse-complement, comparing the raw `u64` values directly (left-aligned encoding makes this equivalent to nucleotide-wise comparison). Zero allocation — result lives on the stack.
+48
View File
@@ -0,0 +1,48 @@
# MPHF selection — analysis in progress
The choice of Minimal Perfect Hash Function for phase 6 is not yet settled. Three candidates were evaluated.
## Candidates
**boomphf** (BBHash algorithm, maintained by 10X Genomics):
- ~3.7 bits/key; mature crate, used in production bioinformatics (Pufferfish, Piscem)
- Parallel construction; well-tested with DNA kmer data at scale
- Drawback: largest space footprint of the three
**ptr_hash** (PtrHash algorithm, Groot Koerkamp, SEA 2025):
- ~2.4 bits/key; fastest queries (≥2.1× over alternatives, 812 ns/key for u64 in tight loops) and fastest construction (≥3.1×)
- Theoretical foundation solid; paper and Rust crate from the same author
- Drawback: published February 2025 — very young, no production track record
**FMPHGO** (`ph` crate, Beling, ACM JEA 2023):
- ~2.1 bits/key — most compact of the three; good query speed; parallelisable construction
- More established than ptr_hash; actively maintained
- Currently preferred candidate
## Space at scale
For 1 024 partitions × 100 M kmers/partition:
| MPHF | bits/key | Total MPHF size |
|---------|----------|-----------------|
| boomphf | 3.7 | ~47 GB |
| ptr_hash | 2.4 | ~31 GB |
| FMPHGO | 2.1 | ~27 GB |
In practice, partition sizes depend on the dataset. For a human genome at 30× coverage with p=10 (1 024 partitions), realistic partition sizes are 330 M kmers → 18 MB per MPHF, well within RAM.
## On-disk and mmap considerations
All three are in-memory structures. Their internal representation is flat bit arrays (no heap pointers), making them serialisable as contiguous byte blobs and mmappable per partition. True zero-copy access would require rkyv integration; the `ph` crate currently uses serde, so loading involves a copy. Given per-partition MPHF sizes of 18 MB, the OS page cache handles this transparently — strict zero-copy is a refinement, not a blocker.
No established Rust crate provides a natively on-disk MPHF. **SSHash** (Sparse and Skew Hash) is a complete kmer dictionary designed for disk access and is order-preserving (overlapping kmers receive consecutive indices → cache-friendly count access), but it is C++-only and covers more than just the MPHF layer.
## Open questions
- Confirm actual partition sizes on representative metagenomic datasets before fixing the choice.
- Evaluate whether ptr_hash's query speed advantage (2.13.3×) justifies adopting a crate that is less than a year old.
- Assess rkyv integration cost for FMPHGO if true zero-copy mmap becomes necessary.
- Keep SSHash in mind if the indexing architecture is reconsidered at a higher level.
+161
View File
@@ -0,0 +1,161 @@
# Construction pipeline
All phases after scatter are embarrassingly parallel across partitions.
## Phase 0 — Parameter estimation
The construction parameters p, n, and min_count depend on the kmer frequency spectrum of the dataset. Estimating this spectrum before construction avoids costly re-partitioning if p is badly chosen.
Two approaches are supported:
- **External estimation (preferred):** run [NT-CARD](https://github.com/bcgsc/ntCard) on the input files and pass its histogram output to `obikmer build`. NT-CARD produces a kmer frequency histogram in a single streaming pass using ntHash and a Flajolet-Martin-style estimator; obikmer reads this file and derives p, n, and min_count automatically.
- **Internal estimation (future):** an `obikmer estimate` subcommand for users who prefer a single-tool workflow. The implementation would combine two components: (1) **ntHash**, a rolling hash that updates the kmer hash in O(1) per nucleotide by incrementally adding the incoming base and removing the outgoing one — Rust crates exist; (2) a **Flajolet-Martin-style streaming estimator** that maintains a small table of minimum hash values and infers the frequency histogram from their statistical distribution, as described in the NT-CARD paper [@Mohamadi2017-ok].
The histogram gives:
- **F0** (number of distinct kmers) → sets p (target ~10M kmers/partition → p = ⌈log₂(F0 / 10M)⌉)
- **frequency distribution** → sets n (choose n so that fewer than 1% of kmers overflow)
- **error valley** → suggests min_count (typically the local minimum between the error peak and the coverage peak)
## Phase 1 — Scatter
Single streaming pass over raw input files (FASTA/FASTQ, gzip). FASTQ quality scores are ignored. For each read:
1. **Ambiguous base filter**: cut at any non-ACGT base; discard fragments shorter than k.
2. **Entropy filter**: scan each fragment with a sliding window of size k. When the kmer $K_i = S[i \mathinner{..} i+k-1]$ ended by nucleotide $S[j]$ (with $j = i+k-1$) has entropy below threshold $\theta$, emit the current segment and start a new one (see algorithm below). $K_i$ belongs to neither segment, and no valid kmer is lost.
3. **Length filter**: discard any segment shorter than k produced by step 2.
4. **Super-kmer extraction**: for each clean segment, slide a minimizer window and group consecutive kmers sharing the same canonical minimizer; canonise each super-kmer by lexicographic comparison with its reverse complement (early exit).
5. **Partition routing**: `hash(canonical_minimizer) → PART` → append super-kmer to `partition/superkmers.bin.gz`.
**Segmentation behavior:**
When $K_i$ (ended by $S[j]$, $j = i+k-1$) fails the entropy threshold:
- Current segment $S[\textit{seg_start} \mathinner{..} j-1]$ is emitted (last valid kmer = $K_{i-1}$)
- New segment starts at $S[i+1]$ (first new kmer = $K_{i+1}$)
- $K_i$ is excluded: current segment lacks $S[j]$, new segment lacks $S[i]$
- Overlap = $S[i+1 \mathinner{..} j-1]$ = $k-2$ nucleotides
!!! abstract "Algorithm — Entropy filter: sliding window segmentation"
```text
procedure EntropyFilter(S, N, k, θ):
seg_start ← 0
window ← []
for j ← 0 to N1:
window.push(S[j])
if |window| < k: continue
i ← j k + 1
if entropy(window) ≤ θ:
emit S[seg_start .. j1]
seg_start ← i + 1
window ← S[i+1 .. j]
else:
window.pop_front()
emit S[seg_start .. N1]
```
Writes are sequential and append-only — IO-friendly. Gzip applied at write time. Data volume ≈ raw genome size (2 bits/nt compaction offsets header overhead).
## Phase 2 — Dereplication
Performed independently per partition. Identical super-kmers are consolidated and their COUNT accumulated — analogous to amplicon dereplication in metabarcoding. Uses external bucket sort to stay within RAM bounds:
**Pass 1** (streaming): hash the nucleotide payload of each super-kmer, route to one of B bucket files:
```
hash(sequence) % B → bucket_i.bin
```
B ≈ 100 is tunable; RAM needed ≈ partition_size / B.
**Pass 2**: for each bucket, load into an in-memory `HashMap<sequence, COUNT>`, dereplicate by summing COUNT values, write consolidated super-kmers.
After dereplication: at Nx coverage the partition shrinks by ~x (errors aside). The COUNT field in each super-kmer header = number of times that exact super-kmer sequence was observed across all input reads.
**Important:** super-kmer COUNT ≠ individual kmer count. A kmer can appear in multiple distinct super-kmers (same partition, different flanking context); its true count = sum of COUNT of all super-kmers containing it. A super-kmer with COUNT=1 may contain only high-abundance kmers, each appearing in many other super-kmers. Abundance filtering therefore cannot be applied at this phase.
## Phase 3 — Per-kmer count aggregation and quorum filtering
For each dereplicated super-kmer, enumerate its kmers and accumulate counts:
```
for each super-kmer (sequence, COUNT):
for each kmer in sequence:
kmer_counts[canonical(kmer)] += COUNT
```
Implemented as an external sort or a temporary HashMap, depending on partition size. At the end of this phase, each distinct canonical kmer has its exact total count.
Abundance filter applied here: kmers with `total_count < q` are discarded. `q` is a collection parameter (0 = keep all, including singletons for ≤1x data).
No pre-filter on super-kmer COUNT is possible at phase 2: a super-kmer with COUNT=1 may contain only high-abundance kmers, each present in many other super-kmers across the partition.
## Phase 4 — Super-kmer compaction
The valid kmer set from phase 3 is used as a mask to rewrite the super-kmer files:
```
for each dereplicated super-kmer:
scan kmer by kmer
kmer not in valid set → break point (terminates current super-kmer)
kmer in valid set → extend current super-kmer
```
Three cases per super-kmer:
- **All kmers valid** → copied as-is
- **No kmer valid** → discarded
- **Mixed** → split into sub-super-kmers at invalid boundaries; each sub-super-kmer inherits the original COUNT
After splitting, re-apply dereplication (bucket sort, phase 2 method) — splitting can produce new identical super-kmers. This re-dereplication is cheap: the volume is already greatly reduced.
Output: a clean super-kmer file where every kmer passes quorum. This file feeds phase 5.
## Phase 5 — Local de Bruijn graph and unitig construction
Within each partition, build a **local de Bruijn graph** from the valid kmer set and compute its unitigs. All operations are local to the partition — no cross-partition communication.
```
valid kmers → HashSet<u64>
for each kmer K:
out_degree = |{K[1:]+b | b ∈ {A,C,G,T}} ∩ HashSet|
in_degree = |{b+K[:-1] | b ∈ {A,C,G,T}} ∩ HashSet|
internal node ↔ in_degree=1 AND out_degree=1
branching / dead-end → unitig start or end
```
Traverse non-branching paths to assemble unitigs. Kmers whose neighbours fall in other partitions appear as dead ends locally — they terminate the unitig. The result: **each kmer appears in exactly one unitig** within the partition.
The partition size (controlled by p) must be calibrated so that the HashSet fits in RAM during this phase.
Output: `unitigs.bin` — the permanent evidence structure for the partition. Each kmer in the partition appears at exactly one (unitig_id, offset) location.
**Scope of local unitigs:** these are unitigs of the partition's local de Bruijn graph, not global unitigs. A kmer whose k-1 successor or predecessor falls in another partition appears as a dead end locally and terminates the unitig. This does not affect correctness of verification but means partition-local unitigs cannot be directly reused for global assembly.
## Phase 6 — MPHF construction and index finalisation
Built once on the definitive kmer set (all kmers in all unitigs of the partition):
```
kmers from unitigs → MPHF → mphf.bin
→ counts.bin : packed n-bit array (or 1-bit for presence mode)
→ refs.bin : u32 nucleotide offset into unitigs.bin per kmer
```
The MPHF is built once — no rebuild. The n-bit width for `counts.bin` is chosen from the observed count distribution (n=5 covers ~97% of kmers at 15x; n=1 for presence mode). Counts exceeding 2ⁿ−1 go into `overflow.bin` as sorted `(mphf_index: u32, count: u32)` pairs.
**Exact verification via unitig evidence:**
`unitigs.bin` serves as the evidence structure: for any query kmer, the stored unitig provides the ground truth to confirm or deny its presence. The MPHF maps every input to [0, N) including absent kmers — the unitig read-back is the only way to guarantee exactness.
```
query kmer q
→ canonical_minimizer(q) → hash → PART → part_XXXX/
→ MPHF(q) → index i
→ refs[i] = (unitig_id, kmer_offset)
→ read unitig from unitigs.bin → extract kmer at kmer_offset → compare with q
→ match : return counts[i] ← exact hit
→ no match: kmer absent ← MPHF collision on absent kmer
```
One random disk access into `unitigs.bin` per query; the unitig is the minimal, non-redundant evidence (each kmer stored once). `superkmers.bin.gz` is no longer needed at this point and can be deleted.
+61
View File
@@ -0,0 +1,61 @@
# On-disk collection structure
Collections are too large to hold in RAM (hundreds of genomes, billions of kmers). The collection lives on disk as a directory of memory-mapped files:
```
collection/
metadata.toml — collection parameters (see below)
part_XXXX/
superkmers.bin.gz — dereplicated super-kmers for this partition (construction artifact)
mphf.bin — minimal perfect hash function for this partition
counts.bin — packed n-bit count array (or 1-bit presence array)
refs.bin — back-references u32 nucleotide offset into unitigs.bin per kmer
unitigs.bin — local de Bruijn unitigs (permanent evidence structure)
overflow.bin — counts exceeding the packed range (optional)
```
`superkmers.bin.gz` is produced during phase 1 and consumed through phases 24. It can be deleted after phase 5 — it is not needed for querying. The permanent query structure is `mphf.bin + counts.bin + refs.bin + unitigs.bin`.
## Collection parameters
Stored in `metadata.toml`:
| Parameter | Role |
|-----------|------|
| k | kmer length |
| m | minimizer length (odd, < k) |
| p | partition bits (0 ≤ p ≤ min(14, 2m16)) |
| mode | `presence` (1 bit/kmer) or `count` (n bits/kmer) |
| n | bits per kmer in count mode (chosen at construction) |
| min_count | singleton filtering threshold (0 = keep all) |
| hash_fn | hash function identifier |
| hash_seed | seed for the hash function |
## Count storage
**refs.bin capacity:** `unitigs.bin` is a flat 2-bit-packed nucleotide stream with no separators. Each entry in `refs.bin` is a u32 nucleotide offset pointing to the first base of the kmer. A u32 covers 4 billion nucleotide positions = 1 GB of sequence per partition. In the worst case (all unitigs of length 1 kmer, offsets spaced k apart), this supports 4 billion / k ≈ 130 million kmers per partition at k=31. In the typical case (long unitigs, consecutive kmers at offset +1), the limit approaches 4 billion kmers — well beyond any realistic partition size.
*Presence mode* (coverage ≤ 1x, or when only presence/absence matters):
- `counts.bin` is a packed 1-bit array — all bits set to 1 for indexed kmers
- Singletons are the signal, not filtered
*Count mode* (coverage > 1x):
- `counts.bin` is a packed n-bit array; n chosen at construction from the observed distribution
- Value 0: absent sentinel; values 1..2ⁿ−2: direct count; value 2ⁿ−1: overflow
- Overflow counts stored in a separate `overflow.bin` as sorted `(index: u32, count: u32)` pairs
- Empirically (k=31, 15x coverage): n=5 covers 97% of real kmers, n=6 covers 99%
- min_count threshold filters low-frequency kmers (errors) before indexing; for ≤1x, min_count=0
## Query protocol
```
query kmer q
→ canonical_minimizer(q) → hash → PART → part_XXXX/
→ MPHF(q) → index i
→ refs[i] = (unitig_id, kmer_offset)
→ read unitig from unitigs.bin → extract kmer at kmer_offset → compare with q
→ match : return counts[i]
→ no match: kmer absent
```
+114
View File
@@ -0,0 +1,114 @@
# SuperKmer — implementation
## Memory layout
A super-kmer is stored as a **32-bit header** followed by a **byte-aligned nucleotide sequence** (2 bits/base, nucleotide 0 at the MSB of the first byte, max 256 nt):
| Field | Bits | Role |
|-------|------|------|
| COUNT | 24 | Occurrence count (≤ 16 M) |
| SEQL | 8 | Sequence length in nucleotides (1256) |
Bit layout (MSB to LSB): `[31:8] COUNT [7:0] SEQL`
SEQL is stored as a raw `u8`: values 1255 represent lengths 1255; **0 represents 256** (wrapping convention). The public accessor returns a `usize` and performs the conversion:
```rust
fn seql(&self) -> usize { if s == 0 { 256 } else { s as usize } }
fn count(&self) -> u32 { self.0 >> 8 }
fn increment(&mut self) { self.0 += 1 << 8; }
fn add(&mut self, n: u32) { self.0 += n << 8; }
fn set_count(&mut self, n: u32) { self.0 = (self.0 & 0xFF) | (n << 8); }
```
The SEQL field is 8 bits, capping the stored sequence at 256 nt. Given the expected length of ~40 nt, this cap is almost never reached; when it is, the super-kmer is split at 256 nt with a k1 overlap, preserving all kmers without duplication.
The sequence is always stored in canonical form (lexicographic minimum of forward and reverse complement), with nucleotide 0 at the MSB of the first byte. The byte array can be hashed directly without any adjustment.
## ASCII encoding and decoding
Two lookup tables handle ASCII ↔ 2-bit conversion:
- **`ENC: [u8; 32]`** — indexed by `b & 0x1F` (lower 5 bits of the ASCII byte). Maps A/a→0, C/c→1, G/g→2, T/t and U/u→3; ambiguous bases and unknowns silently map to 0 (A). 32 entries, fits entirely in L1 cache. Upper- and lowercase are handled identically.
- **`DEC4: [u32; 256]`** — maps a packed byte (4 nucleotides) to 4 ASCII characters packed as a big-endian `u32`. 1 KB total, fits in L1 cache. One lookup per output byte yields 4 decoded characters.
Encoding 4 nucleotides into one byte:
```rust
byte = ENC[c0 & 0x1F] << 6 | ENC[c1 & 0x1F] << 4 | ENC[c2 & 0x1F] << 2 | ENC[c3 & 0x1F]
```
Decoding one byte into 4 ASCII characters:
```rust
DEC4[byte].to_be_bytes() // [nuc0, nuc1, nuc2, nuc3] in ASCII
```
## Reverse complement
The reverse complement is computed **in place** with zero allocation in two steps.
**Step 1 — byte swap with `REVCOMP4`.** A 256-byte lookup table `REVCOMP4` maps each byte (4 nucleotides) to its reverse complement. Bytes are swapped from the outside in, applying `REVCOMP4` to each:
```rust
const fn revcomp4(x: u8) -> u8 {
let x = !x; // complement all bases
let x = (x >> 4) | (x << 4); // swap nibbles
let x = ((x >> 2) & 0x33) | ((x & 0x33) << 2); // swap 2-bit groups
x
}
```
`REVCOMP4` is 256 bytes (fits in L1 cache), computed at compile time. No endianness dependency — all operations are pure arithmetic on byte values.
**Step 2 — realignment.** After step 1, `padding = n × 8 SEQL × 2` spurious bits (complements of the original padding A's) appear at the start of the array. They are flushed left using `BitSlice<u8, Msb0>::rotate_left(padding)` from the `bitvec` crate, which is SIMD-accelerated. The trailing `padding` bits are then zeroed:
```rust
shift = n * 8 - SEQL * 2 // number of padding bits
bits.rotate_left(shift)
bits[len - shift..].fill(false)
```
`Msb0` ordering makes the bit layout hardware-independent.
!!! abstract "Algorithm — Super-kmer canonisation"
```text
procedure SuperKmerCanonical(seq, SEQL):
for i ← 0 to SEQL 1:
fwd ← nucleotide(seq, i)
rev ← complement(nucleotide(seq, SEQL 1 i))
if fwd < rev: return seq -- forward is canonical
if fwd > rev: return SuperKmerRevcomp(seq, SEQL) -- revcomp is canonical
return seq -- palindrome: either orientation valid
```
## Kmer extraction
A k-mer is extracted from a super-kmer with `SuperKmer::kmer(i, k)`, which returns a `Kmer` — a left-aligned `u64` newtype (see [Kmer implementation](kmer.md)):
```rust
pub fn kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError>
```
The bit slice `seq[i*2 .. (i+k)*2]` (Msb0 order) is loaded as a big-endian `u64` via `bitvec::load_be`, then left-shifted to produce the canonical left-aligned layout. One call — no loop, no allocation.
---
!!! abstract "Algorithm — Super-kmer reverse complement"
```text
procedure SuperKmerRevcomp(seq, SEQL):
n ← ⌈SEQL / 4⌉ -- number of bytes
shift ← n × 8 SEQL × 2 -- padding bits to flush
-- step 1: swap bytes outside-in, applying REVCOMP4 to each (256-byte L1 table)
lo ← 0 ; hi ← n 1
while lo < hi:
seq[lo], seq[hi] ← REVCOMP4[seq[hi]], REVCOMP4[seq[lo]]
lo ← lo + 1 ; hi ← hi 1
if lo == hi: seq[lo] ← REVCOMP4[seq[lo]]
-- step 2: left-rotate entire bit array by shift, zero trailing bits (SIMD via bitvec)
if shift > 0:
bits.rotate_left(shift)
bits[n×8 shift .. n×8].fill(0)
```
+15
View File
@@ -0,0 +1,15 @@
# obikmer
`obikmer` is a Rust tool for manipulation, counting, indexing, and set operations on DNA sequences represented as kmer sets.
## Constraints
- Target scale: metagenomic data, tens of Gbases, billions of kmers
- Maximum efficiency in computation, memory, and disk usage
- Input formats: FASTA, FASTQ, gzip, streaming stdin
## Priority operations
- Kmer counting (frequencies)
- Fast search / query
- Set operations: union, intersection, difference
+243
View File
@@ -0,0 +1,243 @@
%% This BibTeX bibliography file was created using BibDesk.
%% https://bibdesk.sourceforge.io/
%% Created for Eric Coissac at 2026-04-18 08:19:36 +0200
%% Saved with string encoding Unicode (UTF-8)
@article{Zheng2020-ji,
abstract = {MOTIVATION: Minimizers are methods to sample k-mers from a
string, with the guarantee that similar set of k-mers will be
chosen on similar strings. It is parameterized by the k-mer
length k, a window length w and an order on the k-mers.
Minimizers are used in a large number of softwares and pipelines
to improve computation efficiency and decrease memory usage.
Despite the method's popularity, many theoretical questions
regarding its performance remain open. The core metric for
measuring performance of a minimizer is the density, which
measures the sparsity of sampled k-mers. The theoretical optimal
density for a minimizer is 1/w, provably not achievable in
general. For given k and w, little is known about asymptotically
optimal minimizers, that is minimizers with density O(1/w).
RESULTS: We derive a necessary and sufficient condition for
existence of asymptotically optimal minimizers. We also provide a
randomized algorithm, called the Miniception, to design
minimizers with the best theoretical guarantee to date on density
in practical scenarios. Constructing and using the Miniception is
as easy as constructing and using a random minimizer, which
allows the design of efficient minimizers that scale to the
values of k and w used in current bioinformatics software
programs. AVAILABILITY AND IMPLEMENTATION: Reference
implementation of the Miniception and the codes for analysis can
be found at https://github.com/kingsford-group/miniception.
SUPPLEMENTARY INFORMATION: Supplementary data are available at
Bioinformatics online.},
author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
doi = {10.1093/bioinformatics/btaa472},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = jul,
number = {Suppl_1},
pages = {i119--i127},
pmc = {PMC8248892},
pmid = 32657376,
publisher = {Oxford University Press (OUP)},
title = {Improved design and analysis of practical minimizers},
url = {http://dx.doi.org/10.1093/bioinformatics/btaa472},
volume = 36,
year = 2020,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btaa472}}
@article{Zheng2021-cc,
abstract = {MOTIVATION: Minimizers are efficient methods to sample k-mers
from genomic sequences that unconditionally preserve sufficiently
long matches between sequences. Well-established methods to
construct efficient minimizers focus on sampling fewer k-mers on
a random sequence and use universal hitting sets (sets of k-mers
that appear frequently enough) to upper bound the sketch size. In
contrast, the problem of sequence-specific minimizers, which is
to construct efficient minimizers to sample fewer k-mers on a
specific sequence such as the reference genome, is less studied.
Currently, the theoretical understanding of this problem is
lacking, and existing methods do not specialize well to sketch
specific sequences. RESULTS: We propose the concept of polar
sets, complementary to the existing idea of universal hitting
sets. Polar sets are k-mer sets that are spread out enough on the
reference, and provably specialize well to specific sequences.
Link energy measures how well spread out a polar set is, and with
it, the sketch size can be bounded from above and below in a
theoretically sound way. This allows for direct optimization of
sketch size. We propose efficient heuristics to construct polar
sets, and via experiments on the human reference genome, show
their practical superiority in designing efficient
sequence-specific minimizers. AVAILABILITY AND IMPLEMENTATION: A
reference implementation and code for analyses under an
open-source license are at
https://github.com/kingsford-group/polarset. SUPPLEMENTARY
INFORMATION: Supplementary data are available at Bioinformatics
online.},
author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
doi = {10.1093/bioinformatics/btab313},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = jul,
number = {Suppl\_1},
pages = {i187--i195},
pmc = {PMC8686682},
pmid = 34252928,
publisher = {Oxford University Press (OUP)},
title = {Sequence-specific minimizers via polar sets},
url = {http://dx.doi.org/10.1093/bioinformatics/btab313},
volume = 37,
year = 2021,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btab313}}
@article{Pan2024-hb,
abstract = {MOTIVATION: The minimizer concept is a data structure for
sequence sketching. The standard canonical minimizer selects a
subset of k-mers from the given DNA sequence by comparing the
forward and reverse k-mers in a window simultaneously according
to a predefined selection scheme. It is widely employed by
sequence analysis such as read mapping and assembly. k-mer
density, k-mer repetitiveness (e.g. k-mer bias), and
computational efficiency are three critical measurements for
minimizer selection schemes. However, there exist trade-offs
between kinds of minimizer variants. Generic, effective, and
efficient are always the requirements for high-performance
minimizer algorithms. RESULTS: We propose a simple minimizer
operator as a refinement of the standard canonical minimizer. It
takes only a few operations to compute. However, it can improve
the k-mer repetitiveness, especially for the lexicographic order.
It applies to other selection schemes of total orders (e.g.
random orders). Moreover, it is computationally efficient and the
density is close to that of the standard minimizer. The refined
minimizer may benefit high-performance applications like binning
and read mapping. AVAILABILITY AND IMPLEMENTATION: The source
code of the benchmark in this work is available at the github
repository https://github.com/xp3i4/mini\_benchmark.},
author = {Pan, Chenxu and Reinert, Knut},
doi = {10.1093/bioinformatics/btae045},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = feb,
number = 2,
pmc = {PMC10868324},
pmid = 38269626,
publisher = {Oxford University Press (OUP)},
title = {A simple refined DNA minimizer operator enables 2-fold faster computation},
url = {http://dx.doi.org/10.1093/bioinformatics/btae045},
volume = 40,
year = 2024,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btae045}}
@article{Kille2023-px,
abstract = {MOTIVATION: The Jaccard similarity on k-mer sets has shown to be
a convenient proxy for sequence identity. By avoiding expensive
base-level alignments and comparing reduced sequence
representations, tools such as MashMap can scale to massive
numbers of pairwise comparisons while still providing useful
similarity estimates. However, due to their reliance on minimizer
winnowing, previous versions of MashMap were shown to be biased
and inconsistent estimators of Jaccard similarity. This directly
impacts downstream tools that rely on the accuracy of these
estimates. RESULTS: To address this, we propose the minmer
winnowing scheme, which generalizes the minimizer scheme by use
of a rolling minhash with multiple sampled k-mers per window. We
show both theoretically and empirically that minmers yield an
unbiased estimator of local Jaccard similarity, and we implement
this scheme in an updated version of MashMap. The minmer-based
implementation is over 10 times faster than the minimizer-based
version under the default ANI threshold, making it well-suited
for large-scale comparative genomics applications. AVAILABILITY
AND IMPLEMENTATION: MashMap3 is available at
https://github.com/marbl/MashMap.},
author = {Kille, Bryce and Garrison, Erik and Treangen, Todd J and Phillippy, Adam M},
doi = {10.1093/bioinformatics/btad512},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = sep,
number = 9,
pmc = {PMC10505501},
pmid = 37603771,
publisher = {Oxford University Press (OUP)},
title = {Minmers are a generalization of minimizers that enable unbiased local Jaccard estimation},
url = {http://dx.doi.org/10.1093/bioinformatics/btad512},
volume = 39,
year = 2023,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btad512}}
@incollection{Golan2025-xf,
address = {Cham},
author = {Golan, Shay and Shur, Arseny M},
booktitle = {Lecture Notes in Computer Science},
doi = {10.1007/978-3-031-82670-2\_25},
isbn = {9783031826696,9783031826702},
issn = {0302-9743,1611-3349},
language = {en},
pages = {347--360},
publisher = {Springer Nature Switzerland},
series = {Lecture Notes in Computer Science},
title = {Expected density of random minimizers},
url = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
year = 2025,
bdsk-url-1 = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
bdsk-url-2 = {http://dx.doi.org/10.1007/978-3-031-82670-2%5C_25}}
@article{Mohamadi2017-ok,
abstract = {Motivation: Many bioinformatics algorithms are designed for the
analysis of sequences of some uniform length, conventionally
referred to as k -mers. These include de Bruijn graph assembly
methods and sequence alignment tools. An efficient algorithm to
enumerate the number of unique k -mers, or even better, to build
a histogram of k -mer frequencies would be desirable for these
tools and their downstream analysis pipelines. Among other
applications, estimated frequencies can be used to predict genome
sizes, measure sequencing error rates, and tune runtime
parameters for analysis tools. However, calculating a k -mer
histogram from large volumes of sequencing data is a challenging
task. Results: Here, we present ntCard, a streaming algorithm for
estimating the frequencies of k -mers in genomics datasets. At
its core, ntCard uses the ntHash algorithm to efficiently compute
hash values for streamed sequences. It then samples the
calculated hash values to build a reduced representation
multiplicity table describing the sample distribution. Finally,
it uses a statistical model to reconstruct the population
distribution from the sample distribution. We have compared the
performance of ntCard and other cardinality estimation
algorithms. We used three datasets of 480 GB, 500 GB and 2.4 TB
in size, where the first two representing whole genome shotgun
sequencing experiments on the human genome and the last one on
the white spruce genome. Results show ntCard estimates k -mer
coverage frequencies >15× faster than the state-of-the-art
algorithms, using similar amount of memory, and with higher
accuracy rates. Thus, our benchmarks demonstrate ntCard as a
potentially enabling technology for large-scale genomics
applications. Availability and Implementation: ntCard is written
in C ++ and is released under the GPL license. It is freely
available at https://github.com/bcgsc/ntCard. Contact:
hmohamadi@bcgsc.ca or ibirol@bcgsc.ca. Supplementary information:
Supplementary data are available at Bioinformatics online.},
author = {Mohamadi, Hamid and Khan, Hamza and Birol, Inanc},
date-modified = {2026-04-18 08:19:36 +0200},
doi = {10.1093/bioinformatics/btw832},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = may,
number = 9,
pages = {1324--1330},
pmc = {PMC5408799},
pmid = 28453674,
publisher = {Oxford University Press (OUP)},
title = {ntCard: a streaming algorithm for cardinality estimation in genomics data},
url = {http://dx.doi.org/10.1093/bioinformatics/btw832},
volume = 33,
year = 2017,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btw832}}
+38
View File
@@ -0,0 +1,38 @@
# DNA encoding
## 2-bit nucleotide encoding
All nucleotides are encoded on 2 bits, MSB-first within each word. Nucleotides are numbered 0-based from the 5 end across all sequence types:
| Base | Encoding |
|------|----------|
| A | `00` |
| C | `01` |
| G | `10` |
| T | `11` |
The Watson-Crick complement of any base is its bitwise NOT on 2 bits: `complement(base) = ~base & 0b11`.
## Kmer encoding
A kmer fits in a single `u64`. Nucleotide 0 occupies bits 6362, nucleotide i occupies bits 632i and 622i, and the low 642k bits are zero. Extraction of nucleotide i (0 ≤ i < k): `(kmer >> (62 - 2*i)) & 0b11`.
Reverse complement is computed via a **16-bit lookup table** (65 536 entries × 2 bytes = 128 KB, fits in L2 cache) storing the reverse-complement of every 8-base chunk.
!!! abstract "Algorithm — Kmer reverse complement"
```text
procedure KmerRevcomp(kmer, k):
raw ← TABLE16[kmer & 0xFFFF] << 48
| TABLE16[(kmer >> 16) & 0xFFFF] << 32
| TABLE16[(kmer >> 32) & 0xFFFF] << 16
| TABLE16[(kmer >> 48) & 0xFFFF]
return raw << (64 - 2*k)
```
The **canonical form** is the lexicographic minimum of the kmer and its reverse complement:
```
canonical(kmer) = min(kmer, revcomp(kmer))
```
This halves the kmer space and ensures strand-independent counting.
+68
View File
@@ -0,0 +1,68 @@
# Kmer entropy filter
Low-complexity kmers (polyA, polyT, tandem repeats) are detected and excluded during phase 1. The filter computes a **normalized Shannon entropy** over sub-words of multiple sizes, corrected for two sources of bias: the small number of observations within a single kmer, and the unequal sizes of circular equivalence classes.
## Sub-word frequencies
For a kmer of length k and a sub-word size ws (1 ≤ ws ≤ ws_max, typically ws_max = 6), extract the $n_{\text{words}} = k - ws + 1$ overlapping sub-words by sliding a window of length ws:
$$w_i = \text{kmer}[i \mathinner{..} i+ws-1], \quad i = 0, \ldots, n_{\text{words}}-1$$
Each sub-word is mapped to its **circular canonical form**: the lexicographic minimum among all cyclic rotations of the word **and all cyclic rotations of its reverse complement**. This extended equivalence relation ensures that entropy(K) = entropy(revcomp(K)) — the filter is strand-symmetric. Let $s_j$ be the size of equivalence class $j$ (number of distinct raw words mapping to canonical form $j$), and $f_j$ the count of canonical form $j$ among the $n_{\text{words}}$ sub-words ($\sum_j f_j = n_{\text{words}}$).
## Corrected Shannon entropy
The circular equivalence classes have unequal sizes: under a uniform distribution over all $4^{ws}$ raw words, class $j$ is visited with probability $s_j / 4^{ws}$, not $1/n_a$. Computing entropy directly over canonical classes therefore underestimates the entropy of a random sequence.
The correction "unfolds" each canonical class back to its member raw words, redistributing each observation of class $j$ equally among its $s_j$ members:
$$H_{\text{corr}} = \log(n_{\text{words}}) - \frac{1}{n_{\text{words}}} \sum_j f_j \log f_j + \frac{1}{n_{\text{words}}} \sum_j f_j \log s_j$$
The last term is the correction for unequal class sizes. For a uniformly random sequence ($f_j \approx n_{\text{words}} \cdot s_j / 4^{ws}$), this gives $H_{\text{corr}} \approx \log(4^{ws}) = 2 \cdot ws \cdot \log 2$, the maximum entropy over raw words.
## Maximum entropy correction for small samples
With only $n_{\text{words}}$ observations over $4^{ws}$ possible raw words, the achievable maximum entropy is bounded by the most uniform integer distribution over $4^{ws}$ categories.
Let $c = \lfloor n_{\text{words}} / 4^{ws} \rfloor$ and $r = n_{\text{words}} \bmod 4^{ws}$. The most uniform integer distribution assigns frequency $c+1$ to $r$ categories and $c$ to the remaining $4^{ws} - r$, with the convention $0 \log 0 = 0$:
$$H_{\max} = -\left[(4^{ws} - r)\,\frac{c}{n_{\text{words}}}\log\frac{c}{n_{\text{words}}} + r\,\frac{c+1}{n_{\text{words}}}\log\frac{c+1}{n_{\text{words}}}\right]$$
When $n_{\text{words}} < 4^{ws}$: $c=0$, $r=n_{\text{words}}$, and the formula reduces to $H_{\max} = \log(n_{\text{words}})$ — a single unified expression covers both regimes. A truly random sequence achieves $H_{\text{corr}} \approx H_{\max}$.
## Normalized entropy
$$\hat{H}(ws) = \frac{H_{\text{corr}}}{H_{\max}} \in [0, 1]$$
## Final score
The filter computes $\hat{H}(ws)$ for each word size ws from 1 to ws_max and returns the **minimum**:
$$\text{entropy}(kmer) = \min_{ws=1}^{ws_{\max}} \hat{H}(ws)$$
A value near 0 indicates low complexity (e.g. AAAA…); near 1 indicates high complexity. A kmer is rejected if $\text{entropy}(kmer) \leq \theta$, where $\theta$ is a collection parameter. The minimum across word sizes ensures that any scale of repetition is detected independently: polyA is caught at ws=1, dinucleotide repeats at ws=2, etc.
## Interpretation as an effective number of classes
$H_{\text{corr}}$ is a standard Shannon entropy over raw words (after unfolding the equivalence classes), so the classical perplexity interpretation holds directly: $N_{\text{eff}} = e^{H_{\text{corr}}}$ is the number of equiprobable classes that would yield the same entropy.
For the normalised score $\hat{H}$, dividing by $H_{\text{max}}$ changes the logarithm base:
$$\hat{H} = \frac{\log N_{\text{eff}}}{\log N_{\text{max}}} = \log_{N_{\text{max}}} N_{\text{eff}} \quad \Longleftrightarrow \quad N_{\text{eff}} = N_{\text{max}}^{\,\hat{H}}$$
The property is preserved: $\hat{H}$ is the logarithm (in base $N_{\text{max}}$) of the effective number of equi-represented classes.
In the large-sample limit ($n_{\text{words}} \gg 4^{ws}$), $N_{\text{max}} \approx 4^{ws}$, giving:
$$N_{\text{eff}} \approx 4^{ws \cdot \hat{H}}$$
This has a clean interpretation: $ws \cdot \hat{H}$ is the **effective word length** (in bases) of a perfectly uniform distribution that would produce the same entropy. At $\hat{H} = 1$ the full space of $4^{ws}$ words is used; at $\hat{H} = 0.5$ with ws=2, only $4^1 = 4$ effective classes out of 16 are occupied.
In our actual regime, $n_{\text{words}}$ is small and $4^{ws}$ can exceed $n_{\text{words}}$, so $H_{\text{max}} < \log(4^{ws})$ due to the small-sample correction. The exact effective count is $N_{\text{max}}^{\hat{H}}$, not $4^{ws \cdot \hat{H}}$.
## Properties
The entropy score is a function of the kmer sequence alone — it does not depend on the surrounding context or on the position within any genome. Two consequences:
- **Orientation invariance**: $\text{entropy}(K) = \text{entropy}(\text{revcomp}(K))$, guaranteed by the strand-symmetric canonical form.
- **Context independence**: the same kmer is always rejected or always kept, regardless of which genome it occurs in, where in that genome it appears, or which strand is considered. The filter defines a fixed partition of the kmer space into low-complexity and valid kmers.
+28
View File
@@ -0,0 +1,28 @@
# Partitioning and indexing architecture
The canonical minimizer of a super-kmer is hashed to produce a **p-bit routing value** (p is a collection-level parameter):
```
canonical minimizer → hash(minimizer) → p-bit value → PART → partition directory
```
PART is computed once at phase 1 to open the correct partition file, then discarded. It is recomputed on the fly at query time. It is never stored in the super-kmer header.
Each partition holds one MPHF instance (phase 6) that indexes kmers as plain u64 values — the minimizer plays no role inside the partition.
## Why hashing is necessary
The canonical minimizer is an m-mer (m ∈ {9, 11, 13, 15}), encoded in 2m bits (18 to 30 bits). Its distribution over the $4^m$ possible values is **not uniform**: because the minimizer is the lexicographic minimum of a window of m-mers, small values are systematically over-represented [@Zheng2020-ji; @Zheng2021-cc; @Pan2024-hb; @Kille2023-px; @Golan2025-xf]. Routing directly by the raw minimizer value would produce severely unbalanced partitions.
A hash function with good avalanche properties redistributes this skewed distribution uniformly over the $2^p$ partition slots. The key reason this works well is the **entropy gap**: p is chosen to be much smaller than 2m, so the hash compresses many distinct minimizer values into each partition slot. Even under strong bias in the minimizer distribution, as long as its effective entropy exceeds p bits — which holds comfortably since the set of distinct minimizers in any real dataset is far larger than $2^p$ — the load imbalance across partitions is negligible.
## Parameter choices
| m | 2m (bits) | Typical p | Partitions |
|----|-----------|-----------|------------|
| 9 | 18 | 68 | 64256 |
| 11 | 22 | 810 | 2561 024 |
| 13 | 26 | 1012 | 1 0244 096|
| 15 | 30 | 1014 | 1 02416 384|
The hard constraint is p ≤ 2m: one cannot extract more bits of uniform randomness from a source than it contains. In practice p is chosen well below 2m, leaving a large entropy margin that absorbs the minimizer bias. For k=31, m=13, p=10: 1 024 partitions with comfortable balance.
+32
View File
@@ -0,0 +1,32 @@
# Kmers and super-kmers
## Kmers
A **kmer** is a DNA subsequence of fixed length k. Two constraints govern the choice of k:
- **k ∈ [11, 31]**: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word.
- **k is odd**: an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form `min(kmer, revcomp(kmer))` is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting.
## Super-kmers
A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k1 nucleotides. Each kmer of the run carries the same **canonical minimizer**. The **canonical minimizer** of a kmer is the smallest value of `min(m-mer, revcomp(m-mer))` over all m-mers within the kmer (m < k, m odd).
### Canonical super-kmers
A **canonical super-kmer** is the lexicographic minimum of a super-kmer and its reverse complement:
```
canonical(super-kmer) = min(super-kmer, revcomp(super-kmer))
```
When a read and its reverse-complement are both sequenced, they produce super-kmers that are reverse complements of each other. Both map to the same canonical form: the same genomic region is represented by a single canonical super-kmer regardless of which strand was read.
### Expected length of a super-kmer
For a random minimizer of length m over k-mers of length k, the density of minimizer positions is approximately 2/(km+2) [@Zheng2020-ji; @Golan2025-xf], so the expected number of consecutive k-mers per super-kmer is (km+2)/2. A run of n k-mers spans n + k 1 nucleotides, giving:
$$L_{\text{nt}} = \frac{k-m+2}{2} + k - 1$$
For k=31, m=13: expected ≈ 40 nt. In practice super-kmers rarely exceed a few dozen nucleotides.[^superkmer_length]
[^superkmer_length]: The expected length formula and the density approximation 2/(km+2) should be verified against the values reported in [@Zheng2020-ji] and [@Golan2025-xf].
+377
View File
@@ -0,0 +1,377 @@
<?xml version="1.0" encoding="utf-8"?>
<style xmlns="http://purl.org/net/xbiblio/csl" class="in-text" version="1.0" demote-non-dropping-particle="sort-only" initialize-with-hyphen="false" page-range-format="minimal">
<info>
<title>Vancouver</title>
<id>http://www.zotero.org/styles/vancouver</id>
<link href="http://www.zotero.org/styles/vancouver" rel="self"/>
<link href="http://www.nlm.nih.gov/bsd/uniform_requirements.html" rel="documentation"/>
<author>
<name>Michael Berkowitz</name>
<email>mberkowi@gmu.edu</email>
</author>
<contributor>
<name>Sean Takats</name>
<email>stakats@gmu.edu</email>
</contributor>
<contributor>
<name>Sebastian Karcher</name>
</contributor>
<category citation-format="numeric"/>
<category field="generic-base"/>
<category field="medicine"/>
<summary>Vancouver style as outlined by International Committee of Medical Journal Editors Uniform Requirements for Manuscripts Submitted to Biomedical Journals: Sample References</summary>
<updated>2025-05-17T20:55:38-04:00</updated>
<rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
</info>
<locale xml:lang="en">
<date form="text" delimiter=" ">
<date-part name="year"/>
<date-part name="month" form="short" strip-periods="true"/>
<date-part name="day"/>
</date>
<terms>
<term name="collection-editor" form="long">
<single>editor</single>
<multiple>editors</multiple>
</term>
<term name="presented at">presented at</term>
<term name="available at">available from</term>
<term name="section" form="short">sect.</term>
</terms>
</locale>
<locale xml:lang="fr">
<date form="text" delimiter=" ">
<date-part name="day"/>
<date-part name="month" form="short" strip-periods="true"/>
<date-part name="year"/>
</date>
</locale>
<macro name="author">
<names variable="author">
<name sort-separator=" " initialize-with="" name-as-sort-order="all" delimiter=", " delimiter-precedes-last="always"/>
<label form="long" prefix=", "/>
<substitute>
<text macro="webpage-title"/>
<names variable="editor"/>
</substitute>
</names>
</macro>
<macro name="editor">
<names variable="editor" suffix=".">
<name sort-separator=" " initialize-with="" name-as-sort-order="all" delimiter=", " delimiter-precedes-last="always"/>
<label form="long" prefix=", "/>
</names>
</macro>
<macro name="chapter-marker">
<choose>
<if type="chapter paper-conference entry-dictionary entry-encyclopedia" match="any">
<text term="in" text-case="capitalize-first"/>
</if>
</choose>
</macro>
<macro name="webpage-title">
<!--If a webpage has a container, we're assuming the citation is "part of a website" as per ch. 25 Citing Medicine https://www.ncbi.nlm.nih.gov/books/NBK7274/?report=reader -->
<choose>
<if type="webpage" variable="container-title" match="all">
<group delimiter=" ">
<text variable="container-title"/>
<text term="internet" prefix="[" suffix="]" text-case="capitalize-first"/>
</group>
</if>
</choose>
</macro>
<macro name="publisher">
<choose>
<!--discard publisher info for articles-->
<if type="article-journal article-magazine article-newspaper" match="none">
<group delimiter=": " suffix=";">
<choose>
<if type="thesis">
<text variable="publisher-place" prefix="[" suffix="]"/>
</if>
<else-if type="speech"/>
<else>
<text variable="publisher-place"/>
</else>
</choose>
<text variable="publisher"/>
</group>
</if>
</choose>
</macro>
<macro name="access">
<choose>
<if variable="URL">
<group delimiter=": ">
<text term="available at" text-case="capitalize-first"/>
<text variable="URL"/>
</group>
</if>
</choose>
</macro>
<macro name="accessed-date">
<choose>
<if variable="URL">
<group prefix="[" suffix="]" delimiter=" ">
<text term="cited" text-case="lowercase"/>
<date variable="accessed" form="text"/>
</group>
</if>
</choose>
</macro>
<macro name="container-title">
<choose>
<if type="article-journal article-magazine chapter paper-conference article-newspaper review review-book entry-dictionary entry-encyclopedia" match="any">
<group suffix="." delimiter=" ">
<choose>
<if type="article-journal review review-book" match="any">
<text variable="container-title" form="short" strip-periods="true"/>
</if>
<else>
<text variable="container-title" strip-periods="true"/>
</else>
</choose>
<choose>
<if variable="URL">
<text term="internet" prefix="[" suffix="]" text-case="capitalize-first"/>
</if>
</choose>
</group>
<text macro="edition" prefix=" "/>
</if>
<!--add event-name and event-place once they become available-->
<else-if type="bill legislation" match="any">
<group delimiter=", ">
<group delimiter=". ">
<text variable="container-title"/>
<group delimiter=" ">
<text term="section" form="short" text-case="capitalize-first"/>
<text variable="section"/>
</group>
</group>
<text variable="number"/>
</group>
</else-if>
<else-if type="speech">
<group delimiter=": " suffix=";">
<group delimiter=" ">
<text variable="genre" text-case="capitalize-first"/>
<text term="presented at"/>
</group>
<text variable="event"/>
</group>
</else-if>
<else>
<group delimiter=", " suffix=".">
<choose>
<if variable="collection-title" match="none">
<group delimiter=" ">
<label variable="volume" form="short" text-case="capitalize-first"/>
<text variable="volume"/>
</group>
</if>
</choose>
<text variable="container-title"/>
</group>
</else>
</choose>
</macro>
<macro name="title">
<choose>
<if type="webpage" variable="container-title" match="all"/>
<else>
<text variable="title"/>
<choose>
<if type="article-journal article-magazine chapter paper-conference article-newspaper review review-book entry-dictionary entry-encyclopedia" match="none">
<choose>
<if variable="URL">
<text term="internet" prefix=" [" suffix="]" text-case="capitalize-first"/>
</if>
</choose>
<text macro="edition" prefix=". "/>
</if>
</choose>
</else>
</choose>
<choose>
<if type="thesis">
<text variable="genre" prefix=" [" suffix="]"/>
</if>
</choose>
</macro>
<macro name="edition">
<choose>
<if is-numeric="edition">
<group delimiter=" ">
<number variable="edition" form="ordinal"/>
<text term="edition" form="short"/>
</group>
</if>
<else>
<text variable="edition" suffix="."/>
</else>
</choose>
</macro>
<macro name="date">
<choose>
<if type="article-journal article-magazine article-newspaper review review-book" match="any">
<group suffix=";" delimiter=" ">
<date variable="issued" form="text"/>
<text macro="accessed-date"/>
</group>
</if>
<else-if type="bill legislation" match="any">
<group delimiter=", ">
<date variable="issued" delimiter=" ">
<date-part name="month" form="short" strip-periods="true"/>
<date-part name="day"/>
</date>
<date variable="issued">
<date-part name="year"/>
</date>
</group>
</else-if>
<else-if type="report">
<date variable="issued" delimiter=" ">
<date-part name="year"/>
<date-part name="month" form="short" strip-periods="true"/>
</date>
<text macro="accessed-date" prefix=" "/>
</else-if>
<else-if type="patent">
<group suffix=".">
<group delimiter=", ">
<text variable="number"/>
<date variable="issued">
<date-part name="year"/>
</date>
</group>
<text macro="accessed-date" prefix=" "/>
</group>
</else-if>
<else-if type="speech">
<group delimiter="; ">
<group delimiter=" ">
<date variable="issued" delimiter=" ">
<date-part name="year"/>
<date-part name="month" form="short" strip-periods="true"/>
<date-part name="day"/>
</date>
<text macro="accessed-date"/>
</group>
<text variable="event-place"/>
</group>
</else-if>
<else>
<group suffix=".">
<date variable="issued">
<date-part name="year"/>
</date>
<text macro="accessed-date" prefix=" "/>
</group>
</else>
</choose>
</macro>
<macro name="pages">
<choose>
<if type="article-journal article-magazine article-newspaper review review-book" match="any">
<text variable="page" prefix=":"/>
</if>
<else-if type="book" match="any">
<text variable="number-of-pages" prefix=" "/>
<choose>
<if is-numeric="number-of-pages">
<label variable="number-of-pages" form="short" prefix=" " plural="never"/>
</if>
</choose>
</else-if>
<else>
<group prefix=" " delimiter=" ">
<label variable="page" form="short" plural="never"/>
<text variable="page"/>
</group>
</else>
</choose>
</macro>
<macro name="journal-location">
<choose>
<if type="article-journal article-magazine review review-book" match="any">
<text variable="volume"/>
<text variable="issue" prefix="(" suffix=")"/>
</if>
</choose>
</macro>
<macro name="webpage-part">
<choose>
<if type="webpage" variable="container-title" match="all">
<text variable="title"/>
</if>
</choose>
</macro>
<macro name="collection-details">
<choose>
<if type="article-journal article-magazine article-newspaper review review-book" match="none">
<choose>
<if variable="collection-title">
<group delimiter=" " prefix="(" suffix=")">
<names variable="collection-editor" suffix=".">
<name sort-separator=" " initialize-with="" name-as-sort-order="all" delimiter=", " delimiter-precedes-last="always"/>
<label form="long" prefix=", "/>
</names>
<group delimiter="; ">
<text variable="collection-title"/>
<group delimiter=" ">
<label variable="volume" form="short"/>
<text variable="volume"/>
</group>
</group>
</group>
</if>
</choose>
</if>
</choose>
</macro>
<macro name="report-details">
<choose>
<if type="report">
<text variable="number" prefix="Report No.: "/>
</if>
</choose>
</macro>
<citation collapse="citation-number">
<sort>
<key variable="citation-number"/>
</sort>
<layout prefix="(" suffix=")" delimiter=",">
<text variable="citation-number"/>
</layout>
</citation>
<bibliography et-al-min="7" et-al-use-first="6" second-field-align="flush">
<layout>
<text variable="citation-number" suffix="."/>
<group delimiter=". " suffix=". ">
<text macro="author"/>
<text macro="title"/>
</group>
<group delimiter=" " suffix=". ">
<group delimiter=": ">
<text macro="chapter-marker"/>
<group delimiter=" ">
<text macro="editor"/>
<text macro="container-title"/>
</group>
</group>
<text macro="publisher"/>
<group>
<text macro="date"/>
<text macro="journal-location"/>
<text macro="pages"/>
</group>
<text macro="webpage-part"/>
</group>
<text macro="collection-details" suffix=". "/>
<text macro="report-details" suffix=". "/>
<text macro="access"/>
</layout>
</bibliography>
</style>