first implementation but far to be optimal
This commit is contained in:
+2133
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,9 @@
|
||||
# Manipulated sequences
|
||||
|
||||
- We consider sequences only as compact form of a set of overlaping kmers
|
||||
- The largest kmers we considere are 31-mer
|
||||
- We only consider odd k
|
||||
|
||||
- all sequences match /^[acgtACGT]+/
|
||||
- maximum length 256 nucleotides
|
||||
- minimum length k
|
||||
@@ -0,0 +1,35 @@
|
||||
/* docs/css/extra.css */
|
||||
|
||||
/* Styles principaux pour le conteneur et le texte */
|
||||
.ps-root {
|
||||
font-family: "Courier New", monospace;
|
||||
font-size: 0.9em;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
/* Styles pour les mots-clés */
|
||||
.ps-keyword {
|
||||
font-weight: bold;
|
||||
color: #d73a49; /* Une belle teinte de rouge */
|
||||
}
|
||||
|
||||
/* --- CORRECTION DE L'INDENTATION --- */
|
||||
/* Cible tous les niveaux d'indentation et applique une marge gauche */
|
||||
[class*="ps-indent-"] {
|
||||
display: inline-block;
|
||||
}
|
||||
.ps-indent-1 {
|
||||
margin-left: 2em;
|
||||
}
|
||||
.ps-indent-2 {
|
||||
margin-left: 4em;
|
||||
}
|
||||
.ps-indent-3 {
|
||||
margin-left: 6em;
|
||||
}
|
||||
.ps-indent-4 {
|
||||
margin-left: 8em;
|
||||
}
|
||||
.ps-indent-5 {
|
||||
margin-left: 10em;
|
||||
}
|
||||
@@ -0,0 +1,230 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<style xmlns="http://purl.org/net/xbiblio/csl" class="in-text" version="1.0" demote-non-dropping-particle="sort-only" default-locale="en-US">
|
||||
<info>
|
||||
<title>Ecology Letters</title>
|
||||
<id>http://www.zotero.org/styles/ecology-letters</id>
|
||||
<link href="http://www.zotero.org/styles/ecology-letters" rel="self"/>
|
||||
<link href="http://www.zotero.org/styles/apa" rel="template"/>
|
||||
<link href="http://onlinelibrary.wiley.com/journal/10.1111/%28ISSN%291461-0248/homepage/ForAuthors.html" rel="documentation"/>
|
||||
<author>
|
||||
<name>David Kaplan</name>
|
||||
<email>david.kaplan@ird.fr</email>
|
||||
</author>
|
||||
<contributor>
|
||||
<name>Sebastian Karcher</name>
|
||||
</contributor>
|
||||
<category citation-format="author-date"/>
|
||||
<category field="biology"/>
|
||||
<issn>1461-023X</issn>
|
||||
<eissn>1461-0248</eissn>
|
||||
<updated>2023-10-11T10:45:32+00:00</updated>
|
||||
<rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
|
||||
</info>
|
||||
<macro name="container">
|
||||
<choose>
|
||||
<if type="chapter paper-conference" match="any">
|
||||
<text term="in" text-case="capitalize-first" suffix=": "/>
|
||||
<text variable="container-title" font-style="italic"/>
|
||||
<text variable="collection-title" prefix=", "/>
|
||||
<names variable="editor translator" prefix=" (" delimiter=", " suffix=")">
|
||||
<label form="short" suffix=" "/>
|
||||
<name name-as-sort-order="all" and="symbol" sort-separator=", " initialize-with="." delimiter=", " delimiter-precedes-last="never"/>
|
||||
</names>
|
||||
</if>
|
||||
<else>
|
||||
<group delimiter=", ">
|
||||
<text variable="container-title" font-style="italic" form="short"/>
|
||||
<text variable="collection-title"/>
|
||||
</group>
|
||||
</else>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="author">
|
||||
<names variable="author">
|
||||
<name name-as-sort-order="all" and="symbol" sort-separator=", " initialize-with="." delimiter=", " delimiter-precedes-last="never"/>
|
||||
<label form="short" prefix=" (" suffix=")" text-case="capitalize-first"/>
|
||||
<et-al font-style="italic"/>
|
||||
<substitute>
|
||||
<names variable="editor"/>
|
||||
<names variable="translator"/>
|
||||
<text macro="title"/>
|
||||
</substitute>
|
||||
</names>
|
||||
</macro>
|
||||
<macro name="author-short">
|
||||
<names variable="author">
|
||||
<name form="short" and="symbol" delimiter=", " initialize-with=". "/>
|
||||
<et-al font-style="italic"/>
|
||||
<substitute>
|
||||
<names variable="editor"/>
|
||||
<names variable="translator"/>
|
||||
<choose>
|
||||
<if type="bill book graphic legal_case legislation motion_picture report song" match="any">
|
||||
<text variable="title" form="short" font-style="italic"/>
|
||||
</if>
|
||||
<else>
|
||||
<text variable="title" form="short" quotes="true"/>
|
||||
</else>
|
||||
</choose>
|
||||
</substitute>
|
||||
</names>
|
||||
</macro>
|
||||
<macro name="access">
|
||||
<choose>
|
||||
<if type="webpage">
|
||||
<group>
|
||||
<text term="available at" text-case="capitalize-first" suffix=": "/>
|
||||
<text variable="URL" suffix="."/>
|
||||
</group>
|
||||
<text value="Last accessed" prefix=" " suffix=" "/>
|
||||
<date variable="accessed">
|
||||
<date-part name="day" suffix=" "/>
|
||||
<date-part name="month" suffix=" "/>
|
||||
<date-part name="year"/>
|
||||
</date>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="title">
|
||||
<choose>
|
||||
<if type="report" match="any">
|
||||
<text variable="title" font-style="italic"/>
|
||||
<group prefix=" (" suffix=")">
|
||||
<text variable="genre"/>
|
||||
<text variable="number" prefix=" No. "/>
|
||||
</group>
|
||||
</if>
|
||||
<else-if type="bill book graphic legal_case legislation motion_picture report song speech" match="any">
|
||||
<text variable="title" font-style="italic"/>
|
||||
</else-if>
|
||||
<else-if type="webpage">
|
||||
<text variable="title" font-style="italic"/>
|
||||
</else-if>
|
||||
<else>
|
||||
<text variable="title"/>
|
||||
</else>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="publisher">
|
||||
<choose>
|
||||
<if type="report" match="any">
|
||||
<group delimiter=", ">
|
||||
<text variable="publisher"/>
|
||||
<text variable="publisher-place"/>
|
||||
</group>
|
||||
</if>
|
||||
<else>
|
||||
<text variable="genre" suffix=". "/>
|
||||
<group delimiter=", ">
|
||||
<text variable="publisher"/>
|
||||
<text variable="publisher-place"/>
|
||||
</group>
|
||||
</else>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="event">
|
||||
<choose>
|
||||
<if variable="event">
|
||||
<text term="presented at" text-case="capitalize-first" suffix=" "/>
|
||||
<text variable="event"/>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="issued">
|
||||
<choose>
|
||||
<if variable="issued">
|
||||
<date variable="issued">
|
||||
<date-part name="year"/>
|
||||
</date>
|
||||
</if>
|
||||
<else-if variable="accessed">
|
||||
<choose>
|
||||
<if type="webpage">
|
||||
<date variable="accessed">
|
||||
<date-part name="year"/>
|
||||
</date>
|
||||
</if>
|
||||
<else>
|
||||
<text term="no date" form="short"/>
|
||||
</else>
|
||||
</choose>
|
||||
</else-if>
|
||||
<else>
|
||||
<text term="no date" form="short"/>
|
||||
</else>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="edition">
|
||||
<choose>
|
||||
<if is-numeric="edition">
|
||||
<group delimiter=" ">
|
||||
<number variable="edition" form="ordinal"/>
|
||||
<text value="edn"/>
|
||||
</group>
|
||||
</if>
|
||||
<else>
|
||||
<text variable="edition" suffix="."/>
|
||||
</else>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="locators">
|
||||
<choose>
|
||||
<if type="article-journal article-magazine article-newspaper" match="any">
|
||||
<group prefix=", " delimiter=", ">
|
||||
<group>
|
||||
<text variable="volume"/>
|
||||
</group>
|
||||
<text variable="page"/>
|
||||
</group>
|
||||
</if>
|
||||
<else-if type="bill book graphic legal_case legislation motion_picture report song thesis" match="any">
|
||||
<group delimiter=". " prefix=". ">
|
||||
<text macro="edition"/>
|
||||
<text macro="event"/>
|
||||
<text macro="publisher"/>
|
||||
</group>
|
||||
</else-if>
|
||||
<else-if type="chapter paper-conference" match="any">
|
||||
<group delimiter=", " prefix=". ">
|
||||
<text macro="event"/>
|
||||
<text macro="publisher"/>
|
||||
<group>
|
||||
<label variable="page" form="short" suffix=" "/>
|
||||
<text variable="page"/>
|
||||
</group>
|
||||
</group>
|
||||
</else-if>
|
||||
</choose>
|
||||
</macro>
|
||||
<citation et-al-min="3" et-al-use-first="1" disambiguate-add-year-suffix="true" collapse="year-suffix" year-suffix-delimiter=", ">
|
||||
<sort>
|
||||
<key macro="author"/>
|
||||
<key macro="issued"/>
|
||||
</sort>
|
||||
<layout prefix="(" suffix=")" delimiter="; ">
|
||||
<group delimiter=" ">
|
||||
<text macro="author-short"/>
|
||||
<text macro="issued"/>
|
||||
</group>
|
||||
</layout>
|
||||
</citation>
|
||||
<bibliography et-al-min="7" et-al-use-first="6" entry-spacing="0" hanging-indent="true">
|
||||
<sort>
|
||||
<key macro="author"/>
|
||||
<key macro="issued" sort="ascending"/>
|
||||
<key macro="title"/>
|
||||
</sort>
|
||||
<layout>
|
||||
<group suffix=".">
|
||||
<text macro="author" suffix="."/>
|
||||
<text macro="issued" prefix=" (" suffix="). "/>
|
||||
<group delimiter=". ">
|
||||
<text macro="title"/>
|
||||
<text macro="container"/>
|
||||
</group>
|
||||
<text macro="locators"/>
|
||||
<text macro="access" prefix=". "/>
|
||||
</group>
|
||||
</layout>
|
||||
</bibliography>
|
||||
</style>
|
||||
@@ -0,0 +1,100 @@
|
||||
# Chunk reader — implementation
|
||||
|
||||
The `obiread` crate provides a streaming iterator that reads FASTA or FASTQ files in fixed-size blocks and yields self-contained chunks, each ending on a complete sequence record boundary. Chunks are consumed in parallel by downstream workers.
|
||||
|
||||
## Output type: rope
|
||||
|
||||
Each chunk is a `Vec<Bytes>` — a **rope**: a list of reference-counted byte slices that are not necessarily contiguous in memory. The consumer iterates over the slices in order.
|
||||
|
||||
Using `bytes::Bytes` means the split at the record boundary is O(1): `Bytes::split_to(n)` adjusts a reference counter, not memory. No `memcpy` in the common case.
|
||||
|
||||
## Allocation policy
|
||||
|
||||
| Case | Cost |
|
||||
|------|------|
|
||||
| Boundary found in the current block (common) | zero extra allocation — `split_to` only |
|
||||
| Boundary straddles multiple blocks (sequence > block size, rare) | one allocation to pack the rope into a flat buffer |
|
||||
| EOF flush | zero extra allocation |
|
||||
|
||||
## SeqChunkIter
|
||||
|
||||
```rust
|
||||
pub struct SeqChunkIter<R: Read> { /* private */ }
|
||||
|
||||
impl<R: Read> Iterator for SeqChunkIter<R> {
|
||||
type Item = io::Result<Vec<Bytes>>;
|
||||
}
|
||||
|
||||
pub fn fasta_chunks<R: Read>(source: R) -> SeqChunkIter<R>
|
||||
pub fn fastq_chunks<R: Read>(source: R) -> SeqChunkIter<R>
|
||||
```
|
||||
|
||||
`next()` loop:
|
||||
|
||||
```text
|
||||
1. read one block of block_size bytes → push onto rope
|
||||
2. probe check: if the boundary marker ("\n>" or "\n@") is absent from the
|
||||
last block, skip the splitter (avoids a full backward scan for nothing)
|
||||
3. call splitter on last block
|
||||
if found at offset n:
|
||||
remainder = last_block.split_to(n) ← O(1), zero copy
|
||||
return std::mem::take(&mut self.rope) ← the chunk
|
||||
4. if rope.len() > 1 (multi-block accumulation):
|
||||
pack rope into one flat buffer ← one alloc
|
||||
retry splitter on flat buffer
|
||||
5. if EOF: flush remaining rope as final chunk
|
||||
```
|
||||
|
||||
## Boundary detection — FASTA
|
||||
|
||||
Backward scan with a 2-state machine. Searches for `>` immediately preceded by `\n` or `\r`:
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
direction LR
|
||||
[*] --> Scanning
|
||||
Scanning --> FoundGt : '>'
|
||||
FoundGt --> Scanning : other
|
||||
FoundGt --> [*] : '\\n' / '\\r' ✓
|
||||
```
|
||||
|
||||
Returns the byte offset of the `>` that starts the last complete record.
|
||||
|
||||
## Boundary detection — FASTQ
|
||||
|
||||
FASTQ records have a rigid 4-line structure (`@header`, sequence, `+`, quality). The `@` character (ASCII 64, Phred score 31) can appear legitimately in quality lines, making any forward heuristic unreliable. The backward scanner verifies the full structural context before accepting a candidate `@`.
|
||||
|
||||
7-state machine (port of Go's `EndOfLastFastqEntry`), scanning from **right to left**. Each time a `+` is found, its position is saved as `restart`; any state mismatch resets the scan to that position.
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
direction LR
|
||||
|
||||
[*] --> Scanning
|
||||
|
||||
Scanning --> FoundPlus : '+' (save restart)
|
||||
FoundPlus --> AfterNlPlus : '\\n' / '\\r'
|
||||
FoundPlus --> Scanning : other → backtrack
|
||||
|
||||
AfterNlPlus --> AfterNlPlus : séparateur
|
||||
AfterNlPlus --> InSequence : lettre / - / . / [ / ]
|
||||
AfterNlPlus --> Scanning : other → backtrack
|
||||
|
||||
InSequence --> AfterSequence : '\\n' / '\\r'
|
||||
InSequence --> InSequence : lettre / - / . / [ / ]
|
||||
InSequence --> Scanning : other → backtrack
|
||||
|
||||
AfterSequence --> AfterSequence : '\\n' / '\\r'
|
||||
AfterSequence --> InHeader : other
|
||||
|
||||
InHeader --> FoundAt : '@' (save cut)
|
||||
InHeader --> Scanning : '\\n' / '\\r' → backtrack
|
||||
InHeader --> InHeader : other
|
||||
|
||||
FoundAt --> [*] : '\\n' / '\\r' ✓
|
||||
FoundAt --> InHeader : other
|
||||
```
|
||||
|
||||
`restart` is updated each time a `+` is found. When any state fails its expected input, the scan jumps back to `restart` and continues from there — guaranteeing that a `@` in a quality line cannot be accepted as a record start, because the `\n+\n` structure immediately following it (going backward) will not be found.
|
||||
|
||||
Returns the byte offset of the `@` that starts the last complete record.
|
||||
@@ -0,0 +1,60 @@
|
||||
# Kmer — implementation
|
||||
|
||||
## Memory layout
|
||||
|
||||
`Kmer` is a `#[repr(transparent)]` newtype over `u64`:
|
||||
|
||||
```rust
|
||||
#[repr(transparent)]
|
||||
pub struct Kmer(u64);
|
||||
```
|
||||
|
||||
Nucleotides are packed 2 bits each, **left-aligned**, MSB-first. Nucleotide 0 occupies bits 63–62; nucleotide i occupies bits 63−2i and 62−2i. The low 64−2k bits are always zero. k is **not stored** — it is a parameter of every operation that needs it, and will be owned by the future collection-level indexer.
|
||||
|
||||
| 63–62 | 61–60 | … | 63−2(k−1)−1 to 63−2(k−1) | 63−2k down to 0 |
|
||||
|-------|-------|---|--------------------------|-----------------|
|
||||
| nt 0 | nt 1 | … | nt k−1 | zero padding |
|
||||
|
||||
## Encoding
|
||||
|
||||
`Kmer::from_ascii(ascii, k)` encodes the first k bytes of an ASCII slice using the shared `ENC` table (see [SuperKmer — ASCII encoding](superkmer.md#ascii-encoding-and-decoding)):
|
||||
|
||||
```rust
|
||||
for i in 0..k {
|
||||
val = (val << 2) | encode_base(ascii[i]) as u64;
|
||||
}
|
||||
Kmer(val << (64 - 2 * k))
|
||||
```
|
||||
|
||||
Zero allocation — result lives on the stack.
|
||||
|
||||
## Decoding
|
||||
|
||||
`write_ascii(k, buf)` appends k ASCII characters to a caller-supplied `Vec<u8>` using the shared `DEC4` table: one lookup per 4 nucleotides, two partial-byte lookups for the remainder. No allocation in the hot path.
|
||||
|
||||
`to_ascii(k)` is a convenience wrapper that allocates and returns a `Vec<u8>`; intended for tests and display only.
|
||||
|
||||
## Reverse complement
|
||||
|
||||
Computed as pure arithmetic — no lookup table, no memory access:
|
||||
|
||||
```rust
|
||||
let x = !self.0; // complement
|
||||
let x = x.swap_bytes(); // reverse bytes
|
||||
let x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4); // swap nibbles
|
||||
let x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2); // swap 2-bit groups
|
||||
Kmer(x << (64 - 2 * k))
|
||||
```
|
||||
|
||||
After complementing, bytes are reversed (`swap_bytes`), then nibbles, then 2-bit groups — restoring 2-bit nucleotides to their correct positions in reverse order. A final left-shift realigns to MSB. Zero allocation — result lives on the stack.
|
||||
|
||||
## Canonical form
|
||||
|
||||
```rust
|
||||
pub fn canonical(&self, k: usize) -> Self {
|
||||
let rc = self.revcomp(k);
|
||||
if self.0 <= rc.0 { *self } else { rc }
|
||||
}
|
||||
```
|
||||
|
||||
Lexicographic minimum of forward and reverse-complement, comparing the raw `u64` values directly (left-aligned encoding makes this equivalent to nucleotide-wise comparison). Zero allocation — result lives on the stack.
|
||||
@@ -0,0 +1,48 @@
|
||||
# MPHF selection — analysis in progress
|
||||
|
||||
The choice of Minimal Perfect Hash Function for phase 6 is not yet settled. Three candidates were evaluated.
|
||||
|
||||
## Candidates
|
||||
|
||||
**boomphf** (BBHash algorithm, maintained by 10X Genomics):
|
||||
|
||||
- ~3.7 bits/key; mature crate, used in production bioinformatics (Pufferfish, Piscem)
|
||||
- Parallel construction; well-tested with DNA kmer data at scale
|
||||
- Drawback: largest space footprint of the three
|
||||
|
||||
**ptr_hash** (PtrHash algorithm, Groot Koerkamp, SEA 2025):
|
||||
|
||||
- ~2.4 bits/key; fastest queries (≥2.1× over alternatives, 8–12 ns/key for u64 in tight loops) and fastest construction (≥3.1×)
|
||||
- Theoretical foundation solid; paper and Rust crate from the same author
|
||||
- Drawback: published February 2025 — very young, no production track record
|
||||
|
||||
**FMPHGO** (`ph` crate, Beling, ACM JEA 2023):
|
||||
|
||||
- ~2.1 bits/key — most compact of the three; good query speed; parallelisable construction
|
||||
- More established than ptr_hash; actively maintained
|
||||
- Currently preferred candidate
|
||||
|
||||
## Space at scale
|
||||
|
||||
For 1 024 partitions × 100 M kmers/partition:
|
||||
|
||||
| MPHF | bits/key | Total MPHF size |
|
||||
|---------|----------|-----------------|
|
||||
| boomphf | 3.7 | ~47 GB |
|
||||
| ptr_hash | 2.4 | ~31 GB |
|
||||
| FMPHGO | 2.1 | ~27 GB |
|
||||
|
||||
In practice, partition sizes depend on the dataset. For a human genome at 30× coverage with p=10 (1 024 partitions), realistic partition sizes are 3–30 M kmers → 1–8 MB per MPHF, well within RAM.
|
||||
|
||||
## On-disk and mmap considerations
|
||||
|
||||
All three are in-memory structures. Their internal representation is flat bit arrays (no heap pointers), making them serialisable as contiguous byte blobs and mmappable per partition. True zero-copy access would require rkyv integration; the `ph` crate currently uses serde, so loading involves a copy. Given per-partition MPHF sizes of 1–8 MB, the OS page cache handles this transparently — strict zero-copy is a refinement, not a blocker.
|
||||
|
||||
No established Rust crate provides a natively on-disk MPHF. **SSHash** (Sparse and Skew Hash) is a complete kmer dictionary designed for disk access and is order-preserving (overlapping kmers receive consecutive indices → cache-friendly count access), but it is C++-only and covers more than just the MPHF layer.
|
||||
|
||||
## Open questions
|
||||
|
||||
- Confirm actual partition sizes on representative metagenomic datasets before fixing the choice.
|
||||
- Evaluate whether ptr_hash's query speed advantage (2.1–3.3×) justifies adopting a crate that is less than a year old.
|
||||
- Assess rkyv integration cost for FMPHGO if true zero-copy mmap becomes necessary.
|
||||
- Keep SSHash in mind if the indexing architecture is reconsidered at a higher level.
|
||||
@@ -0,0 +1,161 @@
|
||||
# Construction pipeline
|
||||
|
||||
All phases after scatter are embarrassingly parallel across partitions.
|
||||
|
||||
## Phase 0 — Parameter estimation
|
||||
|
||||
The construction parameters p, n, and min_count depend on the kmer frequency spectrum of the dataset. Estimating this spectrum before construction avoids costly re-partitioning if p is badly chosen.
|
||||
|
||||
Two approaches are supported:
|
||||
|
||||
- **External estimation (preferred):** run [NT-CARD](https://github.com/bcgsc/ntCard) on the input files and pass its histogram output to `obikmer build`. NT-CARD produces a kmer frequency histogram in a single streaming pass using ntHash and a Flajolet-Martin-style estimator; obikmer reads this file and derives p, n, and min_count automatically.
|
||||
- **Internal estimation (future):** an `obikmer estimate` subcommand for users who prefer a single-tool workflow. The implementation would combine two components: (1) **ntHash**, a rolling hash that updates the kmer hash in O(1) per nucleotide by incrementally adding the incoming base and removing the outgoing one — Rust crates exist; (2) a **Flajolet-Martin-style streaming estimator** that maintains a small table of minimum hash values and infers the frequency histogram from their statistical distribution, as described in the NT-CARD paper [@Mohamadi2017-ok].
|
||||
|
||||
The histogram gives:
|
||||
|
||||
- **F0** (number of distinct kmers) → sets p (target ~10M kmers/partition → p = ⌈log₂(F0 / 10M)⌉)
|
||||
- **frequency distribution** → sets n (choose n so that fewer than 1% of kmers overflow)
|
||||
- **error valley** → suggests min_count (typically the local minimum between the error peak and the coverage peak)
|
||||
|
||||
## Phase 1 — Scatter
|
||||
|
||||
Single streaming pass over raw input files (FASTA/FASTQ, gzip). FASTQ quality scores are ignored. For each read:
|
||||
|
||||
1. **Ambiguous base filter**: cut at any non-ACGT base; discard fragments shorter than k.
|
||||
2. **Entropy filter**: scan each fragment with a sliding window of size k. When the kmer $K_i = S[i \mathinner{..} i+k-1]$ ended by nucleotide $S[j]$ (with $j = i+k-1$) has entropy below threshold $\theta$, emit the current segment and start a new one (see algorithm below). $K_i$ belongs to neither segment, and no valid kmer is lost.
|
||||
3. **Length filter**: discard any segment shorter than k produced by step 2.
|
||||
4. **Super-kmer extraction**: for each clean segment, slide a minimizer window and group consecutive kmers sharing the same canonical minimizer; canonise each super-kmer by lexicographic comparison with its reverse complement (early exit).
|
||||
5. **Partition routing**: `hash(canonical_minimizer) → PART` → append super-kmer to `partition/superkmers.bin.gz`.
|
||||
|
||||
**Segmentation behavior:**
|
||||
|
||||
When $K_i$ (ended by $S[j]$, $j = i+k-1$) fails the entropy threshold:
|
||||
|
||||
- Current segment $S[\textit{seg_start} \mathinner{..} j-1]$ is emitted (last valid kmer = $K_{i-1}$)
|
||||
- New segment starts at $S[i+1]$ (first new kmer = $K_{i+1}$)
|
||||
- $K_i$ is excluded: current segment lacks $S[j]$, new segment lacks $S[i]$
|
||||
- Overlap = $S[i+1 \mathinner{..} j-1]$ = $k-2$ nucleotides
|
||||
|
||||
!!! abstract "Algorithm — Entropy filter: sliding window segmentation"
|
||||
```text
|
||||
procedure EntropyFilter(S, N, k, θ):
|
||||
seg_start ← 0
|
||||
window ← []
|
||||
for j ← 0 to N−1:
|
||||
window.push(S[j])
|
||||
if |window| < k: continue
|
||||
i ← j − k + 1
|
||||
if entropy(window) ≤ θ:
|
||||
emit S[seg_start .. j−1]
|
||||
seg_start ← i + 1
|
||||
window ← S[i+1 .. j]
|
||||
else:
|
||||
window.pop_front()
|
||||
emit S[seg_start .. N−1]
|
||||
```
|
||||
|
||||
Writes are sequential and append-only — IO-friendly. Gzip applied at write time. Data volume ≈ raw genome size (2 bits/nt compaction offsets header overhead).
|
||||
|
||||
## Phase 2 — Dereplication
|
||||
|
||||
Performed independently per partition. Identical super-kmers are consolidated and their COUNT accumulated — analogous to amplicon dereplication in metabarcoding. Uses external bucket sort to stay within RAM bounds:
|
||||
|
||||
**Pass 1** (streaming): hash the nucleotide payload of each super-kmer, route to one of B bucket files:
|
||||
```
|
||||
hash(sequence) % B → bucket_i.bin
|
||||
```
|
||||
B ≈ 100 is tunable; RAM needed ≈ partition_size / B.
|
||||
|
||||
**Pass 2**: for each bucket, load into an in-memory `HashMap<sequence, COUNT>`, dereplicate by summing COUNT values, write consolidated super-kmers.
|
||||
|
||||
After dereplication: at Nx coverage the partition shrinks by ~x (errors aside). The COUNT field in each super-kmer header = number of times that exact super-kmer sequence was observed across all input reads.
|
||||
|
||||
**Important:** super-kmer COUNT ≠ individual kmer count. A kmer can appear in multiple distinct super-kmers (same partition, different flanking context); its true count = sum of COUNT of all super-kmers containing it. A super-kmer with COUNT=1 may contain only high-abundance kmers, each appearing in many other super-kmers. Abundance filtering therefore cannot be applied at this phase.
|
||||
|
||||
## Phase 3 — Per-kmer count aggregation and quorum filtering
|
||||
|
||||
For each dereplicated super-kmer, enumerate its kmers and accumulate counts:
|
||||
|
||||
```
|
||||
for each super-kmer (sequence, COUNT):
|
||||
for each kmer in sequence:
|
||||
kmer_counts[canonical(kmer)] += COUNT
|
||||
```
|
||||
|
||||
Implemented as an external sort or a temporary HashMap, depending on partition size. At the end of this phase, each distinct canonical kmer has its exact total count.
|
||||
|
||||
Abundance filter applied here: kmers with `total_count < q` are discarded. `q` is a collection parameter (0 = keep all, including singletons for ≤1x data).
|
||||
|
||||
No pre-filter on super-kmer COUNT is possible at phase 2: a super-kmer with COUNT=1 may contain only high-abundance kmers, each present in many other super-kmers across the partition.
|
||||
|
||||
## Phase 4 — Super-kmer compaction
|
||||
|
||||
The valid kmer set from phase 3 is used as a mask to rewrite the super-kmer files:
|
||||
|
||||
```
|
||||
for each dereplicated super-kmer:
|
||||
scan kmer by kmer
|
||||
kmer not in valid set → break point (terminates current super-kmer)
|
||||
kmer in valid set → extend current super-kmer
|
||||
```
|
||||
|
||||
Three cases per super-kmer:
|
||||
|
||||
- **All kmers valid** → copied as-is
|
||||
- **No kmer valid** → discarded
|
||||
- **Mixed** → split into sub-super-kmers at invalid boundaries; each sub-super-kmer inherits the original COUNT
|
||||
|
||||
After splitting, re-apply dereplication (bucket sort, phase 2 method) — splitting can produce new identical super-kmers. This re-dereplication is cheap: the volume is already greatly reduced.
|
||||
|
||||
Output: a clean super-kmer file where every kmer passes quorum. This file feeds phase 5.
|
||||
|
||||
## Phase 5 — Local de Bruijn graph and unitig construction
|
||||
|
||||
Within each partition, build a **local de Bruijn graph** from the valid kmer set and compute its unitigs. All operations are local to the partition — no cross-partition communication.
|
||||
|
||||
```
|
||||
valid kmers → HashSet<u64>
|
||||
|
||||
for each kmer K:
|
||||
out_degree = |{K[1:]+b | b ∈ {A,C,G,T}} ∩ HashSet|
|
||||
in_degree = |{b+K[:-1] | b ∈ {A,C,G,T}} ∩ HashSet|
|
||||
|
||||
internal node ↔ in_degree=1 AND out_degree=1
|
||||
branching / dead-end → unitig start or end
|
||||
```
|
||||
|
||||
Traverse non-branching paths to assemble unitigs. Kmers whose neighbours fall in other partitions appear as dead ends locally — they terminate the unitig. The result: **each kmer appears in exactly one unitig** within the partition.
|
||||
|
||||
The partition size (controlled by p) must be calibrated so that the HashSet fits in RAM during this phase.
|
||||
|
||||
Output: `unitigs.bin` — the permanent evidence structure for the partition. Each kmer in the partition appears at exactly one (unitig_id, offset) location.
|
||||
|
||||
**Scope of local unitigs:** these are unitigs of the partition's local de Bruijn graph, not global unitigs. A kmer whose k-1 successor or predecessor falls in another partition appears as a dead end locally and terminates the unitig. This does not affect correctness of verification but means partition-local unitigs cannot be directly reused for global assembly.
|
||||
|
||||
## Phase 6 — MPHF construction and index finalisation
|
||||
|
||||
Built once on the definitive kmer set (all kmers in all unitigs of the partition):
|
||||
|
||||
```
|
||||
kmers from unitigs → MPHF → mphf.bin
|
||||
→ counts.bin : packed n-bit array (or 1-bit for presence mode)
|
||||
→ refs.bin : u32 nucleotide offset into unitigs.bin per kmer
|
||||
```
|
||||
|
||||
The MPHF is built once — no rebuild. The n-bit width for `counts.bin` is chosen from the observed count distribution (n=5 covers ~97% of kmers at 15x; n=1 for presence mode). Counts exceeding 2ⁿ−1 go into `overflow.bin` as sorted `(mphf_index: u32, count: u32)` pairs.
|
||||
|
||||
**Exact verification via unitig evidence:**
|
||||
|
||||
`unitigs.bin` serves as the evidence structure: for any query kmer, the stored unitig provides the ground truth to confirm or deny its presence. The MPHF maps every input to [0, N) including absent kmers — the unitig read-back is the only way to guarantee exactness.
|
||||
|
||||
```
|
||||
query kmer q
|
||||
→ canonical_minimizer(q) → hash → PART → part_XXXX/
|
||||
→ MPHF(q) → index i
|
||||
→ refs[i] = (unitig_id, kmer_offset)
|
||||
→ read unitig from unitigs.bin → extract kmer at kmer_offset → compare with q
|
||||
→ match : return counts[i] ← exact hit
|
||||
→ no match: kmer absent ← MPHF collision on absent kmer
|
||||
```
|
||||
|
||||
One random disk access into `unitigs.bin` per query; the unitig is the minimal, non-redundant evidence (each kmer stored once). `superkmers.bin.gz` is no longer needed at this point and can be deleted.
|
||||
@@ -0,0 +1,61 @@
|
||||
# On-disk collection structure
|
||||
|
||||
Collections are too large to hold in RAM (hundreds of genomes, billions of kmers). The collection lives on disk as a directory of memory-mapped files:
|
||||
|
||||
```
|
||||
collection/
|
||||
metadata.toml — collection parameters (see below)
|
||||
part_XXXX/
|
||||
superkmers.bin.gz — dereplicated super-kmers for this partition (construction artifact)
|
||||
mphf.bin — minimal perfect hash function for this partition
|
||||
counts.bin — packed n-bit count array (or 1-bit presence array)
|
||||
refs.bin — back-references u32 nucleotide offset into unitigs.bin per kmer
|
||||
unitigs.bin — local de Bruijn unitigs (permanent evidence structure)
|
||||
overflow.bin — counts exceeding the packed range (optional)
|
||||
```
|
||||
|
||||
`superkmers.bin.gz` is produced during phase 1 and consumed through phases 2–4. It can be deleted after phase 5 — it is not needed for querying. The permanent query structure is `mphf.bin + counts.bin + refs.bin + unitigs.bin`.
|
||||
|
||||
## Collection parameters
|
||||
|
||||
Stored in `metadata.toml`:
|
||||
|
||||
| Parameter | Role |
|
||||
|-----------|------|
|
||||
| k | kmer length |
|
||||
| m | minimizer length (odd, < k) |
|
||||
| p | partition bits (0 ≤ p ≤ min(14, 2m−16)) |
|
||||
| mode | `presence` (1 bit/kmer) or `count` (n bits/kmer) |
|
||||
| n | bits per kmer in count mode (chosen at construction) |
|
||||
| min_count | singleton filtering threshold (0 = keep all) |
|
||||
| hash_fn | hash function identifier |
|
||||
| hash_seed | seed for the hash function |
|
||||
|
||||
## Count storage
|
||||
|
||||
**refs.bin capacity:** `unitigs.bin` is a flat 2-bit-packed nucleotide stream with no separators. Each entry in `refs.bin` is a u32 nucleotide offset pointing to the first base of the kmer. A u32 covers 4 billion nucleotide positions = 1 GB of sequence per partition. In the worst case (all unitigs of length 1 kmer, offsets spaced k apart), this supports 4 billion / k ≈ 130 million kmers per partition at k=31. In the typical case (long unitigs, consecutive kmers at offset +1), the limit approaches 4 billion kmers — well beyond any realistic partition size.
|
||||
|
||||
*Presence mode* (coverage ≤ 1x, or when only presence/absence matters):
|
||||
|
||||
- `counts.bin` is a packed 1-bit array — all bits set to 1 for indexed kmers
|
||||
- Singletons are the signal, not filtered
|
||||
|
||||
*Count mode* (coverage > 1x):
|
||||
|
||||
- `counts.bin` is a packed n-bit array; n chosen at construction from the observed distribution
|
||||
- Value 0: absent sentinel; values 1..2ⁿ−2: direct count; value 2ⁿ−1: overflow
|
||||
- Overflow counts stored in a separate `overflow.bin` as sorted `(index: u32, count: u32)` pairs
|
||||
- Empirically (k=31, 15x coverage): n=5 covers 97% of real kmers, n=6 covers 99%
|
||||
- min_count threshold filters low-frequency kmers (errors) before indexing; for ≤1x, min_count=0
|
||||
|
||||
## Query protocol
|
||||
|
||||
```
|
||||
query kmer q
|
||||
→ canonical_minimizer(q) → hash → PART → part_XXXX/
|
||||
→ MPHF(q) → index i
|
||||
→ refs[i] = (unitig_id, kmer_offset)
|
||||
→ read unitig from unitigs.bin → extract kmer at kmer_offset → compare with q
|
||||
→ match : return counts[i]
|
||||
→ no match: kmer absent
|
||||
```
|
||||
@@ -0,0 +1,114 @@
|
||||
# SuperKmer — implementation
|
||||
|
||||
## Memory layout
|
||||
|
||||
A super-kmer is stored as a **32-bit header** followed by a **byte-aligned nucleotide sequence** (2 bits/base, nucleotide 0 at the MSB of the first byte, max 256 nt):
|
||||
|
||||
| Field | Bits | Role |
|
||||
|-------|------|------|
|
||||
| COUNT | 24 | Occurrence count (≤ 16 M) |
|
||||
| SEQL | 8 | Sequence length in nucleotides (1–256) |
|
||||
|
||||
Bit layout (MSB to LSB): `[31:8] COUNT [7:0] SEQL`
|
||||
|
||||
SEQL is stored as a raw `u8`: values 1–255 represent lengths 1–255; **0 represents 256** (wrapping convention). The public accessor returns a `usize` and performs the conversion:
|
||||
|
||||
```rust
|
||||
fn seql(&self) -> usize { if s == 0 { 256 } else { s as usize } }
|
||||
fn count(&self) -> u32 { self.0 >> 8 }
|
||||
fn increment(&mut self) { self.0 += 1 << 8; }
|
||||
fn add(&mut self, n: u32) { self.0 += n << 8; }
|
||||
fn set_count(&mut self, n: u32) { self.0 = (self.0 & 0xFF) | (n << 8); }
|
||||
```
|
||||
|
||||
The SEQL field is 8 bits, capping the stored sequence at 256 nt. Given the expected length of ~40 nt, this cap is almost never reached; when it is, the super-kmer is split at 256 nt with a k−1 overlap, preserving all kmers without duplication.
|
||||
|
||||
The sequence is always stored in canonical form (lexicographic minimum of forward and reverse complement), with nucleotide 0 at the MSB of the first byte. The byte array can be hashed directly without any adjustment.
|
||||
|
||||
## ASCII encoding and decoding
|
||||
|
||||
Two lookup tables handle ASCII ↔ 2-bit conversion:
|
||||
|
||||
- **`ENC: [u8; 32]`** — indexed by `b & 0x1F` (lower 5 bits of the ASCII byte). Maps A/a→0, C/c→1, G/g→2, T/t and U/u→3; ambiguous bases and unknowns silently map to 0 (A). 32 entries, fits entirely in L1 cache. Upper- and lowercase are handled identically.
|
||||
- **`DEC4: [u32; 256]`** — maps a packed byte (4 nucleotides) to 4 ASCII characters packed as a big-endian `u32`. 1 KB total, fits in L1 cache. One lookup per output byte yields 4 decoded characters.
|
||||
|
||||
Encoding 4 nucleotides into one byte:
|
||||
|
||||
```rust
|
||||
byte = ENC[c0 & 0x1F] << 6 | ENC[c1 & 0x1F] << 4 | ENC[c2 & 0x1F] << 2 | ENC[c3 & 0x1F]
|
||||
```
|
||||
|
||||
Decoding one byte into 4 ASCII characters:
|
||||
|
||||
```rust
|
||||
DEC4[byte].to_be_bytes() // [nuc0, nuc1, nuc2, nuc3] in ASCII
|
||||
```
|
||||
|
||||
## Reverse complement
|
||||
|
||||
The reverse complement is computed **in place** with zero allocation in two steps.
|
||||
|
||||
**Step 1 — byte swap with `REVCOMP4`.** A 256-byte lookup table `REVCOMP4` maps each byte (4 nucleotides) to its reverse complement. Bytes are swapped from the outside in, applying `REVCOMP4` to each:
|
||||
|
||||
```rust
|
||||
const fn revcomp4(x: u8) -> u8 {
|
||||
let x = !x; // complement all bases
|
||||
let x = (x >> 4) | (x << 4); // swap nibbles
|
||||
let x = ((x >> 2) & 0x33) | ((x & 0x33) << 2); // swap 2-bit groups
|
||||
x
|
||||
}
|
||||
```
|
||||
|
||||
`REVCOMP4` is 256 bytes (fits in L1 cache), computed at compile time. No endianness dependency — all operations are pure arithmetic on byte values.
|
||||
|
||||
**Step 2 — realignment.** After step 1, `padding = n × 8 − SEQL × 2` spurious bits (complements of the original padding A's) appear at the start of the array. They are flushed left using `BitSlice<u8, Msb0>::rotate_left(padding)` from the `bitvec` crate, which is SIMD-accelerated. The trailing `padding` bits are then zeroed:
|
||||
|
||||
```rust
|
||||
shift = n * 8 - SEQL * 2 // number of padding bits
|
||||
bits.rotate_left(shift)
|
||||
bits[len - shift..].fill(false)
|
||||
```
|
||||
|
||||
`Msb0` ordering makes the bit layout hardware-independent.
|
||||
|
||||
!!! abstract "Algorithm — Super-kmer canonisation"
|
||||
```text
|
||||
procedure SuperKmerCanonical(seq, SEQL):
|
||||
for i ← 0 to SEQL − 1:
|
||||
fwd ← nucleotide(seq, i)
|
||||
rev ← complement(nucleotide(seq, SEQL − 1 − i))
|
||||
if fwd < rev: return seq -- forward is canonical
|
||||
if fwd > rev: return SuperKmerRevcomp(seq, SEQL) -- revcomp is canonical
|
||||
return seq -- palindrome: either orientation valid
|
||||
```
|
||||
|
||||
## Kmer extraction
|
||||
|
||||
A k-mer is extracted from a super-kmer with `SuperKmer::kmer(i, k)`, which returns a `Kmer` — a left-aligned `u64` newtype (see [Kmer implementation](kmer.md)):
|
||||
|
||||
```rust
|
||||
pub fn kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError>
|
||||
```
|
||||
|
||||
The bit slice `seq[i*2 .. (i+k)*2]` (Msb0 order) is loaded as a big-endian `u64` via `bitvec::load_be`, then left-shifted to produce the canonical left-aligned layout. One call — no loop, no allocation.
|
||||
|
||||
---
|
||||
|
||||
!!! abstract "Algorithm — Super-kmer reverse complement"
|
||||
```text
|
||||
procedure SuperKmerRevcomp(seq, SEQL):
|
||||
n ← ⌈SEQL / 4⌉ -- number of bytes
|
||||
shift ← n × 8 − SEQL × 2 -- padding bits to flush
|
||||
|
||||
-- step 1: swap bytes outside-in, applying REVCOMP4 to each (256-byte L1 table)
|
||||
lo ← 0 ; hi ← n − 1
|
||||
while lo < hi:
|
||||
seq[lo], seq[hi] ← REVCOMP4[seq[hi]], REVCOMP4[seq[lo]]
|
||||
lo ← lo + 1 ; hi ← hi − 1
|
||||
if lo == hi: seq[lo] ← REVCOMP4[seq[lo]]
|
||||
|
||||
-- step 2: left-rotate entire bit array by shift, zero trailing bits (SIMD via bitvec)
|
||||
if shift > 0:
|
||||
bits.rotate_left(shift)
|
||||
bits[n×8 − shift .. n×8].fill(0)
|
||||
```
|
||||
@@ -0,0 +1,15 @@
|
||||
# obikmer
|
||||
|
||||
`obikmer` is a Rust tool for manipulation, counting, indexing, and set operations on DNA sequences represented as kmer sets.
|
||||
|
||||
## Constraints
|
||||
|
||||
- Target scale: metagenomic data, tens of Gbases, billions of kmers
|
||||
- Maximum efficiency in computation, memory, and disk usage
|
||||
- Input formats: FASTA, FASTQ, gzip, streaming stdin
|
||||
|
||||
## Priority operations
|
||||
|
||||
- Kmer counting (frequencies)
|
||||
- Fast search / query
|
||||
- Set operations: union, intersection, difference
|
||||
@@ -0,0 +1,243 @@
|
||||
%% This BibTeX bibliography file was created using BibDesk.
|
||||
%% https://bibdesk.sourceforge.io/
|
||||
|
||||
%% Created for Eric Coissac at 2026-04-18 08:19:36 +0200
|
||||
|
||||
|
||||
%% Saved with string encoding Unicode (UTF-8)
|
||||
|
||||
|
||||
|
||||
@article{Zheng2020-ji,
|
||||
abstract = {MOTIVATION: Minimizers are methods to sample k-mers from a
|
||||
string, with the guarantee that similar set of k-mers will be
|
||||
chosen on similar strings. It is parameterized by the k-mer
|
||||
length k, a window length w and an order on the k-mers.
|
||||
Minimizers are used in a large number of softwares and pipelines
|
||||
to improve computation efficiency and decrease memory usage.
|
||||
Despite the method's popularity, many theoretical questions
|
||||
regarding its performance remain open. The core metric for
|
||||
measuring performance of a minimizer is the density, which
|
||||
measures the sparsity of sampled k-mers. The theoretical optimal
|
||||
density for a minimizer is 1/w, provably not achievable in
|
||||
general. For given k and w, little is known about asymptotically
|
||||
optimal minimizers, that is minimizers with density O(1/w).
|
||||
RESULTS: We derive a necessary and sufficient condition for
|
||||
existence of asymptotically optimal minimizers. We also provide a
|
||||
randomized algorithm, called the Miniception, to design
|
||||
minimizers with the best theoretical guarantee to date on density
|
||||
in practical scenarios. Constructing and using the Miniception is
|
||||
as easy as constructing and using a random minimizer, which
|
||||
allows the design of efficient minimizers that scale to the
|
||||
values of k and w used in current bioinformatics software
|
||||
programs. AVAILABILITY AND IMPLEMENTATION: Reference
|
||||
implementation of the Miniception and the codes for analysis can
|
||||
be found at https://github.com/kingsford-group/miniception.
|
||||
SUPPLEMENTARY INFORMATION: Supplementary data are available at
|
||||
Bioinformatics online.},
|
||||
author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
|
||||
doi = {10.1093/bioinformatics/btaa472},
|
||||
issn = {1367-4803,1367-4811},
|
||||
journal = {Bioinformatics (Oxford, England)},
|
||||
language = {en},
|
||||
month = jul,
|
||||
number = {Suppl_1},
|
||||
pages = {i119--i127},
|
||||
pmc = {PMC8248892},
|
||||
pmid = 32657376,
|
||||
publisher = {Oxford University Press (OUP)},
|
||||
title = {Improved design and analysis of practical minimizers},
|
||||
url = {http://dx.doi.org/10.1093/bioinformatics/btaa472},
|
||||
volume = 36,
|
||||
year = 2020,
|
||||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btaa472}}
|
||||
|
||||
@article{Zheng2021-cc,
|
||||
abstract = {MOTIVATION: Minimizers are efficient methods to sample k-mers
|
||||
from genomic sequences that unconditionally preserve sufficiently
|
||||
long matches between sequences. Well-established methods to
|
||||
construct efficient minimizers focus on sampling fewer k-mers on
|
||||
a random sequence and use universal hitting sets (sets of k-mers
|
||||
that appear frequently enough) to upper bound the sketch size. In
|
||||
contrast, the problem of sequence-specific minimizers, which is
|
||||
to construct efficient minimizers to sample fewer k-mers on a
|
||||
specific sequence such as the reference genome, is less studied.
|
||||
Currently, the theoretical understanding of this problem is
|
||||
lacking, and existing methods do not specialize well to sketch
|
||||
specific sequences. RESULTS: We propose the concept of polar
|
||||
sets, complementary to the existing idea of universal hitting
|
||||
sets. Polar sets are k-mer sets that are spread out enough on the
|
||||
reference, and provably specialize well to specific sequences.
|
||||
Link energy measures how well spread out a polar set is, and with
|
||||
it, the sketch size can be bounded from above and below in a
|
||||
theoretically sound way. This allows for direct optimization of
|
||||
sketch size. We propose efficient heuristics to construct polar
|
||||
sets, and via experiments on the human reference genome, show
|
||||
their practical superiority in designing efficient
|
||||
sequence-specific minimizers. AVAILABILITY AND IMPLEMENTATION: A
|
||||
reference implementation and code for analyses under an
|
||||
open-source license are at
|
||||
https://github.com/kingsford-group/polarset. SUPPLEMENTARY
|
||||
INFORMATION: Supplementary data are available at Bioinformatics
|
||||
online.},
|
||||
author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
|
||||
doi = {10.1093/bioinformatics/btab313},
|
||||
issn = {1367-4803,1367-4811},
|
||||
journal = {Bioinformatics (Oxford, England)},
|
||||
language = {en},
|
||||
month = jul,
|
||||
number = {Suppl\_1},
|
||||
pages = {i187--i195},
|
||||
pmc = {PMC8686682},
|
||||
pmid = 34252928,
|
||||
publisher = {Oxford University Press (OUP)},
|
||||
title = {Sequence-specific minimizers via polar sets},
|
||||
url = {http://dx.doi.org/10.1093/bioinformatics/btab313},
|
||||
volume = 37,
|
||||
year = 2021,
|
||||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btab313}}
|
||||
|
||||
@article{Pan2024-hb,
|
||||
abstract = {MOTIVATION: The minimizer concept is a data structure for
|
||||
sequence sketching. The standard canonical minimizer selects a
|
||||
subset of k-mers from the given DNA sequence by comparing the
|
||||
forward and reverse k-mers in a window simultaneously according
|
||||
to a predefined selection scheme. It is widely employed by
|
||||
sequence analysis such as read mapping and assembly. k-mer
|
||||
density, k-mer repetitiveness (e.g. k-mer bias), and
|
||||
computational efficiency are three critical measurements for
|
||||
minimizer selection schemes. However, there exist trade-offs
|
||||
between kinds of minimizer variants. Generic, effective, and
|
||||
efficient are always the requirements for high-performance
|
||||
minimizer algorithms. RESULTS: We propose a simple minimizer
|
||||
operator as a refinement of the standard canonical minimizer. It
|
||||
takes only a few operations to compute. However, it can improve
|
||||
the k-mer repetitiveness, especially for the lexicographic order.
|
||||
It applies to other selection schemes of total orders (e.g.
|
||||
random orders). Moreover, it is computationally efficient and the
|
||||
density is close to that of the standard minimizer. The refined
|
||||
minimizer may benefit high-performance applications like binning
|
||||
and read mapping. AVAILABILITY AND IMPLEMENTATION: The source
|
||||
code of the benchmark in this work is available at the github
|
||||
repository https://github.com/xp3i4/mini\_benchmark.},
|
||||
author = {Pan, Chenxu and Reinert, Knut},
|
||||
doi = {10.1093/bioinformatics/btae045},
|
||||
issn = {1367-4803,1367-4811},
|
||||
journal = {Bioinformatics (Oxford, England)},
|
||||
language = {en},
|
||||
month = feb,
|
||||
number = 2,
|
||||
pmc = {PMC10868324},
|
||||
pmid = 38269626,
|
||||
publisher = {Oxford University Press (OUP)},
|
||||
title = {A simple refined DNA minimizer operator enables 2-fold faster computation},
|
||||
url = {http://dx.doi.org/10.1093/bioinformatics/btae045},
|
||||
volume = 40,
|
||||
year = 2024,
|
||||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btae045}}
|
||||
|
||||
@article{Kille2023-px,
|
||||
abstract = {MOTIVATION: The Jaccard similarity on k-mer sets has shown to be
|
||||
a convenient proxy for sequence identity. By avoiding expensive
|
||||
base-level alignments and comparing reduced sequence
|
||||
representations, tools such as MashMap can scale to massive
|
||||
numbers of pairwise comparisons while still providing useful
|
||||
similarity estimates. However, due to their reliance on minimizer
|
||||
winnowing, previous versions of MashMap were shown to be biased
|
||||
and inconsistent estimators of Jaccard similarity. This directly
|
||||
impacts downstream tools that rely on the accuracy of these
|
||||
estimates. RESULTS: To address this, we propose the minmer
|
||||
winnowing scheme, which generalizes the minimizer scheme by use
|
||||
of a rolling minhash with multiple sampled k-mers per window. We
|
||||
show both theoretically and empirically that minmers yield an
|
||||
unbiased estimator of local Jaccard similarity, and we implement
|
||||
this scheme in an updated version of MashMap. The minmer-based
|
||||
implementation is over 10 times faster than the minimizer-based
|
||||
version under the default ANI threshold, making it well-suited
|
||||
for large-scale comparative genomics applications. AVAILABILITY
|
||||
AND IMPLEMENTATION: MashMap3 is available at
|
||||
https://github.com/marbl/MashMap.},
|
||||
author = {Kille, Bryce and Garrison, Erik and Treangen, Todd J and Phillippy, Adam M},
|
||||
doi = {10.1093/bioinformatics/btad512},
|
||||
issn = {1367-4803,1367-4811},
|
||||
journal = {Bioinformatics (Oxford, England)},
|
||||
language = {en},
|
||||
month = sep,
|
||||
number = 9,
|
||||
pmc = {PMC10505501},
|
||||
pmid = 37603771,
|
||||
publisher = {Oxford University Press (OUP)},
|
||||
title = {Minmers are a generalization of minimizers that enable unbiased local Jaccard estimation},
|
||||
url = {http://dx.doi.org/10.1093/bioinformatics/btad512},
|
||||
volume = 39,
|
||||
year = 2023,
|
||||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btad512}}
|
||||
|
||||
@incollection{Golan2025-xf,
|
||||
address = {Cham},
|
||||
author = {Golan, Shay and Shur, Arseny M},
|
||||
booktitle = {Lecture Notes in Computer Science},
|
||||
doi = {10.1007/978-3-031-82670-2\_25},
|
||||
isbn = {9783031826696,9783031826702},
|
||||
issn = {0302-9743,1611-3349},
|
||||
language = {en},
|
||||
pages = {347--360},
|
||||
publisher = {Springer Nature Switzerland},
|
||||
series = {Lecture Notes in Computer Science},
|
||||
title = {Expected density of random minimizers},
|
||||
url = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
|
||||
year = 2025,
|
||||
bdsk-url-1 = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
|
||||
bdsk-url-2 = {http://dx.doi.org/10.1007/978-3-031-82670-2%5C_25}}
|
||||
|
||||
@article{Mohamadi2017-ok,
|
||||
abstract = {Motivation: Many bioinformatics algorithms are designed for the
|
||||
analysis of sequences of some uniform length, conventionally
|
||||
referred to as k -mers. These include de Bruijn graph assembly
|
||||
methods and sequence alignment tools. An efficient algorithm to
|
||||
enumerate the number of unique k -mers, or even better, to build
|
||||
a histogram of k -mer frequencies would be desirable for these
|
||||
tools and their downstream analysis pipelines. Among other
|
||||
applications, estimated frequencies can be used to predict genome
|
||||
sizes, measure sequencing error rates, and tune runtime
|
||||
parameters for analysis tools. However, calculating a k -mer
|
||||
histogram from large volumes of sequencing data is a challenging
|
||||
task. Results: Here, we present ntCard, a streaming algorithm for
|
||||
estimating the frequencies of k -mers in genomics datasets. At
|
||||
its core, ntCard uses the ntHash algorithm to efficiently compute
|
||||
hash values for streamed sequences. It then samples the
|
||||
calculated hash values to build a reduced representation
|
||||
multiplicity table describing the sample distribution. Finally,
|
||||
it uses a statistical model to reconstruct the population
|
||||
distribution from the sample distribution. We have compared the
|
||||
performance of ntCard and other cardinality estimation
|
||||
algorithms. We used three datasets of 480 GB, 500 GB and 2.4 TB
|
||||
in size, where the first two representing whole genome shotgun
|
||||
sequencing experiments on the human genome and the last one on
|
||||
the white spruce genome. Results show ntCard estimates k -mer
|
||||
coverage frequencies >15× faster than the state-of-the-art
|
||||
algorithms, using similar amount of memory, and with higher
|
||||
accuracy rates. Thus, our benchmarks demonstrate ntCard as a
|
||||
potentially enabling technology for large-scale genomics
|
||||
applications. Availability and Implementation: ntCard is written
|
||||
in C ++ and is released under the GPL license. It is freely
|
||||
available at https://github.com/bcgsc/ntCard. Contact:
|
||||
hmohamadi@bcgsc.ca or ibirol@bcgsc.ca. Supplementary information:
|
||||
Supplementary data are available at Bioinformatics online.},
|
||||
author = {Mohamadi, Hamid and Khan, Hamza and Birol, Inanc},
|
||||
date-modified = {2026-04-18 08:19:36 +0200},
|
||||
doi = {10.1093/bioinformatics/btw832},
|
||||
issn = {1367-4803,1367-4811},
|
||||
journal = {Bioinformatics (Oxford, England)},
|
||||
language = {en},
|
||||
month = may,
|
||||
number = 9,
|
||||
pages = {1324--1330},
|
||||
pmc = {PMC5408799},
|
||||
pmid = 28453674,
|
||||
publisher = {Oxford University Press (OUP)},
|
||||
title = {ntCard: a streaming algorithm for cardinality estimation in genomics data},
|
||||
url = {http://dx.doi.org/10.1093/bioinformatics/btw832},
|
||||
volume = 33,
|
||||
year = 2017,
|
||||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btw832}}
|
||||
@@ -0,0 +1,38 @@
|
||||
# DNA encoding
|
||||
|
||||
## 2-bit nucleotide encoding
|
||||
|
||||
All nucleotides are encoded on 2 bits, MSB-first within each word. Nucleotides are numbered 0-based from the 5′ end across all sequence types:
|
||||
|
||||
| Base | Encoding |
|
||||
|------|----------|
|
||||
| A | `00` |
|
||||
| C | `01` |
|
||||
| G | `10` |
|
||||
| T | `11` |
|
||||
|
||||
The Watson-Crick complement of any base is its bitwise NOT on 2 bits: `complement(base) = ~base & 0b11`.
|
||||
|
||||
## Kmer encoding
|
||||
|
||||
A kmer fits in a single `u64`. Nucleotide 0 occupies bits 63–62, nucleotide i occupies bits 63−2i and 62−2i, and the low 64−2k bits are zero. Extraction of nucleotide i (0 ≤ i < k): `(kmer >> (62 - 2*i)) & 0b11`.
|
||||
|
||||
Reverse complement is computed via a **16-bit lookup table** (65 536 entries × 2 bytes = 128 KB, fits in L2 cache) storing the reverse-complement of every 8-base chunk.
|
||||
|
||||
!!! abstract "Algorithm — Kmer reverse complement"
|
||||
```text
|
||||
procedure KmerRevcomp(kmer, k):
|
||||
raw ← TABLE16[kmer & 0xFFFF] << 48
|
||||
| TABLE16[(kmer >> 16) & 0xFFFF] << 32
|
||||
| TABLE16[(kmer >> 32) & 0xFFFF] << 16
|
||||
| TABLE16[(kmer >> 48) & 0xFFFF]
|
||||
return raw << (64 - 2*k)
|
||||
```
|
||||
|
||||
The **canonical form** is the lexicographic minimum of the kmer and its reverse complement:
|
||||
|
||||
```
|
||||
canonical(kmer) = min(kmer, revcomp(kmer))
|
||||
```
|
||||
|
||||
This halves the kmer space and ensures strand-independent counting.
|
||||
@@ -0,0 +1,68 @@
|
||||
# Kmer entropy filter
|
||||
|
||||
Low-complexity kmers (polyA, polyT, tandem repeats) are detected and excluded during phase 1. The filter computes a **normalized Shannon entropy** over sub-words of multiple sizes, corrected for two sources of bias: the small number of observations within a single kmer, and the unequal sizes of circular equivalence classes.
|
||||
|
||||
## Sub-word frequencies
|
||||
|
||||
For a kmer of length k and a sub-word size ws (1 ≤ ws ≤ ws_max, typically ws_max = 6), extract the $n_{\text{words}} = k - ws + 1$ overlapping sub-words by sliding a window of length ws:
|
||||
|
||||
$$w_i = \text{kmer}[i \mathinner{..} i+ws-1], \quad i = 0, \ldots, n_{\text{words}}-1$$
|
||||
|
||||
Each sub-word is mapped to its **circular canonical form**: the lexicographic minimum among all cyclic rotations of the word **and all cyclic rotations of its reverse complement**. This extended equivalence relation ensures that entropy(K) = entropy(revcomp(K)) — the filter is strand-symmetric. Let $s_j$ be the size of equivalence class $j$ (number of distinct raw words mapping to canonical form $j$), and $f_j$ the count of canonical form $j$ among the $n_{\text{words}}$ sub-words ($\sum_j f_j = n_{\text{words}}$).
|
||||
|
||||
## Corrected Shannon entropy
|
||||
|
||||
The circular equivalence classes have unequal sizes: under a uniform distribution over all $4^{ws}$ raw words, class $j$ is visited with probability $s_j / 4^{ws}$, not $1/n_a$. Computing entropy directly over canonical classes therefore underestimates the entropy of a random sequence.
|
||||
|
||||
The correction "unfolds" each canonical class back to its member raw words, redistributing each observation of class $j$ equally among its $s_j$ members:
|
||||
|
||||
$$H_{\text{corr}} = \log(n_{\text{words}}) - \frac{1}{n_{\text{words}}} \sum_j f_j \log f_j + \frac{1}{n_{\text{words}}} \sum_j f_j \log s_j$$
|
||||
|
||||
The last term is the correction for unequal class sizes. For a uniformly random sequence ($f_j \approx n_{\text{words}} \cdot s_j / 4^{ws}$), this gives $H_{\text{corr}} \approx \log(4^{ws}) = 2 \cdot ws \cdot \log 2$, the maximum entropy over raw words.
|
||||
|
||||
## Maximum entropy correction for small samples
|
||||
|
||||
With only $n_{\text{words}}$ observations over $4^{ws}$ possible raw words, the achievable maximum entropy is bounded by the most uniform integer distribution over $4^{ws}$ categories.
|
||||
|
||||
Let $c = \lfloor n_{\text{words}} / 4^{ws} \rfloor$ and $r = n_{\text{words}} \bmod 4^{ws}$. The most uniform integer distribution assigns frequency $c+1$ to $r$ categories and $c$ to the remaining $4^{ws} - r$, with the convention $0 \log 0 = 0$:
|
||||
|
||||
$$H_{\max} = -\left[(4^{ws} - r)\,\frac{c}{n_{\text{words}}}\log\frac{c}{n_{\text{words}}} + r\,\frac{c+1}{n_{\text{words}}}\log\frac{c+1}{n_{\text{words}}}\right]$$
|
||||
|
||||
When $n_{\text{words}} < 4^{ws}$: $c=0$, $r=n_{\text{words}}$, and the formula reduces to $H_{\max} = \log(n_{\text{words}})$ — a single unified expression covers both regimes. A truly random sequence achieves $H_{\text{corr}} \approx H_{\max}$.
|
||||
|
||||
## Normalized entropy
|
||||
|
||||
$$\hat{H}(ws) = \frac{H_{\text{corr}}}{H_{\max}} \in [0, 1]$$
|
||||
|
||||
## Final score
|
||||
|
||||
The filter computes $\hat{H}(ws)$ for each word size ws from 1 to ws_max and returns the **minimum**:
|
||||
|
||||
$$\text{entropy}(kmer) = \min_{ws=1}^{ws_{\max}} \hat{H}(ws)$$
|
||||
|
||||
A value near 0 indicates low complexity (e.g. AAAA…); near 1 indicates high complexity. A kmer is rejected if $\text{entropy}(kmer) \leq \theta$, where $\theta$ is a collection parameter. The minimum across word sizes ensures that any scale of repetition is detected independently: polyA is caught at ws=1, dinucleotide repeats at ws=2, etc.
|
||||
|
||||
## Interpretation as an effective number of classes
|
||||
|
||||
$H_{\text{corr}}$ is a standard Shannon entropy over raw words (after unfolding the equivalence classes), so the classical perplexity interpretation holds directly: $N_{\text{eff}} = e^{H_{\text{corr}}}$ is the number of equiprobable classes that would yield the same entropy.
|
||||
|
||||
For the normalised score $\hat{H}$, dividing by $H_{\text{max}}$ changes the logarithm base:
|
||||
|
||||
$$\hat{H} = \frac{\log N_{\text{eff}}}{\log N_{\text{max}}} = \log_{N_{\text{max}}} N_{\text{eff}} \quad \Longleftrightarrow \quad N_{\text{eff}} = N_{\text{max}}^{\,\hat{H}}$$
|
||||
|
||||
The property is preserved: $\hat{H}$ is the logarithm (in base $N_{\text{max}}$) of the effective number of equi-represented classes.
|
||||
|
||||
In the large-sample limit ($n_{\text{words}} \gg 4^{ws}$), $N_{\text{max}} \approx 4^{ws}$, giving:
|
||||
|
||||
$$N_{\text{eff}} \approx 4^{ws \cdot \hat{H}}$$
|
||||
|
||||
This has a clean interpretation: $ws \cdot \hat{H}$ is the **effective word length** (in bases) of a perfectly uniform distribution that would produce the same entropy. At $\hat{H} = 1$ the full space of $4^{ws}$ words is used; at $\hat{H} = 0.5$ with ws=2, only $4^1 = 4$ effective classes out of 16 are occupied.
|
||||
|
||||
In our actual regime, $n_{\text{words}}$ is small and $4^{ws}$ can exceed $n_{\text{words}}$, so $H_{\text{max}} < \log(4^{ws})$ due to the small-sample correction. The exact effective count is $N_{\text{max}}^{\hat{H}}$, not $4^{ws \cdot \hat{H}}$.
|
||||
|
||||
## Properties
|
||||
|
||||
The entropy score is a function of the kmer sequence alone — it does not depend on the surrounding context or on the position within any genome. Two consequences:
|
||||
|
||||
- **Orientation invariance**: $\text{entropy}(K) = \text{entropy}(\text{revcomp}(K))$, guaranteed by the strand-symmetric canonical form.
|
||||
- **Context independence**: the same kmer is always rejected or always kept, regardless of which genome it occurs in, where in that genome it appears, or which strand is considered. The filter defines a fixed partition of the kmer space into low-complexity and valid kmers.
|
||||
@@ -0,0 +1,28 @@
|
||||
# Partitioning and indexing architecture
|
||||
|
||||
The canonical minimizer of a super-kmer is hashed to produce a **p-bit routing value** (p is a collection-level parameter):
|
||||
|
||||
```
|
||||
canonical minimizer → hash(minimizer) → p-bit value → PART → partition directory
|
||||
```
|
||||
|
||||
PART is computed once at phase 1 to open the correct partition file, then discarded. It is recomputed on the fly at query time. It is never stored in the super-kmer header.
|
||||
|
||||
Each partition holds one MPHF instance (phase 6) that indexes kmers as plain u64 values — the minimizer plays no role inside the partition.
|
||||
|
||||
## Why hashing is necessary
|
||||
|
||||
The canonical minimizer is an m-mer (m ∈ {9, 11, 13, 15}), encoded in 2m bits (18 to 30 bits). Its distribution over the $4^m$ possible values is **not uniform**: because the minimizer is the lexicographic minimum of a window of m-mers, small values are systematically over-represented [@Zheng2020-ji; @Zheng2021-cc; @Pan2024-hb; @Kille2023-px; @Golan2025-xf]. Routing directly by the raw minimizer value would produce severely unbalanced partitions.
|
||||
|
||||
A hash function with good avalanche properties redistributes this skewed distribution uniformly over the $2^p$ partition slots. The key reason this works well is the **entropy gap**: p is chosen to be much smaller than 2m, so the hash compresses many distinct minimizer values into each partition slot. Even under strong bias in the minimizer distribution, as long as its effective entropy exceeds p bits — which holds comfortably since the set of distinct minimizers in any real dataset is far larger than $2^p$ — the load imbalance across partitions is negligible.
|
||||
|
||||
## Parameter choices
|
||||
|
||||
| m | 2m (bits) | Typical p | Partitions |
|
||||
|----|-----------|-----------|------------|
|
||||
| 9 | 18 | 6–8 | 64–256 |
|
||||
| 11 | 22 | 8–10 | 256–1 024 |
|
||||
| 13 | 26 | 10–12 | 1 024–4 096|
|
||||
| 15 | 30 | 10–14 | 1 024–16 384|
|
||||
|
||||
The hard constraint is p ≤ 2m: one cannot extract more bits of uniform randomness from a source than it contains. In practice p is chosen well below 2m, leaving a large entropy margin that absorbs the minimizer bias. For k=31, m=13, p=10: 1 024 partitions with comfortable balance.
|
||||
@@ -0,0 +1,32 @@
|
||||
# Kmers and super-kmers
|
||||
|
||||
## Kmers
|
||||
|
||||
A **kmer** is a DNA subsequence of fixed length k. Two constraints govern the choice of k:
|
||||
|
||||
- **k ∈ [11, 31]**: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word.
|
||||
- **k is odd**: an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form `min(kmer, revcomp(kmer))` is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting.
|
||||
|
||||
## Super-kmers
|
||||
|
||||
A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides. Each kmer of the run carries the same **canonical minimizer**. The **canonical minimizer** of a kmer is the smallest value of `min(m-mer, revcomp(m-mer))` over all m-mers within the kmer (m < k, m odd).
|
||||
|
||||
### Canonical super-kmers
|
||||
|
||||
A **canonical super-kmer** is the lexicographic minimum of a super-kmer and its reverse complement:
|
||||
|
||||
```
|
||||
canonical(super-kmer) = min(super-kmer, revcomp(super-kmer))
|
||||
```
|
||||
|
||||
When a read and its reverse-complement are both sequenced, they produce super-kmers that are reverse complements of each other. Both map to the same canonical form: the same genomic region is represented by a single canonical super-kmer regardless of which strand was read.
|
||||
|
||||
### Expected length of a super-kmer
|
||||
|
||||
For a random minimizer of length m over k-mers of length k, the density of minimizer positions is approximately 2/(k−m+2) [@Zheng2020-ji; @Golan2025-xf], so the expected number of consecutive k-mers per super-kmer is (k−m+2)/2. A run of n k-mers spans n + k − 1 nucleotides, giving:
|
||||
|
||||
$$L_{\text{nt}} = \frac{k-m+2}{2} + k - 1$$
|
||||
|
||||
For k=31, m=13: expected ≈ 40 nt. In practice super-kmers rarely exceed a few dozen nucleotides.[^superkmer_length]
|
||||
|
||||
[^superkmer_length]: The expected length formula and the density approximation 2/(k−m+2) should be verified against the values reported in [@Zheng2020-ji] and [@Golan2025-xf].
|
||||
@@ -0,0 +1,377 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<style xmlns="http://purl.org/net/xbiblio/csl" class="in-text" version="1.0" demote-non-dropping-particle="sort-only" initialize-with-hyphen="false" page-range-format="minimal">
|
||||
<info>
|
||||
<title>Vancouver</title>
|
||||
<id>http://www.zotero.org/styles/vancouver</id>
|
||||
<link href="http://www.zotero.org/styles/vancouver" rel="self"/>
|
||||
<link href="http://www.nlm.nih.gov/bsd/uniform_requirements.html" rel="documentation"/>
|
||||
<author>
|
||||
<name>Michael Berkowitz</name>
|
||||
<email>mberkowi@gmu.edu</email>
|
||||
</author>
|
||||
<contributor>
|
||||
<name>Sean Takats</name>
|
||||
<email>stakats@gmu.edu</email>
|
||||
</contributor>
|
||||
<contributor>
|
||||
<name>Sebastian Karcher</name>
|
||||
</contributor>
|
||||
<category citation-format="numeric"/>
|
||||
<category field="generic-base"/>
|
||||
<category field="medicine"/>
|
||||
<summary>Vancouver style as outlined by International Committee of Medical Journal Editors Uniform Requirements for Manuscripts Submitted to Biomedical Journals: Sample References</summary>
|
||||
<updated>2025-05-17T20:55:38-04:00</updated>
|
||||
<rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
|
||||
</info>
|
||||
<locale xml:lang="en">
|
||||
<date form="text" delimiter=" ">
|
||||
<date-part name="year"/>
|
||||
<date-part name="month" form="short" strip-periods="true"/>
|
||||
<date-part name="day"/>
|
||||
</date>
|
||||
<terms>
|
||||
<term name="collection-editor" form="long">
|
||||
<single>editor</single>
|
||||
<multiple>editors</multiple>
|
||||
</term>
|
||||
<term name="presented at">presented at</term>
|
||||
<term name="available at">available from</term>
|
||||
<term name="section" form="short">sect.</term>
|
||||
</terms>
|
||||
</locale>
|
||||
<locale xml:lang="fr">
|
||||
<date form="text" delimiter=" ">
|
||||
<date-part name="day"/>
|
||||
<date-part name="month" form="short" strip-periods="true"/>
|
||||
<date-part name="year"/>
|
||||
</date>
|
||||
</locale>
|
||||
<macro name="author">
|
||||
<names variable="author">
|
||||
<name sort-separator=" " initialize-with="" name-as-sort-order="all" delimiter=", " delimiter-precedes-last="always"/>
|
||||
<label form="long" prefix=", "/>
|
||||
<substitute>
|
||||
<text macro="webpage-title"/>
|
||||
<names variable="editor"/>
|
||||
</substitute>
|
||||
</names>
|
||||
</macro>
|
||||
<macro name="editor">
|
||||
<names variable="editor" suffix=".">
|
||||
<name sort-separator=" " initialize-with="" name-as-sort-order="all" delimiter=", " delimiter-precedes-last="always"/>
|
||||
<label form="long" prefix=", "/>
|
||||
</names>
|
||||
</macro>
|
||||
<macro name="chapter-marker">
|
||||
<choose>
|
||||
<if type="chapter paper-conference entry-dictionary entry-encyclopedia" match="any">
|
||||
<text term="in" text-case="capitalize-first"/>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="webpage-title">
|
||||
<!--If a webpage has a container, we're assuming the citation is "part of a website" as per ch. 25 Citing Medicine https://www.ncbi.nlm.nih.gov/books/NBK7274/?report=reader -->
|
||||
<choose>
|
||||
<if type="webpage" variable="container-title" match="all">
|
||||
<group delimiter=" ">
|
||||
<text variable="container-title"/>
|
||||
<text term="internet" prefix="[" suffix="]" text-case="capitalize-first"/>
|
||||
</group>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="publisher">
|
||||
<choose>
|
||||
<!--discard publisher info for articles-->
|
||||
<if type="article-journal article-magazine article-newspaper" match="none">
|
||||
<group delimiter=": " suffix=";">
|
||||
<choose>
|
||||
<if type="thesis">
|
||||
<text variable="publisher-place" prefix="[" suffix="]"/>
|
||||
</if>
|
||||
<else-if type="speech"/>
|
||||
<else>
|
||||
<text variable="publisher-place"/>
|
||||
</else>
|
||||
</choose>
|
||||
<text variable="publisher"/>
|
||||
</group>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="access">
|
||||
<choose>
|
||||
<if variable="URL">
|
||||
<group delimiter=": ">
|
||||
<text term="available at" text-case="capitalize-first"/>
|
||||
<text variable="URL"/>
|
||||
</group>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="accessed-date">
|
||||
<choose>
|
||||
<if variable="URL">
|
||||
<group prefix="[" suffix="]" delimiter=" ">
|
||||
<text term="cited" text-case="lowercase"/>
|
||||
<date variable="accessed" form="text"/>
|
||||
</group>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="container-title">
|
||||
<choose>
|
||||
<if type="article-journal article-magazine chapter paper-conference article-newspaper review review-book entry-dictionary entry-encyclopedia" match="any">
|
||||
<group suffix="." delimiter=" ">
|
||||
<choose>
|
||||
<if type="article-journal review review-book" match="any">
|
||||
<text variable="container-title" form="short" strip-periods="true"/>
|
||||
</if>
|
||||
<else>
|
||||
<text variable="container-title" strip-periods="true"/>
|
||||
</else>
|
||||
</choose>
|
||||
<choose>
|
||||
<if variable="URL">
|
||||
<text term="internet" prefix="[" suffix="]" text-case="capitalize-first"/>
|
||||
</if>
|
||||
</choose>
|
||||
</group>
|
||||
<text macro="edition" prefix=" "/>
|
||||
</if>
|
||||
<!--add event-name and event-place once they become available-->
|
||||
<else-if type="bill legislation" match="any">
|
||||
<group delimiter=", ">
|
||||
<group delimiter=". ">
|
||||
<text variable="container-title"/>
|
||||
<group delimiter=" ">
|
||||
<text term="section" form="short" text-case="capitalize-first"/>
|
||||
<text variable="section"/>
|
||||
</group>
|
||||
</group>
|
||||
<text variable="number"/>
|
||||
</group>
|
||||
</else-if>
|
||||
<else-if type="speech">
|
||||
<group delimiter=": " suffix=";">
|
||||
<group delimiter=" ">
|
||||
<text variable="genre" text-case="capitalize-first"/>
|
||||
<text term="presented at"/>
|
||||
</group>
|
||||
<text variable="event"/>
|
||||
</group>
|
||||
</else-if>
|
||||
<else>
|
||||
<group delimiter=", " suffix=".">
|
||||
<choose>
|
||||
<if variable="collection-title" match="none">
|
||||
<group delimiter=" ">
|
||||
<label variable="volume" form="short" text-case="capitalize-first"/>
|
||||
<text variable="volume"/>
|
||||
</group>
|
||||
</if>
|
||||
</choose>
|
||||
<text variable="container-title"/>
|
||||
</group>
|
||||
</else>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="title">
|
||||
<choose>
|
||||
<if type="webpage" variable="container-title" match="all"/>
|
||||
<else>
|
||||
<text variable="title"/>
|
||||
<choose>
|
||||
<if type="article-journal article-magazine chapter paper-conference article-newspaper review review-book entry-dictionary entry-encyclopedia" match="none">
|
||||
<choose>
|
||||
<if variable="URL">
|
||||
<text term="internet" prefix=" [" suffix="]" text-case="capitalize-first"/>
|
||||
</if>
|
||||
</choose>
|
||||
<text macro="edition" prefix=". "/>
|
||||
</if>
|
||||
</choose>
|
||||
</else>
|
||||
</choose>
|
||||
<choose>
|
||||
<if type="thesis">
|
||||
<text variable="genre" prefix=" [" suffix="]"/>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="edition">
|
||||
<choose>
|
||||
<if is-numeric="edition">
|
||||
<group delimiter=" ">
|
||||
<number variable="edition" form="ordinal"/>
|
||||
<text term="edition" form="short"/>
|
||||
</group>
|
||||
</if>
|
||||
<else>
|
||||
<text variable="edition" suffix="."/>
|
||||
</else>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="date">
|
||||
<choose>
|
||||
<if type="article-journal article-magazine article-newspaper review review-book" match="any">
|
||||
<group suffix=";" delimiter=" ">
|
||||
<date variable="issued" form="text"/>
|
||||
<text macro="accessed-date"/>
|
||||
</group>
|
||||
</if>
|
||||
<else-if type="bill legislation" match="any">
|
||||
<group delimiter=", ">
|
||||
<date variable="issued" delimiter=" ">
|
||||
<date-part name="month" form="short" strip-periods="true"/>
|
||||
<date-part name="day"/>
|
||||
</date>
|
||||
<date variable="issued">
|
||||
<date-part name="year"/>
|
||||
</date>
|
||||
</group>
|
||||
</else-if>
|
||||
<else-if type="report">
|
||||
<date variable="issued" delimiter=" ">
|
||||
<date-part name="year"/>
|
||||
<date-part name="month" form="short" strip-periods="true"/>
|
||||
</date>
|
||||
<text macro="accessed-date" prefix=" "/>
|
||||
</else-if>
|
||||
<else-if type="patent">
|
||||
<group suffix=".">
|
||||
<group delimiter=", ">
|
||||
<text variable="number"/>
|
||||
<date variable="issued">
|
||||
<date-part name="year"/>
|
||||
</date>
|
||||
</group>
|
||||
<text macro="accessed-date" prefix=" "/>
|
||||
</group>
|
||||
</else-if>
|
||||
<else-if type="speech">
|
||||
<group delimiter="; ">
|
||||
<group delimiter=" ">
|
||||
<date variable="issued" delimiter=" ">
|
||||
<date-part name="year"/>
|
||||
<date-part name="month" form="short" strip-periods="true"/>
|
||||
<date-part name="day"/>
|
||||
</date>
|
||||
<text macro="accessed-date"/>
|
||||
</group>
|
||||
<text variable="event-place"/>
|
||||
</group>
|
||||
</else-if>
|
||||
<else>
|
||||
<group suffix=".">
|
||||
<date variable="issued">
|
||||
<date-part name="year"/>
|
||||
</date>
|
||||
<text macro="accessed-date" prefix=" "/>
|
||||
</group>
|
||||
</else>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="pages">
|
||||
<choose>
|
||||
<if type="article-journal article-magazine article-newspaper review review-book" match="any">
|
||||
<text variable="page" prefix=":"/>
|
||||
</if>
|
||||
<else-if type="book" match="any">
|
||||
<text variable="number-of-pages" prefix=" "/>
|
||||
<choose>
|
||||
<if is-numeric="number-of-pages">
|
||||
<label variable="number-of-pages" form="short" prefix=" " plural="never"/>
|
||||
</if>
|
||||
</choose>
|
||||
</else-if>
|
||||
<else>
|
||||
<group prefix=" " delimiter=" ">
|
||||
<label variable="page" form="short" plural="never"/>
|
||||
<text variable="page"/>
|
||||
</group>
|
||||
</else>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="journal-location">
|
||||
<choose>
|
||||
<if type="article-journal article-magazine review review-book" match="any">
|
||||
<text variable="volume"/>
|
||||
<text variable="issue" prefix="(" suffix=")"/>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="webpage-part">
|
||||
<choose>
|
||||
<if type="webpage" variable="container-title" match="all">
|
||||
<text variable="title"/>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="collection-details">
|
||||
<choose>
|
||||
<if type="article-journal article-magazine article-newspaper review review-book" match="none">
|
||||
<choose>
|
||||
<if variable="collection-title">
|
||||
<group delimiter=" " prefix="(" suffix=")">
|
||||
<names variable="collection-editor" suffix=".">
|
||||
<name sort-separator=" " initialize-with="" name-as-sort-order="all" delimiter=", " delimiter-precedes-last="always"/>
|
||||
<label form="long" prefix=", "/>
|
||||
</names>
|
||||
<group delimiter="; ">
|
||||
<text variable="collection-title"/>
|
||||
<group delimiter=" ">
|
||||
<label variable="volume" form="short"/>
|
||||
<text variable="volume"/>
|
||||
</group>
|
||||
</group>
|
||||
</group>
|
||||
</if>
|
||||
</choose>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<macro name="report-details">
|
||||
<choose>
|
||||
<if type="report">
|
||||
<text variable="number" prefix="Report No.: "/>
|
||||
</if>
|
||||
</choose>
|
||||
</macro>
|
||||
<citation collapse="citation-number">
|
||||
<sort>
|
||||
<key variable="citation-number"/>
|
||||
</sort>
|
||||
<layout prefix="(" suffix=")" delimiter=",">
|
||||
<text variable="citation-number"/>
|
||||
</layout>
|
||||
</citation>
|
||||
<bibliography et-al-min="7" et-al-use-first="6" second-field-align="flush">
|
||||
<layout>
|
||||
<text variable="citation-number" suffix="."/>
|
||||
<group delimiter=". " suffix=". ">
|
||||
<text macro="author"/>
|
||||
<text macro="title"/>
|
||||
</group>
|
||||
<group delimiter=" " suffix=". ">
|
||||
<group delimiter=": ">
|
||||
<text macro="chapter-marker"/>
|
||||
<group delimiter=" ">
|
||||
<text macro="editor"/>
|
||||
<text macro="container-title"/>
|
||||
</group>
|
||||
</group>
|
||||
<text macro="publisher"/>
|
||||
<group>
|
||||
<text macro="date"/>
|
||||
<text macro="journal-location"/>
|
||||
<text macro="pages"/>
|
||||
</group>
|
||||
<text macro="webpage-part"/>
|
||||
</group>
|
||||
<text macro="collection-details" suffix=". "/>
|
||||
<text macro="report-details" suffix=". "/>
|
||||
<text macro="access"/>
|
||||
</layout>
|
||||
</bibliography>
|
||||
</style>
|
||||
Reference in New Issue
Block a user