Push nvyqwlpspwvl #11
@@ -27,6 +27,18 @@
|
|||||||
- Canonical form: `min(kmer, revcomp(kmer))` reduces strand-symmetric space by half
|
- Canonical form: `min(kmer, revcomp(kmer))` reduces strand-symmetric space by half
|
||||||
- Input formats: FASTA, FASTQ, gzip, streaming stdin; `index` reads from stdin automatically when no input files are provided (`-` can also be passed explicitly among other paths)
|
- Input formats: FASTA, FASTQ, gzip, streaming stdin; `index` reads from stdin automatically when no input files are provided (`-` can also be passed explicitly among other paths)
|
||||||
|
|
||||||
|
## Parameter constraints (enforced at CLI)
|
||||||
|
|
||||||
|
All constraints below are checked by `CommonArgs::validate()` at the start of `superkmer` and `index`. Invalid values exit immediately with an error.
|
||||||
|
|
||||||
|
| Parameter | Constraint | Reason |
|
||||||
|
|-----------|-----------|--------|
|
||||||
|
| k (`--kmer-size`) | odd | even k allows palindromic k-mers: kmer == revcomp(kmer), breaking the canonical form invariant |
|
||||||
|
| k (`--kmer-size`) | k ∈ [11, 31] | k > 31 overflows u64 at 2 bits/base; k < 11 gives insufficient specificity |
|
||||||
|
| m (`--minimizer-size`) | odd | same palindrome argument as k |
|
||||||
|
| m (`--minimizer-size`) | 3 ≤ m ≤ k−1 | minimizer must be strictly shorter than the kmer |
|
||||||
|
| z (`-z`, Findere, `index --approx` only) | z ≤ k−1 | effective indexed kmer size is k−z+1; z ≥ k would make it ≤ 0 |
|
||||||
|
|
||||||
## Genome label constraints
|
## Genome label constraints
|
||||||
|
|
||||||
Genome labels are arbitrary Unicode strings with the following restrictions:
|
Genome labels are arbitrary Unicode strings with the following restrictions:
|
||||||
|
|||||||
+3
-1
@@ -4,9 +4,11 @@
|
|||||||
|
|
||||||
A **kmer** is a DNA subsequence of fixed length k. Two constraints govern the choice of k:
|
A **kmer** is a DNA subsequence of fixed length k. Two constraints govern the choice of k:
|
||||||
|
|
||||||
- **k ∈ [11, 31]**: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word.
|
- **k ∈ [11, 31]**: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word (u64 at 2 bits/base requires k ≤ 32; k < 11 yields insufficient specificity).
|
||||||
- **k is odd**: an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form `min(kmer, revcomp(kmer))` is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting.
|
- **k is odd**: an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form `min(kmer, revcomp(kmer))` is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting.
|
||||||
|
|
||||||
|
Both constraints are **enforced at CLI entry** by `CommonArgs::validate()` in `superkmer` and `index`. Passing an invalid k exits immediately with an error message.
|
||||||
|
|
||||||
## Super-kmers
|
## Super-kmers
|
||||||
|
|
||||||
A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides, sharing the same **canonical minimizer**. The **canonical minimizer** of a kmer is the m-mer (m < k) whose canonical hash `hash_kmer(min(m-mer, revcomp(m-mer)))` is smallest over all m-mers in the kmer window. The hash function is a `mix64`-based bijection; selection is purely hash-ordered with no degeneracy filter. A super-kmer is capped at 256 nucleotides; a longer run is split at that boundary.
|
A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides, sharing the same **canonical minimizer**. The **canonical minimizer** of a kmer is the m-mer (m < k) whose canonical hash `hash_kmer(min(m-mer, revcomp(m-mer)))` is smallest over all m-mers in the kmer window. The hash function is a `mix64`-based bijection; selection is purely hash-ordered with no degeneracy filter. A super-kmer is capped at 256 nucleotides; a longer run is split at that boundary.
|
||||||
|
|||||||
@@ -63,6 +63,29 @@ pub fn block_size_to_bits(n: usize) -> u8 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl CommonArgs {
|
impl CommonArgs {
|
||||||
|
/// Validate k and m constraints. Exits on error.
|
||||||
|
pub fn validate(&self) {
|
||||||
|
let k = self.kmer_size;
|
||||||
|
let m = self.minimizer_size;
|
||||||
|
|
||||||
|
if k < 11 || k > 31 {
|
||||||
|
eprintln!("error: --kmer-size must be in [11, 31] (got {k})");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
if k % 2 == 0 {
|
||||||
|
eprintln!("error: --kmer-size must be odd (got {k}); even k allows palindromic k-mers");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
if m < 3 || m >= k {
|
||||||
|
eprintln!("error: --minimizer-size must be in [3, k−1] = [3, {}] (got {m})", k - 1);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
if m % 2 == 0 {
|
||||||
|
eprintln!("error: --minimizer-size must be odd (got {m})");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn seqfile_paths(&self) -> obiread::PathIter {
|
pub fn seqfile_paths(&self) -> obiread::PathIter {
|
||||||
let paths: Vec<PathBuf> = if self.inputs.is_empty() {
|
let paths: Vec<PathBuf> = if self.inputs.is_empty() {
|
||||||
vec![PathBuf::from("-")]
|
vec![PathBuf::from("-")]
|
||||||
|
|||||||
@@ -152,12 +152,23 @@ pub(crate) fn resolve_approx_params(
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn run(args: IndexArgs) {
|
pub fn run(args: IndexArgs) {
|
||||||
|
args.common.validate();
|
||||||
|
|
||||||
let output = args.output.clone();
|
let output = args.output.clone();
|
||||||
let mut rep = Reporter::new();
|
let mut rep = Reporter::new();
|
||||||
|
|
||||||
// ── Resolve evidence kind ────────────────────────────────────────────────
|
// ── Resolve evidence kind ────────────────────────────────────────────────
|
||||||
let evidence = if args.approx {
|
let evidence = if args.approx {
|
||||||
let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp);
|
let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp);
|
||||||
|
let k = args.common.kmer_size;
|
||||||
|
if z as usize >= k {
|
||||||
|
eprintln!(
|
||||||
|
"error: Findere z={z} must be < kmer-size={k} \
|
||||||
|
(effective kmer size k−z+1 = {} ≤ 0)",
|
||||||
|
k as isize - z as isize + 1
|
||||||
|
);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}");
|
info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}");
|
||||||
IndexMode::Approx { b, z }
|
IndexMode::Approx { b, z }
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ fn write_batch(
|
|||||||
// ── Entry point ───────────────────────────────────────────────────────────────
|
// ── Entry point ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub fn run(args: SuperkmerArgs) {
|
pub fn run(args: SuperkmerArgs) {
|
||||||
|
args.common.validate();
|
||||||
|
|
||||||
let k = args.common.kmer_size;
|
let k = args.common.kmer_size;
|
||||||
let m = args.common.minimizer_size;
|
let m = args.common.minimizer_size;
|
||||||
let theta = args.common.theta;
|
let theta = args.common.theta;
|
||||||
|
|||||||
Reference in New Issue
Block a user