Push nvyqwlpspwvl #11
@@ -27,6 +27,18 @@
|
||||
- Canonical form: `min(kmer, revcomp(kmer))` reduces strand-symmetric space by half
|
||||
- Input formats: FASTA, FASTQ, gzip, streaming stdin; `index` reads from stdin automatically when no input files are provided (`-` can also be passed explicitly among other paths)
|
||||
|
||||
## Parameter constraints (enforced at CLI)
|
||||
|
||||
All constraints below are checked by `CommonArgs::validate()` at the start of `superkmer` and `index`. Invalid values exit immediately with an error.
|
||||
|
||||
| Parameter | Constraint | Reason |
|
||||
|-----------|-----------|--------|
|
||||
| k (`--kmer-size`) | odd | even k allows palindromic k-mers: kmer == revcomp(kmer), breaking the canonical form invariant |
|
||||
| k (`--kmer-size`) | k ∈ [11, 31] | k > 31 overflows u64 at 2 bits/base; k < 11 gives insufficient specificity |
|
||||
| m (`--minimizer-size`) | odd | same palindrome argument as k |
|
||||
| m (`--minimizer-size`) | 3 ≤ m ≤ k−1 | minimizer must be strictly shorter than the kmer |
|
||||
| z (`-z`, Findere, `index --approx` only) | z ≤ k−1 | effective indexed kmer size is k−z+1; z ≥ k would make it ≤ 0 |
|
||||
|
||||
## Genome label constraints
|
||||
|
||||
Genome labels are arbitrary Unicode strings with the following restrictions:
|
||||
|
||||
+3
-1
@@ -4,9 +4,11 @@
|
||||
|
||||
A **kmer** is a DNA subsequence of fixed length k. Two constraints govern the choice of k:
|
||||
|
||||
- **k ∈ [11, 31]**: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word.
|
||||
- **k ∈ [11, 31]**: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word (u64 at 2 bits/base requires k ≤ 32; k < 11 yields insufficient specificity).
|
||||
- **k is odd**: an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form `min(kmer, revcomp(kmer))` is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting.
|
||||
|
||||
Both constraints are **enforced at CLI entry** by `CommonArgs::validate()` in `superkmer` and `index`. Passing an invalid k exits immediately with an error message.
|
||||
|
||||
## Super-kmers
|
||||
|
||||
A **super-kmer** is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides, sharing the same **canonical minimizer**. The **canonical minimizer** of a kmer is the m-mer (m < k) whose canonical hash `hash_kmer(min(m-mer, revcomp(m-mer)))` is smallest over all m-mers in the kmer window. The hash function is a `mix64`-based bijection; selection is purely hash-ordered with no degeneracy filter. A super-kmer is capped at 256 nucleotides; a longer run is split at that boundary.
|
||||
|
||||
@@ -63,6 +63,29 @@ pub fn block_size_to_bits(n: usize) -> u8 {
|
||||
}
|
||||
|
||||
impl CommonArgs {
|
||||
/// Validate k and m constraints. Exits on error.
|
||||
pub fn validate(&self) {
|
||||
let k = self.kmer_size;
|
||||
let m = self.minimizer_size;
|
||||
|
||||
if k < 11 || k > 31 {
|
||||
eprintln!("error: --kmer-size must be in [11, 31] (got {k})");
|
||||
std::process::exit(1);
|
||||
}
|
||||
if k % 2 == 0 {
|
||||
eprintln!("error: --kmer-size must be odd (got {k}); even k allows palindromic k-mers");
|
||||
std::process::exit(1);
|
||||
}
|
||||
if m < 3 || m >= k {
|
||||
eprintln!("error: --minimizer-size must be in [3, k−1] = [3, {}] (got {m})", k - 1);
|
||||
std::process::exit(1);
|
||||
}
|
||||
if m % 2 == 0 {
|
||||
eprintln!("error: --minimizer-size must be odd (got {m})");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn seqfile_paths(&self) -> obiread::PathIter {
|
||||
let paths: Vec<PathBuf> = if self.inputs.is_empty() {
|
||||
vec![PathBuf::from("-")]
|
||||
|
||||
@@ -152,12 +152,23 @@ pub(crate) fn resolve_approx_params(
|
||||
}
|
||||
|
||||
pub fn run(args: IndexArgs) {
|
||||
args.common.validate();
|
||||
|
||||
let output = args.output.clone();
|
||||
let mut rep = Reporter::new();
|
||||
|
||||
// ── Resolve evidence kind ────────────────────────────────────────────────
|
||||
let evidence = if args.approx {
|
||||
let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp);
|
||||
let k = args.common.kmer_size;
|
||||
if z as usize >= k {
|
||||
eprintln!(
|
||||
"error: Findere z={z} must be < kmer-size={k} \
|
||||
(effective kmer size k−z+1 = {} ≤ 0)",
|
||||
k as isize - z as isize + 1
|
||||
);
|
||||
std::process::exit(1);
|
||||
}
|
||||
info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}");
|
||||
IndexMode::Approx { b, z }
|
||||
} else {
|
||||
|
||||
@@ -33,6 +33,8 @@ fn write_batch(
|
||||
// ── Entry point ───────────────────────────────────────────────────────────────
|
||||
|
||||
pub fn run(args: SuperkmerArgs) {
|
||||
args.common.validate();
|
||||
|
||||
let k = args.common.kmer_size;
|
||||
let m = args.common.minimizer_size;
|
||||
let theta = args.common.theta;
|
||||
|
||||
Reference in New Issue
Block a user