Push lrwmyplxxzkn #19

Merged
coissac merged 4 commits from push-lrwmyplxxzkn into main 2026-06-09 08:28:21 +00:00
7 changed files with 105 additions and 29 deletions
Showing only changes of commit d626d42ec7 - Show all commits
+35 -2
View File
@@ -96,8 +96,16 @@ For each genome:
| `--max-total-count N` | all genomes | sum of per-genome counts ≤ N (`rebuild` only) | | `--max-total-count N` | all genomes | sum of per-genome counts ≤ N (`rebuild` only) |
| `--presence-threshold N` | all | per-genome count > N to be considered "present" (default 0) | | `--presence-threshold N` | all | per-genome count > N to be considered "present" (default 0) |
Defaults: mins = 0 (no lower bound), max counts = group size, max fracs = 1.0 **Conditional defaults** — the default for `--min-frac` and `--max-outgroup-count` depends on whether the corresponding group was explicitly declared:
(no upper bound). A filter with all defaults is a no-op.
| Situation | `--min-frac` default | `--max-outgroup-count` default |
|-----------|----------------------|-------------------------------|
| Neither `--ingroup` nor `--outgroup` | 0.0 (no-op) | no constraint (no-op) |
| `--ingroup` only | **1.0** — all ingroup genomes must carry the k-mer | no constraint |
| `--outgroup` only | 0.0 | **0** — no outgroup genome may carry the k-mer |
| Both declared | **1.0** | **0** |
Explicit flags always override these defaults. All other bounds (`--min-count`, `--max-count`, `--max-frac`, `--min-outgroup-*`) default to 0 / group size / 1.0 regardless of whether groups are declared.
Fractions are computed over the size of the classified group, not over total Fractions are computed over the size of the classified group, not over total
genome count. An empty group (no genome classified as ingroup/outgroup) never genome count. An empty group (no genome classified as ingroup/outgroup) never
@@ -169,6 +177,31 @@ obikmer unitig myindex \
--max-outgroup-count 0 --max-outgroup-count 0
``` ```
## Command-specific options
### `dump --head N`
Stops output after the first N k-mers that pass all active filters.
Iteration terminates immediately — subsequent partitions and layers are not scanned.
Useful for quick inspection of large indexes without loading the entire dataset.
```sh
obikmer dump myindex --head 100
obikmer dump myindex --head 20 --ingroup "species=Betula_nana" --min-count 1
```
### `distance --presence-threshold N`
When computing Jaccard distance on a **count index**, a k-mer is considered present in a genome if its count is ≥ N (default 1).
This option is independent of the `--presence-threshold` used in `rebuild`/`query` filtering.
```sh
# Jaccard treating kmers with count ≥ 2 as present
obikmer distance myindex --metric jaccard --presence-threshold 2
```
This parameter has no effect on presence/absence indexes (where values are already 0/1) or on metrics other than Jaccard.
## Implementation ## Implementation
- **`obikpartitionner::filter::GroupQuorumFilter`** — implements `KmerFilter` - **`obikpartitionner::filter::GroupQuorumFilter`** — implements `KmerFilter`
+2 -2
View File
@@ -11,9 +11,9 @@
| `merge` | Merge multiple built indexes into one | | `merge` | Merge multiple built indexes into one |
| `rebuild` | Filter and compact an existing index into a new single-layer index; supports ingroup/outgroup predicates on genome metadata | | `rebuild` | Filter and compact an existing index into a new single-layer index; supports ingroup/outgroup predicates on genome metadata |
| `query` | Query an index with sequences and annotate matches | | `query` | Query an index with sequences and annotate matches |
| `dump` | Dump all indexed k-mers as CSV (kmer + per-genome counts or presence); supports the same ingroup/outgroup filtering as `rebuild` | | `dump` | Dump all indexed k-mers as CSV (kmer + per-genome counts or presence); supports the same ingroup/outgroup filtering as `rebuild`; `--head N` limits output to the first N k-mers |
| `annotate` | Add or update genome metadata from a CSV file; or dump metadata as CSV | | `annotate` | Add or update genome metadata from a CSV file; or dump metadata as CSV |
| `distance` | Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees | | `distance` | Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees; `--presence-threshold N` sets the minimum count to consider a k-mer present when computing Jaccard on count indexes (default 1) |
| `unitig` | Build a global de Bruijn graph across all partitions and enumerate its unitigs as FASTA; supports the same ingroup/outgroup filtering as `rebuild` | | `unitig` | Build a global de Bruijn graph across all partitions and enumerate its unitigs as FASTA; supports the same ingroup/outgroup filtering as `rebuild` |
| `estimate` | Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing | | `estimate` | Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing |
| `reindex` | Convert an index's evidence in-place: exact ↔ approx | | `reindex` | Convert an index's evidence in-place: exact ↔ approx |
+12 -4
View File
@@ -20,6 +20,7 @@ impl KmerIndex {
out: &mut W, out: &mut W,
force_presence: bool, force_presence: bool,
debug: bool, debug: bool,
head: Option<usize>,
filters: &[Box<dyn KmerFilter>], filters: &[Box<dyn KmerFilter>],
) -> OKIResult<()> { ) -> OKIResult<()> {
let genomes = &self.meta.genomes; let genomes = &self.meta.genomes;
@@ -39,8 +40,10 @@ impl KmerIndex {
// ── Rows ────────────────────────────────────────────────────────────── // ── Rows ──────────────────────────────────────────────────────────────
let n = self.n_partitions(); let n = self.n_partitions();
let mut remaining = head.unwrap_or(usize::MAX);
for i in 0..n { for i in 0..n {
if debug { if remaining == 0 { break; }
let cont = if debug {
self.partition self.partition
.iter_partition_kmers_located(i, use_counts, n_genomes, filters, |part, layer, kmer, row| { .iter_partition_kmers_located(i, use_counts, n_genomes, filters, |part, layer, kmer, row| {
let seq = String::from_utf8(kmer.to_ascii()) let seq = String::from_utf8(kmer.to_ascii())
@@ -48,8 +51,10 @@ impl KmerIndex {
let _ = write!(out, "{part},{layer},{seq}"); let _ = write!(out, "{part},{layer},{seq}");
for &v in row.iter() { let _ = write!(out, ",{v}"); } for &v in row.iter() { let _ = write!(out, ",{v}"); }
let _ = writeln!(out); let _ = writeln!(out);
remaining -= 1;
remaining > 0
}) })
.map_err(OKIError::Partition)?; .map_err(OKIError::Partition)?
} else { } else {
self.partition self.partition
.iter_partition_kmers(i, use_counts, n_genomes, filters, |kmer, row| { .iter_partition_kmers(i, use_counts, n_genomes, filters, |kmer, row| {
@@ -58,9 +63,12 @@ impl KmerIndex {
let _ = write!(out, "{seq}"); let _ = write!(out, "{seq}");
for &v in row.iter() { let _ = write!(out, ",{v}"); } for &v in row.iter() { let _ = write!(out, ",{v}"); }
let _ = writeln!(out); let _ = writeln!(out);
remaining -= 1;
remaining > 0
}) })
.map_err(OKIError::Partition)?; .map_err(OKIError::Partition)?
} };
if !cont { break; }
} }
out.flush()?; out.flush()?;
+5 -1
View File
@@ -20,6 +20,10 @@ pub struct DumpArgs {
#[arg(long, default_value_t = false)] #[arg(long, default_value_t = false)]
pub debug: bool, pub debug: bool,
/// Only output the first N kmers
#[arg(long)]
pub head: Option<usize>,
#[command(flatten)] #[command(flatten)]
pub filter: FilterArgs, pub filter: FilterArgs,
} }
@@ -41,7 +45,7 @@ pub fn run(args: DumpArgs) {
let stdout = io::stdout(); let stdout = io::stdout();
let mut out = BufWriter::new(stdout.lock()); let mut out = BufWriter::new(stdout.lock());
idx.dump(&mut out, args.force_presence, args.debug, &filters).unwrap_or_else(|e| { idx.dump(&mut out, args.force_presence, args.debug, args.head, &filters).unwrap_or_else(|e| {
eprintln!("dump error: {e}"); eprintln!("dump error: {e}");
std::process::exit(1); std::process::exit(1);
}); });
+7 -2
View File
@@ -162,6 +162,7 @@ pub struct FilterArgs {
pub max_count: Option<usize>, pub max_count: Option<usize>,
/// Minimum fraction of ingroup genomes containing the k-mer [0.01.0] /// Minimum fraction of ingroup genomes containing the k-mer [0.01.0]
/// (default 1.0 when --ingroup is set, 0.0 otherwise)
#[arg(long)] #[arg(long)]
pub min_frac: Option<f64>, pub min_frac: Option<f64>,
@@ -174,6 +175,7 @@ pub struct FilterArgs {
pub min_outgroup_count: Option<usize>, pub min_outgroup_count: Option<usize>,
/// Maximum number of outgroup genomes containing the k-mer /// Maximum number of outgroup genomes containing the k-mer
/// (default 0 when --outgroup is set, no constraint otherwise)
#[arg(long)] #[arg(long)]
pub max_outgroup_count: Option<usize>, pub max_outgroup_count: Option<usize>,
@@ -258,16 +260,19 @@ pub fn build_group_filter(
let in_size = ingroup_idx.len(); let in_size = ingroup_idx.len();
let out_size = outgroup_idx.len(); let out_size = outgroup_idx.len();
let default_min_frac = if !ingroup_preds.is_empty() { 1.0 } else { 0.0 };
let default_max_outgroup_count = if !outgroup_preds.is_empty() { 0 } else { out_size };
GroupQuorumFilter { GroupQuorumFilter {
ingroup_idx, ingroup_idx,
outgroup_idx, outgroup_idx,
threshold: p.threshold, threshold: p.threshold,
min_count: p.min_count.unwrap_or(0), min_count: p.min_count.unwrap_or(0),
max_count: p.max_count.unwrap_or(in_size), max_count: p.max_count.unwrap_or(in_size),
min_frac: p.min_frac.unwrap_or(0.0), min_frac: p.min_frac.unwrap_or(default_min_frac),
max_frac: p.max_frac.unwrap_or(1.0), max_frac: p.max_frac.unwrap_or(1.0),
min_outgroup_count: p.min_outgroup_count.unwrap_or(0), min_outgroup_count: p.min_outgroup_count.unwrap_or(0),
max_outgroup_count: p.max_outgroup_count.unwrap_or(out_size), max_outgroup_count: p.max_outgroup_count.unwrap_or(default_max_outgroup_count),
min_outgroup_frac: p.min_outgroup_frac.unwrap_or(0.0), min_outgroup_frac: p.min_outgroup_frac.unwrap_or(0.0),
max_outgroup_frac: p.max_outgroup_frac.unwrap_or(1.0), max_outgroup_frac: p.max_outgroup_frac.unwrap_or(1.0),
} }
+1
View File
@@ -48,6 +48,7 @@ pub fn run(args: UnitigArgs) {
partition partition
.iter_partition_kmers(i, use_counts, n_genomes, &filters, |kmer, _row| { .iter_partition_kmers(i, use_counts, n_genomes, &filters, |kmer, _row| {
local_g.push(kmer); local_g.push(kmer);
true
}) })
.unwrap_or_else(|e| { .unwrap_or_else(|e| {
eprintln!("error reading partition {i}: {e}"); eprintln!("error reading partition {i}: {e}");
+43 -18
View File
@@ -26,17 +26,19 @@ impl KmerPartition {
/// If no data matrix exists for a layer (pure set-membership, single genome), /// If no data matrix exists for a layer (pure set-membership, single genome),
/// a row of `n_genomes` ones is emitted for every kmer in that layer — unless /// a row of `n_genomes` ones is emitted for every kmer in that layer — unless
/// the filter rejects it, in which case the whole layer is skipped. /// the filter rejects it, in which case the whole layer is skipped.
/// Like [`iter_partition_kmers`] but the callback returns `false` to stop early.
/// Returns `Ok(true)` if all kmers were visited, `Ok(false)` if the callback halted.
pub fn iter_partition_kmers( pub fn iter_partition_kmers(
&self, &self,
part: usize, part: usize,
use_counts: bool, use_counts: bool,
n_genomes: usize, n_genomes: usize,
filters: &[Box<dyn KmerFilter>], filters: &[Box<dyn KmerFilter>],
mut cb: impl FnMut(CanonicalKmer, Box<[u32]>), mut cb: impl FnMut(CanonicalKmer, Box<[u32]>) -> bool,
) -> SKResult<()> { ) -> SKResult<bool> {
let index_dir = self.part_dir(part).join(INDEX_SUBDIR); let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
if !index_dir.exists() { if !index_dir.exists() {
return Ok(()); return Ok(true);
} }
let index_mode = PartitionMeta::load(&index_dir) let index_mode = PartitionMeta::load(&index_dir)
@@ -54,56 +56,68 @@ impl KmerPartition {
let counts_dir = layer_dir.join("counts"); let counts_dir = layer_dir.join("counts");
let presence_dir = layer_dir.join("presence"); let presence_dir = layer_dir.join("presence");
if use_counts && counts_dir.exists() { let cont = if use_counts && counts_dir.exists() {
let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?; let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
let mut cont = true;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) { if let Some(slot) = mphf.find(kmer) {
let row = mat.row(slot); let row = mat.row(slot);
if passes_all(filters, &row, n_genomes) { if passes_all(filters, &row, n_genomes) {
cb(kmer, row); cont = cb(kmer, row);
if !cont { break; }
} }
} }
} }
cont
} else if !use_counts && presence_dir.exists() { } else if !use_counts && presence_dir.exists() {
let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?; let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
let mut cont = true;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) { if let Some(slot) = mphf.find(kmer) {
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect(); let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
if passes_all(filters, &row, n_genomes) { if passes_all(filters, &row, n_genomes) {
cb(kmer, row); cont = cb(kmer, row);
if !cont { break; }
} }
} }
} }
cont
} else { } else {
// No data matrix: implicit presence — all values are 1. // No data matrix: implicit presence — all values are 1.
// The filter result is identical for every kmer, so evaluate once. // The filter result is identical for every kmer, so evaluate once.
let all_present: Box<[u32]> = vec![1u32; n_genomes].into(); let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
let mut cont = true;
if passes_all(filters, &all_present, n_genomes) { if passes_all(filters, &all_present, n_genomes) {
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if mphf.find(kmer).is_some() { if mphf.find(kmer).is_some() {
cb(kmer, all_present.clone()); cont = cb(kmer, all_present.clone());
if !cont { break; }
} }
} }
} }
} cont
};
if !cont { return Ok(false); }
} }
Ok(()) Ok(true)
} }
/// Like [`iter_partition_kmers`] but the callback also receives `(partition, layer)` /// Like [`iter_partition_kmers`] but the callback also receives `(partition, layer)`
/// indices, enabling debug output that identifies where each kmer was stored. /// indices, enabling debug output that identifies where each kmer was stored.
/// Returns `Ok(true)` if all kmers were visited, `Ok(false)` if the callback halted.
pub fn iter_partition_kmers_located( pub fn iter_partition_kmers_located(
&self, &self,
part: usize, part: usize,
use_counts: bool, use_counts: bool,
n_genomes: usize, n_genomes: usize,
filters: &[Box<dyn KmerFilter>], filters: &[Box<dyn KmerFilter>],
mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>), mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>) -> bool,
) -> SKResult<()> { ) -> SKResult<bool> {
let index_dir = self.part_dir(part).join(INDEX_SUBDIR); let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
if !index_dir.exists() { if !index_dir.exists() {
return Ok(()); return Ok(true);
} }
let index_mode = PartitionMeta::load(&index_dir) let index_mode = PartitionMeta::load(&index_dir)
@@ -120,39 +134,50 @@ impl KmerPartition {
let counts_dir = layer_dir.join("counts"); let counts_dir = layer_dir.join("counts");
let presence_dir = layer_dir.join("presence"); let presence_dir = layer_dir.join("presence");
if use_counts && counts_dir.exists() { let cont = if use_counts && counts_dir.exists() {
let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?; let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
let mut cont = true;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) { if let Some(slot) = mphf.find(kmer) {
let row = mat.row(slot); let row = mat.row(slot);
if passes_all(filters, &row, n_genomes) { if passes_all(filters, &row, n_genomes) {
cb(part, layer, kmer, row); cont = cb(part, layer, kmer, row);
if !cont { break; }
} }
} }
} }
cont
} else if !use_counts && presence_dir.exists() { } else if !use_counts && presence_dir.exists() {
let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?; let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
let mut cont = true;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if let Some(slot) = mphf.find(kmer) { if let Some(slot) = mphf.find(kmer) {
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect(); let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
if passes_all(filters, &row, n_genomes) { if passes_all(filters, &row, n_genomes) {
cb(part, layer, kmer, row); cont = cb(part, layer, kmer, row);
if !cont { break; }
} }
} }
} }
cont
} else { } else {
let all_present: Box<[u32]> = vec![1u32; n_genomes].into(); let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
let mut cont = true;
if passes_all(filters, &all_present, n_genomes) { if passes_all(filters, &all_present, n_genomes) {
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
if mphf.find(kmer).is_some() { if mphf.find(kmer).is_some() {
cb(part, layer, kmer, all_present.clone()); cont = cb(part, layer, kmer, all_present.clone());
if !cont { break; }
} }
} }
} }
} cont
};
if !cont { return Ok(false); }
layer += 1; layer += 1;
} }
Ok(()) Ok(true)
} }
} }