diff --git a/src/obikindex/src/dump.rs b/src/obikindex/src/dump.rs new file mode 100644 index 0000000..31e5ed3 --- /dev/null +++ b/src/obikindex/src/dump.rs @@ -0,0 +1,50 @@ +use std::io::Write; + +use crate::error::{OKIError, OKIResult}; +use crate::index::KmerIndex; + +impl KmerIndex { + /// Write a CSV table of all indexed kmers to `out`. + /// + /// Columns: `kmer`, then one column per genome (in index order). + /// Values are counts (u32) when `use_counts = true`, otherwise 0/1. + /// + /// `force_presence` overrides `with_counts`: even if the index stores counts, + /// the output uses 0/1 presence columns. + /// + /// The caller must have set the global kmer length (`obikseq::set_k`) before + /// calling this method. + pub fn dump(&self, out: &mut W, force_presence: bool) -> OKIResult<()> { + + let genomes = &self.meta.genomes; + let use_counts = self.meta.config.with_counts && !force_presence; + let n_genomes = genomes.len().max(1); + + // ── Header ──────────────────────────────────────────────────────────── + write!(out, "kmer")?; + for g in genomes { + write!(out, ",{g}")?; + } + writeln!(out)?; + + // ── Rows ────────────────────────────────────────────────────────────── + let n = self.n_partitions(); + for i in 0..n { + self.partition + .iter_partition_kmers(i, use_counts, n_genomes, |kmer, row| { + let seq = String::from_utf8(kmer.to_ascii()) + .unwrap_or_else(|_| "?".repeat(self.kmer_size())); + // write is infallible inside a closure — propagate via a flag if needed + let _ = write!(out, "{seq}"); + for &v in row.iter() { + let _ = write!(out, ",{v}"); + } + let _ = writeln!(out); + }) + .map_err(OKIError::Partition)?; + } + + out.flush()?; + Ok(()) + } +} diff --git a/src/obikindex/src/lib.rs b/src/obikindex/src/lib.rs index 4dcf4ff..c4a66f5 100644 --- a/src/obikindex/src/lib.rs +++ b/src/obikindex/src/lib.rs @@ -1,6 +1,7 @@ pub mod error; pub mod meta; pub mod state; +mod dump; mod index; mod merge; diff --git a/src/obikmer/src/cmd/dump.rs b/src/obikmer/src/cmd/dump.rs new file mode 100644 index 0000000..8775699 --- /dev/null +++ b/src/obikmer/src/cmd/dump.rs @@ -0,0 +1,39 @@ +use std::io::{self, BufWriter}; +use std::path::PathBuf; + +use clap::Args; +use obikindex::KmerIndex; +use obikseq::set_k; +use tracing::info; + +#[derive(Args)] +pub struct DumpArgs { + /// Index directory to dump + pub index: PathBuf, + + /// Output presence/absence (0/1) even if the index stores counts + #[arg(long, default_value_t = false)] + pub force_presence: bool, +} + +pub fn run(args: DumpArgs) { + let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| { + eprintln!("error opening index: {e}"); + std::process::exit(1); + }); + + set_k(idx.kmer_size()); + info!( + "dumping {} partitions, {} genome(s)", + idx.n_partitions(), + idx.meta().genomes.len() + ); + + let stdout = io::stdout(); + let mut out = BufWriter::new(stdout.lock()); + + idx.dump(&mut out, args.force_presence).unwrap_or_else(|e| { + eprintln!("dump error: {e}"); + std::process::exit(1); + }); +} diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index aaccdbf..85d8603 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,3 +1,4 @@ +pub mod dump; pub mod index; pub mod merge; pub mod superkmer; diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index 52c5953..1a1f8bc 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -20,6 +20,8 @@ enum Commands { Index(cmd::index::IndexArgs), /// Merge multiple built indexes into one Merge(cmd::merge::MergeArgs), + /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence) + Dump(cmd::dump::DumpArgs), /// Dump unitigs from a built index to stdout (debug) Unitig(cmd::unitig::UnitigArgs), } @@ -46,6 +48,7 @@ fn main() { Commands::Superkmer(args) => cmd::superkmer::run(args), Commands::Index(args) => cmd::index::run(args), Commands::Merge(args) => cmd::merge::run(args), + Commands::Dump(args) => cmd::dump::run(args), Commands::Unitig(args) => cmd::unitig::run(args), } diff --git a/src/obikpartitionner/src/dump_layer.rs b/src/obikpartitionner/src/dump_layer.rs new file mode 100644 index 0000000..71bcc93 --- /dev/null +++ b/src/obikpartitionner/src/dump_layer.rs @@ -0,0 +1,77 @@ +use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix}; +use obikseq::CanonicalKmer; +use obiskio::{SKError, SKResult, UnitigFileReader}; +use obilayeredmap::OLMError; +use obilayeredmap::meta::PartitionMeta; +use obilayeredmap::MphfLayer; + +use crate::partition::KmerPartition; + +const INDEX_SUBDIR: &str = "index"; + +fn olm_to_sk(e: OLMError) -> SKError { + match e { + OLMError::Io(e) => SKError::Io(e), + other => SKError::InvalidData { context: "dump", detail: other.to_string() }, + } +} + +impl KmerPartition { + /// Iterate all indexed kmers in partition `part`, calling `cb(kmer, row)` for each. + /// + /// `use_counts = true` → reads count columns (u32 values per genome). + /// `use_counts = false` → reads presence columns, converted to 0/1 u32. + /// + /// If no data matrix exists for a layer (pure set-membership, single genome), + /// a row of `n_genomes` ones is emitted for every kmer in that layer. + pub fn iter_partition_kmers( + &self, + part: usize, + use_counts: bool, + n_genomes: usize, + mut cb: impl FnMut(CanonicalKmer, Box<[u32]>), + ) -> SKResult<()> { + let index_dir = self.part_dir(part).join(INDEX_SUBDIR); + if !index_dir.exists() { + return Ok(()); + } + + let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?; + + for l in 0..meta.n_layers { + let layer_dir = index_dir.join(format!("layer_{l}")); + let mphf = MphfLayer::open(&layer_dir).map_err(olm_to_sk)?; + let reader = UnitigFileReader::open(&layer_dir.join("unitigs.bin"))?; + + let counts_dir = layer_dir.join("counts"); + let presence_dir = layer_dir.join("presence"); + + if use_counts && counts_dir.exists() { + let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + if let Some(slot) = mphf.find(kmer) { + cb(kmer, mat.row(slot)); + } + } + } else if !use_counts && presence_dir.exists() { + let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?; + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + if let Some(slot) = mphf.find(kmer) { + let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect(); + cb(kmer, row); + } + } + } else { + // No data matrix: pure set-membership layer, all kmers belong to every genome. + let all_present: Box<[u32]> = vec![1u32; n_genomes].into(); + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + if mphf.find(kmer).is_some() { + cb(kmer, all_present.clone()); + } + } + } + } + + Ok(()) + } +} diff --git a/src/obikpartitionner/src/lib.rs b/src/obikpartitionner/src/lib.rs index 672c118..3b098a2 100644 --- a/src/obikpartitionner/src/lib.rs +++ b/src/obikpartitionner/src/lib.rs @@ -1,3 +1,4 @@ +mod dump_layer; mod index_layer; mod kmer_sort; mod merge_layer;