feat: add CLI command to export indexed k-mers to CSV
This change introduces a new `dump` subcommand that exports all indexed k-mers to a CSV stream. The implementation spans multiple crates, adding core export logic to `obikindex` and partition iteration to `obikpartitionner`. The command supports a `--force-presence` flag to output binary presence/absence data instead of stored counts, and includes necessary module registrations and structural updates across the codebase.
This commit is contained in:
@@ -0,0 +1,77 @@
|
||||
use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
|
||||
use obikseq::CanonicalKmer;
|
||||
use obiskio::{SKError, SKResult, UnitigFileReader};
|
||||
use obilayeredmap::OLMError;
|
||||
use obilayeredmap::meta::PartitionMeta;
|
||||
use obilayeredmap::MphfLayer;
|
||||
|
||||
use crate::partition::KmerPartition;
|
||||
|
||||
const INDEX_SUBDIR: &str = "index";
|
||||
|
||||
fn olm_to_sk(e: OLMError) -> SKError {
|
||||
match e {
|
||||
OLMError::Io(e) => SKError::Io(e),
|
||||
other => SKError::InvalidData { context: "dump", detail: other.to_string() },
|
||||
}
|
||||
}
|
||||
|
||||
impl KmerPartition {
|
||||
/// Iterate all indexed kmers in partition `part`, calling `cb(kmer, row)` for each.
|
||||
///
|
||||
/// `use_counts = true` → reads count columns (u32 values per genome).
|
||||
/// `use_counts = false` → reads presence columns, converted to 0/1 u32.
|
||||
///
|
||||
/// If no data matrix exists for a layer (pure set-membership, single genome),
|
||||
/// a row of `n_genomes` ones is emitted for every kmer in that layer.
|
||||
pub fn iter_partition_kmers(
|
||||
&self,
|
||||
part: usize,
|
||||
use_counts: bool,
|
||||
n_genomes: usize,
|
||||
mut cb: impl FnMut(CanonicalKmer, Box<[u32]>),
|
||||
) -> SKResult<()> {
|
||||
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
|
||||
if !index_dir.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?;
|
||||
|
||||
for l in 0..meta.n_layers {
|
||||
let layer_dir = index_dir.join(format!("layer_{l}"));
|
||||
let mphf = MphfLayer::open(&layer_dir).map_err(olm_to_sk)?;
|
||||
let reader = UnitigFileReader::open(&layer_dir.join("unitigs.bin"))?;
|
||||
|
||||
let counts_dir = layer_dir.join("counts");
|
||||
let presence_dir = layer_dir.join("presence");
|
||||
|
||||
if use_counts && counts_dir.exists() {
|
||||
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
if let Some(slot) = mphf.find(kmer) {
|
||||
cb(kmer, mat.row(slot));
|
||||
}
|
||||
}
|
||||
} else if !use_counts && presence_dir.exists() {
|
||||
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
if let Some(slot) = mphf.find(kmer) {
|
||||
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
|
||||
cb(kmer, row);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No data matrix: pure set-membership layer, all kmers belong to every genome.
|
||||
let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
|
||||
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||
if mphf.find(kmer).is_some() {
|
||||
cb(kmer, all_present.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
mod dump_layer;
|
||||
mod index_layer;
|
||||
mod kmer_sort;
|
||||
mod merge_layer;
|
||||
|
||||
Reference in New Issue
Block a user