Push qkpyqurltlpk #1
@@ -0,0 +1,50 @@
|
|||||||
|
use std::io::Write;
|
||||||
|
|
||||||
|
use crate::error::{OKIError, OKIResult};
|
||||||
|
use crate::index::KmerIndex;
|
||||||
|
|
||||||
|
impl KmerIndex {
|
||||||
|
/// Write a CSV table of all indexed kmers to `out`.
|
||||||
|
///
|
||||||
|
/// Columns: `kmer`, then one column per genome (in index order).
|
||||||
|
/// Values are counts (u32) when `use_counts = true`, otherwise 0/1.
|
||||||
|
///
|
||||||
|
/// `force_presence` overrides `with_counts`: even if the index stores counts,
|
||||||
|
/// the output uses 0/1 presence columns.
|
||||||
|
///
|
||||||
|
/// The caller must have set the global kmer length (`obikseq::set_k`) before
|
||||||
|
/// calling this method.
|
||||||
|
pub fn dump<W: Write>(&self, out: &mut W, force_presence: bool) -> OKIResult<()> {
|
||||||
|
|
||||||
|
let genomes = &self.meta.genomes;
|
||||||
|
let use_counts = self.meta.config.with_counts && !force_presence;
|
||||||
|
let n_genomes = genomes.len().max(1);
|
||||||
|
|
||||||
|
// ── Header ────────────────────────────────────────────────────────────
|
||||||
|
write!(out, "kmer")?;
|
||||||
|
for g in genomes {
|
||||||
|
write!(out, ",{g}")?;
|
||||||
|
}
|
||||||
|
writeln!(out)?;
|
||||||
|
|
||||||
|
// ── Rows ──────────────────────────────────────────────────────────────
|
||||||
|
let n = self.n_partitions();
|
||||||
|
for i in 0..n {
|
||||||
|
self.partition
|
||||||
|
.iter_partition_kmers(i, use_counts, n_genomes, |kmer, row| {
|
||||||
|
let seq = String::from_utf8(kmer.to_ascii())
|
||||||
|
.unwrap_or_else(|_| "?".repeat(self.kmer_size()));
|
||||||
|
// write is infallible inside a closure — propagate via a flag if needed
|
||||||
|
let _ = write!(out, "{seq}");
|
||||||
|
for &v in row.iter() {
|
||||||
|
let _ = write!(out, ",{v}");
|
||||||
|
}
|
||||||
|
let _ = writeln!(out);
|
||||||
|
})
|
||||||
|
.map_err(OKIError::Partition)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
out.flush()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
pub mod error;
|
pub mod error;
|
||||||
pub mod meta;
|
pub mod meta;
|
||||||
pub mod state;
|
pub mod state;
|
||||||
|
mod dump;
|
||||||
mod index;
|
mod index;
|
||||||
mod merge;
|
mod merge;
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,39 @@
|
|||||||
|
use std::io::{self, BufWriter};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use clap::Args;
|
||||||
|
use obikindex::KmerIndex;
|
||||||
|
use obikseq::set_k;
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
|
#[derive(Args)]
|
||||||
|
pub struct DumpArgs {
|
||||||
|
/// Index directory to dump
|
||||||
|
pub index: PathBuf,
|
||||||
|
|
||||||
|
/// Output presence/absence (0/1) even if the index stores counts
|
||||||
|
#[arg(long, default_value_t = false)]
|
||||||
|
pub force_presence: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run(args: DumpArgs) {
|
||||||
|
let idx = KmerIndex::open(&args.index).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error opening index: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
set_k(idx.kmer_size());
|
||||||
|
info!(
|
||||||
|
"dumping {} partitions, {} genome(s)",
|
||||||
|
idx.n_partitions(),
|
||||||
|
idx.meta().genomes.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
let stdout = io::stdout();
|
||||||
|
let mut out = BufWriter::new(stdout.lock());
|
||||||
|
|
||||||
|
idx.dump(&mut out, args.force_presence).unwrap_or_else(|e| {
|
||||||
|
eprintln!("dump error: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
pub mod dump;
|
||||||
pub mod index;
|
pub mod index;
|
||||||
pub mod merge;
|
pub mod merge;
|
||||||
pub mod superkmer;
|
pub mod superkmer;
|
||||||
|
|||||||
@@ -20,6 +20,8 @@ enum Commands {
|
|||||||
Index(cmd::index::IndexArgs),
|
Index(cmd::index::IndexArgs),
|
||||||
/// Merge multiple built indexes into one
|
/// Merge multiple built indexes into one
|
||||||
Merge(cmd::merge::MergeArgs),
|
Merge(cmd::merge::MergeArgs),
|
||||||
|
/// Dump all indexed kmers as CSV (kmer + per-genome counts or presence)
|
||||||
|
Dump(cmd::dump::DumpArgs),
|
||||||
/// Dump unitigs from a built index to stdout (debug)
|
/// Dump unitigs from a built index to stdout (debug)
|
||||||
Unitig(cmd::unitig::UnitigArgs),
|
Unitig(cmd::unitig::UnitigArgs),
|
||||||
}
|
}
|
||||||
@@ -46,6 +48,7 @@ fn main() {
|
|||||||
Commands::Superkmer(args) => cmd::superkmer::run(args),
|
Commands::Superkmer(args) => cmd::superkmer::run(args),
|
||||||
Commands::Index(args) => cmd::index::run(args),
|
Commands::Index(args) => cmd::index::run(args),
|
||||||
Commands::Merge(args) => cmd::merge::run(args),
|
Commands::Merge(args) => cmd::merge::run(args),
|
||||||
|
Commands::Dump(args) => cmd::dump::run(args),
|
||||||
Commands::Unitig(args) => cmd::unitig::run(args),
|
Commands::Unitig(args) => cmd::unitig::run(args),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,77 @@
|
|||||||
|
use obicompactvec::{PersistentBitMatrix, PersistentCompactIntMatrix};
|
||||||
|
use obikseq::CanonicalKmer;
|
||||||
|
use obiskio::{SKError, SKResult, UnitigFileReader};
|
||||||
|
use obilayeredmap::OLMError;
|
||||||
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
|
use obilayeredmap::MphfLayer;
|
||||||
|
|
||||||
|
use crate::partition::KmerPartition;
|
||||||
|
|
||||||
|
const INDEX_SUBDIR: &str = "index";
|
||||||
|
|
||||||
|
fn olm_to_sk(e: OLMError) -> SKError {
|
||||||
|
match e {
|
||||||
|
OLMError::Io(e) => SKError::Io(e),
|
||||||
|
other => SKError::InvalidData { context: "dump", detail: other.to_string() },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KmerPartition {
|
||||||
|
/// Iterate all indexed kmers in partition `part`, calling `cb(kmer, row)` for each.
|
||||||
|
///
|
||||||
|
/// `use_counts = true` → reads count columns (u32 values per genome).
|
||||||
|
/// `use_counts = false` → reads presence columns, converted to 0/1 u32.
|
||||||
|
///
|
||||||
|
/// If no data matrix exists for a layer (pure set-membership, single genome),
|
||||||
|
/// a row of `n_genomes` ones is emitted for every kmer in that layer.
|
||||||
|
pub fn iter_partition_kmers(
|
||||||
|
&self,
|
||||||
|
part: usize,
|
||||||
|
use_counts: bool,
|
||||||
|
n_genomes: usize,
|
||||||
|
mut cb: impl FnMut(CanonicalKmer, Box<[u32]>),
|
||||||
|
) -> SKResult<()> {
|
||||||
|
let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
|
||||||
|
if !index_dir.exists() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?;
|
||||||
|
|
||||||
|
for l in 0..meta.n_layers {
|
||||||
|
let layer_dir = index_dir.join(format!("layer_{l}"));
|
||||||
|
let mphf = MphfLayer::open(&layer_dir).map_err(olm_to_sk)?;
|
||||||
|
let reader = UnitigFileReader::open(&layer_dir.join("unitigs.bin"))?;
|
||||||
|
|
||||||
|
let counts_dir = layer_dir.join("counts");
|
||||||
|
let presence_dir = layer_dir.join("presence");
|
||||||
|
|
||||||
|
if use_counts && counts_dir.exists() {
|
||||||
|
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
|
||||||
|
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
|
if let Some(slot) = mphf.find(kmer) {
|
||||||
|
cb(kmer, mat.row(slot));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if !use_counts && presence_dir.exists() {
|
||||||
|
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
|
||||||
|
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
|
if let Some(slot) = mphf.find(kmer) {
|
||||||
|
let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
|
||||||
|
cb(kmer, row);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// No data matrix: pure set-membership layer, all kmers belong to every genome.
|
||||||
|
let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
|
||||||
|
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
|
||||||
|
if mphf.find(kmer).is_some() {
|
||||||
|
cb(kmer, all_present.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
mod dump_layer;
|
||||||
mod index_layer;
|
mod index_layer;
|
||||||
mod kmer_sort;
|
mod kmer_sort;
|
||||||
mod merge_layer;
|
mod merge_layer;
|
||||||
|
|||||||
Reference in New Issue
Block a user