feat: introduce genome metadata tracking and CSV export #2
Generated
+28
@@ -601,6 +601,27 @@ dependencies = [
|
|||||||
"typenum",
|
"typenum",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "csv"
|
||||||
|
version = "1.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938"
|
||||||
|
dependencies = [
|
||||||
|
"csv-core",
|
||||||
|
"itoa",
|
||||||
|
"ryu",
|
||||||
|
"serde_core",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "csv-core"
|
||||||
|
version = "0.1.13"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cvt"
|
name = "cvt"
|
||||||
version = "0.1.2"
|
version = "0.1.2"
|
||||||
@@ -1500,6 +1521,7 @@ name = "obikmer"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
|
"csv",
|
||||||
"indicatif",
|
"indicatif",
|
||||||
"kodama",
|
"kodama",
|
||||||
"obifastwrite",
|
"obifastwrite",
|
||||||
@@ -2192,6 +2214,12 @@ version = "1.0.22"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ryu"
|
||||||
|
version = "1.0.23"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "same-file"
|
name = "same-file"
|
||||||
version = "1.0.6"
|
version = "1.0.6"
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ impl KmerIndex {
|
|||||||
}
|
}
|
||||||
write!(out, "kmer")?;
|
write!(out, "kmer")?;
|
||||||
for g in genomes {
|
for g in genomes {
|
||||||
write!(out, ",{g}")?;
|
write!(out, ",{}", g.label)?;
|
||||||
}
|
}
|
||||||
writeln!(out)?;
|
writeln!(out)?;
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ use rayon::prelude::*;
|
|||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
use crate::error::{OKIError, OKIResult};
|
use crate::error::{OKIError, OKIResult};
|
||||||
use crate::meta::{IndexConfig, IndexMeta};
|
use crate::meta::{GenomeInfo, IndexConfig, IndexMeta};
|
||||||
use crate::state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
use crate::state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
||||||
|
|
||||||
pub struct KmerIndex {
|
pub struct KmerIndex {
|
||||||
@@ -23,13 +23,12 @@ pub struct KmerIndex {
|
|||||||
impl KmerIndex {
|
impl KmerIndex {
|
||||||
/// Create a new index at `path`.
|
/// Create a new index at `path`.
|
||||||
///
|
///
|
||||||
/// If `genome_label` is `Some`, it is stored immediately.
|
/// If `genome_info` is `Some`, it is stored immediately.
|
||||||
/// If `None`, the label will be derived from the first scatter input path
|
/// If `None`, the genome entry will be added when `mark_scattered` is called.
|
||||||
/// when `mark_scattered` is called.
|
|
||||||
pub fn create<P: AsRef<Path>>(
|
pub fn create<P: AsRef<Path>>(
|
||||||
path: P,
|
path: P,
|
||||||
config: IndexConfig,
|
config: IndexConfig,
|
||||||
genome_label: Option<String>,
|
genome_info: Option<GenomeInfo>,
|
||||||
force: bool,
|
force: bool,
|
||||||
) -> OKIResult<Self> {
|
) -> OKIResult<Self> {
|
||||||
let root_path = path.as_ref().to_owned();
|
let root_path = path.as_ref().to_owned();
|
||||||
@@ -41,8 +40,8 @@ impl KmerIndex {
|
|||||||
force,
|
force,
|
||||||
)?;
|
)?;
|
||||||
let mut meta = IndexMeta::new(config);
|
let mut meta = IndexMeta::new(config);
|
||||||
if let Some(label) = genome_label {
|
if let Some(info) = genome_info {
|
||||||
meta.genomes.push(label);
|
meta.genomes.push(info);
|
||||||
}
|
}
|
||||||
meta.write(&root_path)?;
|
meta.write(&root_path)?;
|
||||||
Ok(Self { root_path, meta, partition })
|
Ok(Self { root_path, meta, partition })
|
||||||
@@ -71,6 +70,7 @@ impl KmerIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn meta(&self) -> &IndexMeta { &self.meta }
|
pub fn meta(&self) -> &IndexMeta { &self.meta }
|
||||||
|
pub fn meta_mut(&mut self) -> &mut IndexMeta { &mut self.meta }
|
||||||
pub fn kmer_size(&self) -> usize { self.meta.config.kmer_size }
|
pub fn kmer_size(&self) -> usize { self.meta.config.kmer_size }
|
||||||
pub fn minimizer_size(&self) -> usize { self.meta.config.minimizer_size }
|
pub fn minimizer_size(&self) -> usize { self.meta.config.minimizer_size }
|
||||||
pub fn n_partitions(&self) -> usize { self.partition.n_partitions() }
|
pub fn n_partitions(&self) -> usize { self.partition.n_partitions() }
|
||||||
@@ -88,7 +88,7 @@ impl KmerIndex {
|
|||||||
pub fn mark_scattered(&mut self) -> OKIResult<()> {
|
pub fn mark_scattered(&mut self) -> OKIResult<()> {
|
||||||
if self.meta.genomes.is_empty() {
|
if self.meta.genomes.is_empty() {
|
||||||
let label = label_from_path(&self.root_path);
|
let label = label_from_path(&self.root_path);
|
||||||
self.meta.genomes.push(label);
|
self.meta.genomes.push(GenomeInfo::new(label));
|
||||||
self.meta.write(&self.root_path)?;
|
self.meta.write(&self.root_path)?;
|
||||||
}
|
}
|
||||||
touch(&self.root_path.join(SENTINEL_SCATTERED))?;
|
touch(&self.root_path.join(SENTINEL_SCATTERED))?;
|
||||||
@@ -114,7 +114,7 @@ impl KmerIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn write_spectrum(&self, sp: &KmerSpectrum) -> OKIResult<()> {
|
fn write_spectrum(&self, sp: &KmerSpectrum) -> OKIResult<()> {
|
||||||
let label = self.meta.genomes.first().map(String::as_str).unwrap_or("unknown");
|
let label = self.meta.genomes.first().map(|g| g.label.as_str()).unwrap_or("unknown");
|
||||||
let spectrums_dir = self.root_path.join("spectrums");
|
let spectrums_dir = self.root_path.join("spectrums");
|
||||||
fs::create_dir_all(&spectrums_dir)?;
|
fs::create_dir_all(&spectrums_dir)?;
|
||||||
let path = spectrums_dir.join(format!("{label}.json"));
|
let path = spectrums_dir.join(format!("{label}.json"));
|
||||||
|
|||||||
@@ -11,5 +11,5 @@ pub use error::{OKIError, OKIResult};
|
|||||||
pub use distance::{DistanceMetric, DistanceOutput};
|
pub use distance::{DistanceMetric, DistanceOutput};
|
||||||
pub use index::KmerIndex;
|
pub use index::KmerIndex;
|
||||||
pub use merge::MergeMode;
|
pub use merge::MergeMode;
|
||||||
pub use meta::{IndexConfig, IndexMeta, META_FILENAME};
|
pub use meta::{GenomeInfo, IndexConfig, IndexMeta, META_FILENAME};
|
||||||
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
pub use state::{IndexState, SENTINEL_COUNTED, SENTINEL_INDEXED, SENTINEL_SCATTERED};
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ use tracing::info;
|
|||||||
|
|
||||||
use crate::error::{OKIError, OKIResult};
|
use crate::error::{OKIError, OKIResult};
|
||||||
use crate::index::KmerIndex;
|
use crate::index::KmerIndex;
|
||||||
use crate::meta::IndexMeta;
|
use crate::meta::{GenomeInfo, IndexMeta};
|
||||||
use crate::state::IndexState;
|
use crate::state::IndexState;
|
||||||
|
|
||||||
pub use obikpartitionner::MergeMode;
|
pub use obikpartitionner::MergeMode;
|
||||||
@@ -111,7 +111,8 @@ impl KmerIndex {
|
|||||||
fs::remove_dir_all(&spectrums_dir)?;
|
fs::remove_dir_all(&spectrums_dir)?;
|
||||||
}
|
}
|
||||||
for (src, new_labels) in sources.iter().zip(&source_labels) {
|
for (src, new_labels) in sources.iter().zip(&source_labels) {
|
||||||
copy_spectrums(&src.root_path, output, &src.meta.genomes, new_labels)?;
|
let old_labels: Vec<String> = src.meta.genomes.iter().map(|g| g.label.clone()).collect();
|
||||||
|
copy_spectrums(&src.root_path, output, &old_labels, new_labels)?;
|
||||||
}
|
}
|
||||||
pb.finish_and_clear();
|
pb.finish_and_clear();
|
||||||
rep.push(t.stop());
|
rep.push(t.stop());
|
||||||
@@ -169,14 +170,15 @@ impl KmerIndex {
|
|||||||
fn compute_labels(
|
fn compute_labels(
|
||||||
sources: &[&KmerIndex],
|
sources: &[&KmerIndex],
|
||||||
rename_duplicates: bool,
|
rename_duplicates: bool,
|
||||||
) -> OKIResult<(Vec<Vec<String>>, Vec<String>)> {
|
) -> OKIResult<(Vec<Vec<String>>, Vec<GenomeInfo>)> {
|
||||||
let mut seen: HashMap<String, usize> = HashMap::new();
|
let mut seen: HashMap<String, usize> = HashMap::new();
|
||||||
let mut source_labels: Vec<Vec<String>> = Vec::with_capacity(sources.len());
|
let mut source_labels: Vec<Vec<String>> = Vec::with_capacity(sources.len());
|
||||||
let mut all_genomes: Vec<String> = Vec::new();
|
let mut all_genomes: Vec<GenomeInfo> = Vec::new();
|
||||||
|
|
||||||
for src in sources {
|
for src in sources {
|
||||||
let mut labels = Vec::with_capacity(src.meta.genomes.len());
|
let mut labels = Vec::with_capacity(src.meta.genomes.len());
|
||||||
for label in &src.meta.genomes {
|
for genome in &src.meta.genomes {
|
||||||
|
let label = &genome.label;
|
||||||
let count = seen.entry(label.clone()).or_insert(0);
|
let count = seen.entry(label.clone()).or_insert(0);
|
||||||
let new_label = if *count == 0 {
|
let new_label = if *count == 0 {
|
||||||
label.clone()
|
label.clone()
|
||||||
@@ -187,7 +189,7 @@ fn compute_labels(
|
|||||||
};
|
};
|
||||||
*count += 1;
|
*count += 1;
|
||||||
labels.push(new_label.clone());
|
labels.push(new_label.clone());
|
||||||
all_genomes.push(new_label);
|
all_genomes.push(GenomeInfo { label: new_label, meta: genome.meta.clone() });
|
||||||
}
|
}
|
||||||
source_labels.push(labels);
|
source_labels.push(labels);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
@@ -7,6 +8,20 @@ use serde::{Deserialize, Serialize};
|
|||||||
pub const META_FILENAME: &str = "index.meta";
|
pub const META_FILENAME: &str = "index.meta";
|
||||||
const META_VERSION: u32 = 1;
|
const META_VERSION: u32 = 1;
|
||||||
|
|
||||||
|
/// Per-genome label + categorical metadata.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct GenomeInfo {
|
||||||
|
pub label: String,
|
||||||
|
#[serde(default)]
|
||||||
|
pub meta: HashMap<String, String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl GenomeInfo {
|
||||||
|
pub fn new(label: impl Into<String>) -> Self {
|
||||||
|
Self { label: label.into(), meta: HashMap::new() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct IndexConfig {
|
pub struct IndexConfig {
|
||||||
pub kmer_size: usize,
|
pub kmer_size: usize,
|
||||||
@@ -19,9 +34,8 @@ pub struct IndexConfig {
|
|||||||
pub struct IndexMeta {
|
pub struct IndexMeta {
|
||||||
pub version: u32,
|
pub version: u32,
|
||||||
pub config: IndexConfig,
|
pub config: IndexConfig,
|
||||||
/// Ordered list of genome labels indexed here.
|
/// Ordered list of genomes indexed here (label + optional categorical metadata).
|
||||||
/// Element 0 is the initial genome; subsequent entries come from merges.
|
pub genomes: Vec<GenomeInfo>,
|
||||||
pub genomes: Vec<String>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IndexMeta {
|
impl IndexMeta {
|
||||||
@@ -42,4 +56,9 @@ impl IndexMeta {
|
|||||||
pub fn exists(root: &Path) -> bool {
|
pub fn exists(root: &Path) -> bool {
|
||||||
root.join(META_FILENAME).exists()
|
root.join(META_FILENAME).exists()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Iterate over genome labels only.
|
||||||
|
pub fn genome_labels(&self) -> impl Iterator<Item = &str> {
|
||||||
|
self.genomes.iter().map(|g| g.label.as_str())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ obiskio = { path = "../obiskio" }
|
|||||||
obikindex = { path = "../obikindex" }
|
obikindex = { path = "../obikindex" }
|
||||||
clap = { version = "4", features = ["derive"] }
|
clap = { version = "4", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
|
csv = "1"
|
||||||
kodama = "0.2"
|
kodama = "0.2"
|
||||||
speedytree = "0.1"
|
speedytree = "0.1"
|
||||||
rayon = "1"
|
rayon = "1"
|
||||||
|
|||||||
@@ -0,0 +1,174 @@
|
|||||||
|
use std::collections::HashSet;
|
||||||
|
use std::io::{self, BufWriter, Write};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use clap::Args;
|
||||||
|
use obikindex::KmerIndex;
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
|
#[derive(Args)]
|
||||||
|
pub struct AnnotateArgs {
|
||||||
|
/// Index directory to annotate (modified in-place)
|
||||||
|
pub index: PathBuf,
|
||||||
|
|
||||||
|
/// CSV file with genome metadata (must contain an id column)
|
||||||
|
#[arg(long)]
|
||||||
|
pub csv: Option<PathBuf>,
|
||||||
|
|
||||||
|
/// CSV field separator
|
||||||
|
#[arg(long, default_value = ",")]
|
||||||
|
pub sep: char,
|
||||||
|
|
||||||
|
/// Name of the column that contains genome labels
|
||||||
|
#[arg(long, default_value = "id")]
|
||||||
|
pub id_col: String,
|
||||||
|
|
||||||
|
/// Value that means "delete / absent" (removes existing key if present)
|
||||||
|
#[arg(long, default_value = "NA")]
|
||||||
|
pub na_value: String,
|
||||||
|
|
||||||
|
/// Do not overwrite existing metadata keys
|
||||||
|
#[arg(long)]
|
||||||
|
pub no_overwrite: bool,
|
||||||
|
|
||||||
|
/// Dump all genome metadata as CSV (stdout) instead of reading a CSV
|
||||||
|
#[arg(long)]
|
||||||
|
pub dump: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run(args: AnnotateArgs) {
|
||||||
|
if args.dump {
|
||||||
|
run_dump(&args);
|
||||||
|
} else {
|
||||||
|
run_annotate(&args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_dump(args: &AnnotateArgs) {
|
||||||
|
let idx = open_index(&args.index);
|
||||||
|
|
||||||
|
let genomes = &idx.meta().genomes;
|
||||||
|
|
||||||
|
// Collect all keys in stable order (sorted for determinism)
|
||||||
|
let mut key_set: HashSet<String> = HashSet::new();
|
||||||
|
for g in genomes {
|
||||||
|
for k in g.meta.keys() {
|
||||||
|
key_set.insert(k.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mut keys: Vec<String> = key_set.into_iter().collect();
|
||||||
|
keys.sort();
|
||||||
|
|
||||||
|
let stdout = io::stdout();
|
||||||
|
let mut out = BufWriter::new(stdout.lock());
|
||||||
|
|
||||||
|
// Header
|
||||||
|
write!(out, "id").unwrap();
|
||||||
|
for k in &keys {
|
||||||
|
write!(out, "{}{k}", args.sep).unwrap();
|
||||||
|
}
|
||||||
|
writeln!(out).unwrap();
|
||||||
|
|
||||||
|
// Rows
|
||||||
|
for g in genomes {
|
||||||
|
write!(out, "{}", g.label).unwrap();
|
||||||
|
for k in &keys {
|
||||||
|
let v = g.meta.get(k).map(|s| s.as_str()).unwrap_or("NA");
|
||||||
|
write!(out, "{}{v}", args.sep).unwrap();
|
||||||
|
}
|
||||||
|
writeln!(out).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_annotate(args: &AnnotateArgs) {
|
||||||
|
let csv_path = match &args.csv {
|
||||||
|
Some(p) => p.clone(),
|
||||||
|
None => {
|
||||||
|
eprintln!("error: --csv is required unless --dump is used");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut idx = open_index(&args.index);
|
||||||
|
|
||||||
|
// Build a label → genome index position map
|
||||||
|
let label_to_pos: std::collections::HashMap<String, usize> = idx
|
||||||
|
.meta()
|
||||||
|
.genomes
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, g)| (g.label.clone(), i))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let sep = args.sep as u8;
|
||||||
|
let mut rdr = csv::ReaderBuilder::new()
|
||||||
|
.delimiter(sep)
|
||||||
|
.from_path(&csv_path)
|
||||||
|
.unwrap_or_else(|e| {
|
||||||
|
eprintln!("error opening {}: {e}", csv_path.display());
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
let headers = rdr.headers().unwrap_or_else(|e| {
|
||||||
|
eprintln!("error reading CSV headers: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
}).clone();
|
||||||
|
|
||||||
|
let id_col_idx = headers.iter().position(|h| h == args.id_col).unwrap_or_else(|| {
|
||||||
|
eprintln!("error: id column '{}' not found in CSV", args.id_col);
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
let meta_cols: Vec<(usize, String)> = headers
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|(i, _)| *i != id_col_idx)
|
||||||
|
.map(|(i, h)| (i, h.to_string()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut updated = 0usize;
|
||||||
|
let mut skipped = 0usize;
|
||||||
|
|
||||||
|
for result in rdr.records() {
|
||||||
|
let record = result.unwrap_or_else(|e| {
|
||||||
|
eprintln!("error reading CSV record: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
let label = record.get(id_col_idx).unwrap_or("").to_string();
|
||||||
|
let pos = match label_to_pos.get(&label) {
|
||||||
|
Some(&p) => p,
|
||||||
|
None => {
|
||||||
|
skipped += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let genome = &mut idx.meta_mut().genomes[pos];
|
||||||
|
for (col_idx, key) in &meta_cols {
|
||||||
|
let val = record.get(*col_idx).unwrap_or("");
|
||||||
|
if val == args.na_value {
|
||||||
|
genome.meta.remove(key);
|
||||||
|
} else if args.no_overwrite && genome.meta.contains_key(key) {
|
||||||
|
// skip
|
||||||
|
} else {
|
||||||
|
genome.meta.insert(key.clone(), val.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
updated += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
idx.meta_mut().write(&args.index).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error writing index metadata: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
info!("annotated {updated} genome(s), skipped {skipped} CSV row(s) with unknown label");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn open_index(path: &PathBuf) -> KmerIndex {
|
||||||
|
KmerIndex::open(path).unwrap_or_else(|e| {
|
||||||
|
eprintln!("error opening index: {e}");
|
||||||
|
std::process::exit(1);
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -75,8 +75,8 @@ pub fn run(args: DistanceArgs) {
|
|||||||
set_k(idx.kmer_size());
|
set_k(idx.kmer_size());
|
||||||
set_m(idx.minimizer_size());
|
set_m(idx.minimizer_size());
|
||||||
|
|
||||||
let genomes = idx.meta().genomes.clone();
|
let labels: Vec<String> = idx.meta().genomes.iter().map(|g| g.label.clone()).collect();
|
||||||
let n = genomes.len();
|
let n = labels.len();
|
||||||
info!(
|
info!(
|
||||||
"computing {:?} distances for {} genome(s)",
|
"computing {:?} distances for {} genome(s)",
|
||||||
args.metric, n
|
args.metric, n
|
||||||
@@ -93,9 +93,9 @@ pub fn run(args: DistanceArgs) {
|
|||||||
// ── Distance matrix → CSV ─────────────────────────────────────────────────
|
// ── Distance matrix → CSV ─────────────────────────────────────────────────
|
||||||
let write_dist_csv = |w: &mut dyn Write| {
|
let write_dist_csv = |w: &mut dyn Write| {
|
||||||
write!(w, "genome").unwrap();
|
write!(w, "genome").unwrap();
|
||||||
for g in &genomes { write!(w, ",{g}").unwrap(); }
|
for g in &labels { write!(w, ",{g}").unwrap(); }
|
||||||
writeln!(w).unwrap();
|
writeln!(w).unwrap();
|
||||||
for (i, g) in genomes.iter().enumerate() {
|
for (i, g) in labels.iter().enumerate() {
|
||||||
write!(w, "{g}").unwrap();
|
write!(w, "{g}").unwrap();
|
||||||
for j in 0..n {
|
for j in 0..n {
|
||||||
write!(w, ",{:.6}", result.matrix[[i, j]]).unwrap();
|
write!(w, ",{:.6}", result.matrix[[i, j]]).unwrap();
|
||||||
@@ -132,9 +132,9 @@ pub fn run(args: DistanceArgs) {
|
|||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
}));
|
}));
|
||||||
write!(f, "genome").unwrap();
|
write!(f, "genome").unwrap();
|
||||||
for g in &genomes { write!(f, ",{g}").unwrap(); }
|
for g in &labels { write!(f, ",{g}").unwrap(); }
|
||||||
writeln!(f).unwrap();
|
writeln!(f).unwrap();
|
||||||
for (i, g) in genomes.iter().enumerate() {
|
for (i, g) in labels.iter().enumerate() {
|
||||||
write!(f, "{g}").unwrap();
|
write!(f, "{g}").unwrap();
|
||||||
for j in 0..n { write!(f, ",{}", shared[[i, j]]).unwrap(); }
|
for j in 0..n { write!(f, ",{}", shared[[i, j]]).unwrap(); }
|
||||||
writeln!(f).unwrap();
|
writeln!(f).unwrap();
|
||||||
@@ -148,7 +148,7 @@ pub fn run(args: DistanceArgs) {
|
|||||||
let rows: Vec<Vec<f64>> = (0..n)
|
let rows: Vec<Vec<f64>> = (0..n)
|
||||||
.map(|i| (0..n).map(|j| result.matrix[[i, j]]).collect())
|
.map(|i| (0..n).map(|j| result.matrix[[i, j]]).collect())
|
||||||
.collect();
|
.collect();
|
||||||
let dm = DistanceMatrix::build(rows, genomes.clone()).unwrap_or_else(|e| {
|
let dm = DistanceMatrix::build(rows, labels.clone()).unwrap_or_else(|e| {
|
||||||
eprintln!("error building distance matrix for NJ: {e}");
|
eprintln!("error building distance matrix for NJ: {e}");
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
});
|
});
|
||||||
@@ -176,7 +176,7 @@ pub fn run(args: DistanceArgs) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
let dendro = linkage(&mut condensed, n, Method::Average);
|
let dendro = linkage(&mut condensed, n, Method::Average);
|
||||||
let newick = upgma_to_newick(&dendro, &genomes);
|
let newick = upgma_to_newick(&dendro, &labels);
|
||||||
let path = args.output.as_ref()
|
let path = args.output.as_ref()
|
||||||
.map(|p| format!("{}_upgma.nwk", p.display()))
|
.map(|p| format!("{}_upgma.nwk", p.display()))
|
||||||
.unwrap_or_else(|| "upgma.nwk".into());
|
.unwrap_or_else(|| "upgma.nwk".into());
|
||||||
|
|||||||
@@ -1,7 +1,12 @@
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use clap::Args;
|
use clap::Args;
|
||||||
use obikindex::{IndexConfig, IndexState, KmerIndex};
|
use obikindex::{GenomeInfo, IndexConfig, IndexState, KmerIndex};
|
||||||
|
|
||||||
|
fn parse_key_value(s: &str) -> Result<(String, String), String> {
|
||||||
|
let pos = s.find('=').ok_or_else(|| format!("invalid key=value: no '=' in '{s}'"))?;
|
||||||
|
Ok((s[..pos].to_string(), s[pos + 1..].to_string()))
|
||||||
|
}
|
||||||
use obikseq::{set_k, set_m};
|
use obikseq::{set_k, set_m};
|
||||||
use obisys::Reporter;
|
use obisys::Reporter;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
@@ -23,6 +28,10 @@ pub struct IndexArgs {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
pub label: Option<String>,
|
pub label: Option<String>,
|
||||||
|
|
||||||
|
/// Genome categorical metadata as key=value pairs (repeatable)
|
||||||
|
#[arg(long = "meta", value_parser = parse_key_value)]
|
||||||
|
pub meta: Vec<(String, String)>,
|
||||||
|
|
||||||
/// Minimum kmer abundance (inclusive)
|
/// Minimum kmer abundance (inclusive)
|
||||||
#[arg(long, default_value_t = 1)]
|
#[arg(long, default_value_t = 1)]
|
||||||
pub min_abundance: u32,
|
pub min_abundance: u32,
|
||||||
@@ -73,7 +82,14 @@ pub fn run(args: IndexArgs) {
|
|||||||
n_bits,
|
n_bits,
|
||||||
with_counts: args.with_counts,
|
with_counts: args.with_counts,
|
||||||
};
|
};
|
||||||
KmerIndex::create(&output, config, args.label.clone(), false).unwrap_or_else(|e| {
|
let genome_info = args.label.as_ref().map(|label| {
|
||||||
|
let mut info = GenomeInfo::new(label.clone());
|
||||||
|
for (k, v) in &args.meta {
|
||||||
|
info.meta.insert(k.clone(), v.clone());
|
||||||
|
}
|
||||||
|
info
|
||||||
|
});
|
||||||
|
KmerIndex::create(&output, config, genome_info, false).unwrap_or_else(|e| {
|
||||||
eprintln!("error creating index: {e}");
|
eprintln!("error creating index: {e}");
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
pub mod annotate;
|
||||||
pub mod distance;
|
pub mod distance;
|
||||||
pub mod dump;
|
pub mod dump;
|
||||||
pub mod index;
|
pub mod index;
|
||||||
|
|||||||
@@ -266,8 +266,8 @@ fn emit_batch(
|
|||||||
ann.insert("kmer_missing".into(), acc.kmer_missing.into());
|
ann.insert("kmer_missing".into(), acc.kmer_missing.into());
|
||||||
}
|
}
|
||||||
let mut match_map = serde_json::Map::new();
|
let mut match_map = serde_json::Map::new();
|
||||||
for (g, label) in meta.genomes.iter().enumerate() {
|
for (g, genome) in meta.genomes.iter().enumerate() {
|
||||||
match_map.insert(label.clone(), acc.genome_totals[g].into());
|
match_map.insert(genome.label.clone(), acc.genome_totals[g].into());
|
||||||
}
|
}
|
||||||
ann.insert("kmer_strict_matches".into(), match_map.into());
|
ann.insert("kmer_strict_matches".into(), match_map.into());
|
||||||
|
|
||||||
|
|||||||
@@ -26,6 +26,8 @@ enum Commands {
|
|||||||
Query(cmd::query::QueryArgs),
|
Query(cmd::query::QueryArgs),
|
||||||
/// Dump all indexed kmers as CSV (kmer + per-genome counts or presence)
|
/// Dump all indexed kmers as CSV (kmer + per-genome counts or presence)
|
||||||
Dump(cmd::dump::DumpArgs),
|
Dump(cmd::dump::DumpArgs),
|
||||||
|
/// Add or update genome metadata from a CSV file; or dump metadata as CSV
|
||||||
|
Annotate(cmd::annotate::AnnotateArgs),
|
||||||
/// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees
|
/// Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees
|
||||||
Distance(cmd::distance::DistanceArgs),
|
Distance(cmd::distance::DistanceArgs),
|
||||||
/// Dump unitigs from a built index to stdout (debug)
|
/// Dump unitigs from a built index to stdout (debug)
|
||||||
@@ -57,6 +59,7 @@ fn main() {
|
|||||||
Commands::Dump(args) => cmd::dump::run(args),
|
Commands::Dump(args) => cmd::dump::run(args),
|
||||||
Commands::Rebuild(args) => cmd::rebuild::run(args),
|
Commands::Rebuild(args) => cmd::rebuild::run(args),
|
||||||
Commands::Query(args) => cmd::query::run(args),
|
Commands::Query(args) => cmd::query::run(args),
|
||||||
|
Commands::Annotate(args) => cmd::annotate::run(args),
|
||||||
Commands::Distance(args) => cmd::distance::run(args),
|
Commands::Distance(args) => cmd::distance::run(args),
|
||||||
Commands::Unitig(args) => cmd::unitig::run(args),
|
Commands::Unitig(args) => cmd::unitig::run(args),
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user