Push qkpyqurltlpk #1
@@ -9,20 +9,6 @@
|
|||||||
|
|
||||||
## commandes à ajouter
|
## commandes à ajouter
|
||||||
|
|
||||||
- merge : pour construire un index à partir d'index existants
|
|
||||||
- deux modes : count et presence/absence. count exige que tous les index mergés soient déjà en mode count. mode presence/absence par defaut. Si passage de mode count à mode presence/absence, par defaut presence = count >= 1. Possibilité de spécifier un seuil personnalisé.
|
|
||||||
- le merge doit se faire en parallèle sur chaque partition
|
|
||||||
- en entrée : une liste de chemins vers les index à fusionner
|
|
||||||
- en sortie : un nouvel index fusionné (option -o <output_index>)
|
|
||||||
- j'imagine comme algo:
|
|
||||||
- on copie le premier index dans le nouvel index
|
|
||||||
- on ajoute a chaque partition une matrice de count ou de presence s'il n'y en avait pas déjà.
|
|
||||||
- si besoin, on cree la colone 0 de la matrice de count ou de presence pour le genome courant
|
|
||||||
- on parcourt les partitions et les index à fusionner en parallèle
|
|
||||||
- pour chaque partition, on ajoute les kmer présents dans les index à fusionner au nouvel index
|
|
||||||
- si le kmer est déjà présent dans le nouvel index on ajoute le compte ou la presence du kmer dans la matrice de count ou de presence
|
|
||||||
- sinon, on ajoute le kmer dans une nouvelle layer
|
|
||||||
|
|
||||||
- filter : produit un nouvel index filtré à partir d'un index existant en verifiant que les kmer présents dans le nouvel index respectent les critères de filtrage spécifiés
|
- filter : produit un nouvel index filtré à partir d'un index existant en verifiant que les kmer présents dans le nouvel index respectent les critères de filtrage spécifiés
|
||||||
- quorum de presence en fraction-(min/max) du nombre de génomes, en nombre-(min/max) de génomes, si mode count la présence peut être défini par un seuil personnalisé minimum et maximum
|
- quorum de presence en fraction-(min/max) du nombre de génomes, en nombre-(min/max) de génomes, si mode count la présence peut être défini par un seuil personnalisé minimum et maximum
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,9 @@ use std::collections::HashMap;
|
|||||||
use std::fs;
|
use std::fs;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use indicatif::{ProgressBar, ProgressStyle};
|
||||||
use obisys::{Reporter, Stage};
|
use obisys::{Reporter, Stage};
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
@@ -31,6 +33,7 @@ impl KmerIndex {
|
|||||||
mode: MergeMode,
|
mode: MergeMode,
|
||||||
force: bool,
|
force: bool,
|
||||||
rename_duplicates: bool,
|
rename_duplicates: bool,
|
||||||
|
rep: &mut Reporter,
|
||||||
) -> OKIResult<Self> {
|
) -> OKIResult<Self> {
|
||||||
let output = output.as_ref();
|
let output = output.as_ref();
|
||||||
|
|
||||||
@@ -74,7 +77,14 @@ impl KmerIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ── Bootstrap: copy first source to output ────────────────────────────
|
// ── Bootstrap: copy first source to output ────────────────────────────
|
||||||
info!("copying {} → {}", sources[0].root_path.display(), output.display());
|
info!(
|
||||||
|
"bootstrap: copying {} → {} ({} genome(s))",
|
||||||
|
sources[0].root_path.display(),
|
||||||
|
output.display(),
|
||||||
|
sources[0].meta.genomes.len(),
|
||||||
|
);
|
||||||
|
let t = Stage::start("bootstrap");
|
||||||
|
let pb = spinner("bootstrap — copying index …");
|
||||||
copy_dir_all(&sources[0].root_path, output)?;
|
copy_dir_all(&sources[0].root_path, output)?;
|
||||||
|
|
||||||
// Rewrite index.meta with final genome labels and the effective mode.
|
// Rewrite index.meta with final genome labels and the effective mode.
|
||||||
@@ -88,9 +98,14 @@ impl KmerIndex {
|
|||||||
if mode == MergeMode::Presence {
|
if mode == MergeMode::Presence {
|
||||||
remove_dirs_named(output, "counts")?;
|
remove_dirs_named(output, "counts")?;
|
||||||
}
|
}
|
||||||
|
pb.finish_and_clear();
|
||||||
|
rep.push(t.stop());
|
||||||
|
|
||||||
// Rebuild spectrums/ from all sources using the (possibly renamed) labels.
|
// Rebuild spectrums/ from all sources using the (possibly renamed) labels.
|
||||||
// Drop the spectrums/ that were copied from source_0 and rebuild from scratch.
|
// Drop the spectrums/ that were copied from source_0 and rebuild from scratch.
|
||||||
|
info!("rebuilding spectrums for {} source(s)", sources.len());
|
||||||
|
let t = Stage::start("spectrums");
|
||||||
|
let pb = spinner("spectrums — copying …");
|
||||||
let spectrums_dir = output.join("spectrums");
|
let spectrums_dir = output.join("spectrums");
|
||||||
if spectrums_dir.exists() {
|
if spectrums_dir.exists() {
|
||||||
fs::remove_dir_all(&spectrums_dir)?;
|
fs::remove_dir_all(&spectrums_dir)?;
|
||||||
@@ -98,6 +113,8 @@ impl KmerIndex {
|
|||||||
for (src, new_labels) in sources.iter().zip(&source_labels) {
|
for (src, new_labels) in sources.iter().zip(&source_labels) {
|
||||||
copy_spectrums(&src.root_path, output, &src.meta.genomes, new_labels)?;
|
copy_spectrums(&src.root_path, output, &src.meta.genomes, new_labels)?;
|
||||||
}
|
}
|
||||||
|
pb.finish_and_clear();
|
||||||
|
rep.push(t.stop());
|
||||||
|
|
||||||
// Open the destination index.
|
// Open the destination index.
|
||||||
let dst = KmerIndex::open(output)?;
|
let dst = KmerIndex::open(output)?;
|
||||||
@@ -107,8 +124,13 @@ impl KmerIndex {
|
|||||||
// ── Merge each subsequent source partition-by-partition ───────────────
|
// ── Merge each subsequent source partition-by-partition ───────────────
|
||||||
let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec();
|
let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec();
|
||||||
if !remaining_sources.is_empty() {
|
if !remaining_sources.is_empty() {
|
||||||
let mut rep = Reporter::new();
|
let n_src_genomes: usize = remaining_sources.iter().map(|s| s.meta.genomes.len()).sum();
|
||||||
|
info!(
|
||||||
|
"merging {} partition(s) × {} additional source genome(s) into {} destination genome(s)",
|
||||||
|
n_partitions, n_src_genomes, n_dst_genomes,
|
||||||
|
);
|
||||||
let t = Stage::start("merge_partitions");
|
let t = Stage::start("merge_partitions");
|
||||||
|
let pb = partition_bar(n_partitions as u64);
|
||||||
|
|
||||||
let dst_partition = &dst.partition;
|
let dst_partition = &dst.partition;
|
||||||
|
|
||||||
@@ -117,10 +139,13 @@ impl KmerIndex {
|
|||||||
.filter_map(|i| {
|
.filter_map(|i| {
|
||||||
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
|
let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
|
||||||
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
|
remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
|
||||||
dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err()
|
let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes).err();
|
||||||
|
pb.inc(1);
|
||||||
|
result
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
pb.finish_and_clear();
|
||||||
if let Some(e) = errors.into_iter().next() {
|
if let Some(e) = errors.into_iter().next() {
|
||||||
return Err(OKIError::Partition(e));
|
return Err(OKIError::Partition(e));
|
||||||
}
|
}
|
||||||
@@ -206,6 +231,31 @@ fn remove_dirs_named(root: &Path, name: &str) -> io::Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn spinner(msg: &'static str) -> ProgressBar {
|
||||||
|
let pb = ProgressBar::new_spinner();
|
||||||
|
pb.set_style(
|
||||||
|
ProgressStyle::with_template("{spinner} {msg} {elapsed}")
|
||||||
|
.unwrap()
|
||||||
|
.tick_strings(&["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]),
|
||||||
|
);
|
||||||
|
pb.set_message(msg);
|
||||||
|
pb.enable_steady_tick(Duration::from_millis(100));
|
||||||
|
pb
|
||||||
|
}
|
||||||
|
|
||||||
|
fn partition_bar(n: u64) -> ProgressBar {
|
||||||
|
let pb = ProgressBar::new(n);
|
||||||
|
pb.set_style(
|
||||||
|
ProgressStyle::with_template(
|
||||||
|
"{spinner} merge — {bar:40.cyan/blue} {pos}/{len} partitions {elapsed}",
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
.tick_strings(&["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]),
|
||||||
|
);
|
||||||
|
pb.enable_steady_tick(Duration::from_millis(100));
|
||||||
|
pb
|
||||||
|
}
|
||||||
|
|
||||||
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
fn copy_dir_all(src: &Path, dst: &Path) -> io::Result<()> {
|
||||||
fs::create_dir_all(dst)?;
|
fs::create_dir_all(dst)?;
|
||||||
for entry in fs::read_dir(src)? {
|
for entry in fs::read_dir(src)? {
|
||||||
|
|||||||
@@ -55,9 +55,15 @@ pub fn run(args: MergeArgs) {
|
|||||||
|
|
||||||
set_k(sources[0].kmer_size());
|
set_k(sources[0].kmer_size());
|
||||||
set_m(sources[0].minimizer_size());
|
set_m(sources[0].minimizer_size());
|
||||||
info!("merging {} indexes → {}", sources.len(), args.output.display());
|
|
||||||
let rep = Reporter::new();
|
let n_genomes: usize = sources.iter().map(|s| s.meta().genomes.len()).sum();
|
||||||
KmerIndex::merge(&args.output, &source_refs, mode, args.force, args.rename_duplicates).unwrap_or_else(|e| {
|
info!(
|
||||||
|
"merging {} index(es), {} genome(s) total → {}",
|
||||||
|
sources.len(), n_genomes, args.output.display()
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut rep = Reporter::new();
|
||||||
|
KmerIndex::merge(&args.output, &source_refs, mode, args.force, args.rename_duplicates, &mut rep).unwrap_or_else(|e| {
|
||||||
eprintln!("error merging: {e}");
|
eprintln!("error merging: {e}");
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -41,6 +41,8 @@ impl MphfLayer {
|
|||||||
#[inline]
|
#[inline]
|
||||||
pub fn find(&self, kmer: CanonicalKmer) -> Option<usize> {
|
pub fn find(&self, kmer: CanonicalKmer) -> Option<usize> {
|
||||||
let slot = self.mphf.index(&kmer.raw());
|
let slot = self.mphf.index(&kmer.raw());
|
||||||
|
// PtrHash guarantees slot < n only for its key set; arbitrary queries may exceed bounds.
|
||||||
|
if slot >= self.n { return None; }
|
||||||
let (chunk_id, rank) = self.evidence.decode(slot);
|
let (chunk_id, rank) = self.evidence.decode(slot);
|
||||||
if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
|
if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
|
||||||
Some(slot)
|
Some(slot)
|
||||||
|
|||||||
Reference in New Issue
Block a user