Push ooxwzorvsqvy #26
@@ -1,15 +1,15 @@
|
|||||||
//use ahash::RandomState;
|
//use ahash::RandomState;
|
||||||
|
use crossbeam_channel;
|
||||||
use hashbrown::HashMap;
|
use hashbrown::HashMap;
|
||||||
use obikseq::k;
|
use obikseq::k;
|
||||||
use obikseq::{CanonicalKmer, Sequence, Unitig};
|
use obikseq::{CanonicalKmer, Sequence, Unitig};
|
||||||
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
||||||
use std::cell::RefCell;
|
use std::cell::RefCell;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use crossbeam_channel;
|
|
||||||
use std::sync::atomic::{AtomicU8, Ordering};
|
use std::sync::atomic::{AtomicU8, Ordering};
|
||||||
use xxhash_rust::xxh3::Xxh3Builder;
|
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
use tracing::{debug, info};
|
use tracing::{debug, info};
|
||||||
|
use xxhash_rust::xxh3::Xxh3Builder;
|
||||||
|
|
||||||
// ── Types ─────────────────────────────────────────────────────────────────────
|
// ── Types ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -99,7 +99,6 @@ impl Node {
|
|||||||
(self.0 >> 5) & 0b11
|
(self.0 >> 5) & 0b11
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Marks the node as visited.
|
/// Marks the node as visited.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn set_visited(&mut self) {
|
pub fn set_visited(&mut self) {
|
||||||
@@ -180,7 +179,11 @@ impl WalkState {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn reachable(&self, graph: &GraphDeBruijn) -> bool {
|
pub fn reachable(&self, graph: &GraphDeBruijn) -> bool {
|
||||||
WalkState { kmer: self.kmer, node: self.node, direct: !self.direct }
|
WalkState {
|
||||||
|
kmer: self.kmer,
|
||||||
|
node: self.node,
|
||||||
|
direct: !self.direct,
|
||||||
|
}
|
||||||
.leavable(graph)
|
.leavable(graph)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -197,8 +200,19 @@ impl WalkState {
|
|||||||
if next_node.is_visited() {
|
if next_node.is_visited() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
let reachable = if dnext { next_node.can_extend_left() } else { next_node.can_extend_right() };
|
let reachable = if dnext {
|
||||||
reachable.then_some((WalkState { kmer: cnext, node: next_node, direct: dnext }, nuc))
|
next_node.can_extend_left()
|
||||||
|
} else {
|
||||||
|
next_node.can_extend_right()
|
||||||
|
};
|
||||||
|
reachable.then_some((
|
||||||
|
WalkState {
|
||||||
|
kmer: cnext,
|
||||||
|
node: next_node,
|
||||||
|
direct: dnext,
|
||||||
|
},
|
||||||
|
nuc,
|
||||||
|
))
|
||||||
} else {
|
} else {
|
||||||
if !self.node.can_extend_left() {
|
if !self.node.can_extend_left() {
|
||||||
return None;
|
return None;
|
||||||
@@ -211,8 +225,19 @@ impl WalkState {
|
|||||||
if next_node.is_visited() {
|
if next_node.is_visited() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
let reachable = if dnext { next_node.can_extend_right() } else { next_node.can_extend_left() };
|
let reachable = if dnext {
|
||||||
reachable.then_some((WalkState { kmer: cnext, node: next_node, direct: dnext }, 3 - nuc))
|
next_node.can_extend_right()
|
||||||
|
} else {
|
||||||
|
next_node.can_extend_left()
|
||||||
|
};
|
||||||
|
reachable.then_some((
|
||||||
|
WalkState {
|
||||||
|
kmer: cnext,
|
||||||
|
node: next_node,
|
||||||
|
direct: dnext,
|
||||||
|
},
|
||||||
|
3 - nuc,
|
||||||
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -275,7 +300,11 @@ impl GraphDeBruijn {
|
|||||||
node.set_left(lc, ln);
|
node.set_left(lc, ln);
|
||||||
atomic.store(node.0, Ordering::Relaxed);
|
atomic.store(node.0, Ordering::Relaxed);
|
||||||
});
|
});
|
||||||
debug!("[compute_degrees] pass 1 (degrees): {:?} — {} nodes", t1.elapsed(), self.nodes.len());
|
debug!(
|
||||||
|
"[compute_degrees] pass 1 (degrees): {:?} — {} nodes",
|
||||||
|
t1.elapsed(),
|
||||||
|
self.nodes.len()
|
||||||
|
);
|
||||||
|
|
||||||
// Pass 2: mark start nodes
|
// Pass 2: mark start nodes
|
||||||
|
|
||||||
@@ -290,7 +319,11 @@ impl GraphDeBruijn {
|
|||||||
atomic.store(node.0, Ordering::Relaxed);
|
atomic.store(node.0, Ordering::Relaxed);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
debug!("[compute_degrees] pass 2 (starts): {:?} — {} nodes", t2.elapsed(), self.nodes.len());
|
debug!(
|
||||||
|
"[compute_degrees] pass 2 (starts): {:?} — {} nodes",
|
||||||
|
t2.elapsed(),
|
||||||
|
self.nodes.len()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_visited(&self, kmer: &CanonicalKmer) -> Option<bool> {
|
pub fn is_visited(&self, kmer: &CanonicalKmer) -> Option<bool> {
|
||||||
@@ -328,14 +361,28 @@ impl GraphDeBruijn {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn unitig_nucleotides(&self, kmer: CanonicalKmer, k: usize) -> Option<UnitigNucIter<'_>> {
|
fn unitig_nucleotides(&self, kmer: CanonicalKmer, k: usize) -> Option<UnitigNucIter<'_>> {
|
||||||
let old = self.nodes.get(&kmer)?.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
|
let old = self
|
||||||
if old & IS_VISITED_MASK != 0 { return None; }
|
.nodes
|
||||||
|
.get(&kmer)?
|
||||||
|
.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
|
||||||
|
if old & IS_VISITED_MASK != 0 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
let start = WalkState::new(kmer, Node(old), true);
|
let start = WalkState::new(kmer, Node(old), true);
|
||||||
let next_step = start.walk(self).and_then(|(next_state, nuc)| {
|
let next_step = start.walk(self).and_then(|(next_state, nuc)| {
|
||||||
let ext_old = self.nodes.get(&next_state.kmer)?.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
|
let ext_old = self
|
||||||
|
.nodes
|
||||||
|
.get(&next_state.kmer)?
|
||||||
|
.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
|
||||||
(ext_old & IS_VISITED_MASK == 0).then_some((next_state, nuc))
|
(ext_old & IS_VISITED_MASK == 0).then_some((next_state, nuc))
|
||||||
});
|
});
|
||||||
Some(UnitigNucIter { graph: self, start: kmer, pos: 0, k, next_step })
|
Some(UnitigNucIter {
|
||||||
|
graph: self,
|
||||||
|
start: kmer,
|
||||||
|
pos: 0,
|
||||||
|
k,
|
||||||
|
next_step,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn for_each_unitig(&self, f: impl Fn(UnitigNucIter<'_>) + Sync) {
|
pub fn for_each_unitig(&self, f: impl Fn(UnitigNucIter<'_>) + Sync) {
|
||||||
@@ -352,7 +399,9 @@ impl GraphDeBruijn {
|
|||||||
self.nodes
|
self.nodes
|
||||||
.par_iter()
|
.par_iter()
|
||||||
.filter_map(|(&kmer, atomic)| {
|
.filter_map(|(&kmer, atomic)| {
|
||||||
Node(atomic.load(Ordering::Acquire)).is_start().then_some(kmer)
|
Node(atomic.load(Ordering::Acquire))
|
||||||
|
.is_start()
|
||||||
|
.then_some(kmer)
|
||||||
})
|
})
|
||||||
.for_each(|kmer| {
|
.for_each(|kmer| {
|
||||||
if let Some(iter) = self.unitig_nucleotides(kmer, k) {
|
if let Some(iter) = self.unitig_nucleotides(kmer, k) {
|
||||||
@@ -403,7 +452,7 @@ impl GraphDeBruijn {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
info!(
|
debug!(
|
||||||
chains = n_chains.load(Ordering::Relaxed),
|
chains = n_chains.load(Ordering::Relaxed),
|
||||||
phase2 = n2.load(Ordering::Relaxed),
|
phase2 = n2.load(Ordering::Relaxed),
|
||||||
total = n_chains.load(Ordering::Relaxed) + n2.load(Ordering::Relaxed),
|
total = n_chains.load(Ordering::Relaxed) + n2.load(Ordering::Relaxed),
|
||||||
@@ -508,7 +557,11 @@ impl Iterator for UnitigNucIter<'_> {
|
|||||||
Some(nuc)
|
Some(nuc)
|
||||||
} else if let Some((state, nuc)) = self.next_step.take() {
|
} else if let Some((state, nuc)) = self.next_step.take() {
|
||||||
self.next_step = state.walk(self.graph).and_then(|(next_state, next_nuc)| {
|
self.next_step = state.walk(self.graph).and_then(|(next_state, next_nuc)| {
|
||||||
let old = self.graph.nodes.get(&next_state.kmer)?.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
|
let old = self
|
||||||
|
.graph
|
||||||
|
.nodes
|
||||||
|
.get(&next_state.kmer)?
|
||||||
|
.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
|
||||||
(old & IS_VISITED_MASK == 0).then_some((next_state, next_nuc))
|
(old & IS_VISITED_MASK == 0).then_some((next_state, next_nuc))
|
||||||
});
|
});
|
||||||
Some(nuc)
|
Some(nuc)
|
||||||
@@ -539,10 +592,16 @@ fn count_neighbors(
|
|||||||
}
|
}
|
||||||
nuc = i as u8;
|
nuc = i as u8;
|
||||||
count += 1;
|
count += 1;
|
||||||
if count >= 2 { return (2, None); }
|
if count >= 2 {
|
||||||
|
return (2, None);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if count == 1 { (1, Some(nuc)) } else { (0, None) }
|
}
|
||||||
|
if count == 1 {
|
||||||
|
(1, Some(nuc))
|
||||||
|
} else {
|
||||||
|
(0, None)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
+120
-38
@@ -65,39 +65,65 @@ impl KmerIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ── Log source characteristics and choose base ────────────────────────
|
// ── Log source characteristics and choose base ────────────────────────
|
||||||
let mode_str = if mode == MergeMode::Presence { "presence" } else { "count" };
|
let mode_str = if mode == MergeMode::Presence {
|
||||||
|
"presence"
|
||||||
|
} else {
|
||||||
|
"count"
|
||||||
|
};
|
||||||
info!(
|
info!(
|
||||||
"merge: {} source(s), smer-size={}, mode={}",
|
"merge: {} source(s), smer-size={}, mode={}",
|
||||||
sources.len(), sources[0].kmer_size(), mode_str,
|
sources.len(),
|
||||||
|
sources[0].kmer_size(),
|
||||||
|
mode_str,
|
||||||
);
|
);
|
||||||
for (i, src) in sources.iter().enumerate() {
|
for (i, src) in sources.iter().enumerate() {
|
||||||
let genome_str = if src.meta.genomes.len() == 1 { "mono-genome".to_string() }
|
let genome_str = if src.meta.genomes.len() == 1 {
|
||||||
else { format!("{} genomes", src.meta.genomes.len()) };
|
"mono-genome".to_string()
|
||||||
let trivial_str = if is_trivial(src, mode) { " [trivial: no data approximation]" } else { "" };
|
} else {
|
||||||
|
format!("{} genomes", src.meta.genomes.len())
|
||||||
|
};
|
||||||
|
let trivial_str = if is_trivial(src, mode) {
|
||||||
|
" [trivial: no data approximation]"
|
||||||
|
} else {
|
||||||
|
""
|
||||||
|
};
|
||||||
info!(
|
info!(
|
||||||
" [{}] {} — {}, {}, {}{}",
|
" [{}] {} — {}, {}, {}{}",
|
||||||
i, src.root_path.display(),
|
i,
|
||||||
|
src.root_path.display(),
|
||||||
format_evidence(&src.meta.config.evidence),
|
format_evidence(&src.meta.config.evidence),
|
||||||
genome_str, mode_str, trivial_str,
|
genome_str,
|
||||||
|
mode_str,
|
||||||
|
trivial_str,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let base_idx = choose_base(sources, mode);
|
let base_idx = choose_base(sources, mode);
|
||||||
let needs_approx = sources.iter().any(|src| {
|
let needs_approx = sources.iter().any(|src| {
|
||||||
!is_trivial(src, mode)
|
!is_trivial(src, mode)
|
||||||
&& matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. })
|
&& matches!(
|
||||||
|
src.meta.config.evidence,
|
||||||
|
IndexMode::Approx { .. } | IndexMode::Hybrid { .. }
|
||||||
|
)
|
||||||
});
|
});
|
||||||
info!(
|
info!(
|
||||||
"output evidence: {} ({}base: [{}] {})",
|
"output evidence: {} ({}base: [{}] {})",
|
||||||
format_evidence(&sources[base_idx].meta.config.evidence),
|
format_evidence(&sources[base_idx].meta.config.evidence),
|
||||||
if needs_approx { "forced approx — " } else { "" },
|
if needs_approx {
|
||||||
base_idx, sources[base_idx].root_path.display(),
|
"forced approx — "
|
||||||
|
} else {
|
||||||
|
""
|
||||||
|
},
|
||||||
|
base_idx,
|
||||||
|
sources[base_idx].root_path.display(),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut ordered: Vec<&KmerIndex> = Vec::with_capacity(sources.len());
|
let mut ordered: Vec<&KmerIndex> = Vec::with_capacity(sources.len());
|
||||||
ordered.push(sources[base_idx]);
|
ordered.push(sources[base_idx]);
|
||||||
for (i, &src) in sources.iter().enumerate() {
|
for (i, &src) in sources.iter().enumerate() {
|
||||||
if i != base_idx { ordered.push(src); }
|
if i != base_idx {
|
||||||
|
ordered.push(src);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
let sources: &[&KmerIndex] = &ordered;
|
let sources: &[&KmerIndex] = &ordered;
|
||||||
let evidence = sources[0].meta.config.evidence.clone();
|
let evidence = sources[0].meta.config.evidence.clone();
|
||||||
@@ -151,7 +177,8 @@ impl KmerIndex {
|
|||||||
fs::remove_dir_all(&spectrums_dir)?;
|
fs::remove_dir_all(&spectrums_dir)?;
|
||||||
}
|
}
|
||||||
for (src, new_labels) in sources.iter().zip(&source_labels) {
|
for (src, new_labels) in sources.iter().zip(&source_labels) {
|
||||||
let old_labels: Vec<String> = src.meta.genomes.iter().map(|g| g.label.clone()).collect();
|
let old_labels: Vec<String> =
|
||||||
|
src.meta.genomes.iter().map(|g| g.label.clone()).collect();
|
||||||
copy_spectrums(&src.root_path, output, &old_labels, new_labels)?;
|
copy_spectrums(&src.root_path, output, &old_labels, new_labels)?;
|
||||||
}
|
}
|
||||||
pb.finish_and_clear();
|
pb.finish_and_clear();
|
||||||
@@ -184,9 +211,12 @@ impl KmerIndex {
|
|||||||
|
|
||||||
// Per-partition unitig byte sizes across remaining sources (stat() only)
|
// Per-partition unitig byte sizes across remaining sources (stat() only)
|
||||||
let partition_sizes: Vec<u64> = (0..n_partitions)
|
let partition_sizes: Vec<u64> = (0..n_partitions)
|
||||||
.map(|i| remaining_sources.iter()
|
.map(|i| {
|
||||||
|
remaining_sources
|
||||||
|
.iter()
|
||||||
.map(|s| partition_unitig_bytes(s, i))
|
.map(|s| partition_unitig_bytes(s, i))
|
||||||
.sum())
|
.sum()
|
||||||
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
// LFD sort: largest partition first
|
// LFD sort: largest partition first
|
||||||
@@ -201,7 +231,8 @@ impl KmerIndex {
|
|||||||
// IDs; each reports (id, g_len, duration) on a result channel.
|
// IDs; each reports (id, g_len, duration) on a result channel.
|
||||||
const SPAWN_THRESHOLD: f64 = 0.95; // spawn when >5% capacity idle
|
const SPAWN_THRESHOLD: f64 = 0.95; // spawn when >5% capacity idle
|
||||||
let n_cores = std::thread::available_parallelism()
|
let n_cores = std::thread::available_parallelism()
|
||||||
.map(|n| n.get()).unwrap_or(1);
|
.map(|n| n.get())
|
||||||
|
.unwrap_or(1);
|
||||||
let max_workers = (n_cores / 2).max(1);
|
let max_workers = (n_cores / 2).max(1);
|
||||||
let _ = budget_fraction; // kept in signature for CLI compatibility
|
let _ = budget_fraction; // kept in signature for CLI compatibility
|
||||||
|
|
||||||
@@ -220,6 +251,9 @@ impl KmerIndex {
|
|||||||
let mut part_stats: Vec<PartStat> = Vec::with_capacity(n_partitions);
|
let mut part_stats: Vec<PartStat> = Vec::with_capacity(n_partitions);
|
||||||
let mut n_workers = 0usize;
|
let mut n_workers = 0usize;
|
||||||
let mut cpu_sample = CpuSample::now();
|
let mut cpu_sample = CpuSample::now();
|
||||||
|
// Efficiency measured just before each spawn, used to assess
|
||||||
|
// whether the previous worker delivered its expected marginal gain.
|
||||||
|
let mut efficiency_at_last_spawn = 0.0f64;
|
||||||
|
|
||||||
// Shadow as references so closures can capture them by copy.
|
// Shadow as references so closures can capture them by copy.
|
||||||
let srcs = &srcs;
|
let srcs = &srcs;
|
||||||
@@ -237,7 +271,12 @@ impl KmerIndex {
|
|||||||
for i in &prx {
|
for i in &prx {
|
||||||
let t = Instant::now();
|
let t = Instant::now();
|
||||||
let r = dst_partition.merge_partition(
|
let r = dst_partition.merge_partition(
|
||||||
i, srcs, mode, n_dst_genomes, block_bits, evidence,
|
i,
|
||||||
|
srcs,
|
||||||
|
mode,
|
||||||
|
n_dst_genomes,
|
||||||
|
block_bits,
|
||||||
|
evidence,
|
||||||
);
|
);
|
||||||
rtx.send((i, r, t.elapsed())).ok();
|
rtx.send((i, r, t.elapsed())).ok();
|
||||||
}
|
}
|
||||||
@@ -252,26 +291,51 @@ impl KmerIndex {
|
|||||||
|
|
||||||
let mut completed = 0usize;
|
let mut completed = 0usize;
|
||||||
while completed < n_partitions {
|
while completed < n_partitions {
|
||||||
let (i, r, dur) = result_rx.recv()
|
let (i, r, dur) = result_rx.recv().map_err(|_| {
|
||||||
.map_err(|_| OKIError::Io(io::Error::new(
|
OKIError::Io(io::Error::new(
|
||||||
io::ErrorKind::UnexpectedEof, "worker channel closed")))?;
|
io::ErrorKind::UnexpectedEof,
|
||||||
|
"worker channel closed",
|
||||||
|
))
|
||||||
|
})?;
|
||||||
let g_len = r.map_err(OKIError::Partition)?;
|
let g_len = r.map_err(OKIError::Partition)?;
|
||||||
pb.inc(1);
|
pb.inc(1);
|
||||||
debug!("partition {i}: done in {:.1}s — {} new kmers",
|
debug!(
|
||||||
dur.as_secs_f64(), g_len);
|
"partition {i}: done in {:.1}s — {} new kmers",
|
||||||
|
dur.as_secs_f64(),
|
||||||
|
g_len
|
||||||
|
);
|
||||||
part_stats.push(PartStat {
|
part_stats.push(PartStat {
|
||||||
id: i, unitig_bytes: partition_sizes[i], g_len,
|
id: i,
|
||||||
|
unitig_bytes: partition_sizes[i],
|
||||||
|
g_len,
|
||||||
});
|
});
|
||||||
completed += 1;
|
completed += 1;
|
||||||
|
|
||||||
if n_workers < max_workers && completed < n_partitions {
|
if n_workers < max_workers && completed < n_partitions {
|
||||||
let eff = cpu_sample.cpu_efficiency(n_cores);
|
let eff = cpu_sample.cpu_efficiency(n_cores);
|
||||||
if eff < SPAWN_THRESHOLD {
|
// For the first spawn use SPAWN_THRESHOLD.
|
||||||
|
// For subsequent spawns: the previous worker should
|
||||||
|
// have raised efficiency by at least a quarter of the expected
|
||||||
|
// marginal gain (1/n_workers). If not, adding another
|
||||||
|
// worker won't help.
|
||||||
|
let should_spawn = if n_workers == 1 {
|
||||||
|
eff < SPAWN_THRESHOLD
|
||||||
|
} else {
|
||||||
|
let gain = eff - efficiency_at_last_spawn;
|
||||||
|
let expected = 1.0 / n_workers as f64;
|
||||||
|
gain >= expected * 0.25
|
||||||
|
};
|
||||||
|
if should_spawn {
|
||||||
|
debug!(
|
||||||
|
"activated worker {} — efficiency {:.0}%, gain vs prev {:.0}%",
|
||||||
|
n_workers + 1,
|
||||||
|
eff * 100.0,
|
||||||
|
(eff - efficiency_at_last_spawn) * 100.0,
|
||||||
|
);
|
||||||
|
efficiency_at_last_spawn = eff;
|
||||||
activate_tx.send(()).ok();
|
activate_tx.send(()).ok();
|
||||||
n_workers += 1;
|
n_workers += 1;
|
||||||
cpu_sample = CpuSample::now();
|
cpu_sample = CpuSample::now();
|
||||||
debug!("activated worker {n_workers} — efficiency {:.0}%",
|
|
||||||
eff * 100.0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -319,9 +383,7 @@ fn print_merge_partition_report(stats: &[PartStat], n_workers: usize, max_worker
|
|||||||
" {} partition(s) processed, {} total new kmers",
|
" {} partition(s) processed, {} total new kmers",
|
||||||
non_empty, total_new,
|
non_empty, total_new,
|
||||||
);
|
);
|
||||||
info!(
|
info!(" workers spawned: {n_workers} / {max_workers} (max)",);
|
||||||
" workers spawned: {n_workers} / {max_workers} (max)",
|
|
||||||
);
|
|
||||||
|
|
||||||
// Top 8 partitions by new-kmer count
|
// Top 8 partitions by new-kmer count
|
||||||
let mut by_new: Vec<&PartStat> = stats.iter().filter(|s| s.g_len > 0).collect();
|
let mut by_new: Vec<&PartStat> = stats.iter().filter(|s| s.g_len > 0).collect();
|
||||||
@@ -343,10 +405,15 @@ fn print_merge_partition_report(stats: &[PartStat], n_workers: usize, max_worker
|
|||||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
fn fmt_bytes(b: u64) -> String {
|
fn fmt_bytes(b: u64) -> String {
|
||||||
if b >= 1 << 30 { format!("{:.1} GB", b as f64 / (1u64 << 30) as f64) }
|
if b >= 1 << 30 {
|
||||||
else if b >= 1 << 20 { format!("{:.1} MB", b as f64 / (1u64 << 20) as f64) }
|
format!("{:.1} GB", b as f64 / (1u64 << 30) as f64)
|
||||||
else if b >= 1 << 10 { format!("{:.1} KB", b as f64 / (1u64 << 10) as f64) }
|
} else if b >= 1 << 20 {
|
||||||
else { format!("{b} B") }
|
format!("{:.1} MB", b as f64 / (1u64 << 20) as f64)
|
||||||
|
} else if b >= 1 << 10 {
|
||||||
|
format!("{:.1} KB", b as f64 / (1u64 << 10) as f64)
|
||||||
|
} else {
|
||||||
|
format!("{b} B")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sum of all unitigs.bin sizes across all layers of partition `i` in `src`.
|
/// Sum of all unitigs.bin sizes across all layers of partition `i` in `src`.
|
||||||
@@ -354,8 +421,12 @@ fn partition_unitig_bytes(src: &KmerIndex, i: usize) -> u64 {
|
|||||||
let mut total = 0u64;
|
let mut total = 0u64;
|
||||||
for l in 0.. {
|
for l in 0.. {
|
||||||
let p = src.layer_unitigs_path(i, l);
|
let p = src.layer_unitigs_path(i, l);
|
||||||
if !p.exists() { break; }
|
if !p.exists() {
|
||||||
if let Ok(m) = std::fs::metadata(&p) { total += m.len(); }
|
break;
|
||||||
|
}
|
||||||
|
if let Ok(m) = std::fs::metadata(&p) {
|
||||||
|
total += m.len();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
total
|
total
|
||||||
}
|
}
|
||||||
@@ -382,7 +453,10 @@ fn compute_labels(
|
|||||||
};
|
};
|
||||||
*count += 1;
|
*count += 1;
|
||||||
labels.push(new_label.clone());
|
labels.push(new_label.clone());
|
||||||
all_genomes.push(GenomeInfo { label: new_label, meta: genome.meta.clone() });
|
all_genomes.push(GenomeInfo {
|
||||||
|
label: new_label,
|
||||||
|
meta: genome.meta.clone(),
|
||||||
|
});
|
||||||
}
|
}
|
||||||
source_labels.push(labels);
|
source_labels.push(labels);
|
||||||
}
|
}
|
||||||
@@ -443,13 +517,21 @@ fn index_unitig_size(src: &KmerIndex) -> u64 {
|
|||||||
fn choose_base(sources: &[&KmerIndex], mode: MergeMode) -> usize {
|
fn choose_base(sources: &[&KmerIndex], mode: MergeMode) -> usize {
|
||||||
let needs_approx = sources.iter().any(|src| {
|
let needs_approx = sources.iter().any(|src| {
|
||||||
!is_trivial(src, mode)
|
!is_trivial(src, mode)
|
||||||
&& matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. })
|
&& matches!(
|
||||||
|
src.meta.config.evidence,
|
||||||
|
IndexMode::Approx { .. } | IndexMode::Hybrid { .. }
|
||||||
|
)
|
||||||
});
|
});
|
||||||
|
|
||||||
sources.iter().enumerate()
|
sources
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
.filter(|(_, src)| {
|
.filter(|(_, src)| {
|
||||||
!needs_approx
|
!needs_approx
|
||||||
|| matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. })
|
|| matches!(
|
||||||
|
src.meta.config.evidence,
|
||||||
|
IndexMode::Approx { .. } | IndexMode::Hybrid { .. }
|
||||||
|
)
|
||||||
})
|
})
|
||||||
.max_by_key(|(_, src)| index_unitig_size(src))
|
.max_by_key(|(_, src)| index_unitig_size(src))
|
||||||
.map(|(i, _)| i)
|
.map(|(i, _)| i)
|
||||||
|
|||||||
Reference in New Issue
Block a user