Push ywwwypqxrtmy #14
Generated
+1
@@ -1659,6 +1659,7 @@ name = "obisys"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
|
"sysinfo",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|||||||
@@ -32,6 +32,14 @@ impl PersistentBitMatrix {
|
|||||||
self.cols.iter().map(|c| c.get(slot)).collect()
|
self.cols.iter().map(|c| c.get(slot)).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Fill `buf[i]` with `col_i[slot]` as 0/1 u32, without allocating.
|
||||||
|
/// `buf` must have length ≥ `self.n_cols()`.
|
||||||
|
pub fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||||
|
for (c, col) in self.cols.iter().enumerate() {
|
||||||
|
buf[c] = col.get(slot) as u32;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the number of set bits in each column as `Array1<u64>`.
|
/// Returns the number of set bits in each column as `Array1<u64>`.
|
||||||
pub fn count_ones(&self) -> Array1<u64> {
|
pub fn count_ones(&self) -> Array1<u64> {
|
||||||
let counts: Vec<u64> = (0..self.n_cols())
|
let counts: Vec<u64> = (0..self.n_cols())
|
||||||
|
|||||||
@@ -33,6 +33,14 @@ impl PersistentCompactIntMatrix {
|
|||||||
self.cols.iter().map(|c| c.get(slot)).collect()
|
self.cols.iter().map(|c| c.get(slot)).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Fill `buf[i]` with `col_i[slot]`, without allocating.
|
||||||
|
/// `buf` must have length ≥ `self.n_cols()`.
|
||||||
|
pub fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||||
|
for (c, col) in self.cols.iter().enumerate() {
|
||||||
|
buf[c] = col.get(slot);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── Distance matrices ─────────────────────────────────────────────────────
|
// ── Distance matrices ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub fn bray_dist_matrix(&self) -> Array2<f64> {
|
pub fn bray_dist_matrix(&self) -> Array2<f64> {
|
||||||
|
|||||||
+237
-127
@@ -5,13 +5,13 @@ use std::sync::Arc;
|
|||||||
|
|
||||||
use clap::Args;
|
use clap::Args;
|
||||||
use obikindex::KmerIndex;
|
use obikindex::KmerIndex;
|
||||||
use obikpartitionner::PreloadedIndex;
|
|
||||||
use obilayeredmap::IndexMode;
|
|
||||||
use obiread::chunk::read_sequence_chunks;
|
|
||||||
use obiread::record::{SeqRecord, parse_chunk};
|
|
||||||
use obikrope::Rope;
|
use obikrope::Rope;
|
||||||
use obikseq::{RoutableSuperKmer, set_k, set_m};
|
use obikseq::{RoutableSuperKmer, set_k, set_m};
|
||||||
|
use obilayeredmap::IndexMode;
|
||||||
|
use obiread::chunk::read_sequence_chunks_sized;
|
||||||
|
use obiread::record::{SeqRecord, parse_chunk};
|
||||||
use obiskbuilder::SuperKmerIter;
|
use obiskbuilder::SuperKmerIter;
|
||||||
|
use obisys::available_memory_bytes;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
// ── Pipeline data ─────────────────────────────────────────────────────────────
|
// ── Pipeline data ─────────────────────────────────────────────────────────────
|
||||||
@@ -70,6 +70,10 @@ pub struct QueryArgs {
|
|||||||
.unwrap_or(1)
|
.unwrap_or(1)
|
||||||
)]
|
)]
|
||||||
pub threads: usize,
|
pub threads: usize,
|
||||||
|
|
||||||
|
/// I/O chunk size in MiB (default: auto-sized from available RAM and thread count)
|
||||||
|
#[arg(long)]
|
||||||
|
pub chunk_size: Option<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── SKDesc — one occurrence of a superkmer in the batch ───────────────────────
|
// ── SKDesc — one occurrence of a superkmer in the batch ───────────────────────
|
||||||
@@ -98,25 +102,24 @@ pub struct QueryBatch {
|
|||||||
|
|
||||||
impl QueryBatch {
|
impl QueryBatch {
|
||||||
/// Build a batch from a vec of parsed sequence records.
|
/// Build a batch from a vec of parsed sequence records.
|
||||||
pub fn from_records(
|
pub fn from_records(records: Vec<SeqRecord>, k: usize, level_max: usize, theta: f64) -> Self {
|
||||||
records: Vec<SeqRecord>,
|
|
||||||
k: usize,
|
|
||||||
level_max: usize,
|
|
||||||
theta: f64,
|
|
||||||
) -> Self {
|
|
||||||
let mut ids = Vec::with_capacity(records.len());
|
let mut ids = Vec::with_capacity(records.len());
|
||||||
let mut seqs = Vec::with_capacity(records.len());
|
let mut seqs = Vec::with_capacity(records.len());
|
||||||
let mut n_kmers = Vec::with_capacity(records.len());
|
let mut n_kmers = Vec::with_capacity(records.len());
|
||||||
let mut map: HashMap<RoutableSuperKmer, Vec<SKDesc>> = HashMap::new();
|
// Upper-bound estimate: at most one superkmer per k bases.
|
||||||
|
// Avoids repeated rehash on large chunks.
|
||||||
|
let cap = records.iter().map(|r| r.normalized.len()).sum::<usize>() / k.max(1);
|
||||||
|
let mut map: HashMap<RoutableSuperKmer, Vec<SKDesc>> = HashMap::with_capacity(cap);
|
||||||
|
|
||||||
for (seq_idx, record) in records.into_iter().enumerate() {
|
for (seq_idx, record) in records.into_iter().enumerate() {
|
||||||
let mut kmer_offset = 0u32;
|
let mut kmer_offset = 0u32;
|
||||||
|
|
||||||
for rsk in SuperKmerIter::new(&record.normalized, k, level_max, theta) {
|
for rsk in SuperKmerIter::new(&record.normalized, k, level_max, theta) {
|
||||||
let n = (rsk.seql() - k + 1) as u32;
|
let n = (rsk.seql() - k + 1) as u32;
|
||||||
map.entry(rsk)
|
map.entry(rsk).or_default().push(SKDesc {
|
||||||
.or_default()
|
seq_idx: seq_idx as u32,
|
||||||
.push(SKDesc { seq_idx: seq_idx as u32, kmer_offset });
|
kmer_offset,
|
||||||
|
});
|
||||||
kmer_offset += n;
|
kmer_offset += n;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -125,7 +128,12 @@ impl QueryBatch {
|
|||||||
n_kmers.push(kmer_offset);
|
n_kmers.push(kmer_offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
Self { ids, seqs, n_kmers, map }
|
Self {
|
||||||
|
ids,
|
||||||
|
seqs,
|
||||||
|
n_kmers,
|
||||||
|
map,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Split the superkmer map by partition index.
|
/// Split the superkmer map by partition index.
|
||||||
@@ -140,6 +148,59 @@ impl QueryBatch {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── KmerResults — allocation-free ragged result matrix ───────────────────────
|
||||||
|
|
||||||
|
/// Flat storage for per-kmer query results across all sequences in a chunk.
|
||||||
|
///
|
||||||
|
/// Replaces `Vec<Vec<Option<Box<[u32]>>>>` — a single allocation for the whole
|
||||||
|
/// chunk instead of one `Box<[u32]>` per found k-mer.
|
||||||
|
struct KmerResults {
|
||||||
|
data: Vec<u32>, // total_kmers × n_genomes, row-major
|
||||||
|
in_index: Vec<bool>, // total_kmers — true if the kmer was found in the index
|
||||||
|
offsets: Vec<usize>, // offsets[i]..offsets[i+1] = kmer range for sequence i
|
||||||
|
n_genomes: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KmerResults {
|
||||||
|
fn new(n_kmers_per_seq: &[u32], n_genomes: usize) -> Self {
|
||||||
|
let mut offsets = Vec::with_capacity(n_kmers_per_seq.len() + 1);
|
||||||
|
let mut total = 0usize;
|
||||||
|
offsets.push(0);
|
||||||
|
for &n in n_kmers_per_seq {
|
||||||
|
total += n as usize;
|
||||||
|
offsets.push(total);
|
||||||
|
}
|
||||||
|
Self {
|
||||||
|
data: vec![0u32; total * n_genomes],
|
||||||
|
in_index: vec![false; total],
|
||||||
|
offsets,
|
||||||
|
n_genomes,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn n_kmers_for(&self, seq: usize) -> usize {
|
||||||
|
self.offsets[seq + 1] - self.offsets[seq]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set(&mut self, seq: usize, kmer: usize, row: &[u32]) {
|
||||||
|
let abs = self.offsets[seq] + kmer;
|
||||||
|
self.in_index[abs] = true;
|
||||||
|
let base = abs * self.n_genomes;
|
||||||
|
self.data[base..base + self.n_genomes].copy_from_slice(row);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_in_index(&self, seq: usize, kmer: usize) -> bool {
|
||||||
|
self.in_index[self.offsets[seq] + kmer]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Value for genome `g` at (seq, kmer); meaningful only when `is_in_index`.
|
||||||
|
#[inline]
|
||||||
|
fn val(&self, seq: usize, kmer: usize, g: usize) -> u32 {
|
||||||
|
self.data[(self.offsets[seq] + kmer) * self.n_genomes + g]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── Per-sequence accumulator ──────────────────────────────────────────────────
|
// ── Per-sequence accumulator ──────────────────────────────────────────────────
|
||||||
|
|
||||||
struct SeqAcc {
|
struct SeqAcc {
|
||||||
@@ -158,61 +219,10 @@ impl SeqAcc {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Findere z-window filter ───────────────────────────────────────────────────
|
|
||||||
|
|
||||||
/// Apply the Findere z-window filter to per-kmer query results for one superkmer.
|
|
||||||
/// Aggregate s-mer query results into k-mer answers using a Findere z-window.
|
|
||||||
///
|
|
||||||
/// Input: N s-mer results (indexed kmer size s = k − z + 1).
|
|
||||||
/// Output: N − z + 1 k-mer results (user kmer size k).
|
|
||||||
///
|
|
||||||
/// For each genome g independently: k-mer at position i is confirmed iff all z values
|
|
||||||
/// results[i..i+z][g] are nonzero (None counts as zero for all genomes).
|
|
||||||
/// Output values are taken from results[i]; genomes not confirmed are zeroed.
|
|
||||||
fn apply_findere(
|
|
||||||
results: &[Option<Box<[u32]>>],
|
|
||||||
z: usize,
|
|
||||||
n_genomes: usize,
|
|
||||||
) -> Vec<Option<Box<[u32]>>> {
|
|
||||||
let n = results.len();
|
|
||||||
if z <= 1 {
|
|
||||||
return results.iter().map(|r| r.as_ref().map(|row| row.clone())).collect();
|
|
||||||
}
|
|
||||||
if n < z {
|
|
||||||
return Vec::new();
|
|
||||||
}
|
|
||||||
|
|
||||||
let out_n = n - z + 1;
|
|
||||||
let mut confirmed = vec![vec![false; n_genomes]; out_n];
|
|
||||||
|
|
||||||
for g in 0..n_genomes {
|
|
||||||
let hit = |i: usize| results[i].as_ref().map_or(false, |r| r[g] > 0);
|
|
||||||
|
|
||||||
let mut count: u32 = (0..z).filter(|&j| hit(j)).count() as u32;
|
|
||||||
if count == z as u32 { confirmed[0][g] = true; }
|
|
||||||
|
|
||||||
for i in 1..out_n {
|
|
||||||
if hit(i - 1) { count -= 1; }
|
|
||||||
if hit(i + z - 1) { count += 1; }
|
|
||||||
if count == z as u32 { confirmed[i][g] = true; }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
(0..out_n).map(|i| {
|
|
||||||
let first = results[i].as_ref()?;
|
|
||||||
let mut row: Box<[u32]> = first.clone();
|
|
||||||
for g in 0..n_genomes {
|
|
||||||
if !confirmed[i][g] { row[g] = 0; }
|
|
||||||
}
|
|
||||||
if row.iter().any(|&v| v > 0) { Some(row) } else { None }
|
|
||||||
}).collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── process_chunk ─────────────────────────────────────────────────────────────
|
// ── process_chunk ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
fn process_chunk(
|
fn process_chunk(
|
||||||
idx: &KmerIndex,
|
idx: &KmerIndex,
|
||||||
preloaded: &PreloadedIndex,
|
|
||||||
rope: Rope,
|
rope: Rope,
|
||||||
k: usize,
|
k: usize,
|
||||||
n_genomes: usize,
|
n_genomes: usize,
|
||||||
@@ -232,11 +242,8 @@ fn process_chunk(
|
|||||||
let batch = QueryBatch::from_records(records, k, 6, 0.7);
|
let batch = QueryBatch::from_records(records, k, 6, 0.7);
|
||||||
let n_seqs = batch.ids.len();
|
let n_seqs = batch.ids.len();
|
||||||
|
|
||||||
// Per-sequence s-mer result vectors in global sequence position order.
|
// Flat result matrix — one allocation for the whole chunk.
|
||||||
// All partitions fill into this structure before Findere is applied.
|
let mut results = KmerResults::new(&batch.n_kmers, n_genomes);
|
||||||
let mut seq_results: Vec<Vec<Option<Box<[u32]>>>> = batch.n_kmers.iter()
|
|
||||||
.map(|&n| vec![None; n as usize])
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let by_part = batch.split_by_partition(n_partitions);
|
let by_part = batch.split_by_partition(n_partitions);
|
||||||
|
|
||||||
@@ -245,35 +252,54 @@ fn process_chunk(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let kmer_results = preloaded
|
idx.partition()
|
||||||
.query_partition(part_idx, part_sks, k, n_genomes)
|
.query_partition_with(
|
||||||
|
part_idx,
|
||||||
|
part_sks,
|
||||||
|
k,
|
||||||
|
n_genomes,
|
||||||
|
with_counts,
|
||||||
|
|sk_idx, kmer_idx, row| {
|
||||||
|
let rsk = part_sks[sk_idx];
|
||||||
|
let descs = batch.map.get(rsk).expect("rsk must be in map");
|
||||||
|
for desc in descs {
|
||||||
|
results.set(
|
||||||
|
desc.seq_idx as usize,
|
||||||
|
desc.kmer_offset as usize + kmer_idx,
|
||||||
|
row,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
.unwrap_or_else(|e| {
|
.unwrap_or_else(|e| {
|
||||||
eprintln!("query error on partition {part_idx}: {e}");
|
eprintln!("query error on partition {part_idx}: {e}");
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
for (rsk, sk_kmer_results) in part_sks.iter().zip(kmer_results.iter()) {
|
|
||||||
let descs = batch.map.get(*rsk).expect("rsk must be in map");
|
|
||||||
for desc in descs {
|
|
||||||
let offset = desc.kmer_offset as usize;
|
|
||||||
let dst = &mut seq_results[desc.seq_idx as usize];
|
|
||||||
for (j, hit) in sk_kmer_results.iter().enumerate() {
|
|
||||||
dst[offset + j] = hit.as_ref().map(|r| r.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply Findere on each complete sequence vector, then accumulate.
|
// Findere z-window filter + accumulation — no intermediate allocations.
|
||||||
let n_kmers_out: Vec<usize> = batch.n_kmers.iter()
|
// One `confirmed` buffer reused across all sequences.
|
||||||
.map(|&n| { let n = n as usize; if n >= effective_z { n - effective_z + 1 } else { 0 } })
|
let max_n_kmers = batch.n_kmers.iter().map(|&n| n as usize).max().unwrap_or(0);
|
||||||
|
let mut confirmed = vec![false; max_n_kmers * n_genomes];
|
||||||
|
|
||||||
|
let mut accs: Vec<SeqAcc> = (0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect();
|
||||||
|
|
||||||
|
let n_kmers_out: Vec<usize> = batch
|
||||||
|
.n_kmers
|
||||||
|
.iter()
|
||||||
|
.map(|&n| {
|
||||||
|
let n = n as usize;
|
||||||
|
if n >= effective_z {
|
||||||
|
n - effective_z + 1
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
}
|
||||||
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let mut accs: Vec<SeqAcc> =
|
|
||||||
(0..n_seqs).map(|_| SeqAcc::new(n_genomes)).collect();
|
|
||||||
|
|
||||||
let mut cov: Vec<Vec<Vec<u32>>> = if detail {
|
let mut cov: Vec<Vec<Vec<u32>>> = if detail {
|
||||||
n_kmers_out.iter()
|
n_kmers_out
|
||||||
|
.iter()
|
||||||
.map(|&n| vec![vec![0u32; n]; n_genomes])
|
.map(|&n| vec![vec![0u32; n]; n_genomes])
|
||||||
.collect()
|
.collect()
|
||||||
} else {
|
} else {
|
||||||
@@ -282,39 +308,114 @@ fn process_chunk(
|
|||||||
|
|
||||||
let presence = force_presence || !with_counts;
|
let presence = force_presence || !with_counts;
|
||||||
let threshold = presence_threshold;
|
let threshold = presence_threshold;
|
||||||
|
let z = effective_z;
|
||||||
|
|
||||||
for seq_idx in 0..n_seqs {
|
for seq_idx in 0..n_seqs {
|
||||||
let filtered = apply_findere(&seq_results[seq_idx], effective_z, n_genomes);
|
let n = results.n_kmers_for(seq_idx);
|
||||||
let acc = &mut accs[seq_idx];
|
let out_n = n_kmers_out[seq_idx];
|
||||||
|
if out_n == 0 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
for (pos, hit) in filtered.iter().enumerate() {
|
if z <= 1 {
|
||||||
match hit {
|
// No Findere — every indexed s-mer is confirmed.
|
||||||
None => {
|
let acc = &mut accs[seq_idx];
|
||||||
if seq_results[seq_idx][pos].is_none() {
|
for pos in 0..n {
|
||||||
|
if !results.is_in_index(seq_idx, pos) {
|
||||||
acc.kmer_missing += 1;
|
acc.kmer_missing += 1;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
Some(row) => {
|
|
||||||
acc.kmer_count += 1;
|
acc.kmer_count += 1;
|
||||||
for (g, &v) in row.iter().enumerate() {
|
for g in 0..n_genomes {
|
||||||
if v == 0 { continue; }
|
let v = results.val(seq_idx, pos, g);
|
||||||
let contribution = if presence {
|
if v == 0 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let c = if presence {
|
||||||
u32::from(v >= threshold)
|
u32::from(v >= threshold)
|
||||||
} else {
|
} else {
|
||||||
v
|
v
|
||||||
};
|
};
|
||||||
acc.genome_totals[g] += contribution;
|
acc.genome_totals[g] += c;
|
||||||
if detail {
|
if detail {
|
||||||
cov[seq_idx][g][pos] += contribution;
|
cov[seq_idx][g][pos] += c;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Build confirmed[pos * n_genomes + g] via sliding window.
|
||||||
|
let conf = &mut confirmed[..out_n * n_genomes];
|
||||||
|
conf.fill(false);
|
||||||
|
|
||||||
|
for g in 0..n_genomes {
|
||||||
|
let hit =
|
||||||
|
|i: usize| results.is_in_index(seq_idx, i) && results.val(seq_idx, i, g) > 0;
|
||||||
|
|
||||||
|
let mut cnt: u32 = (0..z).filter(|&j| hit(j)).count() as u32;
|
||||||
|
if cnt == z as u32 {
|
||||||
|
conf[g] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in 1..out_n {
|
||||||
|
if hit(i - 1) {
|
||||||
|
cnt -= 1;
|
||||||
|
}
|
||||||
|
if hit(i + z - 1) {
|
||||||
|
cnt += 1;
|
||||||
|
}
|
||||||
|
if cnt == z as u32 {
|
||||||
|
conf[i * n_genomes + g] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let acc = &mut accs[seq_idx];
|
||||||
|
for pos in 0..out_n {
|
||||||
|
let any = (0..n_genomes).any(|g| conf[pos * n_genomes + g]);
|
||||||
|
if !any {
|
||||||
|
if !results.is_in_index(seq_idx, pos) {
|
||||||
|
acc.kmer_missing += 1;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
acc.kmer_count += 1;
|
||||||
|
for g in 0..n_genomes {
|
||||||
|
if !conf[pos * n_genomes + g] {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let v = results.val(seq_idx, pos, g);
|
||||||
|
if v == 0 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let c = if presence {
|
||||||
|
u32::from(v >= threshold)
|
||||||
|
} else {
|
||||||
|
v
|
||||||
|
};
|
||||||
|
acc.genome_totals[g] += c;
|
||||||
|
if detail {
|
||||||
|
cov[seq_idx][g][pos] += c;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut buf = Vec::new();
|
// Capacity estimate: actual sequence + ID bytes, plus JSON overhead per record.
|
||||||
emit_batch(&batch, &accs, idx.meta(), count_missing, detail, &cov, &mut buf);
|
// JSON per record ≈ 50 fixed chars + ~20 per genome (label + count value) + 100 (overhead).
|
||||||
|
let seq_bytes: usize = batch.seqs.iter().map(|s| s.len()).sum();
|
||||||
|
let id_bytes: usize = batch.ids.iter().map(|s| s.len()).sum();
|
||||||
|
let cap = seq_bytes + id_bytes + n_seqs * (4 + 50 + n_genomes * 20) + 100;
|
||||||
|
let mut buf = Vec::with_capacity(cap);
|
||||||
|
emit_batch(
|
||||||
|
&batch,
|
||||||
|
&accs,
|
||||||
|
idx.meta(),
|
||||||
|
count_missing,
|
||||||
|
detail,
|
||||||
|
&cov,
|
||||||
|
&mut buf,
|
||||||
|
);
|
||||||
buf
|
buf
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -335,11 +436,23 @@ pub fn run(args: QueryArgs) {
|
|||||||
let with_counts = idx.meta().config.with_counts;
|
let with_counts = idx.meta().config.with_counts;
|
||||||
let n_workers = args.threads.max(1);
|
let n_workers = args.threads.max(1);
|
||||||
|
|
||||||
let effective_z: usize = args.findere_z.unwrap_or_else(|| {
|
// Chunk size: each chunk stays in memory for its entire processing lifetime.
|
||||||
match idx.meta().config.evidence {
|
// Overhead per raw byte is ~8× (Rope + parsed records + superkmers + results).
|
||||||
|
// We target ≤ 50 % of available RAM across all concurrent workers.
|
||||||
|
let chunk_bytes = args
|
||||||
|
.chunk_size
|
||||||
|
.map(|mb| mb * 1024 * 1024)
|
||||||
|
.unwrap_or_else(|| {
|
||||||
|
let avail = available_memory_bytes();
|
||||||
|
let computed = avail / (n_workers as u64 * 16);
|
||||||
|
computed.clamp(4 * 1024 * 1024, 256 * 1024 * 1024) as usize
|
||||||
|
});
|
||||||
|
|
||||||
|
let effective_z: usize = args
|
||||||
|
.findere_z
|
||||||
|
.unwrap_or_else(|| match idx.meta().config.evidence {
|
||||||
IndexMode::Approx { z, .. } | IndexMode::Hybrid { z, .. } => z as usize,
|
IndexMode::Approx { z, .. } | IndexMode::Hybrid { z, .. } => z as usize,
|
||||||
IndexMode::Exact => 1,
|
IndexMode::Exact => 1,
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
@@ -352,14 +465,6 @@ pub fn run(args: QueryArgs) {
|
|||||||
eprintln!("warning: --mismatch not yet implemented, ignored");
|
eprintln!("warning: --mismatch not yet implemented, ignored");
|
||||||
}
|
}
|
||||||
|
|
||||||
let preloaded = Arc::new(
|
|
||||||
PreloadedIndex::new(idx.partition(), n_partitions, with_counts)
|
|
||||||
.unwrap_or_else(|e| {
|
|
||||||
eprintln!("error loading index layers: {e}");
|
|
||||||
std::process::exit(1);
|
|
||||||
})
|
|
||||||
);
|
|
||||||
|
|
||||||
let detail = args.detail;
|
let detail = args.detail;
|
||||||
let count_missing = args.count_missing;
|
let count_missing = args.count_missing;
|
||||||
let force_presence = args.force_presence;
|
let force_presence = args.force_presence;
|
||||||
@@ -367,13 +472,18 @@ pub fn run(args: QueryArgs) {
|
|||||||
|
|
||||||
// Flat iterator over all Rope chunks from all input files.
|
// Flat iterator over all Rope chunks from all input files.
|
||||||
// I/O runs in the source thread; chunk processing is parallelised by the pipe.
|
// I/O runs in the source thread; chunk processing is parallelised by the pipe.
|
||||||
|
info!("query: chunk_size={}MiB", chunk_bytes / (1024 * 1024));
|
||||||
|
|
||||||
let paths: Vec<PathBuf> = args.inputs.iter().map(PathBuf::from).collect();
|
let paths: Vec<PathBuf> = args.inputs.iter().map(PathBuf::from).collect();
|
||||||
let all_chunks = paths.into_iter().flat_map(|path| {
|
let all_chunks = paths.into_iter().flat_map(move |path| {
|
||||||
let path_str = path.to_str().unwrap_or("").to_owned();
|
let path_str = path.to_str().unwrap_or("").to_owned();
|
||||||
match read_sequence_chunks(&path_str) {
|
match read_sequence_chunks_sized(&path_str, chunk_bytes) {
|
||||||
Ok(iter) => Box::new(iter.filter_map(|r| match r {
|
Ok(iter) => Box::new(iter.filter_map(|r| match r {
|
||||||
Ok(rope) => Some(rope),
|
Ok(rope) => Some(rope),
|
||||||
Err(e) => { eprintln!("read error: {e}"); None }
|
Err(e) => {
|
||||||
|
eprintln!("read error: {e}");
|
||||||
|
None
|
||||||
|
}
|
||||||
})) as Box<dyn Iterator<Item = Rope> + Send>,
|
})) as Box<dyn Iterator<Item = Rope> + Send>,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("error opening {path_str}: {e}");
|
eprintln!("error opening {path_str}: {e}");
|
||||||
@@ -386,10 +496,9 @@ pub fn run(args: QueryArgs) {
|
|||||||
QueryData : Rope => Vec<u8>,
|
QueryData : Rope => Vec<u8>,
|
||||||
| {
|
| {
|
||||||
let idx = Arc::clone(&idx);
|
let idx = Arc::clone(&idx);
|
||||||
let preloaded = Arc::clone(&preloaded);
|
|
||||||
move |rope: Rope| {
|
move |rope: Rope| {
|
||||||
process_chunk(
|
process_chunk(
|
||||||
&idx, &preloaded, rope, k, n_genomes, n_partitions, with_counts,
|
&idx, rope, k, n_genomes, n_partitions, with_counts,
|
||||||
effective_z, detail, count_missing, force_presence, presence_threshold,
|
effective_z, detail, count_missing, force_presence, presence_threshold,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -434,17 +543,18 @@ fn emit_batch(
|
|||||||
if detail && !cov.is_empty() {
|
if detail && !cov.is_empty() {
|
||||||
let mut cov_map = serde_json::Map::new();
|
let mut cov_map = serde_json::Map::new();
|
||||||
for (g, genome) in meta.genomes.iter().enumerate() {
|
for (g, genome) in meta.genomes.iter().enumerate() {
|
||||||
let v: Vec<serde_json::Value> =
|
let v: Vec<serde_json::Value> = cov[seq_idx][g].iter().map(|&x| x.into()).collect();
|
||||||
cov[seq_idx][g].iter().map(|&x| x.into()).collect();
|
|
||||||
cov_map.insert(genome.label.clone(), v.into());
|
cov_map.insert(genome.label.clone(), v.into());
|
||||||
}
|
}
|
||||||
ann.insert("coverage".into(), cov_map.into());
|
ann.insert("coverage".into(), cov_map.into());
|
||||||
}
|
}
|
||||||
|
|
||||||
let ann_str = serde_json::to_string(&ann).unwrap_or_else(|_| "{}".to_string());
|
|
||||||
|
|
||||||
// OBITools4 FASTA format: >id {"key":value,...}
|
// OBITools4 FASTA format: >id {"key":value,...}
|
||||||
let _ = writeln!(out, ">{id} {ann_str}");
|
let _ = out.write_all(b">");
|
||||||
|
let _ = out.write_all(id.as_bytes());
|
||||||
|
let _ = out.write_all(b" ");
|
||||||
|
let _ = serde_json::to_writer(&mut *out, &ann);
|
||||||
|
let _ = out.write_all(b"\n");
|
||||||
let _ = out.write_all(seq);
|
let _ = out.write_all(seq);
|
||||||
let _ = out.write_all(b"\n");
|
let _ = out.write_all(b"\n");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,4 +11,3 @@ mod rebuild_layer;
|
|||||||
pub use filter::KmerFilter;
|
pub use filter::KmerFilter;
|
||||||
pub use merge_layer::MergeMode;
|
pub use merge_layer::MergeMode;
|
||||||
pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};
|
pub use partition::{KmerPartition, KmerSpectrum, PARTITIONS_SUBDIR};
|
||||||
pub use query_layer::PreloadedIndex;
|
|
||||||
|
|||||||
@@ -50,129 +50,91 @@ impl QueryLayer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return `Some(per-genome row)` if `kmer` is indexed in this layer, else `None`.
|
/// Write per-genome values into `buf` if `kmer` is indexed in this layer.
|
||||||
fn find(&self, kmer: CanonicalKmer, n_genomes: usize) -> Option<Box<[u32]>> {
|
/// Returns `true` on hit; `buf` is untouched on miss.
|
||||||
|
fn find_into(&self, kmer: CanonicalKmer, n_genomes: usize, buf: &mut [u32]) -> bool {
|
||||||
match self {
|
match self {
|
||||||
QueryLayer::SetOnly(mphf) => {
|
QueryLayer::SetOnly(mphf) => {
|
||||||
mphf.find(kmer)
|
if mphf.find(kmer).is_some() {
|
||||||
.map(|_| vec![1u32; n_genomes].into_boxed_slice())
|
buf[..n_genomes].fill(1);
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
QueryLayer::Presence(mphf, mat) => {
|
QueryLayer::Presence(mphf, mat) => {
|
||||||
mphf.find(kmer)
|
if let Some(slot) = mphf.find(kmer) {
|
||||||
.map(|slot| mat.row(slot).iter().map(|&b| b as u32).collect())
|
mat.fill_row(slot, &mut buf[..n_genomes]);
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
QueryLayer::Count(mphf, mat) => {
|
QueryLayer::Count(mphf, mat) => {
|
||||||
mphf.find(kmer).map(|slot| mat.row(slot))
|
if let Some(slot) = mphf.find(kmer) {
|
||||||
|
mat.fill_row(slot, &mut buf[..n_genomes]);
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── PreloadedIndex ────────────────────────────────────────────────────────────
|
// ── KmerPartition::query_partition* ──────────────────────────────────────────
|
||||||
|
|
||||||
/// All query layers for every partition, opened once at startup.
|
impl KmerPartition {
|
||||||
///
|
/// Query a single partition, calling `on_hit(sk_idx, kmer_idx, row)` for
|
||||||
/// Wrap in `Arc` and share across worker threads — all access is read-only.
|
/// every found k-mer without allocating intermediate result vectors.
|
||||||
pub struct PreloadedIndex {
|
|
||||||
/// `layers[part_idx]` — ordered vec of query layers for that partition.
|
|
||||||
/// Empty vec when the partition has no index directory yet.
|
|
||||||
layers: Vec<Vec<QueryLayer>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
// SAFETY: QueryLayer and its contents are opened read-only (mmap + in-memory
|
|
||||||
// data structures). No mutation occurs after construction.
|
|
||||||
unsafe impl Sync for PreloadedIndex {}
|
|
||||||
unsafe impl Send for PreloadedIndex {}
|
|
||||||
|
|
||||||
impl PreloadedIndex {
|
|
||||||
/// Open all partition index directories and deserialise every MPHF once.
|
|
||||||
///
|
///
|
||||||
/// This is the expensive call — do it once before spawning query workers.
|
/// `row` is a shared scratch buffer valid only for the duration of the call;
|
||||||
pub fn new(
|
/// the callback must copy what it needs before returning.
|
||||||
partition: &KmerPartition,
|
pub fn query_partition_with<F>(
|
||||||
n_partitions: usize,
|
|
||||||
with_counts: bool,
|
|
||||||
) -> SKResult<Self> {
|
|
||||||
let active: Vec<usize> = (0..n_partitions).collect();
|
|
||||||
Self::new_subset(partition, n_partitions, &active, with_counts)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Open only the listed partition indices.
|
|
||||||
///
|
|
||||||
/// Keeps file-descriptor and memory usage bounded to the active set.
|
|
||||||
/// Unlisted partitions have an empty layer vec and return all-None on query.
|
|
||||||
pub fn new_subset(
|
|
||||||
partition: &KmerPartition,
|
|
||||||
n_partitions: usize,
|
|
||||||
active: &[usize],
|
|
||||||
with_counts: bool,
|
|
||||||
) -> SKResult<Self> {
|
|
||||||
let mut layers: Vec<Vec<QueryLayer>> = (0..n_partitions).map(|_| Vec::new()).collect();
|
|
||||||
for &i in active {
|
|
||||||
let index_dir = partition.part_dir(i).join(INDEX_SUBDIR);
|
|
||||||
if !index_dir.exists() {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?;
|
|
||||||
layers[i] = (0..meta.n_layers)
|
|
||||||
.map(|l| QueryLayer::open(
|
|
||||||
&index_dir.join(format!("layer_{l}")),
|
|
||||||
with_counts,
|
|
||||||
&meta.mode,
|
|
||||||
))
|
|
||||||
.collect::<SKResult<_>>()?;
|
|
||||||
}
|
|
||||||
Ok(Self { layers })
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Query one partition for a slice of already-routed super-kmers.
|
|
||||||
///
|
|
||||||
/// Returns one entry per input super-kmer; each entry is a `Vec` with one
|
|
||||||
/// `Option<Box<[u32]>>` per k-mer inside that super-kmer:
|
|
||||||
/// - `None` — k-mer absent from the index
|
|
||||||
/// - `Some(row)` — per-genome count or 0/1 presence
|
|
||||||
pub fn query_partition(
|
|
||||||
&self,
|
&self,
|
||||||
part_idx: usize,
|
part_idx: usize,
|
||||||
superkmers: &[&RoutableSuperKmer],
|
superkmers: &[&RoutableSuperKmer],
|
||||||
k: usize,
|
k: usize,
|
||||||
n_genomes: usize,
|
n_genomes: usize,
|
||||||
) -> SKResult<Vec<Vec<Option<Box<[u32]>>>>> {
|
with_counts: bool,
|
||||||
|
mut on_hit: F,
|
||||||
|
) -> SKResult<()>
|
||||||
|
where
|
||||||
|
F: FnMut(usize, usize, &[u32]),
|
||||||
|
{
|
||||||
if superkmers.is_empty() {
|
if superkmers.is_empty() {
|
||||||
return Ok(Vec::new());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let layers = &self.layers[part_idx];
|
let index_dir = self.part_dir(part_idx).join(INDEX_SUBDIR);
|
||||||
|
if !index_dir.exists() {
|
||||||
if layers.is_empty() {
|
return Ok(());
|
||||||
return Ok(superkmers
|
|
||||||
.iter()
|
|
||||||
.map(|rsk| vec![None; rsk.seql() - k + 1])
|
|
||||||
.collect());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(superkmers
|
let meta = PartitionMeta::load(&index_dir).map_err(olm_to_sk)?;
|
||||||
.iter()
|
let layers: Vec<QueryLayer> = (0..meta.n_layers)
|
||||||
.map(|rsk| {
|
.map(|i| QueryLayer::open(&index_dir.join(format!("layer_{i}")), with_counts, &meta.mode))
|
||||||
rsk.superkmer()
|
.collect::<SKResult<_>>()?;
|
||||||
.iter_canonical_kmers()
|
|
||||||
.map(|kmer| {
|
let mut buf = vec![0u32; n_genomes];
|
||||||
layers.iter().find_map(|layer| layer.find(kmer, n_genomes))
|
|
||||||
})
|
for (sk_idx, rsk) in superkmers.iter().enumerate() {
|
||||||
.collect()
|
for (kmer_idx, kmer) in rsk.superkmer().iter_canonical_kmers().enumerate() {
|
||||||
})
|
for layer in &layers {
|
||||||
.collect())
|
if layer.find_into(kmer, n_genomes, &mut buf) {
|
||||||
|
on_hit(sk_idx, kmer_idx, &buf);
|
||||||
|
buf.fill(0);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// ── KmerPartition::query_partition (kept for backward compatibility) ──────────
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
impl KmerPartition {
|
|
||||||
/// Query a single partition for a slice of (already-routed) super-kmers.
|
/// Query a single partition for a slice of (already-routed) super-kmers.
|
||||||
///
|
/// Prefer [`query_partition_with`] to avoid per-kmer heap allocations.
|
||||||
/// **Prefer [`PreloadedIndex`] for repeated queries** — this method
|
|
||||||
/// re-opens and deserialises the MPHF on every call.
|
|
||||||
#[deprecated(note = "use PreloadedIndex::query_partition to avoid repeated MPHF I/O")]
|
|
||||||
pub fn query_partition(
|
pub fn query_partition(
|
||||||
&self,
|
&self,
|
||||||
part_idx: usize,
|
part_idx: usize,
|
||||||
@@ -199,13 +161,21 @@ impl KmerPartition {
|
|||||||
.map(|i| QueryLayer::open(&index_dir.join(format!("layer_{i}")), with_counts, &meta.mode))
|
.map(|i| QueryLayer::open(&index_dir.join(format!("layer_{i}")), with_counts, &meta.mode))
|
||||||
.collect::<SKResult<_>>()?;
|
.collect::<SKResult<_>>()?;
|
||||||
|
|
||||||
|
let mut buf = vec![0u32; n_genomes];
|
||||||
Ok(superkmers
|
Ok(superkmers
|
||||||
.iter()
|
.iter()
|
||||||
.map(|rsk| {
|
.map(|rsk| {
|
||||||
rsk.superkmer()
|
rsk.superkmer()
|
||||||
.iter_canonical_kmers()
|
.iter_canonical_kmers()
|
||||||
.map(|kmer| {
|
.map(|kmer| {
|
||||||
layers.iter().find_map(|layer| layer.find(kmer, n_genomes))
|
for layer in &layers {
|
||||||
|
if layer.find_into(kmer, n_genomes, &mut buf) {
|
||||||
|
let row: Box<[u32]> = buf[..n_genomes].into();
|
||||||
|
buf.fill(0);
|
||||||
|
return Some(row);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
})
|
})
|
||||||
.collect()
|
.collect()
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -17,8 +17,13 @@ pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
|
|||||||
pub(crate) const EVIDENCE_FILE: &str = "evidence.bin";
|
pub(crate) const EVIDENCE_FILE: &str = "evidence.bin";
|
||||||
pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin";
|
pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin";
|
||||||
|
|
||||||
|
/// Owned MPHF — used only at build time (construction + store).
|
||||||
pub(crate) type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
pub(crate) type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
||||||
|
|
||||||
|
/// Zero-copy MPHF for querying — ε-deserialized view into a memory-mapped file.
|
||||||
|
/// `MemCase` owns the mmap backing; `'static` is sound because MemCase pins the memory.
|
||||||
|
type MphfEps = PtrHash<u64, CubicEps, CachelineEfVec<&'static [CachelineEf]>, Xx64, &'static [u8]>;
|
||||||
|
|
||||||
// ── LayerEvidence ─────────────────────────────────────────────────────────────
|
// ── LayerEvidence ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
enum LayerEvidence {
|
enum LayerEvidence {
|
||||||
@@ -36,7 +41,7 @@ enum LayerEvidence {
|
|||||||
/// - [`find_strict`](Self::find_strict) — always exact; O(1) on Exact/Hybrid layers,
|
/// - [`find_strict`](Self::find_strict) — always exact; O(1) on Exact/Hybrid layers,
|
||||||
/// O(n) sequential scan on Approx layers.
|
/// O(n) sequential scan on Approx layers.
|
||||||
pub struct MphfLayer {
|
pub struct MphfLayer {
|
||||||
mphf: Mphf,
|
mphf: MemCase<MphfEps>,
|
||||||
ev: LayerEvidence,
|
ev: LayerEvidence,
|
||||||
n: usize,
|
n: usize,
|
||||||
}
|
}
|
||||||
@@ -45,7 +50,7 @@ impl MphfLayer {
|
|||||||
/// Open a layer using the index-level `mode` determined at `LayeredMap` open time.
|
/// Open a layer using the index-level `mode` determined at `LayeredMap` open time.
|
||||||
/// No per-layer metadata file is read.
|
/// No per-layer metadata file is read.
|
||||||
pub fn open(dir: &Path, mode: &IndexMode) -> OLMResult<Self> {
|
pub fn open(dir: &Path, mode: &IndexMode) -> OLMResult<Self> {
|
||||||
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
|
let mphf: MemCase<MphfEps> = Mphf::mmap(&dir.join(MPHF_FILE), Flags::empty())
|
||||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||||
let (ev, n) = match mode {
|
let (ev, n) = match mode {
|
||||||
IndexMode::Exact => {
|
IndexMode::Exact => {
|
||||||
@@ -137,11 +142,11 @@ impl MphfLayer {
|
|||||||
///
|
///
|
||||||
/// Use this when the caller guarantees that all queried kmers are in the MPHF
|
/// Use this when the caller guarantees that all queried kmers are in the MPHF
|
||||||
/// domain (e.g. when iterating the source's own unitigs during merge).
|
/// domain (e.g. when iterating the source's own unitigs during merge).
|
||||||
pub struct MphfOnly(Mphf);
|
pub struct MphfOnly(MemCase<MphfEps>);
|
||||||
|
|
||||||
impl MphfOnly {
|
impl MphfOnly {
|
||||||
pub fn open(dir: &Path) -> OLMResult<Self> {
|
pub fn open(dir: &Path) -> OLMResult<Self> {
|
||||||
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
|
let mphf: MemCase<MphfEps> = Mphf::mmap(&dir.join(MPHF_FILE), Flags::empty())
|
||||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||||
Ok(Self(mphf))
|
Ok(Self(mphf))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -153,11 +153,23 @@ pub fn read_fastq_chunks(
|
|||||||
/// Returns an error if the format cannot be identified as `text/fasta` or `text/fastq`.
|
/// Returns an error if the format cannot be identified as `text/fasta` or `text/fastq`.
|
||||||
pub fn read_sequence_chunks(
|
pub fn read_sequence_chunks(
|
||||||
path: &str,
|
path: &str,
|
||||||
|
) -> io::Result<SeqChunkIter<MimeTypeGuesser<Box<dyn Read + Send>>>> {
|
||||||
|
read_sequence_chunks_sized(path, DEFAULT_BLOCK_SIZE)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as [`read_sequence_chunks`] but with an explicit I/O block size.
|
||||||
|
///
|
||||||
|
/// Larger values amortise per-partition open/close overhead across more superkmers.
|
||||||
|
pub fn read_sequence_chunks_sized(
|
||||||
|
path: &str,
|
||||||
|
block_size: usize,
|
||||||
) -> io::Result<SeqChunkIter<MimeTypeGuesser<Box<dyn Read + Send>>>> {
|
) -> io::Result<SeqChunkIter<MimeTypeGuesser<Box<dyn Read + Send>>>> {
|
||||||
let input = match xopen(path) {
|
let input = match xopen(path) {
|
||||||
Ok(mut i) => match i.mime_type() {
|
Ok(mut i) => match i.mime_type() {
|
||||||
Some("text/fasta") => fasta_chunks(i),
|
Some("text/fasta") => SeqChunkIter::new(i, block_size,
|
||||||
Some("text/fastq") => fastq_chunks(i),
|
fasta::end_of_last_fasta_entry, Some("text/fasta")),
|
||||||
|
Some("text/fastq") => SeqChunkIter::new(i, block_size,
|
||||||
|
fastq::end_of_last_fastq_entry, Some("text/fastq")),
|
||||||
_ => {
|
_ => {
|
||||||
return Err(io::Error::new(
|
return Err(io::Error::new(
|
||||||
io::ErrorKind::InvalidData,
|
io::ErrorKind::InvalidData,
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ pub mod xopen;
|
|||||||
|
|
||||||
pub use chunk::{
|
pub use chunk::{
|
||||||
SeqChunkIter, fasta_chunks, fastq_chunks, read_fasta_chunks, read_fastq_chunks,
|
SeqChunkIter, fasta_chunks, fastq_chunks, read_fasta_chunks, read_fastq_chunks,
|
||||||
read_sequence_chunks,
|
read_sequence_chunks, read_sequence_chunks_sized,
|
||||||
};
|
};
|
||||||
pub use mimetype::MimeTypeGuesser;
|
pub use mimetype::MimeTypeGuesser;
|
||||||
pub use normalize::{normalize_fasta_chunk, normalize_fastq_chunk, normalize_sequence_chunk};
|
pub use normalize::{normalize_fasta_chunk, normalize_fastq_chunk, normalize_sequence_chunk};
|
||||||
|
|||||||
@@ -5,3 +5,4 @@ edition = "2024"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
|
sysinfo = "0.33"
|
||||||
|
|||||||
@@ -2,6 +2,21 @@ use std::fmt;
|
|||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use libc::{RUSAGE_SELF, getrusage, rusage, timeval};
|
use libc::{RUSAGE_SELF, getrusage, rusage, timeval};
|
||||||
|
use sysinfo::System;
|
||||||
|
|
||||||
|
// ── Memory query ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Returns the number of bytes available for allocation on this machine.
|
||||||
|
///
|
||||||
|
/// On macOS, `available_memory()` can return 0 when the memory compressor
|
||||||
|
/// inflates the page count; in that case we fall back to half of total memory.
|
||||||
|
pub fn available_memory_bytes() -> u64 {
|
||||||
|
let sys = System::new_all();
|
||||||
|
match sys.available_memory() {
|
||||||
|
0 => sys.total_memory() / 2,
|
||||||
|
n => n,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── raw helpers ───────────────────────────────────────────────────────────────
|
// ── raw helpers ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user