✨ add PhantomData import for generic type safety
- Added `use std::marker::PhantomData;` to prepare for generic scheduler implementations - Ensures type safety and avoids unused lifetime/type parameters warnings
This commit is contained in:
@@ -20,13 +20,13 @@ use rayon::prelude::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
const META_FILENAME: &str = "partition.meta";
|
||||
const SK_EXT: &str = "skmer.zst";
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PartitionMeta {
|
||||
n_bits: usize,
|
||||
kmer_size: usize,
|
||||
minimizer_size: usize,
|
||||
format: String,
|
||||
level: u32,
|
||||
}
|
||||
|
||||
@@ -37,7 +37,6 @@ pub struct KmerPartition {
|
||||
kmer_size: usize,
|
||||
minimizer_size: usize,
|
||||
writers: Vec<Option<SKFileWriter>>,
|
||||
format: Format,
|
||||
level: Level,
|
||||
closed: bool,
|
||||
}
|
||||
@@ -50,15 +49,7 @@ impl KmerPartition {
|
||||
minimizer_size: usize,
|
||||
force: bool,
|
||||
) -> SKResult<Self> {
|
||||
Self::create_with(
|
||||
path,
|
||||
n_bits,
|
||||
kmer_size,
|
||||
minimizer_size,
|
||||
Format::Zstd,
|
||||
Level::Three,
|
||||
force,
|
||||
)
|
||||
Self::create_with(path, n_bits, kmer_size, minimizer_size, Level::Three, force)
|
||||
}
|
||||
|
||||
pub fn create_with<P: AsRef<Path>>(
|
||||
@@ -66,7 +57,6 @@ impl KmerPartition {
|
||||
n_bits: usize,
|
||||
kmer_size: usize,
|
||||
minimizer_size: usize,
|
||||
format: Format,
|
||||
level: Level,
|
||||
force: bool,
|
||||
) -> SKResult<Self> {
|
||||
@@ -95,7 +85,6 @@ impl KmerPartition {
|
||||
kmer_size,
|
||||
minimizer_size,
|
||||
writers,
|
||||
format,
|
||||
level,
|
||||
closed: false,
|
||||
};
|
||||
@@ -116,13 +105,6 @@ impl KmerPartition {
|
||||
let meta: PartitionMeta = serde_json::from_reader(fs::File::open(&meta_path)?)
|
||||
.map_err(io::Error::other)?;
|
||||
|
||||
let format = match meta.format.as_str() {
|
||||
"gzip" => Format::Gzip,
|
||||
"bzip2" => Format::Bzip,
|
||||
"lzma" => Format::Lzma,
|
||||
"zstd" => Format::Zstd,
|
||||
_ => Format::No,
|
||||
};
|
||||
let level = level_from_u32(meta.level);
|
||||
let n_partitions = 1usize << meta.n_bits;
|
||||
let writers = (0..n_partitions).map(|_| None).collect();
|
||||
@@ -133,7 +115,6 @@ impl KmerPartition {
|
||||
kmer_size: meta.kmer_size,
|
||||
minimizer_size: meta.minimizer_size,
|
||||
writers,
|
||||
format,
|
||||
level,
|
||||
closed: true, // read-only: writing is not allowed on an opened partition
|
||||
})
|
||||
@@ -203,9 +184,7 @@ impl KmerPartition {
|
||||
/// partition). Higher values reduce per-temp-file memory at the cost of
|
||||
/// more temporary file descriptors — all managed by the global fd pool.
|
||||
pub fn dereplicate(&self) -> SKResult<()> {
|
||||
let format = self.format;
|
||||
let level = self.level;
|
||||
let ext = format_ext(format);
|
||||
let root = &self.root_path;
|
||||
let available = System::new_all().available_memory();
|
||||
let n_threads = rayon::current_num_threads().max(1) as u64;
|
||||
@@ -218,9 +197,9 @@ impl KmerPartition {
|
||||
if !dir.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
let raw_path = dir.join(format!("raw.{ext}"));
|
||||
let raw_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
let n_buckets = optimal_buckets(&raw_path, available_per_thread);
|
||||
dereplicate_partition(&dir, ext, format, level, n_buckets)
|
||||
dereplicate_partition(&dir, level, n_buckets)
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -241,7 +220,6 @@ impl KmerPartition {
|
||||
/// Partitions are processed in parallel via Rayon (one task per thread).
|
||||
/// Peak memory per partition is ~80 MB, so n_threads partitions run simultaneously.
|
||||
pub fn count_kmer(&self) -> SKResult<()> {
|
||||
let ext = format_ext(self.format);
|
||||
let root = &self.root_path;
|
||||
let k = self.kmer_size;
|
||||
|
||||
@@ -249,7 +227,7 @@ impl KmerPartition {
|
||||
.into_par_iter()
|
||||
.map(|i| {
|
||||
let dir = root.join(format!("part_{:05}", i));
|
||||
let dedup_path = dir.join(format!("dereplicated.{ext}"));
|
||||
let dedup_path = dir.join(format!("dereplicated.{SK_EXT}"));
|
||||
if !dedup_path.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -320,14 +298,6 @@ impl KmerPartition {
|
||||
n_bits,
|
||||
kmer_size: self.kmer_size,
|
||||
minimizer_size: self.minimizer_size,
|
||||
format: match self.format {
|
||||
Format::Gzip => "gzip",
|
||||
Format::Bzip => "bzip2",
|
||||
Format::Lzma => "lzma",
|
||||
Format::Zstd => "zstd",
|
||||
Format::No => "none",
|
||||
}
|
||||
.to_owned(),
|
||||
level: u32::from(self.level),
|
||||
};
|
||||
let f = fs::File::create(self.root_path.join(META_FILENAME))?;
|
||||
@@ -339,9 +309,8 @@ impl KmerPartition {
|
||||
if self.writers[partition].is_none() {
|
||||
let dir = self.root_path.join(format!("part_{:05}", partition));
|
||||
fs::create_dir_all(&dir)?;
|
||||
let ext = format_ext(self.format);
|
||||
let file_path = dir.join(format!("raw.{ext}"));
|
||||
let writer = SKFileWriter::create_with(file_path, self.format, self.level)?;
|
||||
let file_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
let writer = SKFileWriter::create_with(file_path, Format::Zstd, self.level)?;
|
||||
self.writers[partition] = Some(writer);
|
||||
}
|
||||
Ok(self.writers[partition].as_mut().unwrap())
|
||||
@@ -408,34 +377,19 @@ fn level_from_u32(n: u32) -> Level {
|
||||
}
|
||||
}
|
||||
|
||||
fn format_ext(format: Format) -> &'static str {
|
||||
match format {
|
||||
Format::Gzip => "skmer.gz",
|
||||
Format::Bzip => "skmer.bz2",
|
||||
Format::Lzma => "skmer.xz",
|
||||
Format::Zstd => "skmer.zst",
|
||||
Format::No => "skmer",
|
||||
}
|
||||
}
|
||||
|
||||
/// Maximum value that fits in the 24-bit COUNT field of a SuperKmer header.
|
||||
const MAX_SK_COUNT: u64 = (1 << 24) - 1;
|
||||
|
||||
/// Deduplicate one partition directory in place (two-phase split + merge).
|
||||
fn dereplicate_partition(
|
||||
dir: &Path,
|
||||
ext: &str,
|
||||
format: Format,
|
||||
level: Level,
|
||||
n_temp: usize,
|
||||
) -> SKResult<()> {
|
||||
let raw_path = dir.join(format!("raw.{ext}"));
|
||||
fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()> {
|
||||
let raw_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
if !raw_path.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let out_path = dir.join(format!("dereplicated.{ext}"));
|
||||
let mut writer = SKFileWriter::create_with(&out_path, format, level)?;
|
||||
let out_path = dir.join(format!("dereplicated.{SK_EXT}"));
|
||||
let mut writer = SKFileWriter::create_with(&out_path, Format::Zstd, level)?;
|
||||
|
||||
if n_temp == 1 {
|
||||
// ── Direct path: partition fits in memory, no split needed ────────────
|
||||
@@ -446,13 +400,13 @@ fn dereplicate_partition(
|
||||
// ── Phase 1: split raw file into temp buckets ─────────────────────────
|
||||
let temp_mask = (n_temp as u64) - 1;
|
||||
let temp_paths: Vec<PathBuf> = (0..n_temp)
|
||||
.map(|j| dir.join(format!("temp_{j:04}.{ext}")))
|
||||
.map(|j| dir.join(format!("temp_{j:04}.{SK_EXT}")))
|
||||
.collect();
|
||||
|
||||
{
|
||||
let mut writers: Vec<SKFileWriter> = temp_paths
|
||||
.iter()
|
||||
.map(|p| SKFileWriter::create_with(p, format, level))
|
||||
.map(|p| SKFileWriter::create_with(p, Format::Zstd, level))
|
||||
.collect::<SKResult<_>>()?;
|
||||
|
||||
let mut reader = SKFileReader::open(&raw_path)?;
|
||||
@@ -530,12 +484,8 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
let mut reader = SKFileReader::open(dedup_path)?;
|
||||
while let Some(sk) = reader.read()? {
|
||||
pass1_superkmers += 1;
|
||||
let seql = sk.seql();
|
||||
if seql < k {
|
||||
continue;
|
||||
}
|
||||
for pos in 0..=(seql - k) {
|
||||
seen.insert(sk.kmer(pos, k).map_err(io::Error::other)?.canonical(k));
|
||||
for kmer in sk.iter_canonical_kmers(k) {
|
||||
seen.insert(kmer);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -583,8 +533,7 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
continue;
|
||||
}
|
||||
pass2_count_sum += sk_count as u64;
|
||||
for pos in 0..=(seql - k) {
|
||||
let kmer = sk.kmer(pos, k).map_err(io::Error::other)?.canonical(k);
|
||||
for kmer in sk.iter_canonical_kmers(k) {
|
||||
if let Some(idx) = mphf.get(&kmer) {
|
||||
counts[idx as usize] = counts[idx as usize].saturating_add(sk_count);
|
||||
pass2_kmer_hits += 1;
|
||||
|
||||
Reference in New Issue
Block a user