add PhantomData import for generic type safety

- Added `use std::marker::PhantomData;` to prepare for generic scheduler implementations
- Ensures type safety and avoids unused lifetime/type parameters warnings
This commit is contained in:
Eric Coissac
2026-04-27 23:27:42 +02:00
parent ebbfe35cbc
commit 4c19882f03
10 changed files with 328 additions and 271 deletions
+16 -67
View File
@@ -20,13 +20,13 @@ use rayon::prelude::*;
use serde::{Deserialize, Serialize};
const META_FILENAME: &str = "partition.meta";
const SK_EXT: &str = "skmer.zst";
#[derive(Serialize, Deserialize)]
struct PartitionMeta {
n_bits: usize,
kmer_size: usize,
minimizer_size: usize,
format: String,
level: u32,
}
@@ -37,7 +37,6 @@ pub struct KmerPartition {
kmer_size: usize,
minimizer_size: usize,
writers: Vec<Option<SKFileWriter>>,
format: Format,
level: Level,
closed: bool,
}
@@ -50,15 +49,7 @@ impl KmerPartition {
minimizer_size: usize,
force: bool,
) -> SKResult<Self> {
Self::create_with(
path,
n_bits,
kmer_size,
minimizer_size,
Format::Zstd,
Level::Three,
force,
)
Self::create_with(path, n_bits, kmer_size, minimizer_size, Level::Three, force)
}
pub fn create_with<P: AsRef<Path>>(
@@ -66,7 +57,6 @@ impl KmerPartition {
n_bits: usize,
kmer_size: usize,
minimizer_size: usize,
format: Format,
level: Level,
force: bool,
) -> SKResult<Self> {
@@ -95,7 +85,6 @@ impl KmerPartition {
kmer_size,
minimizer_size,
writers,
format,
level,
closed: false,
};
@@ -116,13 +105,6 @@ impl KmerPartition {
let meta: PartitionMeta = serde_json::from_reader(fs::File::open(&meta_path)?)
.map_err(io::Error::other)?;
let format = match meta.format.as_str() {
"gzip" => Format::Gzip,
"bzip2" => Format::Bzip,
"lzma" => Format::Lzma,
"zstd" => Format::Zstd,
_ => Format::No,
};
let level = level_from_u32(meta.level);
let n_partitions = 1usize << meta.n_bits;
let writers = (0..n_partitions).map(|_| None).collect();
@@ -133,7 +115,6 @@ impl KmerPartition {
kmer_size: meta.kmer_size,
minimizer_size: meta.minimizer_size,
writers,
format,
level,
closed: true, // read-only: writing is not allowed on an opened partition
})
@@ -203,9 +184,7 @@ impl KmerPartition {
/// partition). Higher values reduce per-temp-file memory at the cost of
/// more temporary file descriptors — all managed by the global fd pool.
pub fn dereplicate(&self) -> SKResult<()> {
let format = self.format;
let level = self.level;
let ext = format_ext(format);
let root = &self.root_path;
let available = System::new_all().available_memory();
let n_threads = rayon::current_num_threads().max(1) as u64;
@@ -218,9 +197,9 @@ impl KmerPartition {
if !dir.exists() {
return Ok(());
}
let raw_path = dir.join(format!("raw.{ext}"));
let raw_path = dir.join(format!("raw.{SK_EXT}"));
let n_buckets = optimal_buckets(&raw_path, available_per_thread);
dereplicate_partition(&dir, ext, format, level, n_buckets)
dereplicate_partition(&dir, level, n_buckets)
})
.collect();
@@ -241,7 +220,6 @@ impl KmerPartition {
/// Partitions are processed in parallel via Rayon (one task per thread).
/// Peak memory per partition is ~80 MB, so n_threads partitions run simultaneously.
pub fn count_kmer(&self) -> SKResult<()> {
let ext = format_ext(self.format);
let root = &self.root_path;
let k = self.kmer_size;
@@ -249,7 +227,7 @@ impl KmerPartition {
.into_par_iter()
.map(|i| {
let dir = root.join(format!("part_{:05}", i));
let dedup_path = dir.join(format!("dereplicated.{ext}"));
let dedup_path = dir.join(format!("dereplicated.{SK_EXT}"));
if !dedup_path.exists() {
return Ok(());
}
@@ -320,14 +298,6 @@ impl KmerPartition {
n_bits,
kmer_size: self.kmer_size,
minimizer_size: self.minimizer_size,
format: match self.format {
Format::Gzip => "gzip",
Format::Bzip => "bzip2",
Format::Lzma => "lzma",
Format::Zstd => "zstd",
Format::No => "none",
}
.to_owned(),
level: u32::from(self.level),
};
let f = fs::File::create(self.root_path.join(META_FILENAME))?;
@@ -339,9 +309,8 @@ impl KmerPartition {
if self.writers[partition].is_none() {
let dir = self.root_path.join(format!("part_{:05}", partition));
fs::create_dir_all(&dir)?;
let ext = format_ext(self.format);
let file_path = dir.join(format!("raw.{ext}"));
let writer = SKFileWriter::create_with(file_path, self.format, self.level)?;
let file_path = dir.join(format!("raw.{SK_EXT}"));
let writer = SKFileWriter::create_with(file_path, Format::Zstd, self.level)?;
self.writers[partition] = Some(writer);
}
Ok(self.writers[partition].as_mut().unwrap())
@@ -408,34 +377,19 @@ fn level_from_u32(n: u32) -> Level {
}
}
fn format_ext(format: Format) -> &'static str {
match format {
Format::Gzip => "skmer.gz",
Format::Bzip => "skmer.bz2",
Format::Lzma => "skmer.xz",
Format::Zstd => "skmer.zst",
Format::No => "skmer",
}
}
/// Maximum value that fits in the 24-bit COUNT field of a SuperKmer header.
const MAX_SK_COUNT: u64 = (1 << 24) - 1;
/// Deduplicate one partition directory in place (two-phase split + merge).
fn dereplicate_partition(
dir: &Path,
ext: &str,
format: Format,
level: Level,
n_temp: usize,
) -> SKResult<()> {
let raw_path = dir.join(format!("raw.{ext}"));
fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()> {
let raw_path = dir.join(format!("raw.{SK_EXT}"));
if !raw_path.exists() {
return Ok(());
}
let out_path = dir.join(format!("dereplicated.{ext}"));
let mut writer = SKFileWriter::create_with(&out_path, format, level)?;
let out_path = dir.join(format!("dereplicated.{SK_EXT}"));
let mut writer = SKFileWriter::create_with(&out_path, Format::Zstd, level)?;
if n_temp == 1 {
// ── Direct path: partition fits in memory, no split needed ────────────
@@ -446,13 +400,13 @@ fn dereplicate_partition(
// ── Phase 1: split raw file into temp buckets ─────────────────────────
let temp_mask = (n_temp as u64) - 1;
let temp_paths: Vec<PathBuf> = (0..n_temp)
.map(|j| dir.join(format!("temp_{j:04}.{ext}")))
.map(|j| dir.join(format!("temp_{j:04}.{SK_EXT}")))
.collect();
{
let mut writers: Vec<SKFileWriter> = temp_paths
.iter()
.map(|p| SKFileWriter::create_with(p, format, level))
.map(|p| SKFileWriter::create_with(p, Format::Zstd, level))
.collect::<SKResult<_>>()?;
let mut reader = SKFileReader::open(&raw_path)?;
@@ -530,12 +484,8 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
let mut reader = SKFileReader::open(dedup_path)?;
while let Some(sk) = reader.read()? {
pass1_superkmers += 1;
let seql = sk.seql();
if seql < k {
continue;
}
for pos in 0..=(seql - k) {
seen.insert(sk.kmer(pos, k).map_err(io::Error::other)?.canonical(k));
for kmer in sk.iter_canonical_kmers(k) {
seen.insert(kmer);
}
}
}
@@ -583,8 +533,7 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
continue;
}
pass2_count_sum += sk_count as u64;
for pos in 0..=(seql - k) {
let kmer = sk.kmer(pos, k).map_err(io::Error::other)?.canonical(k);
for kmer in sk.iter_canonical_kmers(k) {
if let Some(idx) = mphf.get(&kmer) {
counts[idx as usize] = counts[idx as usize].saturating_add(sk_count);
pass2_kmer_hits += 1;