feat: add seq_hash() and refactor canonical hashing
Introduce `. seqhash(&self)` for direct XXH3-64 sequencing of packed bytes, and remove legacy `.hash()` method that used conditional canonicalization via revcomp. Also update partitioning logic to use `sk.hashseq_hash()` and deduplicate imports.
This commit is contained in:
@@ -14,8 +14,8 @@ use remove_dir_all::remove_dir_all;
|
||||
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obiskio::{SKFileMeta, SKFileReader, SKFileWriter, SKResult};
|
||||
use rayon::prelude::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -124,7 +124,8 @@ impl KmerPartition {
|
||||
/// Route and write one super-kmer to its partition file.
|
||||
pub fn write(&mut self, rsk: RoutableSuperKmer) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
let partition = (rsk.minimizer().hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||
let partition =
|
||||
(rsk.minimizer().seq_hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||
let sk = rsk.into_superkmer();
|
||||
self.ensure_writer(partition)?.write(&sk)
|
||||
}
|
||||
@@ -133,7 +134,8 @@ impl KmerPartition {
|
||||
pub fn write_batch(&mut self, rsks: Vec<RoutableSuperKmer>) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
for rsk in rsks {
|
||||
let partition = (rsk.minimizer().hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||
let partition =
|
||||
(rsk.minimizer().seq_hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||
let sk = rsk.into_superkmer();
|
||||
self.ensure_writer(partition)?.write(&sk)?;
|
||||
}
|
||||
@@ -441,9 +443,8 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize, k: usize) -> S
|
||||
.collect::<SKResult<_>>()?;
|
||||
|
||||
let mut reader = SKFileReader::open(&raw_path, k)?;
|
||||
while let Some(mut sk) = reader.read()? {
|
||||
sk.canonical();
|
||||
let bucket = (sk.hash() & temp_mask) as usize;
|
||||
while let Some(sk) = reader.read()? {
|
||||
let bucket = (sk.seq_hash() & temp_mask) as usize;
|
||||
writers[bucket].write(&sk)?;
|
||||
}
|
||||
for w in &mut writers {
|
||||
@@ -473,8 +474,7 @@ fn load_bucket(path: &Path, k: usize) -> SKResult<HashMap<SuperKmer, u64>> {
|
||||
.unwrap_or(0);
|
||||
let mut map: HashMap<SuperKmer, u64> = HashMap::with_capacity(capacity);
|
||||
let mut reader = SKFileReader::open(path, k)?;
|
||||
while let Some(mut sk) = reader.read()? {
|
||||
sk.canonical();
|
||||
while let Some(sk) = reader.read()? {
|
||||
let count = sk.count() as u64;
|
||||
*map.entry(sk).or_insert(0) += count;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user