feat: add seq_hash() and refactor canonical hashing
Introduce `. seqhash(&self)` for direct XXH3-64 sequencing of packed bytes, and remove legacy `.hash()` method that used conditional canonicalization via revcomp. Also update partitioning logic to use `sk.hashseq_hash()` and deduplicate imports.
This commit is contained in:
@@ -25,7 +25,7 @@ fn write_batch(
|
|||||||
let partition_mask = (1u64 << partition_bits) - 1;
|
let partition_mask = (1u64 << partition_bits) - 1;
|
||||||
for rsk in batch {
|
for rsk in batch {
|
||||||
let minimizer = *rsk.minimizer();
|
let minimizer = *rsk.minimizer();
|
||||||
let partition = (minimizer.hash(m) & partition_mask) as usize;
|
let partition = (minimizer.seq_hash(m) & partition_mask) as usize;
|
||||||
write_scatter(rsk.superkmer(), out, k, m, partition, minimizer)?;
|
write_scatter(rsk.superkmer(), out, k, m, partition, minimizer)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -14,8 +14,8 @@ use remove_dir_all::remove_dir_all;
|
|||||||
|
|
||||||
use niffler::Level;
|
use niffler::Level;
|
||||||
use niffler::send::compression::Format;
|
use niffler::send::compression::Format;
|
||||||
use obikseq::superkmer::SuperKmer;
|
|
||||||
use obikseq::RoutableSuperKmer;
|
use obikseq::RoutableSuperKmer;
|
||||||
|
use obikseq::superkmer::SuperKmer;
|
||||||
use obiskio::{SKFileMeta, SKFileReader, SKFileWriter, SKResult};
|
use obiskio::{SKFileMeta, SKFileReader, SKFileWriter, SKResult};
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
@@ -124,7 +124,8 @@ impl KmerPartition {
|
|||||||
/// Route and write one super-kmer to its partition file.
|
/// Route and write one super-kmer to its partition file.
|
||||||
pub fn write(&mut self, rsk: RoutableSuperKmer) -> SKResult<()> {
|
pub fn write(&mut self, rsk: RoutableSuperKmer) -> SKResult<()> {
|
||||||
self.check_not_closed()?;
|
self.check_not_closed()?;
|
||||||
let partition = (rsk.minimizer().hash(self.minimizer_size) & self.partitions_mask) as usize;
|
let partition =
|
||||||
|
(rsk.minimizer().seq_hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||||
let sk = rsk.into_superkmer();
|
let sk = rsk.into_superkmer();
|
||||||
self.ensure_writer(partition)?.write(&sk)
|
self.ensure_writer(partition)?.write(&sk)
|
||||||
}
|
}
|
||||||
@@ -133,7 +134,8 @@ impl KmerPartition {
|
|||||||
pub fn write_batch(&mut self, rsks: Vec<RoutableSuperKmer>) -> SKResult<()> {
|
pub fn write_batch(&mut self, rsks: Vec<RoutableSuperKmer>) -> SKResult<()> {
|
||||||
self.check_not_closed()?;
|
self.check_not_closed()?;
|
||||||
for rsk in rsks {
|
for rsk in rsks {
|
||||||
let partition = (rsk.minimizer().hash(self.minimizer_size) & self.partitions_mask) as usize;
|
let partition =
|
||||||
|
(rsk.minimizer().seq_hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||||
let sk = rsk.into_superkmer();
|
let sk = rsk.into_superkmer();
|
||||||
self.ensure_writer(partition)?.write(&sk)?;
|
self.ensure_writer(partition)?.write(&sk)?;
|
||||||
}
|
}
|
||||||
@@ -441,9 +443,8 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize, k: usize) -> S
|
|||||||
.collect::<SKResult<_>>()?;
|
.collect::<SKResult<_>>()?;
|
||||||
|
|
||||||
let mut reader = SKFileReader::open(&raw_path, k)?;
|
let mut reader = SKFileReader::open(&raw_path, k)?;
|
||||||
while let Some(mut sk) = reader.read()? {
|
while let Some(sk) = reader.read()? {
|
||||||
sk.canonical();
|
let bucket = (sk.seq_hash() & temp_mask) as usize;
|
||||||
let bucket = (sk.hash() & temp_mask) as usize;
|
|
||||||
writers[bucket].write(&sk)?;
|
writers[bucket].write(&sk)?;
|
||||||
}
|
}
|
||||||
for w in &mut writers {
|
for w in &mut writers {
|
||||||
@@ -473,8 +474,7 @@ fn load_bucket(path: &Path, k: usize) -> SKResult<HashMap<SuperKmer, u64>> {
|
|||||||
.unwrap_or(0);
|
.unwrap_or(0);
|
||||||
let mut map: HashMap<SuperKmer, u64> = HashMap::with_capacity(capacity);
|
let mut map: HashMap<SuperKmer, u64> = HashMap::with_capacity(capacity);
|
||||||
let mut reader = SKFileReader::open(path, k)?;
|
let mut reader = SKFileReader::open(path, k)?;
|
||||||
while let Some(mut sk) = reader.read()? {
|
while let Some(sk) = reader.read()? {
|
||||||
sk.canonical();
|
|
||||||
let count = sk.count() as u64;
|
let count = sk.count() as u64;
|
||||||
*map.entry(sk).or_insert(0) += count;
|
*map.entry(sk).or_insert(0) += count;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -160,7 +160,7 @@ impl Kmer {
|
|||||||
///
|
///
|
||||||
/// Uses the canonical form of the kmer to compute the hash.
|
/// Uses the canonical form of the kmer to compute the hash.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn hash(&self, k: usize) -> u64 {
|
pub fn seq_hash(&self, k: usize) -> u64 {
|
||||||
mix64(self.canonical(k).0)
|
mix64(self.canonical(k).0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
pub trait Sequence {
|
pub trait Sequence {
|
||||||
fn len(&self) -> usize;
|
|
||||||
fn sequence(&self) -> &[u8];
|
fn sequence(&self) -> &[u8];
|
||||||
fn revcomp(&self) -> Self;
|
fn canonical(&self) -> Self;
|
||||||
|
fn seq_hash(&self) -> u64;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -155,7 +155,9 @@ impl SuperKmer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Reverse-complement this super-kmer in place.
|
/// Reverse-complement this super-kmer in place.
|
||||||
pub fn revcomp(&mut self) {
|
///
|
||||||
|
/// This method is only used internally by the build method.
|
||||||
|
fn revcomp(&mut self) {
|
||||||
let seql = self.len();
|
let seql = self.len();
|
||||||
let n = byte_len(seql);
|
let n = byte_len(seql);
|
||||||
|
|
||||||
@@ -280,7 +282,7 @@ impl SuperKmer {
|
|||||||
/// Put this super-kmer in canonical form (lexicographic minimum of forward and revcomp).
|
/// Put this super-kmer in canonical form (lexicographic minimum of forward and revcomp).
|
||||||
///
|
///
|
||||||
/// Returns `true` if already canonical (no change), `false` if revcomp was applied.
|
/// Returns `true` if already canonical (no change), `false` if revcomp was applied.
|
||||||
pub fn canonical(&mut self) -> bool {
|
fn canonical(&mut self) -> bool {
|
||||||
if self.is_canonical() {
|
if self.is_canonical() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -289,7 +291,7 @@ impl SuperKmer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `true` if this super-kmer is in canonical form (lexicographic minimum of forward and revcomp).
|
/// Returns `true` if this super-kmer is in canonical form (lexicographic minimum of forward and revcomp).
|
||||||
pub fn is_canonical(&self) -> bool {
|
fn is_canonical(&self) -> bool {
|
||||||
let seql = self.len();
|
let seql = self.len();
|
||||||
for i in 0..seql {
|
for i in 0..seql {
|
||||||
let fwd = self.nucleotide(i);
|
let fwd = self.nucleotide(i);
|
||||||
@@ -314,15 +316,9 @@ impl SuperKmer {
|
|||||||
self.iter_kmers(k).map(move |km| km.canonical(k))
|
self.iter_kmers(k).map(move |km| km.canonical(k))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the XXH3 hash of the super-kmer sequence.
|
/// Returns the XXH3-64 hash of the packed sequence bytes.
|
||||||
pub fn hash(&self) -> u64 {
|
pub fn seq_hash(&self) -> u64 {
|
||||||
if self.is_canonical() {
|
xxh3_64(&self.seq)
|
||||||
return xxh3_64(&self.seq);
|
|
||||||
} else {
|
|
||||||
let mut rev = self.clone();
|
|
||||||
rev.revcomp();
|
|
||||||
return xxh3_64(&rev.seq);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user