diff --git a/src/obikmer/src/cmd/superkmer.rs b/src/obikmer/src/cmd/superkmer.rs index a591589..473c67a 100644 --- a/src/obikmer/src/cmd/superkmer.rs +++ b/src/obikmer/src/cmd/superkmer.rs @@ -25,7 +25,7 @@ fn write_batch( let partition_mask = (1u64 << partition_bits) - 1; for rsk in batch { let minimizer = *rsk.minimizer(); - let partition = (minimizer.hash(m) & partition_mask) as usize; + let partition = (minimizer.seq_hash(m) & partition_mask) as usize; write_scatter(rsk.superkmer(), out, k, m, partition, minimizer)?; } Ok(()) diff --git a/src/obikpartitionner/src/partition.rs b/src/obikpartitionner/src/partition.rs index 986d1c1..c1fc019 100644 --- a/src/obikpartitionner/src/partition.rs +++ b/src/obikpartitionner/src/partition.rs @@ -14,8 +14,8 @@ use remove_dir_all::remove_dir_all; use niffler::Level; use niffler::send::compression::Format; -use obikseq::superkmer::SuperKmer; use obikseq::RoutableSuperKmer; +use obikseq::superkmer::SuperKmer; use obiskio::{SKFileMeta, SKFileReader, SKFileWriter, SKResult}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; @@ -124,7 +124,8 @@ impl KmerPartition { /// Route and write one super-kmer to its partition file. pub fn write(&mut self, rsk: RoutableSuperKmer) -> SKResult<()> { self.check_not_closed()?; - let partition = (rsk.minimizer().hash(self.minimizer_size) & self.partitions_mask) as usize; + let partition = + (rsk.minimizer().seq_hash(self.minimizer_size) & self.partitions_mask) as usize; let sk = rsk.into_superkmer(); self.ensure_writer(partition)?.write(&sk) } @@ -133,7 +134,8 @@ impl KmerPartition { pub fn write_batch(&mut self, rsks: Vec) -> SKResult<()> { self.check_not_closed()?; for rsk in rsks { - let partition = (rsk.minimizer().hash(self.minimizer_size) & self.partitions_mask) as usize; + let partition = + (rsk.minimizer().seq_hash(self.minimizer_size) & self.partitions_mask) as usize; let sk = rsk.into_superkmer(); self.ensure_writer(partition)?.write(&sk)?; } @@ -441,9 +443,8 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize, k: usize) -> S .collect::>()?; let mut reader = SKFileReader::open(&raw_path, k)?; - while let Some(mut sk) = reader.read()? { - sk.canonical(); - let bucket = (sk.hash() & temp_mask) as usize; + while let Some(sk) = reader.read()? { + let bucket = (sk.seq_hash() & temp_mask) as usize; writers[bucket].write(&sk)?; } for w in &mut writers { @@ -473,8 +474,7 @@ fn load_bucket(path: &Path, k: usize) -> SKResult> { .unwrap_or(0); let mut map: HashMap = HashMap::with_capacity(capacity); let mut reader = SKFileReader::open(path, k)?; - while let Some(mut sk) = reader.read()? { - sk.canonical(); + while let Some(sk) = reader.read()? { let count = sk.count() as u64; *map.entry(sk).or_insert(0) += count; } diff --git a/src/obikseq/src/kmer.rs b/src/obikseq/src/kmer.rs index bcd14b5..122cfd2 100644 --- a/src/obikseq/src/kmer.rs +++ b/src/obikseq/src/kmer.rs @@ -160,7 +160,7 @@ impl Kmer { /// /// Uses the canonical form of the kmer to compute the hash. #[inline] - pub fn hash(&self, k: usize) -> u64 { + pub fn seq_hash(&self, k: usize) -> u64 { mix64(self.canonical(k).0) } diff --git a/src/obikseq/src/sequence.rs b/src/obikseq/src/sequence.rs index f9f2287..ec23511 100644 --- a/src/obikseq/src/sequence.rs +++ b/src/obikseq/src/sequence.rs @@ -1,5 +1,5 @@ pub trait Sequence { - fn len(&self) -> usize; fn sequence(&self) -> &[u8]; - fn revcomp(&self) -> Self; + fn canonical(&self) -> Self; + fn seq_hash(&self) -> u64; } diff --git a/src/obikseq/src/superkmer.rs b/src/obikseq/src/superkmer.rs index b0597a5..119bd68 100644 --- a/src/obikseq/src/superkmer.rs +++ b/src/obikseq/src/superkmer.rs @@ -155,7 +155,9 @@ impl SuperKmer { } /// Reverse-complement this super-kmer in place. - pub fn revcomp(&mut self) { + /// + /// This method is only used internally by the build method. + fn revcomp(&mut self) { let seql = self.len(); let n = byte_len(seql); @@ -280,7 +282,7 @@ impl SuperKmer { /// Put this super-kmer in canonical form (lexicographic minimum of forward and revcomp). /// /// Returns `true` if already canonical (no change), `false` if revcomp was applied. - pub fn canonical(&mut self) -> bool { + fn canonical(&mut self) -> bool { if self.is_canonical() { return true; } @@ -289,7 +291,7 @@ impl SuperKmer { } /// Returns `true` if this super-kmer is in canonical form (lexicographic minimum of forward and revcomp). - pub fn is_canonical(&self) -> bool { + fn is_canonical(&self) -> bool { let seql = self.len(); for i in 0..seql { let fwd = self.nucleotide(i); @@ -314,15 +316,9 @@ impl SuperKmer { self.iter_kmers(k).map(move |km| km.canonical(k)) } - /// Returns the XXH3 hash of the super-kmer sequence. - pub fn hash(&self) -> u64 { - if self.is_canonical() { - return xxh3_64(&self.seq); - } else { - let mut rev = self.clone(); - rev.revcomp(); - return xxh3_64(&rev.seq); - } + /// Returns the XXH3-64 hash of the packed sequence bytes. + pub fn seq_hash(&self) -> u64 { + xxh3_64(&self.seq) } }