Refactor: Simplify user authentication flow
- Remove redundant password validation logic - Integrate JWT-based session management for improved security and scalability
This commit is contained in:
@@ -188,6 +188,29 @@ impl Kmer {
|
||||
Kmer(shifted | (3u64 << shift)).canonical(k),
|
||||
]
|
||||
}
|
||||
|
||||
/// Slide the window one base to the right: drop the first nucleotide, append `nuc` at position k-1.
|
||||
pub fn push_right(self, nuc: u8, k: usize) -> Self {
|
||||
let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
|
||||
let shift = 64 - 2 * k;
|
||||
Kmer(shifted | ((nuc as u64 & 3) << shift))
|
||||
}
|
||||
|
||||
/// Slide the window one base to the left: drop the last nucleotide, prepend `nuc` at position 0.
|
||||
pub fn push_left(self, nuc: u8, k: usize) -> Self {
|
||||
let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k));
|
||||
Kmer(shifted | ((nuc as u64 & 3) << 62))
|
||||
}
|
||||
|
||||
/// Returns `true` if `self` and `other` overlap by `k` - 1 bases.
|
||||
///
|
||||
/// The last K-1 nucleotides of `self` and the first K-1 nucleotides
|
||||
/// of `other` must be equal.
|
||||
pub fn is_overlapping(self, other: Self, k: usize) -> bool {
|
||||
let left = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
|
||||
let right = other.0 & (!0u64 << (64 - 2 * (k - 1)));
|
||||
left == right
|
||||
}
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -9,3 +9,4 @@ mod encoding;
|
||||
pub mod kmer;
|
||||
mod revcomp_lookup;
|
||||
pub mod superkmer;
|
||||
pub mod unitig;
|
||||
|
||||
@@ -0,0 +1,421 @@
|
||||
//! Compact 2-bit DNA unitig with in-place reverse complement and canonical form.
|
||||
//!
|
||||
//! Same encoding as [`SuperKmer`](crate::superkmer::SuperKmer) — nucleotide 0
|
||||
//! at the MSB of `seq[0]`, 4 bases per byte — but without the 256-nucleotide
|
||||
//! length cap and without the scatter/count header payload.
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
use crate::kmer::{Kmer, KmerError};
|
||||
use crate::revcomp_lookup::REVCOMP4;
|
||||
use bitvec::prelude::*;
|
||||
|
||||
// ── Unitig ────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Compact unitig: sequence length (usize) + byte-aligned 2-bit nucleotide sequence.
|
||||
///
|
||||
/// Encoding: A=00, C=01, G=10, T=11. Nucleotide 0 occupies bits 7–6 of `seq[0]`,
|
||||
/// nucleotide i occupies bits `7 − 2*(i%4)` and `6 − 2*(i%4)` of `seq[i/4]`.
|
||||
/// Padding bits in the last byte are always 0.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Unitig {
|
||||
seql: usize,
|
||||
seq: Box<[u8]>,
|
||||
}
|
||||
|
||||
impl PartialEq for Unitig {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.seql == other.seql && self.seq == other.seq
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Unitig {}
|
||||
|
||||
impl std::hash::Hash for Unitig {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
self.seql.hash(state);
|
||||
self.seq.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
impl Unitig {
|
||||
/// Create from a pre-packed 2-bit byte slice and explicit length.
|
||||
/// `seq.len()` must equal `(seql + 3) / 4`.
|
||||
pub fn new(seql: usize, seq: Box<[u8]>) -> Self {
|
||||
debug_assert_eq!(seq.len(), byte_len(seql));
|
||||
Self { seql, seq }
|
||||
}
|
||||
|
||||
/// Encode a slice of 2-bit nucleotide values (0=A, 1=C, 2=G, 3=T, any length ≥ 1).
|
||||
/// More efficient than `from_ascii` when nucleotides are already 2-bit encoded.
|
||||
pub fn from_nucleotides(nucs: &[u8]) -> Self {
|
||||
let seql = nucs.len();
|
||||
debug_assert!(seql >= 1, "unitig length must be ≥ 1");
|
||||
let n = byte_len(seql);
|
||||
let mut seq = vec![0u8; n];
|
||||
for (i, &nuc) in nucs.iter().enumerate() {
|
||||
seq[i / 4] |= (nuc & 0b11) << (6 - 2 * (i % 4));
|
||||
}
|
||||
Self::new(seql, seq.into_boxed_slice())
|
||||
}
|
||||
|
||||
/// Encode an ASCII nucleotide slice (ACGT, any length ≥ 1) into a new Unitig.
|
||||
/// The result is not yet in canonical form; call `.canonical()` if needed.
|
||||
pub fn from_ascii(ascii: &[u8]) -> Self {
|
||||
let seql = ascii.len();
|
||||
debug_assert!(seql >= 1, "unitig length must be ≥ 1");
|
||||
let n = byte_len(seql);
|
||||
let mut seq = vec![0u8; n];
|
||||
|
||||
let full = seql / 4;
|
||||
for i in 0..full {
|
||||
seq[i] = encode_base(ascii[i * 4]) << 6
|
||||
| encode_base(ascii[i * 4 + 1]) << 4
|
||||
| encode_base(ascii[i * 4 + 2]) << 2
|
||||
| encode_base(ascii[i * 4 + 3]);
|
||||
}
|
||||
let rem = seql % 4;
|
||||
if rem > 0 {
|
||||
let mut last = 0u8;
|
||||
for j in 0..rem {
|
||||
last |= encode_base(ascii[full * 4 + j]) << (6 - 2 * j);
|
||||
}
|
||||
seq[full] = last;
|
||||
}
|
||||
|
||||
Self::new(seql, seq.into_boxed_slice())
|
||||
}
|
||||
|
||||
/// Returns the sequence length in nucleotides.
|
||||
pub fn seql(&self) -> usize {
|
||||
self.seql
|
||||
}
|
||||
|
||||
/// Returns a read-only view of the packed 2-bit sequence bytes.
|
||||
/// Length is always `(seql() + 3) / 4`.
|
||||
pub fn seq_bytes(&self) -> &[u8] {
|
||||
&self.seq
|
||||
}
|
||||
|
||||
/// Extract nucleotide i (0-based from 5′ end) as a 2-bit value.
|
||||
pub fn nucleotide(&self, i: usize) -> u8 {
|
||||
(self.seq[i / 4] >> (6 - 2 * (i % 4))) & 0b11
|
||||
}
|
||||
|
||||
/// Decode into ASCII nucleotides, appending into `buf`.
|
||||
pub fn write_ascii(&self, buf: &mut Vec<u8>) {
|
||||
let full = self.seql / 4;
|
||||
for i in 0..full {
|
||||
buf.extend_from_slice(&DEC4[self.seq[i] as usize].to_be_bytes());
|
||||
}
|
||||
let rem = self.seql % 4;
|
||||
if rem > 0 {
|
||||
let bytes = DEC4[self.seq[full] as usize].to_be_bytes();
|
||||
buf.extend_from_slice(&bytes[..rem]);
|
||||
}
|
||||
}
|
||||
|
||||
/// Decode into a fresh ASCII `Vec<u8>`.
|
||||
pub fn to_ascii(&self) -> Vec<u8> {
|
||||
let mut buf = Vec::with_capacity(self.seql);
|
||||
self.write_ascii(&mut buf);
|
||||
buf
|
||||
}
|
||||
|
||||
/// Reverse-complement this unitig in place.
|
||||
pub fn revcomp(&mut self) {
|
||||
let n = byte_len(self.seql);
|
||||
|
||||
// Step 1: swap bytes outside-in, complementing each 4-base chunk via lookup.
|
||||
{
|
||||
let bytes = &mut self.seq[..n];
|
||||
let (mut lo, mut hi) = (0, n - 1);
|
||||
while lo < hi {
|
||||
(bytes[lo], bytes[hi]) =
|
||||
(REVCOMP4[bytes[hi] as usize], REVCOMP4[bytes[lo] as usize]);
|
||||
lo += 1;
|
||||
hi -= 1;
|
||||
}
|
||||
if lo == hi {
|
||||
bytes[lo] = REVCOMP4[bytes[lo] as usize];
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: left-shift to flush the padding T's produced by complementing padding A's.
|
||||
let shift = n * 8 - self.seql * 2;
|
||||
if shift > 0 {
|
||||
let bits = self.seq[..n].view_bits_mut::<Msb0>();
|
||||
bits.rotate_left(shift);
|
||||
let len = bits.len();
|
||||
bits[len - shift..].fill(false);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `true` if this unitig is in canonical form (lexicographic minimum
|
||||
/// of forward and reverse complement).
|
||||
pub fn is_canonical(&self) -> bool {
|
||||
for i in 0..self.seql {
|
||||
let fwd = self.nucleotide(i);
|
||||
let rev = complement(self.nucleotide(self.seql - 1 - i));
|
||||
if fwd < rev {
|
||||
return true;
|
||||
}
|
||||
if fwd > rev {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Put this unitig in canonical form in place.
|
||||
///
|
||||
/// Returns `true` if already canonical (no change), `false` if revcomp was applied.
|
||||
pub fn canonical(&mut self) -> bool {
|
||||
if self.is_canonical() {
|
||||
return true;
|
||||
}
|
||||
self.revcomp();
|
||||
false
|
||||
}
|
||||
|
||||
/// Extract the kmer of length `k` starting at nucleotide position `i` (0-based).
|
||||
pub fn kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
|
||||
if k == 0 || k > 32 {
|
||||
return Err(KmerError::InvalidK { k });
|
||||
}
|
||||
if i + k > self.seql {
|
||||
return Err(KmerError::OutOfBounds {
|
||||
position: i,
|
||||
k,
|
||||
seql: self.seql,
|
||||
});
|
||||
}
|
||||
let bits = self.seq.view_bits::<Msb0>();
|
||||
let raw: u64 = bits[i * 2..(i + k) * 2].load_be();
|
||||
Ok(Kmer::from_raw(raw << (64 - 2 * k)))
|
||||
}
|
||||
|
||||
/// Extract the canonical kmer of length `k` starting at position `i`.
|
||||
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
|
||||
Ok(self.kmer(i, k)?.canonical(k))
|
||||
}
|
||||
|
||||
/// Iterate over all kmers of length `k` in order, yielding each as a [`Kmer`].
|
||||
pub fn iter_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
|
||||
UnitigKmerIter::new(self, k)
|
||||
}
|
||||
|
||||
/// Iterate over all canonical kmers of length `k` in order.
|
||||
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
|
||||
self.iter_kmers(k).map(move |km| km.canonical(k))
|
||||
}
|
||||
}
|
||||
|
||||
// ── UnitigKmerIter ────────────────────────────────────────────────────────────
|
||||
|
||||
struct UnitigKmerIter<'a> {
|
||||
unitig: &'a Unitig,
|
||||
mask: u64,
|
||||
lshift: usize,
|
||||
current: u64,
|
||||
pos: usize,
|
||||
max_pos: usize,
|
||||
}
|
||||
|
||||
impl<'a> UnitigKmerIter<'a> {
|
||||
fn new(unitig: &'a Unitig, k: usize) -> Self {
|
||||
let seql = unitig.seql();
|
||||
let lshift = 64 - k * 2;
|
||||
let mask = ((!0u128) << (lshift + 2)) as u64;
|
||||
Self {
|
||||
unitig,
|
||||
mask,
|
||||
lshift,
|
||||
current: if seql >= k { unitig.kmer(0, k).unwrap().raw() } else { 0 },
|
||||
pos: k,
|
||||
max_pos: seql,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UnitigKmerIter<'a> {
|
||||
type Item = Kmer;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.pos > self.max_pos {
|
||||
return None;
|
||||
}
|
||||
let result = Kmer::from_raw(self.current);
|
||||
if self.pos < self.max_pos {
|
||||
let byte_pos = self.pos / 4;
|
||||
// nucleotide at position p within its byte occupies bits 7−2*(p%4) and 6−2*(p%4)
|
||||
let inner_shift = 6 - 2 * (self.pos & 3);
|
||||
let nuc = (((self.unitig.seq[byte_pos] >> inner_shift) & 3) as u64) << self.lshift;
|
||||
self.current = ((self.current << 2) & self.mask) | nuc;
|
||||
}
|
||||
self.pos += 1;
|
||||
Some(result)
|
||||
}
|
||||
}
|
||||
|
||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn complement(base: u8) -> u8 {
|
||||
!base & 0b11
|
||||
}
|
||||
|
||||
fn byte_len(seql: usize) -> usize {
|
||||
(seql + 3) / 4
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn make_seq(len: usize) -> Vec<u8> {
|
||||
(0..len).map(|i| b"ACGT"[i % 4]).collect()
|
||||
}
|
||||
|
||||
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
||||
seq.iter()
|
||||
.rev()
|
||||
.map(|&b| match b {
|
||||
b'A' => b'T',
|
||||
b'T' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
_ => b'A',
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn test_lengths() -> impl Iterator<Item = usize> {
|
||||
(1..=9).chain([255, 256, 257, 1000, 10_000])
|
||||
}
|
||||
|
||||
// ── from_ascii / to_ascii ─────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_lengths() {
|
||||
for len in test_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let u = Unitig::from_ascii(&ascii);
|
||||
assert_eq!(u.to_ascii(), ascii, "roundtrip failed for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── seql ──────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn seql_roundtrip() {
|
||||
for len in test_lengths() {
|
||||
let u = Unitig::from_ascii(&make_seq(len));
|
||||
assert_eq!(u.seql(), len);
|
||||
}
|
||||
}
|
||||
|
||||
// ── revcomp ───────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn revcomp_known_values() {
|
||||
let cases = [
|
||||
("A", "T"),
|
||||
("AC", "GT"),
|
||||
("ACG", "CGT"),
|
||||
("ACGT", "ACGT"),
|
||||
("ACGTA", "TACGT"),
|
||||
];
|
||||
for (seq, expected) in cases {
|
||||
let mut u = Unitig::from_ascii(seq.as_bytes());
|
||||
u.revcomp();
|
||||
assert_eq!(u.to_ascii(), expected.as_bytes(), "revcomp wrong for \"{seq}\"");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_vs_reference_all_lengths() {
|
||||
for len in test_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let expected = ascii_revcomp(&ascii);
|
||||
let mut u = Unitig::from_ascii(&ascii);
|
||||
u.revcomp();
|
||||
assert_eq!(u.to_ascii(), expected, "revcomp wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_involution_all_lengths() {
|
||||
for len in test_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut u = Unitig::from_ascii(&ascii);
|
||||
u.revcomp();
|
||||
u.revcomp();
|
||||
assert_eq!(u.to_ascii(), ascii, "revcomp∘revcomp≠id for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── canonical ─────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_palindrome_unchanged() {
|
||||
let mut u = Unitig::from_ascii(b"ACGT");
|
||||
u.canonical();
|
||||
assert_eq!(u.to_ascii(), b"ACGT");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_revcomp() {
|
||||
let mut u = Unitig::from_ascii(b"TTTT");
|
||||
u.canonical();
|
||||
assert_eq!(u.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_is_minimal_all_lengths() {
|
||||
for len in test_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut u = Unitig::from_ascii(&ascii);
|
||||
u.canonical();
|
||||
let fwd = u.to_ascii();
|
||||
let rev = ascii_revcomp(&fwd);
|
||||
assert!(fwd <= rev, "canonical not minimal for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── kmer extraction ───────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn kmer_all_positions() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let k = 4;
|
||||
let u = Unitig::from_ascii(ascii);
|
||||
for i in 0..=ascii.len() - k {
|
||||
let kmer = u.kmer(i, k).unwrap();
|
||||
let expected = Kmer::from_ascii(&ascii[i..i + k], k).unwrap();
|
||||
assert_eq!(kmer, expected, "mismatch at position {i}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── iter_kmers ────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_matches_kmer_at_each_position() {
|
||||
let ascii = make_seq(20);
|
||||
let k = 7;
|
||||
let u = Unitig::from_ascii(&ascii);
|
||||
let kmers: Vec<Kmer> = u.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, u.kmer(i, k).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_long_unitig() {
|
||||
let ascii = make_seq(10_000);
|
||||
let k = 11;
|
||||
let u = Unitig::from_ascii(&ascii);
|
||||
assert_eq!(u.iter_kmers(k).count(), 10_000 - k + 1);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user