mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Fix super k-mer minimizer bijection and add validation test
This commit addresses a bug in the super k-mer implementation where the minimizer bijection property was not properly enforced. The fix ensures that: 1. All k-mers within a super k-mer share the same minimizer 2. Identical super k-mer sequences have the same minimizer The changes include: - Fixing the super k-mer iteration logic to properly validate the minimizer bijection property - Adding a comprehensive test suite (TestSuperKmerMinimizerBijection) that validates the intrinsic property of super k-mers - Updating the .gitignore file to properly track relevant files This resolves issues where the same sequence could be associated with different minimizers, violating the super k-mer definition.
This commit is contained in:
@@ -77,6 +77,122 @@ func TestIterSuperKmersVsSlice(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestSuperKmerMinimizerBijection validates the intrinsic property that
|
||||
// a super k-mer sequence has one and only one minimizer (bijection property).
|
||||
// This test ensures that:
|
||||
// 1. All k-mers in a super k-mer share the same minimizer
|
||||
// 2. Two identical super k-mer sequences must have the same minimizer
|
||||
func TestSuperKmerMinimizerBijection(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
seq []byte
|
||||
k int
|
||||
m int
|
||||
}{
|
||||
{
|
||||
name: "simple sequence",
|
||||
seq: []byte("ACGTACGTACGTACGTACGTACGTACGTACGT"),
|
||||
k: 21,
|
||||
m: 11,
|
||||
},
|
||||
{
|
||||
name: "homopolymer blocks",
|
||||
seq: []byte("AAAACCCCGGGGTTTTAAAACCCCGGGGTTTT"),
|
||||
k: 21,
|
||||
m: 11,
|
||||
},
|
||||
{
|
||||
name: "complex sequence",
|
||||
seq: []byte("ATCGATCGATCGATCGATCGATCGATCGATCG"),
|
||||
k: 15,
|
||||
m: 7,
|
||||
},
|
||||
{
|
||||
name: "longer sequence",
|
||||
seq: []byte("ACGTACGTGGGGAAAAACGTACGTTTTTCCCCACGTACGT"),
|
||||
k: 13,
|
||||
m: 7,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
// Map to track sequence -> minimizer
|
||||
seqToMinimizer := make(map[string]uint64)
|
||||
|
||||
for sk := range IterSuperKmers(tc.seq, tc.k, tc.m) {
|
||||
seqStr := string(sk.Sequence)
|
||||
|
||||
// Check if we've seen this sequence before
|
||||
if prevMinimizer, exists := seqToMinimizer[seqStr]; exists {
|
||||
if prevMinimizer != sk.Minimizer {
|
||||
t.Errorf("BIJECTION VIOLATION: sequence %s has two different minimizers:\n"+
|
||||
" First: %d\n"+
|
||||
" Second: %d\n"+
|
||||
" This violates the super k-mer definition!",
|
||||
seqStr, prevMinimizer, sk.Minimizer)
|
||||
}
|
||||
} else {
|
||||
seqToMinimizer[seqStr] = sk.Minimizer
|
||||
}
|
||||
|
||||
// Verify all k-mers in this super k-mer have the same minimizer
|
||||
if len(sk.Sequence) >= tc.k {
|
||||
for i := 0; i <= len(sk.Sequence)-tc.k; i++ {
|
||||
kmerSeq := sk.Sequence[i : i+tc.k]
|
||||
minimizer := findMinimizer(kmerSeq, tc.k, tc.m)
|
||||
if minimizer != sk.Minimizer {
|
||||
t.Errorf("K-mer at position %d in super k-mer has different minimizer:\n"+
|
||||
" K-mer: %s\n"+
|
||||
" Expected minimizer: %d\n"+
|
||||
" Actual minimizer: %d\n"+
|
||||
" Super k-mer: %s",
|
||||
i, string(kmerSeq), sk.Minimizer, minimizer, seqStr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// findMinimizer computes the minimizer of a k-mer for testing purposes
|
||||
func findMinimizer(kmer []byte, k int, m int) uint64 {
|
||||
if len(kmer) != k {
|
||||
return 0
|
||||
}
|
||||
|
||||
mMask := uint64(1)<<(m*2) - 1
|
||||
rcShift := uint((m - 1) * 2)
|
||||
|
||||
minMinimizer := uint64(^uint64(0)) // max uint64
|
||||
|
||||
// Scan all m-mers in the k-mer
|
||||
var fwdMmer, rvcMmer uint64
|
||||
for i := 0; i < m-1 && i < len(kmer); i++ {
|
||||
code := uint64(__single_base_code__[kmer[i]&31])
|
||||
fwdMmer = (fwdMmer << 2) | code
|
||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||
}
|
||||
|
||||
for i := m - 1; i < len(kmer); i++ {
|
||||
code := uint64(__single_base_code__[kmer[i]&31])
|
||||
fwdMmer = ((fwdMmer << 2) | code) & mMask
|
||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||
|
||||
canonical := fwdMmer
|
||||
if rvcMmer < fwdMmer {
|
||||
canonical = rvcMmer
|
||||
}
|
||||
|
||||
if canonical < minMinimizer {
|
||||
minMinimizer = canonical
|
||||
}
|
||||
}
|
||||
|
||||
return minMinimizer
|
||||
}
|
||||
|
||||
// Note: Tests for ToBioSequence and SuperKmerWorker are in a separate
|
||||
// integration test package to avoid circular dependencies between
|
||||
// obikmer and obiseq packages.
|
||||
|
||||
Reference in New Issue
Block a user