mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
298 lines
5.5 KiB
Go
298 lines
5.5 KiB
Go
package obikmer
|
|
|
|
import (
|
|
"os"
|
|
"sort"
|
|
"unsafe"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obilog"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
|
"github.com/schollz/progressbar/v3"
|
|
)
|
|
|
|
type KmerMap[T obifp.FPUint[T]] struct {
|
|
index map[T][]*obiseq.BioSequence
|
|
Kmersize uint
|
|
kmermask T
|
|
|
|
leftMask T
|
|
rightMask T
|
|
sparseMask T
|
|
|
|
SparseAt int
|
|
}
|
|
|
|
type KmerMatch map[*obiseq.BioSequence]int
|
|
|
|
func (k *KmerMap[T]) KmerSize() uint {
|
|
return k.Kmersize
|
|
}
|
|
|
|
func (k *KmerMap[T]) Len() int {
|
|
return len(k.index)
|
|
}
|
|
|
|
func (k *KmerMap[T]) KmerAsString(kmer T) string {
|
|
buff := make([]byte, k.Kmersize)
|
|
ks := int(k.Kmersize)
|
|
|
|
if k.SparseAt >= 0 {
|
|
ks--
|
|
}
|
|
|
|
for i, j := 0, int(k.Kmersize)-1; i < ks; i++ {
|
|
code := kmer.And(obifp.From64[T](3)).AsUint64()
|
|
buff[j] = decode[code]
|
|
j--
|
|
if k.SparseAt >= 0 && j == k.SparseAt {
|
|
buff[j] = '#'
|
|
j--
|
|
}
|
|
kmer = kmer.RightShift(2)
|
|
}
|
|
|
|
return string(buff)
|
|
}
|
|
|
|
func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T) []T {
|
|
|
|
if sequence.Len() < int(k.Kmersize) {
|
|
return nil
|
|
}
|
|
|
|
makeSparseAt := func(kmer T) T {
|
|
|
|
if k.SparseAt == -1 {
|
|
return kmer
|
|
}
|
|
|
|
return kmer.And(k.leftMask).RightShift(2).Or(kmer.And(k.rightMask))
|
|
}
|
|
|
|
normalizedKmer := func(fw, rv T) T {
|
|
|
|
fw = makeSparseAt(fw)
|
|
rv = makeSparseAt(rv)
|
|
|
|
if fw.LessThan(rv) {
|
|
return fw
|
|
}
|
|
|
|
return rv
|
|
}
|
|
|
|
current := obifp.ZeroUint[T]()
|
|
ccurrent := obifp.ZeroUint[T]()
|
|
lshift := uint(2 * (k.Kmersize - 1))
|
|
|
|
sup := sequence.Len() - int(k.Kmersize) + 1
|
|
|
|
var rep []T
|
|
if buff == nil {
|
|
rep = make([]T, 0, sup)
|
|
} else {
|
|
rep = (*buff)[:0]
|
|
}
|
|
|
|
nuc := sequence.Sequence()
|
|
|
|
size := 0
|
|
for i := 0; i < len(nuc); i++ {
|
|
current = current.LeftShift(2)
|
|
ccurrent = ccurrent.RightShift(2)
|
|
|
|
code := iupac[nuc[i]]
|
|
ccode := iupac[revcompnuc[nuc[i]]]
|
|
|
|
if len(code) != 1 {
|
|
current = obifp.ZeroUint[T]()
|
|
ccurrent = obifp.ZeroUint[T]()
|
|
size = 0
|
|
continue
|
|
}
|
|
|
|
current = current.Or(obifp.From64[T](uint64(code[0])))
|
|
ccurrent = ccurrent.Or(obifp.From64[T](uint64(ccode[0])).LeftShift(lshift))
|
|
|
|
size++
|
|
|
|
if size == int(k.Kmersize) {
|
|
|
|
kmer := normalizedKmer(current, ccurrent)
|
|
rep = append(rep, kmer)
|
|
size--
|
|
}
|
|
|
|
}
|
|
|
|
return rep
|
|
}
|
|
|
|
func (k *KmerMap[T]) Push(sequence *obiseq.BioSequence, maxocc ...int) {
|
|
maxoccurs := -1
|
|
if len(maxocc) > 0 {
|
|
maxoccurs = maxocc[0]
|
|
}
|
|
kmers := k.NormalizedKmerSlice(sequence, nil)
|
|
for _, kmer := range kmers {
|
|
seqs := k.index[kmer]
|
|
if maxoccurs == -1 || len(seqs) <= maxoccurs {
|
|
seqs = append(seqs, sequence)
|
|
k.index[kmer] = seqs
|
|
}
|
|
}
|
|
}
|
|
|
|
func (k *KmerMap[T]) Query(sequence *obiseq.BioSequence) KmerMatch {
|
|
kmers := k.NormalizedKmerSlice(sequence, nil)
|
|
seqs := make([]*obiseq.BioSequence, 0)
|
|
rep := make(KmerMatch)
|
|
|
|
for _, kmer := range kmers {
|
|
if candidates, ok := k.index[kmer]; ok {
|
|
seqs = append(seqs, candidates...)
|
|
}
|
|
}
|
|
|
|
sort.Slice(seqs,
|
|
func(i, j int) bool {
|
|
return uintptr(unsafe.Pointer(seqs[i])) < uintptr(unsafe.Pointer(seqs[j]))
|
|
})
|
|
|
|
prevseq := (*obiseq.BioSequence)(nil)
|
|
n := 0
|
|
for _, seq := range seqs {
|
|
|
|
if seq != prevseq {
|
|
if prevseq != nil && prevseq != sequence {
|
|
rep[prevseq] = n
|
|
}
|
|
n = 1
|
|
prevseq = seq
|
|
}
|
|
|
|
n++
|
|
}
|
|
|
|
if prevseq != nil {
|
|
rep[prevseq] = n
|
|
}
|
|
|
|
return rep
|
|
}
|
|
|
|
func (k *KmerMatch) FilterMinCount(mincount int) {
|
|
for seq, count := range *k {
|
|
if count < mincount {
|
|
delete(*k, seq)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (k *KmerMatch) Len() int {
|
|
return len(*k)
|
|
}
|
|
|
|
func (k *KmerMatch) Sequences() obiseq.BioSequenceSlice {
|
|
ks := make([]*obiseq.BioSequence, 0, len(*k))
|
|
|
|
for seq := range *k {
|
|
ks = append(ks, seq)
|
|
}
|
|
|
|
return ks
|
|
}
|
|
|
|
func (k *KmerMatch) Max() *obiseq.BioSequence {
|
|
max := 0
|
|
var maxseq *obiseq.BioSequence
|
|
for seq, n := range *k {
|
|
if max < n {
|
|
max = n
|
|
maxseq = seq
|
|
}
|
|
}
|
|
return maxseq
|
|
}
|
|
|
|
func NewKmerMap[T obifp.FPUint[T]](
|
|
sequences obiseq.BioSequenceSlice,
|
|
kmersize uint,
|
|
sparse bool,
|
|
maxoccurs int) *KmerMap[T] {
|
|
idx := make(map[T][]*obiseq.BioSequence)
|
|
|
|
sparseAt := -1
|
|
|
|
if sparse && kmersize%2 == 0 {
|
|
obilog.Warnf("Kmer size must be odd when using sparse mode")
|
|
kmersize++
|
|
}
|
|
|
|
if !sparse && kmersize%2 == 1 {
|
|
obilog.Warnf("Kmer size must be even when not using sparse mode")
|
|
kmersize--
|
|
|
|
}
|
|
|
|
if sparse {
|
|
sparseAt = int(kmersize / 2)
|
|
}
|
|
|
|
kmermask := obifp.OneUint[T]().LeftShift(kmersize * 2).Sub(obifp.OneUint[T]())
|
|
leftMask := obifp.ZeroUint[T]()
|
|
rightMask := obifp.ZeroUint[T]()
|
|
|
|
if sparseAt >= 0 {
|
|
if sparseAt >= int(kmersize) {
|
|
sparseAt = -1
|
|
} else {
|
|
pos := kmersize - 1 - uint(sparseAt)
|
|
left := uint(sparseAt) * 2
|
|
right := pos * 2
|
|
|
|
leftMask = obifp.OneUint[T]().LeftShift(left).Sub(obifp.OneUint[T]()).LeftShift(right + 2)
|
|
rightMask = obifp.OneUint[T]().LeftShift(right).Sub(obifp.OneUint[T]())
|
|
}
|
|
}
|
|
|
|
kmap := &KmerMap[T]{
|
|
Kmersize: kmersize,
|
|
kmermask: kmermask,
|
|
leftMask: leftMask,
|
|
rightMask: rightMask,
|
|
index: idx,
|
|
SparseAt: sparseAt,
|
|
}
|
|
|
|
n := len(sequences)
|
|
pbopt := make([]progressbar.Option, 0, 5)
|
|
pbopt = append(pbopt,
|
|
progressbar.OptionSetWriter(os.Stderr),
|
|
progressbar.OptionSetWidth(15),
|
|
progressbar.OptionShowCount(),
|
|
progressbar.OptionShowIts(),
|
|
progressbar.OptionSetDescription("Indexing kmers"),
|
|
)
|
|
|
|
bar := progressbar.NewOptions(n, pbopt...)
|
|
|
|
for i, sequence := range sequences {
|
|
kmap.Push(sequence, maxoccurs)
|
|
if i%100 == 0 {
|
|
bar.Add(100)
|
|
}
|
|
}
|
|
|
|
if maxoccurs >= 0 {
|
|
for k, s := range kmap.index {
|
|
if len(s) >= maxoccurs {
|
|
delete(kmap.index, k)
|
|
}
|
|
}
|
|
}
|
|
|
|
return kmap
|
|
}
|