mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Patch a bug on writing to stdout, and add clearer error on openning data files
This commit is contained in:
@ -3,98 +3,147 @@ package obikmer
|
||||
import (
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type KmerMap struct {
|
||||
index map[KmerIdx64][]*obiseq.BioSequence
|
||||
kmersize int
|
||||
kmermask KmerIdx64
|
||||
type KmerMap[T obifp.FPUint[T]] struct {
|
||||
index map[T][]*obiseq.BioSequence
|
||||
Kmersize uint
|
||||
kmermask T
|
||||
|
||||
leftMask T
|
||||
rightMask T
|
||||
sparseMask T
|
||||
|
||||
SparseAt int
|
||||
}
|
||||
|
||||
type KmerMatch map[*obiseq.BioSequence]int
|
||||
|
||||
func (k *KmerMap) KmerSize() int {
|
||||
return k.kmersize
|
||||
func (k *KmerMap[T]) KmerSize() uint {
|
||||
return k.Kmersize
|
||||
}
|
||||
|
||||
func (k *KmerMap) Len() int {
|
||||
func (k *KmerMap[T]) Len() int {
|
||||
return len(k.index)
|
||||
}
|
||||
|
||||
func (k *KmerMap) Push(sequence *obiseq.BioSequence) {
|
||||
current := KmerIdx64(0)
|
||||
ccurrent := KmerIdx64(0)
|
||||
lshift := uint(2 * (k.kmersize - 1))
|
||||
func (k *KmerMap[T]) KmerAsString(kmer T) string {
|
||||
buff := make([]byte, k.Kmersize)
|
||||
ks := int(k.Kmersize)
|
||||
|
||||
if k.SparseAt >= 0 {
|
||||
ks--
|
||||
}
|
||||
|
||||
for i, j := 0, int(k.Kmersize)-1; i < ks; i++ {
|
||||
code := kmer.And(obifp.From64[T](3)).AsUint64()
|
||||
buff[j] = decode[code]
|
||||
j--
|
||||
if k.SparseAt >= 0 && j == k.SparseAt {
|
||||
buff[j] = '#'
|
||||
j--
|
||||
}
|
||||
kmer = kmer.RightShift(2)
|
||||
}
|
||||
|
||||
return string(buff)
|
||||
}
|
||||
|
||||
func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T) []T {
|
||||
|
||||
makeSparseAt := func(kmer T) T {
|
||||
if k.SparseAt == -1 {
|
||||
return kmer
|
||||
}
|
||||
|
||||
return kmer.And(k.leftMask).RightShift(2).Or(kmer.And(k.rightMask))
|
||||
}
|
||||
|
||||
normalizedKmer := func(fw, rv T) T {
|
||||
|
||||
if k.SparseAt >= 0 {
|
||||
fw = makeSparseAt(fw)
|
||||
rv = makeSparseAt(rv)
|
||||
}
|
||||
|
||||
if fw.LessThan(rv) {
|
||||
return fw
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
current := obifp.ZeroUint[T]()
|
||||
ccurrent := obifp.ZeroUint[T]()
|
||||
lshift := uint(2 * (k.Kmersize - 1))
|
||||
|
||||
sup := sequence.Len() - int(k.Kmersize) + 1
|
||||
|
||||
var rep []T
|
||||
if buff == nil {
|
||||
rep = make([]T, 0, sup)
|
||||
} else {
|
||||
rep = (*buff)[:0]
|
||||
}
|
||||
|
||||
nuc := sequence.Sequence()
|
||||
|
||||
size := 0
|
||||
for i := 0; i < len(nuc)-k.kmersize+1; i++ {
|
||||
current <<= 2
|
||||
ccurrent >>= 2
|
||||
for i := 0; i < len(nuc); i++ {
|
||||
current = current.LeftShift(2)
|
||||
ccurrent = ccurrent.RightShift(2)
|
||||
|
||||
code := iupac[nuc[i]]
|
||||
ccode := iupac[revcompnuc[nuc[i]]]
|
||||
|
||||
if len(code) != 1 {
|
||||
current = KmerIdx64(0)
|
||||
ccurrent = KmerIdx64(0)
|
||||
current = obifp.ZeroUint[T]()
|
||||
ccurrent = obifp.ZeroUint[T]()
|
||||
size = 0
|
||||
continue
|
||||
}
|
||||
|
||||
current |= KmerIdx64(code[0])
|
||||
ccurrent |= KmerIdx64(ccode[0]) << lshift
|
||||
current = current.Or(obifp.From64[T](uint64(code[0])))
|
||||
ccurrent = ccurrent.Or(obifp.From64[T](uint64(ccode[0])).LeftShift(lshift))
|
||||
|
||||
size++
|
||||
|
||||
if size == k.kmersize {
|
||||
if size == int(k.Kmersize) {
|
||||
|
||||
kmer := min(k.kmermask¤t, k.kmermask&ccurrent)
|
||||
k.index[kmer] = append(k.index[kmer], sequence)
|
||||
kmer := normalizedKmer(current, ccurrent)
|
||||
rep = append(rep, kmer)
|
||||
size--
|
||||
}
|
||||
}
|
||||
|
||||
return rep
|
||||
}
|
||||
|
||||
func (k *KmerMap) Query(sequence *obiseq.BioSequence) KmerMatch {
|
||||
current := KmerIdx64(0)
|
||||
ccurrent := KmerIdx64(0)
|
||||
func (k *KmerMap[T]) Push(sequence *obiseq.BioSequence) {
|
||||
kmers := k.NormalizedKmerSlice(sequence, nil)
|
||||
for _, kmer := range kmers {
|
||||
k.index[kmer] = append(k.index[kmer], sequence)
|
||||
}
|
||||
}
|
||||
|
||||
func (k *KmerMap[T]) Query(sequence *obiseq.BioSequence) KmerMatch {
|
||||
kmers := k.NormalizedKmerSlice(sequence, nil)
|
||||
rep := make(KmerMatch)
|
||||
|
||||
nuc := sequence.Sequence()
|
||||
size := 0
|
||||
for i := 0; i < len(nuc)-k.kmersize+1; i++ {
|
||||
current <<= 2
|
||||
ccurrent >>= 2
|
||||
|
||||
code := iupac[nuc[i]]
|
||||
ccode := iupac[revcompnuc[nuc[i]]]
|
||||
|
||||
if len(code) != 1 {
|
||||
current = KmerIdx64(0)
|
||||
ccurrent = KmerIdx64(0)
|
||||
size = 0
|
||||
continue
|
||||
}
|
||||
|
||||
current |= KmerIdx64(code[0])
|
||||
ccurrent |= KmerIdx64(ccode[0]) << uint(2*(k.kmersize-1))
|
||||
size++
|
||||
|
||||
if size == k.kmersize {
|
||||
kmer := min(k.kmermask¤t, k.kmermask&ccurrent)
|
||||
if _, ok := k.index[kmer]; ok {
|
||||
for _, seq := range k.index[kmer] {
|
||||
if seq != sequence {
|
||||
if _, ok := rep[seq]; !ok {
|
||||
rep[seq] = 0
|
||||
}
|
||||
rep[seq]++
|
||||
for _, kmer := range kmers {
|
||||
if _, ok := k.index[kmer]; ok {
|
||||
for _, seq := range k.index[kmer] {
|
||||
if seq != sequence {
|
||||
if _, ok := rep[seq]; !ok {
|
||||
rep[seq] = 0
|
||||
}
|
||||
rep[seq]++
|
||||
}
|
||||
}
|
||||
size--
|
||||
}
|
||||
}
|
||||
|
||||
@ -135,12 +184,54 @@ func (k *KmerMatch) Max() *obiseq.BioSequence {
|
||||
return maxseq
|
||||
}
|
||||
|
||||
func NewKmerMap(sequences obiseq.BioSequenceSlice, kmersize int) *KmerMap {
|
||||
idx := make(map[KmerIdx64][]*obiseq.BioSequence)
|
||||
func NewKmerMap[T obifp.FPUint[T]](
|
||||
sequences obiseq.BioSequenceSlice,
|
||||
kmersize uint,
|
||||
sparse bool) *KmerMap[T] {
|
||||
idx := make(map[T][]*obiseq.BioSequence)
|
||||
|
||||
kmermask := KmerIdx64(^(^uint64(0) << (uint64(kmersize) * 2)))
|
||||
sparseAt := -1
|
||||
|
||||
kmap := &KmerMap{kmersize: kmersize, kmermask: kmermask, index: idx}
|
||||
if sparse && kmersize%2 == 0 {
|
||||
log.Warnf("Kmer size must be odd when using sparse mode")
|
||||
kmersize++
|
||||
}
|
||||
|
||||
if !sparse && kmersize%2 == 1 {
|
||||
log.Warnf("Kmer size must be even when not using sparse mode")
|
||||
kmersize--
|
||||
|
||||
}
|
||||
|
||||
if sparse {
|
||||
sparseAt = int(kmersize / 2)
|
||||
}
|
||||
|
||||
kmermask := obifp.OneUint[T]().LeftShift(kmersize * 2).Sub(obifp.OneUint[T]())
|
||||
leftMask := obifp.ZeroUint[T]()
|
||||
rightMask := obifp.ZeroUint[T]()
|
||||
|
||||
if sparseAt >= 0 {
|
||||
if sparseAt >= int(kmersize) {
|
||||
sparseAt = -1
|
||||
} else {
|
||||
pos := kmersize - 1 - uint(sparseAt)
|
||||
left := uint(sparseAt) * 2
|
||||
right := pos * 2
|
||||
|
||||
leftMask = obifp.OneUint[T]().LeftShift(left).Sub(obifp.OneUint[T]()).LeftShift(right + 2)
|
||||
rightMask = obifp.OneUint[T]().LeftShift(right).Sub(obifp.OneUint[T]())
|
||||
}
|
||||
}
|
||||
|
||||
kmap := &KmerMap[T]{
|
||||
Kmersize: kmersize,
|
||||
kmermask: kmermask,
|
||||
leftMask: leftMask,
|
||||
rightMask: rightMask,
|
||||
index: idx,
|
||||
SparseAt: sparseAt,
|
||||
}
|
||||
|
||||
n := len(sequences)
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
@ -163,14 +254,3 @@ func NewKmerMap(sequences obiseq.BioSequenceSlice, kmersize int) *KmerMap {
|
||||
|
||||
return kmap
|
||||
}
|
||||
|
||||
func (k *KmerMap) MakeCountMatchWorker(minKmerCount int) obiseq.SeqWorker {
|
||||
return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
matches := k.Query(sequence)
|
||||
matches.FilterMinCount(minKmerCount)
|
||||
n := matches.Len()
|
||||
|
||||
sequence.SetAttribute("obikmer_match_count", n)
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user