mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 05:20:52 +00:00
This commit introduces a new configuration module `obidefault` to manage progress bar settings, allowing users to disable progress bars via a `--no-progressbar` option. It updates various packages to conditionally display progress bars based on this new configuration, improving user experience by providing control over progress bar output. The changes also include improvements to progress bar handling in several packages, ensuring they are only displayed when appropriate (e.g., when stderr is a terminal and stdout is not piped).
302 lines
5.6 KiB
Go
302 lines
5.6 KiB
Go
package obikmer
|
|
|
|
import (
|
|
"os"
|
|
"sort"
|
|
"unsafe"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obilog"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
|
"github.com/schollz/progressbar/v3"
|
|
)
|
|
|
|
type KmerMap[T obifp.FPUint[T]] struct {
|
|
index map[T][]*obiseq.BioSequence
|
|
Kmersize uint
|
|
kmermask T
|
|
|
|
leftMask T
|
|
rightMask T
|
|
sparseMask T
|
|
|
|
SparseAt int
|
|
}
|
|
|
|
type KmerMatch map[*obiseq.BioSequence]int
|
|
|
|
func (k *KmerMap[T]) KmerSize() uint {
|
|
return k.Kmersize
|
|
}
|
|
|
|
func (k *KmerMap[T]) Len() int {
|
|
return len(k.index)
|
|
}
|
|
|
|
func (k *KmerMap[T]) KmerAsString(kmer T) string {
|
|
buff := make([]byte, k.Kmersize)
|
|
ks := int(k.Kmersize)
|
|
|
|
if k.SparseAt >= 0 {
|
|
ks--
|
|
}
|
|
|
|
for i, j := 0, int(k.Kmersize)-1; i < ks; i++ {
|
|
code := kmer.And(obifp.From64[T](3)).AsUint64()
|
|
buff[j] = decode[code]
|
|
j--
|
|
if k.SparseAt >= 0 && j == k.SparseAt {
|
|
buff[j] = '#'
|
|
j--
|
|
}
|
|
kmer = kmer.RightShift(2)
|
|
}
|
|
|
|
return string(buff)
|
|
}
|
|
|
|
func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T) []T {
|
|
|
|
if sequence.Len() < int(k.Kmersize) {
|
|
return nil
|
|
}
|
|
|
|
makeSparseAt := func(kmer T) T {
|
|
|
|
if k.SparseAt == -1 {
|
|
return kmer
|
|
}
|
|
|
|
return kmer.And(k.leftMask).RightShift(2).Or(kmer.And(k.rightMask))
|
|
}
|
|
|
|
normalizedKmer := func(fw, rv T) T {
|
|
|
|
fw = makeSparseAt(fw)
|
|
rv = makeSparseAt(rv)
|
|
|
|
if fw.LessThan(rv) {
|
|
return fw
|
|
}
|
|
|
|
return rv
|
|
}
|
|
|
|
current := obifp.ZeroUint[T]()
|
|
ccurrent := obifp.ZeroUint[T]()
|
|
lshift := uint(2 * (k.Kmersize - 1))
|
|
|
|
sup := sequence.Len() - int(k.Kmersize) + 1
|
|
|
|
var rep []T
|
|
if buff == nil {
|
|
rep = make([]T, 0, sup)
|
|
} else {
|
|
rep = (*buff)[:0]
|
|
}
|
|
|
|
nuc := sequence.Sequence()
|
|
|
|
size := 0
|
|
for i := 0; i < len(nuc); i++ {
|
|
current = current.LeftShift(2)
|
|
ccurrent = ccurrent.RightShift(2)
|
|
|
|
code := iupac[nuc[i]]
|
|
ccode := iupac[revcompnuc[nuc[i]]]
|
|
|
|
if len(code) != 1 {
|
|
current = obifp.ZeroUint[T]()
|
|
ccurrent = obifp.ZeroUint[T]()
|
|
size = 0
|
|
continue
|
|
}
|
|
|
|
current = current.Or(obifp.From64[T](uint64(code[0])))
|
|
ccurrent = ccurrent.Or(obifp.From64[T](uint64(ccode[0])).LeftShift(lshift))
|
|
|
|
size++
|
|
|
|
if size == int(k.Kmersize) {
|
|
|
|
kmer := normalizedKmer(current, ccurrent)
|
|
rep = append(rep, kmer)
|
|
size--
|
|
}
|
|
|
|
}
|
|
|
|
return rep
|
|
}
|
|
|
|
func (k *KmerMap[T]) Push(sequence *obiseq.BioSequence, maxocc ...int) {
|
|
maxoccurs := -1
|
|
if len(maxocc) > 0 {
|
|
maxoccurs = maxocc[0]
|
|
}
|
|
kmers := k.NormalizedKmerSlice(sequence, nil)
|
|
for _, kmer := range kmers {
|
|
seqs := k.index[kmer]
|
|
if maxoccurs == -1 || len(seqs) <= maxoccurs {
|
|
seqs = append(seqs, sequence)
|
|
k.index[kmer] = seqs
|
|
}
|
|
}
|
|
}
|
|
|
|
func (k *KmerMap[T]) Query(sequence *obiseq.BioSequence) KmerMatch {
|
|
kmers := k.NormalizedKmerSlice(sequence, nil)
|
|
seqs := make([]*obiseq.BioSequence, 0)
|
|
rep := make(KmerMatch)
|
|
|
|
for _, kmer := range kmers {
|
|
if candidates, ok := k.index[kmer]; ok {
|
|
seqs = append(seqs, candidates...)
|
|
}
|
|
}
|
|
|
|
sort.Slice(seqs,
|
|
func(i, j int) bool {
|
|
return uintptr(unsafe.Pointer(seqs[i])) < uintptr(unsafe.Pointer(seqs[j]))
|
|
})
|
|
|
|
prevseq := (*obiseq.BioSequence)(nil)
|
|
n := 0
|
|
for _, seq := range seqs {
|
|
|
|
if seq != prevseq {
|
|
if prevseq != nil && prevseq != sequence {
|
|
rep[prevseq] = n
|
|
}
|
|
n = 1
|
|
prevseq = seq
|
|
}
|
|
|
|
n++
|
|
}
|
|
|
|
if prevseq != nil {
|
|
rep[prevseq] = n
|
|
}
|
|
|
|
return rep
|
|
}
|
|
|
|
func (k *KmerMatch) FilterMinCount(mincount int) {
|
|
for seq, count := range *k {
|
|
if count < mincount {
|
|
delete(*k, seq)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (k *KmerMatch) Len() int {
|
|
return len(*k)
|
|
}
|
|
|
|
func (k *KmerMatch) Sequences() obiseq.BioSequenceSlice {
|
|
ks := make([]*obiseq.BioSequence, 0, len(*k))
|
|
|
|
for seq := range *k {
|
|
ks = append(ks, seq)
|
|
}
|
|
|
|
return ks
|
|
}
|
|
|
|
func (k *KmerMatch) Max() *obiseq.BioSequence {
|
|
max := 0
|
|
var maxseq *obiseq.BioSequence
|
|
for seq, n := range *k {
|
|
if max < n {
|
|
max = n
|
|
maxseq = seq
|
|
}
|
|
}
|
|
return maxseq
|
|
}
|
|
|
|
func NewKmerMap[T obifp.FPUint[T]](
|
|
sequences obiseq.BioSequenceSlice,
|
|
kmersize uint,
|
|
sparse bool,
|
|
maxoccurs int) *KmerMap[T] {
|
|
idx := make(map[T][]*obiseq.BioSequence)
|
|
|
|
sparseAt := -1
|
|
|
|
if sparse && kmersize%2 == 0 {
|
|
obilog.Warnf("Kmer size must be odd when using sparse mode")
|
|
kmersize++
|
|
}
|
|
|
|
if !sparse && kmersize%2 == 1 {
|
|
obilog.Warnf("Kmer size must be even when not using sparse mode")
|
|
kmersize--
|
|
|
|
}
|
|
|
|
if sparse {
|
|
sparseAt = int(kmersize / 2)
|
|
}
|
|
|
|
kmermask := obifp.OneUint[T]().LeftShift(kmersize * 2).Sub(obifp.OneUint[T]())
|
|
leftMask := obifp.ZeroUint[T]()
|
|
rightMask := obifp.ZeroUint[T]()
|
|
|
|
if sparseAt >= 0 {
|
|
if sparseAt >= int(kmersize) {
|
|
sparseAt = -1
|
|
} else {
|
|
pos := kmersize - 1 - uint(sparseAt)
|
|
left := uint(sparseAt) * 2
|
|
right := pos * 2
|
|
|
|
leftMask = obifp.OneUint[T]().LeftShift(left).Sub(obifp.OneUint[T]()).LeftShift(right + 2)
|
|
rightMask = obifp.OneUint[T]().LeftShift(right).Sub(obifp.OneUint[T]())
|
|
}
|
|
}
|
|
|
|
kmap := &KmerMap[T]{
|
|
Kmersize: kmersize,
|
|
kmermask: kmermask,
|
|
leftMask: leftMask,
|
|
rightMask: rightMask,
|
|
index: idx,
|
|
SparseAt: sparseAt,
|
|
}
|
|
|
|
n := len(sequences)
|
|
var bar *progressbar.ProgressBar
|
|
if obidefault.ProgressBar() {
|
|
pbopt := make([]progressbar.Option, 0, 5)
|
|
pbopt = append(pbopt,
|
|
progressbar.OptionSetWriter(os.Stderr),
|
|
progressbar.OptionSetWidth(15),
|
|
progressbar.OptionShowCount(),
|
|
progressbar.OptionShowIts(),
|
|
progressbar.OptionSetDescription("Indexing kmers"),
|
|
)
|
|
|
|
bar = progressbar.NewOptions(n, pbopt...)
|
|
}
|
|
|
|
for i, sequence := range sequences {
|
|
kmap.Push(sequence, maxoccurs)
|
|
if bar != nil && i%100 == 0 {
|
|
bar.Add(100)
|
|
}
|
|
}
|
|
|
|
if maxoccurs >= 0 {
|
|
for k, s := range kmap.index {
|
|
if len(s) >= maxoccurs {
|
|
delete(kmap.index, k)
|
|
}
|
|
}
|
|
}
|
|
|
|
return kmap
|
|
}
|