Files
obitools4/pkg/obikmer/kmermap.go
Eric Coissac 1a28d5ed64 Add progress bar configuration and conditional display
This commit introduces a new configuration module `obidefault` to manage progress bar settings, allowing users to disable progress bars via a `--no-progressbar` option. It updates various packages to conditionally display progress bars based on this new configuration, improving user experience by providing control over progress bar output. The changes also include improvements to progress bar handling in several packages, ensuring they are only displayed when appropriate (e.g., when stderr is a terminal and stdout is not piped).
2026-02-08 16:14:02 +01:00

302 lines
5.6 KiB
Go

package obikmer
import (
"os"
"sort"
"unsafe"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obilog"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"github.com/schollz/progressbar/v3"
)
type KmerMap[T obifp.FPUint[T]] struct {
index map[T][]*obiseq.BioSequence
Kmersize uint
kmermask T
leftMask T
rightMask T
sparseMask T
SparseAt int
}
type KmerMatch map[*obiseq.BioSequence]int
func (k *KmerMap[T]) KmerSize() uint {
return k.Kmersize
}
func (k *KmerMap[T]) Len() int {
return len(k.index)
}
func (k *KmerMap[T]) KmerAsString(kmer T) string {
buff := make([]byte, k.Kmersize)
ks := int(k.Kmersize)
if k.SparseAt >= 0 {
ks--
}
for i, j := 0, int(k.Kmersize)-1; i < ks; i++ {
code := kmer.And(obifp.From64[T](3)).AsUint64()
buff[j] = decode[code]
j--
if k.SparseAt >= 0 && j == k.SparseAt {
buff[j] = '#'
j--
}
kmer = kmer.RightShift(2)
}
return string(buff)
}
func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T) []T {
if sequence.Len() < int(k.Kmersize) {
return nil
}
makeSparseAt := func(kmer T) T {
if k.SparseAt == -1 {
return kmer
}
return kmer.And(k.leftMask).RightShift(2).Or(kmer.And(k.rightMask))
}
normalizedKmer := func(fw, rv T) T {
fw = makeSparseAt(fw)
rv = makeSparseAt(rv)
if fw.LessThan(rv) {
return fw
}
return rv
}
current := obifp.ZeroUint[T]()
ccurrent := obifp.ZeroUint[T]()
lshift := uint(2 * (k.Kmersize - 1))
sup := sequence.Len() - int(k.Kmersize) + 1
var rep []T
if buff == nil {
rep = make([]T, 0, sup)
} else {
rep = (*buff)[:0]
}
nuc := sequence.Sequence()
size := 0
for i := 0; i < len(nuc); i++ {
current = current.LeftShift(2)
ccurrent = ccurrent.RightShift(2)
code := iupac[nuc[i]]
ccode := iupac[revcompnuc[nuc[i]]]
if len(code) != 1 {
current = obifp.ZeroUint[T]()
ccurrent = obifp.ZeroUint[T]()
size = 0
continue
}
current = current.Or(obifp.From64[T](uint64(code[0])))
ccurrent = ccurrent.Or(obifp.From64[T](uint64(ccode[0])).LeftShift(lshift))
size++
if size == int(k.Kmersize) {
kmer := normalizedKmer(current, ccurrent)
rep = append(rep, kmer)
size--
}
}
return rep
}
func (k *KmerMap[T]) Push(sequence *obiseq.BioSequence, maxocc ...int) {
maxoccurs := -1
if len(maxocc) > 0 {
maxoccurs = maxocc[0]
}
kmers := k.NormalizedKmerSlice(sequence, nil)
for _, kmer := range kmers {
seqs := k.index[kmer]
if maxoccurs == -1 || len(seqs) <= maxoccurs {
seqs = append(seqs, sequence)
k.index[kmer] = seqs
}
}
}
func (k *KmerMap[T]) Query(sequence *obiseq.BioSequence) KmerMatch {
kmers := k.NormalizedKmerSlice(sequence, nil)
seqs := make([]*obiseq.BioSequence, 0)
rep := make(KmerMatch)
for _, kmer := range kmers {
if candidates, ok := k.index[kmer]; ok {
seqs = append(seqs, candidates...)
}
}
sort.Slice(seqs,
func(i, j int) bool {
return uintptr(unsafe.Pointer(seqs[i])) < uintptr(unsafe.Pointer(seqs[j]))
})
prevseq := (*obiseq.BioSequence)(nil)
n := 0
for _, seq := range seqs {
if seq != prevseq {
if prevseq != nil && prevseq != sequence {
rep[prevseq] = n
}
n = 1
prevseq = seq
}
n++
}
if prevseq != nil {
rep[prevseq] = n
}
return rep
}
func (k *KmerMatch) FilterMinCount(mincount int) {
for seq, count := range *k {
if count < mincount {
delete(*k, seq)
}
}
}
func (k *KmerMatch) Len() int {
return len(*k)
}
func (k *KmerMatch) Sequences() obiseq.BioSequenceSlice {
ks := make([]*obiseq.BioSequence, 0, len(*k))
for seq := range *k {
ks = append(ks, seq)
}
return ks
}
func (k *KmerMatch) Max() *obiseq.BioSequence {
max := 0
var maxseq *obiseq.BioSequence
for seq, n := range *k {
if max < n {
max = n
maxseq = seq
}
}
return maxseq
}
func NewKmerMap[T obifp.FPUint[T]](
sequences obiseq.BioSequenceSlice,
kmersize uint,
sparse bool,
maxoccurs int) *KmerMap[T] {
idx := make(map[T][]*obiseq.BioSequence)
sparseAt := -1
if sparse && kmersize%2 == 0 {
obilog.Warnf("Kmer size must be odd when using sparse mode")
kmersize++
}
if !sparse && kmersize%2 == 1 {
obilog.Warnf("Kmer size must be even when not using sparse mode")
kmersize--
}
if sparse {
sparseAt = int(kmersize / 2)
}
kmermask := obifp.OneUint[T]().LeftShift(kmersize * 2).Sub(obifp.OneUint[T]())
leftMask := obifp.ZeroUint[T]()
rightMask := obifp.ZeroUint[T]()
if sparseAt >= 0 {
if sparseAt >= int(kmersize) {
sparseAt = -1
} else {
pos := kmersize - 1 - uint(sparseAt)
left := uint(sparseAt) * 2
right := pos * 2
leftMask = obifp.OneUint[T]().LeftShift(left).Sub(obifp.OneUint[T]()).LeftShift(right + 2)
rightMask = obifp.OneUint[T]().LeftShift(right).Sub(obifp.OneUint[T]())
}
}
kmap := &KmerMap[T]{
Kmersize: kmersize,
kmermask: kmermask,
leftMask: leftMask,
rightMask: rightMask,
index: idx,
SparseAt: sparseAt,
}
n := len(sequences)
var bar *progressbar.ProgressBar
if obidefault.ProgressBar() {
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowCount(),
progressbar.OptionShowIts(),
progressbar.OptionSetDescription("Indexing kmers"),
)
bar = progressbar.NewOptions(n, pbopt...)
}
for i, sequence := range sequences {
kmap.Push(sequence, maxoccurs)
if bar != nil && i%100 == 0 {
bar.Add(100)
}
}
if maxoccurs >= 0 {
for k, s := range kmap.index {
if len(s) >= maxoccurs {
delete(kmap.index, k)
}
}
}
return kmap
}