Patch a bug on writing to stdout, and add clearer error on openning data files

This commit is contained in:
Eric Coissac
2024-08-13 09:45:28 +02:00
parent bdb96dda94
commit 31bfc88eb9
43 changed files with 1654 additions and 696 deletions

View File

@ -3,98 +3,147 @@ package obikmer
import (
"os"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"github.com/schollz/progressbar/v3"
log "github.com/sirupsen/logrus"
)
type KmerMap struct {
index map[KmerIdx64][]*obiseq.BioSequence
kmersize int
kmermask KmerIdx64
type KmerMap[T obifp.FPUint[T]] struct {
index map[T][]*obiseq.BioSequence
Kmersize uint
kmermask T
leftMask T
rightMask T
sparseMask T
SparseAt int
}
type KmerMatch map[*obiseq.BioSequence]int
func (k *KmerMap) KmerSize() int {
return k.kmersize
func (k *KmerMap[T]) KmerSize() uint {
return k.Kmersize
}
func (k *KmerMap) Len() int {
func (k *KmerMap[T]) Len() int {
return len(k.index)
}
func (k *KmerMap) Push(sequence *obiseq.BioSequence) {
current := KmerIdx64(0)
ccurrent := KmerIdx64(0)
lshift := uint(2 * (k.kmersize - 1))
func (k *KmerMap[T]) KmerAsString(kmer T) string {
buff := make([]byte, k.Kmersize)
ks := int(k.Kmersize)
if k.SparseAt >= 0 {
ks--
}
for i, j := 0, int(k.Kmersize)-1; i < ks; i++ {
code := kmer.And(obifp.From64[T](3)).AsUint64()
buff[j] = decode[code]
j--
if k.SparseAt >= 0 && j == k.SparseAt {
buff[j] = '#'
j--
}
kmer = kmer.RightShift(2)
}
return string(buff)
}
func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T) []T {
makeSparseAt := func(kmer T) T {
if k.SparseAt == -1 {
return kmer
}
return kmer.And(k.leftMask).RightShift(2).Or(kmer.And(k.rightMask))
}
normalizedKmer := func(fw, rv T) T {
if k.SparseAt >= 0 {
fw = makeSparseAt(fw)
rv = makeSparseAt(rv)
}
if fw.LessThan(rv) {
return fw
}
return rv
}
current := obifp.ZeroUint[T]()
ccurrent := obifp.ZeroUint[T]()
lshift := uint(2 * (k.Kmersize - 1))
sup := sequence.Len() - int(k.Kmersize) + 1
var rep []T
if buff == nil {
rep = make([]T, 0, sup)
} else {
rep = (*buff)[:0]
}
nuc := sequence.Sequence()
size := 0
for i := 0; i < len(nuc)-k.kmersize+1; i++ {
current <<= 2
ccurrent >>= 2
for i := 0; i < len(nuc); i++ {
current = current.LeftShift(2)
ccurrent = ccurrent.RightShift(2)
code := iupac[nuc[i]]
ccode := iupac[revcompnuc[nuc[i]]]
if len(code) != 1 {
current = KmerIdx64(0)
ccurrent = KmerIdx64(0)
current = obifp.ZeroUint[T]()
ccurrent = obifp.ZeroUint[T]()
size = 0
continue
}
current |= KmerIdx64(code[0])
ccurrent |= KmerIdx64(ccode[0]) << lshift
current = current.Or(obifp.From64[T](uint64(code[0])))
ccurrent = ccurrent.Or(obifp.From64[T](uint64(ccode[0])).LeftShift(lshift))
size++
if size == k.kmersize {
if size == int(k.Kmersize) {
kmer := min(k.kmermask&current, k.kmermask&ccurrent)
k.index[kmer] = append(k.index[kmer], sequence)
kmer := normalizedKmer(current, ccurrent)
rep = append(rep, kmer)
size--
}
}
return rep
}
func (k *KmerMap) Query(sequence *obiseq.BioSequence) KmerMatch {
current := KmerIdx64(0)
ccurrent := KmerIdx64(0)
func (k *KmerMap[T]) Push(sequence *obiseq.BioSequence) {
kmers := k.NormalizedKmerSlice(sequence, nil)
for _, kmer := range kmers {
k.index[kmer] = append(k.index[kmer], sequence)
}
}
func (k *KmerMap[T]) Query(sequence *obiseq.BioSequence) KmerMatch {
kmers := k.NormalizedKmerSlice(sequence, nil)
rep := make(KmerMatch)
nuc := sequence.Sequence()
size := 0
for i := 0; i < len(nuc)-k.kmersize+1; i++ {
current <<= 2
ccurrent >>= 2
code := iupac[nuc[i]]
ccode := iupac[revcompnuc[nuc[i]]]
if len(code) != 1 {
current = KmerIdx64(0)
ccurrent = KmerIdx64(0)
size = 0
continue
}
current |= KmerIdx64(code[0])
ccurrent |= KmerIdx64(ccode[0]) << uint(2*(k.kmersize-1))
size++
if size == k.kmersize {
kmer := min(k.kmermask&current, k.kmermask&ccurrent)
if _, ok := k.index[kmer]; ok {
for _, seq := range k.index[kmer] {
if seq != sequence {
if _, ok := rep[seq]; !ok {
rep[seq] = 0
}
rep[seq]++
for _, kmer := range kmers {
if _, ok := k.index[kmer]; ok {
for _, seq := range k.index[kmer] {
if seq != sequence {
if _, ok := rep[seq]; !ok {
rep[seq] = 0
}
rep[seq]++
}
}
size--
}
}
@ -135,12 +184,54 @@ func (k *KmerMatch) Max() *obiseq.BioSequence {
return maxseq
}
func NewKmerMap(sequences obiseq.BioSequenceSlice, kmersize int) *KmerMap {
idx := make(map[KmerIdx64][]*obiseq.BioSequence)
func NewKmerMap[T obifp.FPUint[T]](
sequences obiseq.BioSequenceSlice,
kmersize uint,
sparse bool) *KmerMap[T] {
idx := make(map[T][]*obiseq.BioSequence)
kmermask := KmerIdx64(^(^uint64(0) << (uint64(kmersize) * 2)))
sparseAt := -1
kmap := &KmerMap{kmersize: kmersize, kmermask: kmermask, index: idx}
if sparse && kmersize%2 == 0 {
log.Warnf("Kmer size must be odd when using sparse mode")
kmersize++
}
if !sparse && kmersize%2 == 1 {
log.Warnf("Kmer size must be even when not using sparse mode")
kmersize--
}
if sparse {
sparseAt = int(kmersize / 2)
}
kmermask := obifp.OneUint[T]().LeftShift(kmersize * 2).Sub(obifp.OneUint[T]())
leftMask := obifp.ZeroUint[T]()
rightMask := obifp.ZeroUint[T]()
if sparseAt >= 0 {
if sparseAt >= int(kmersize) {
sparseAt = -1
} else {
pos := kmersize - 1 - uint(sparseAt)
left := uint(sparseAt) * 2
right := pos * 2
leftMask = obifp.OneUint[T]().LeftShift(left).Sub(obifp.OneUint[T]()).LeftShift(right + 2)
rightMask = obifp.OneUint[T]().LeftShift(right).Sub(obifp.OneUint[T]())
}
}
kmap := &KmerMap[T]{
Kmersize: kmersize,
kmermask: kmermask,
leftMask: leftMask,
rightMask: rightMask,
index: idx,
SparseAt: sparseAt,
}
n := len(sequences)
pbopt := make([]progressbar.Option, 0, 5)
@ -163,14 +254,3 @@ func NewKmerMap(sequences obiseq.BioSequenceSlice, kmersize int) *KmerMap {
return kmap
}
func (k *KmerMap) MakeCountMatchWorker(minKmerCount int) obiseq.SeqWorker {
return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
matches := k.Query(sequence)
matches.FilterMinCount(minKmerCount)
n := matches.Len()
sequence.SetAttribute("obikmer_match_count", n)
return obiseq.BioSequenceSlice{sequence}, nil
}
}