second version of obidistribute and a first buggy version of obiuniq

This commit is contained in:
2022-02-15 00:47:02 +01:00
parent b931321ba1
commit 3586ecc483
15 changed files with 402 additions and 21 deletions

View File

@@ -156,15 +156,15 @@ func (iterator IBioSequenceBatch) Split() IBioSequenceBatch {
}
func (iterator IBioSequenceBatch) Next() bool {
if iterator.pointer.finished.IsSet() {
return false
}
if iterator.pointer.pushBack.IsSet() {
iterator.pointer.pushBack.UnSet()
return true
}
if iterator.pointer.finished.IsSet() {
return false
}
next, ok := (<-iterator.pointer.channel)
if ok {

View File

@@ -25,7 +25,7 @@ func AnnotationClassifier(key string) SequenceClassifier {
return ""
}
return SequenceClassifier(f)
return f
}
var SampleClassifier = AnnotationClassifier("sample")
@@ -39,7 +39,7 @@ func PredicateClassifier(predicate SequencePredicate) SequenceClassifier {
}
}
return SequenceClassifier(f)
return f
}
// Builds a classifier function based on CRC32 of the sequence
@@ -50,7 +50,7 @@ func HashClassifier(size int) SequenceClassifier {
return strconv.Itoa(int(h))
}
return SequenceClassifier(f)
return f
}
func RotateClassifier(size int) SequenceClassifier {
@@ -61,5 +61,5 @@ func RotateClassifier(size int) SequenceClassifier {
return strconv.Itoa(int(h))
}
return SequenceClassifier(f)
return f
}

View File

@@ -62,7 +62,7 @@ func (iterator IBioSequenceBatch) Distribute(class SequenceClassifier, sizes ...
for iterator.Next() {
seqs := iterator.Get()
for _, s := range seqs.slice {
for _, s := range seqs.Slice() {
key := class(s)
slice, ok := slices[key]
@@ -73,13 +73,14 @@ func (iterator IBioSequenceBatch) Distribute(class SequenceClassifier, sizes ...
orders[key] = 0
lock.Lock()
outputs[key] = MakeIBioSequenceBatch(batchsize, buffsize)
outputs[key] = MakeIBioSequenceBatch(buffsize)
lock.Unlock()
news <- key
}
*slice = append(*slice, s)
if len(*slice) == batchsize {
outputs[key].Channel() <- MakeBioSequenceBatch(orders[key], *slice...)
orders[key]++

View File

@@ -3,6 +3,7 @@ package obiseq
import (
"fmt"
"log"
"strings"
)
type StatsOnValues map[string]int
@@ -98,9 +99,13 @@ func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...str
sequence = sequence.Copy()
}
if sequence.HasQualities() {
sequence.SetQualities(nil)
}
annotation := sequence.Annotations()
annotation["count"] = tomerge.Count() + sequence.Count()
count := tomerge.Count() + sequence.Count()
for _, key := range keys {
if tomerge.HasStatsOn(key) {
@@ -112,5 +117,113 @@ func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...str
}
}
if tomerge.HasAnnotation() {
ma := tomerge.Annotations()
for k, va := range annotation {
if !strings.HasPrefix(k, "merged_") {
vm, ok := ma[k]
if !ok || vm != va {
delete(annotation, k)
}
}
}
} else {
for k := range annotation {
if !strings.HasPrefix(k, "merged_") {
delete(annotation, k)
}
}
}
annotation["count"] = count
return sequence
}
func (sequences BioSequenceSlice) Unique(statsOn []string, keys ...string) BioSequenceSlice {
uniq := make(map[string]*BioSequenceSlice, len(sequences))
nVariant := 0
for _, seq := range sequences {
sstring := seq.String()
pgroup, ok := uniq[sstring]
if !ok {
group := make(BioSequenceSlice, 0, 10)
pgroup = &group
uniq[sstring] = pgroup
}
ok = false
i := 0
var s BioSequence
for i, s = range *pgroup {
ok = true
switch {
case seq.HasAnnotation() && s.HasAnnotation():
for _, k := range keys {
seqV, seqOk := seq.Annotations()[k]
sV, sOk := s.Annotations()[k]
ok = ok && ((!seqOk && !sOk) || ((seqOk && sOk) && (seqV == sV)))
if !ok {
break
}
}
case seq.HasAnnotation() && !s.HasAnnotation():
for _, k := range keys {
_, seqOk := seq.Annotations()[k]
ok = ok && !seqOk
if !ok {
break
}
}
case !seq.HasAnnotation() && s.HasAnnotation():
for _, k := range keys {
_, sOk := s.Annotations()[k]
ok = ok && !sOk
if !ok {
break
}
}
default:
ok = true
}
if ok {
break
}
}
if ok {
(*pgroup)[i] = s.Merge(seq, true, statsOn...)
} else {
seq.SetQualities(nil)
if seq.Count() == 1 {
seq.Annotations()["count"] = 1
}
*pgroup = append(*pgroup, seq)
nVariant++
}
}
output := make(BioSequenceSlice, 0, nVariant)
for _, seqs := range uniq {
output = append(output, *seqs...)
}
return output
}
func UniqueSliceWorker(statsOn []string, keys ...string) SeqSliceWorker {
worker := func(sequences BioSequenceSlice) BioSequenceSlice {
return sequences.Unique(statsOn, keys...)
}
return worker
}