Adds the notioon of NA value

This commit is contained in:
2022-02-18 10:00:42 +01:00
parent aef546dae3
commit 6067b92e2f
4 changed files with 61 additions and 108 deletions

View File

@ -6,9 +6,9 @@ import (
"strconv" "strconv"
) )
type SequenceClassifier func(sequence BioSequence) string type BioSequenceClassifier func(sequence BioSequence) string
func AnnotationClassifier(key string) SequenceClassifier { func AnnotationClassifier(key string, na string) BioSequenceClassifier {
f := func(sequence BioSequence) string { f := func(sequence BioSequence) string {
if sequence.HasAnnotation() { if sequence.HasAnnotation() {
value, ok := sequence.Annotations()[key] value, ok := sequence.Annotations()[key]
@ -22,15 +22,13 @@ func AnnotationClassifier(key string) SequenceClassifier {
} }
} }
} }
return "" return na
} }
return f return f
} }
var SampleClassifier = AnnotationClassifier("sample") func PredicateClassifier(predicate SequencePredicate) BioSequenceClassifier {
func PredicateClassifier(predicate SequencePredicate) SequenceClassifier {
f := func(sequence BioSequence) string { f := func(sequence BioSequence) string {
if predicate(sequence) { if predicate(sequence) {
return "true" return "true"
@ -44,7 +42,7 @@ func PredicateClassifier(predicate SequencePredicate) SequenceClassifier {
// Builds a classifier function based on CRC32 of the sequence // Builds a classifier function based on CRC32 of the sequence
// //
func HashClassifier(size int) SequenceClassifier { func HashClassifier(size int) BioSequenceClassifier {
f := func(sequence BioSequence) string { f := func(sequence BioSequence) string {
h := crc32.ChecksumIEEE(sequence.Sequence()) % uint32(size) h := crc32.ChecksumIEEE(sequence.Sequence()) % uint32(size)
return strconv.Itoa(int(h)) return strconv.Itoa(int(h))
@ -53,7 +51,17 @@ func HashClassifier(size int) SequenceClassifier {
return f return f
} }
func RotateClassifier(size int) SequenceClassifier { // Builds a classifier function based on the sequence
//
func SequenceClassifier() BioSequenceClassifier {
f := func(sequence BioSequence) string {
return sequence.String()
}
return f
}
func RotateClassifier(size int) BioSequenceClassifier {
n := 0 n := 0
f := func(sequence BioSequence) string { f := func(sequence BioSequence) string {
h := n % size h := n % size

View File

@ -27,7 +27,7 @@ func (dist *IDistribute) News() chan string {
return dist.news return dist.news
} }
func (iterator IBioSequenceBatch) Distribute(class SequenceClassifier, sizes ...int) IDistribute { func (iterator IBioSequenceBatch) Distribute(class BioSequenceClassifier, sizes ...int) IDistribute {
batchsize := 5000 batchsize := 5000
buffsize := 2 buffsize := 2

View File

@ -20,7 +20,7 @@ func (sequence BioSequence) HasStatsOn(key string) bool {
return ok return ok
} }
func (sequence BioSequence) StatsOn(key string) StatsOnValues { func (sequence BioSequence) StatsOn(key string, na string) StatsOnValues {
mkey := "merged_" + key mkey := "merged_" + key
annotations := sequence.Annotations() annotations := sequence.Annotations()
istat, ok := annotations[mkey] istat, ok := annotations[mkey]
@ -44,20 +44,22 @@ func (sequence BioSequence) StatsOn(key string) StatsOnValues {
newstat = true newstat = true
} }
if newstat && sequence.StatsPlusOne(key, sequence) { if newstat && sequence.StatsPlusOne(key, sequence, na) {
delete(sequence.Annotations(), key) delete(sequence.Annotations(), key)
} }
return stats return stats
} }
func (sequence BioSequence) StatsPlusOne(key string, toAdd BioSequence) bool { func (sequence BioSequence) StatsPlusOne(key string, toAdd BioSequence, na string) bool {
sval := na
stats := sequence.StatsOn(key,na)
retval := false
if toAdd.HasAnnotation() { if toAdd.HasAnnotation() {
stats := sequence.StatsOn(key)
value, ok := toAdd.Annotations()[key] value, ok := toAdd.Annotations()[key]
if ok { if ok {
var sval string
switch value := value.(type) { switch value := value.(type) {
case string: case string:
@ -69,17 +71,18 @@ func (sequence BioSequence) StatsPlusOne(key string, toAdd BioSequence) bool {
default: default:
log.Fatalf("Trying to make stats on a none string, integer or boolean value (%v)", value) log.Fatalf("Trying to make stats on a none string, integer or boolean value (%v)", value)
} }
old, ok := stats[sval] retval = true
if !ok {
old = 0
}
stats[sval] = old + 1
return true
} }
} }
return false old, ok := stats[sval]
if !ok {
old = 0
}
stats[sval] = old + 1
return retval
} }
func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues { func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
@ -94,7 +97,7 @@ func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
return stats return stats
} }
func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...string) BioSequence { func (sequence BioSequence) Merge(tomerge BioSequence, na string, inplace bool, statsOn ...string) BioSequence {
if !inplace { if !inplace {
sequence = sequence.Copy() sequence = sequence.Copy()
} }
@ -105,15 +108,15 @@ func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...str
annotation := sequence.Annotations() annotation := sequence.Annotations()
count := tomerge.Count() + sequence.Count() count := sequence.Count() + tomerge.Count()
for _, key := range keys { for _, key := range statsOn {
if tomerge.HasStatsOn(key) { if tomerge.HasStatsOn(key) {
smk := sequence.StatsOn(key) smk := sequence.StatsOn(key,na)
mmk := tomerge.StatsOn(key) mmk := tomerge.StatsOn(key,na)
smk.Merge(mmk) smk.Merge(mmk)
} else { } else {
sequence.StatsPlusOne(key, tomerge) sequence.StatsPlusOne(key, tomerge,na)
} }
} }
@ -140,89 +143,23 @@ func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...str
return sequence return sequence
} }
func (sequences BioSequenceSlice) Unique(statsOn []string, keys ...string) BioSequenceSlice { func (sequences BioSequenceSlice) Merge(na string, statsOn ...string) BioSequenceSlice {
uniq := make(map[string]*BioSequenceSlice, len(sequences)) seq := sequences[0]
nVariant := 0 seq.SetQualities(nil)
seq.Annotations()["count"] = 1
for _, seq := range sequences {
sstring := seq.String()
pgroup, ok := uniq[sstring]
if !ok {
group := make(BioSequenceSlice, 0, 10)
pgroup = &group
uniq[sstring] = pgroup
}
ok = false
i := 0
var s BioSequence
for i, s = range *pgroup {
ok = true
switch {
case seq.HasAnnotation() && s.HasAnnotation():
for _, k := range keys {
seqV, seqOk := seq.Annotations()[k]
sV, sOk := s.Annotations()[k]
ok = ok && ((!seqOk && !sOk) || ((seqOk && sOk) && (seqV == sV)))
if !ok {
break
}
}
case seq.HasAnnotation() && !s.HasAnnotation():
for _, k := range keys {
_, seqOk := seq.Annotations()[k]
ok = ok && !seqOk
if !ok {
break
}
}
case !seq.HasAnnotation() && s.HasAnnotation():
for _, k := range keys {
_, sOk := s.Annotations()[k]
ok = ok && !sOk
if !ok {
break
}
}
default:
ok = true
}
if ok {
break
}
}
if ok {
(*pgroup)[i] = s.Merge(seq, true, statsOn...)
} else {
seq.SetQualities(nil)
if seq.Count() == 1 {
seq.Annotations()["count"] = 1
}
*pgroup = append(*pgroup, seq)
nVariant++
}
for _, toMerge := range sequences[1:] {
seq.Merge(toMerge, na, true, statsOn...)
toMerge.Recycle()
} }
output := make(BioSequenceSlice, 0, nVariant) return sequences[0:1]
for _, seqs := range uniq {
output = append(output, *seqs...)
}
return output
} }
func UniqueSliceWorker(statsOn []string, keys ...string) SeqSliceWorker { func MergeSliceWorker(na string, statsOn ...string) SeqSliceWorker {
worker := func(sequences BioSequenceSlice) BioSequenceSlice { worker := func(sequences BioSequenceSlice) BioSequenceSlice {
return sequences.Unique(statsOn, keys...) return sequences.Merge(na, statsOn...)
} }
return worker return worker

View File

@ -14,6 +14,7 @@ var _FilenamePattern = ""
var _SequenceClassifierTag = "" var _SequenceClassifierTag = ""
var _BatchCount = 0 var _BatchCount = 0
var _HashSize = 0 var _HashSize = 0
var _NAValue = "NA"
func DistributeOptionSet(options *getoptions.GetOpt) { func DistributeOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_FilenamePattern, "pattern", _FilenamePattern, options.StringVar(&_FilenamePattern, "pattern", _FilenamePattern,
@ -29,6 +30,9 @@ func DistributeOptionSet(options *getoptions.GetOpt) {
"The name must corresponds to a string, a integer or a boolean value. "+ "The name must corresponds to a string, a integer or a boolean value. "+
"That value will be used to dispatch sequences amoong the different files")) "That value will be used to dispatch sequences amoong the different files"))
options.StringVar(&_NAValue, "na-value", _NAValue,
options.Description("Value used when the classifier tag is not defined for a sequence."))
options.IntVar(&_BatchCount, "batches", 0, options.IntVar(&_BatchCount, "batches", 0,
options.Alias("n"), options.Alias("n"),
options.Description("Indicates in how many batches the input file must bee splitted.")) options.Description("Indicates in how many batches the input file must bee splitted."))
@ -44,10 +48,10 @@ func OptionSet(options *getoptions.GetOpt) {
DistributeOptionSet(options) DistributeOptionSet(options)
} }
func CLISequenceClassifier() obiseq.SequenceClassifier { func CLISequenceClassifier() obiseq.BioSequenceClassifier {
switch { switch {
case _SequenceClassifierTag != "": case _SequenceClassifierTag != "":
return obiseq.AnnotationClassifier(_SequenceClassifierTag) return obiseq.AnnotationClassifier(_SequenceClassifierTag, _NAValue)
case _BatchCount > 0: case _BatchCount > 0:
return obiseq.RotateClassifier(_BatchCount) return obiseq.RotateClassifier(_BatchCount)
case _HashSize > 0: case _HashSize > 0:
@ -66,3 +70,7 @@ func CLIFileNamePattern() string {
return _FilenamePattern return _FilenamePattern
} }
func CLINAValue() string {
return _NAValue
}