Adds the notioon of NA value

2025-06-29 16:20:46 +00:00 · 2022-02-18 10:00:42 +01:00
parent aef546dae3
commit 6067b92e2f
4 changed files with 61 additions and 108 deletions
--- a/pkg/obiseq/class.go
+++ b/pkg/obiseq/class.go
@ -6,9 +6,9 @@ import (
 	"strconv"
 )

-type SequenceClassifier func(sequence BioSequence) string
+type BioSequenceClassifier func(sequence BioSequence) string

-func AnnotationClassifier(key string) SequenceClassifier {
+func AnnotationClassifier(key string, na string) BioSequenceClassifier {
 	f := func(sequence BioSequence) string {
 		if sequence.HasAnnotation() {
 			value, ok := sequence.Annotations()[key]
@ -22,15 +22,13 @@ func AnnotationClassifier(key string) SequenceClassifier {
 				}
 			} 
 		}
-		return ""
+		return na
 	}

 	return f
 }

-var SampleClassifier = AnnotationClassifier("sample")
-
-func PredicateClassifier(predicate SequencePredicate) SequenceClassifier {
+func PredicateClassifier(predicate SequencePredicate) BioSequenceClassifier {
 	f := func(sequence BioSequence) string {
 		if predicate(sequence) {
 			return "true"
@ -44,7 +42,7 @@ func PredicateClassifier(predicate SequencePredicate) SequenceClassifier {

 // Builds a classifier function based on CRC32 of the sequence
 //
-func HashClassifier(size int) SequenceClassifier {
+func HashClassifier(size int) BioSequenceClassifier {
 	f := func(sequence BioSequence) string {
 		h := crc32.ChecksumIEEE(sequence.Sequence()) % uint32(size)
 		return strconv.Itoa(int(h))
@ -53,7 +51,17 @@ func HashClassifier(size int) SequenceClassifier {
 	return f
 }

-func RotateClassifier(size int) SequenceClassifier {
+// Builds a classifier function based on the sequence
+//
+func SequenceClassifier() BioSequenceClassifier {
+	f := func(sequence BioSequence) string {
+		return sequence.String()
+	}
+
+	return f
+}
+
+func RotateClassifier(size int) BioSequenceClassifier {
 	n := 0
 	f := func(sequence BioSequence) string {
 		h := n % size
--- a/pkg/obiseq/distribute.go
+++ b/pkg/obiseq/distribute.go
@ -27,7 +27,7 @@ func (dist *IDistribute) News() chan string {
 	return dist.news
 }

-func (iterator IBioSequenceBatch) Distribute(class SequenceClassifier, sizes ...int) IDistribute {
+func (iterator IBioSequenceBatch) Distribute(class BioSequenceClassifier, sizes ...int) IDistribute {
 	batchsize := 5000
 	buffsize := 2

--- a/pkg/obiseq/merge.go
+++ b/pkg/obiseq/merge.go
@ -20,7 +20,7 @@ func (sequence BioSequence) HasStatsOn(key string) bool {
 	return ok
 }

-func (sequence BioSequence) StatsOn(key string) StatsOnValues {
+func (sequence BioSequence) StatsOn(key string, na string) StatsOnValues {
 	mkey := "merged_" + key
 	annotations := sequence.Annotations()
 	istat, ok := annotations[mkey]
@ -44,20 +44,22 @@ func (sequence BioSequence) StatsOn(key string) StatsOnValues {
 		newstat = true
 	}

-	if newstat && sequence.StatsPlusOne(key, sequence) {
+	if newstat && sequence.StatsPlusOne(key, sequence, na) {
 		delete(sequence.Annotations(), key)
 	}

 	return stats
 }

-func (sequence BioSequence) StatsPlusOne(key string, toAdd BioSequence) bool {
+func (sequence BioSequence) StatsPlusOne(key string, toAdd BioSequence, na string) bool {
+	sval := na
+	stats := sequence.StatsOn(key,na)
+	retval := false
+
 	if toAdd.HasAnnotation() {
-		stats := sequence.StatsOn(key)
 		value, ok := toAdd.Annotations()[key]

 		if ok {
-			var sval string

 			switch value := value.(type) {
 			case string:
@ -69,17 +71,18 @@ func (sequence BioSequence) StatsPlusOne(key string, toAdd BioSequence) bool {
 			default:
 				log.Fatalf("Trying to make stats on a none string, integer or boolean value (%v)", value)
 			}
+			retval = true
+		}
+
+	}
+
 	old, ok := stats[sval]
 	if !ok {
 		old = 0
 	}
 	stats[sval] = old + 1

-			return true
-		}
-	}
-
-	return false
+	return retval
 }

 func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
@ -94,7 +97,7 @@ func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
 	return stats
 }

-func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...string) BioSequence {
+func (sequence BioSequence) Merge(tomerge BioSequence, na string, inplace bool, statsOn ...string) BioSequence {
 	if !inplace {
 		sequence = sequence.Copy()
 	}
@ -105,15 +108,15 @@ func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...str

 	annotation := sequence.Annotations()

-	count := tomerge.Count() + sequence.Count()
+	count := sequence.Count() + tomerge.Count()

-	for _, key := range keys {
+	for _, key := range statsOn {
 		if tomerge.HasStatsOn(key) {
-			smk := sequence.StatsOn(key)
-			mmk := tomerge.StatsOn(key)
+			smk := sequence.StatsOn(key,na)
+			mmk := tomerge.StatsOn(key,na)
 			smk.Merge(mmk)
 		} else {
-			sequence.StatsPlusOne(key, tomerge)
+			sequence.StatsPlusOne(key, tomerge,na)
 		}
 	}

@ -140,89 +143,23 @@ func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...str
 	return sequence
 }

-func (sequences BioSequenceSlice) Unique(statsOn []string, keys ...string) BioSequenceSlice {
-	uniq := make(map[string]*BioSequenceSlice, len(sequences))
-	nVariant := 0
-
-	for _, seq := range sequences {
-
-		sstring := seq.String()
-		pgroup, ok := uniq[sstring]
-
-		if !ok {
-			group := make(BioSequenceSlice, 0, 10)
-			pgroup = &group
-			uniq[sstring] = pgroup
-		}
-
-		ok = false
-		i := 0
-		var s BioSequence
-
-		for i, s = range *pgroup {
-			ok = true
-			switch {
-			case seq.HasAnnotation() && s.HasAnnotation():
-				for _, k := range keys {
-					seqV, seqOk := seq.Annotations()[k]
-					sV, sOk := s.Annotations()[k]
-
-					ok = ok && ((!seqOk && !sOk) || ((seqOk && sOk) && (seqV == sV)))
-
-					if !ok {
-						break
-					}
-				}
-			case seq.HasAnnotation() && !s.HasAnnotation():
-				for _, k := range keys {
-					_, seqOk := seq.Annotations()[k]
-					ok = ok && !seqOk
-					if !ok {
-						break
-					}
-				}
-			case !seq.HasAnnotation() && s.HasAnnotation():
-				for _, k := range keys {
-					_, sOk := s.Annotations()[k]
-					ok = ok && !sOk
-					if !ok {
-						break
-					}
-				}
-			default:
-				ok = true
-			}
-
-			if ok {
-				break
-			}
-		}
-
-		if ok {
-			(*pgroup)[i] = s.Merge(seq, true, statsOn...)
-		} else {
+func (sequences BioSequenceSlice) Merge(na string, statsOn ...string) BioSequenceSlice {
+	seq := sequences[0]
 	seq.SetQualities(nil)
-			if seq.Count() == 1 {
 	seq.Annotations()["count"] = 1
-			}
-			*pgroup = append(*pgroup, seq)
-			nVariant++
+
+	for _, toMerge := range sequences[1:] {
+		seq.Merge(toMerge, na, true, statsOn...)
+		toMerge.Recycle()
 	}

+	return sequences[0:1]
 }

-	output := make(BioSequenceSlice, 0, nVariant)
-	for _, seqs := range uniq {
-		output = append(output, *seqs...)
-	}
-
-	return output
-}
-
-func UniqueSliceWorker(statsOn []string, keys ...string) SeqSliceWorker {
+func MergeSliceWorker(na string, statsOn ...string) SeqSliceWorker {

 	worker := func(sequences BioSequenceSlice) BioSequenceSlice {
-		return sequences.Unique(statsOn, keys...)
+		return sequences.Merge(na, statsOn...)
 	}

 	return worker
--- a/pkg/obitools/obidistribute/options.go
+++ b/pkg/obitools/obidistribute/options.go
@ -14,6 +14,7 @@ var _FilenamePattern = ""
 var _SequenceClassifierTag = ""
 var _BatchCount = 0
 var _HashSize = 0
+var _NAValue = "NA"

 func DistributeOptionSet(options *getoptions.GetOpt) {
 	options.StringVar(&_FilenamePattern, "pattern", _FilenamePattern,
@ -29,6 +30,9 @@ func DistributeOptionSet(options *getoptions.GetOpt) {
 			"The name must corresponds to a string, a integer or a boolean value. "+
 			"That value will be used to dispatch sequences amoong the different files"))

+	options.StringVar(&_NAValue, "na-value", _NAValue,
+		options.Description("Value used when the classifier tag is not defined for a sequence."))
+
 	options.IntVar(&_BatchCount, "batches", 0,
 		options.Alias("n"),
 		options.Description("Indicates in how many batches the input file must bee splitted."))
@ -44,10 +48,10 @@ func OptionSet(options *getoptions.GetOpt) {
 	DistributeOptionSet(options)
 }

-func CLISequenceClassifier() obiseq.SequenceClassifier {
+func CLISequenceClassifier() obiseq.BioSequenceClassifier {
 	switch {
 	case _SequenceClassifierTag != "":
-		return obiseq.AnnotationClassifier(_SequenceClassifierTag)
+		return obiseq.AnnotationClassifier(_SequenceClassifierTag, _NAValue)
 	case _BatchCount > 0:
 		return obiseq.RotateClassifier(_BatchCount)
 	case _HashSize > 0:
@ -66,3 +70,7 @@ func CLIFileNamePattern() string {

 	return _FilenamePattern
 }
+
+func CLINAValue() string {
+	return _NAValue
+}