diff --git a/pkg/obiseq/class.go b/pkg/obiseq/class.go index f8414fa..2c75885 100644 --- a/pkg/obiseq/class.go +++ b/pkg/obiseq/class.go @@ -6,9 +6,9 @@ import ( "strconv" ) -type SequenceClassifier func(sequence BioSequence) string +type BioSequenceClassifier func(sequence BioSequence) string -func AnnotationClassifier(key string) SequenceClassifier { +func AnnotationClassifier(key string, na string) BioSequenceClassifier { f := func(sequence BioSequence) string { if sequence.HasAnnotation() { value, ok := sequence.Annotations()[key] @@ -20,17 +20,15 @@ func AnnotationClassifier(key string) SequenceClassifier { default: return fmt.Sprint(value) } - } + } } - return "" + return na } return f } -var SampleClassifier = AnnotationClassifier("sample") - -func PredicateClassifier(predicate SequencePredicate) SequenceClassifier { +func PredicateClassifier(predicate SequencePredicate) BioSequenceClassifier { f := func(sequence BioSequence) string { if predicate(sequence) { return "true" @@ -44,7 +42,7 @@ func PredicateClassifier(predicate SequencePredicate) SequenceClassifier { // Builds a classifier function based on CRC32 of the sequence // -func HashClassifier(size int) SequenceClassifier { +func HashClassifier(size int) BioSequenceClassifier { f := func(sequence BioSequence) string { h := crc32.ChecksumIEEE(sequence.Sequence()) % uint32(size) return strconv.Itoa(int(h)) @@ -53,7 +51,17 @@ func HashClassifier(size int) SequenceClassifier { return f } -func RotateClassifier(size int) SequenceClassifier { +// Builds a classifier function based on the sequence +// +func SequenceClassifier() BioSequenceClassifier { + f := func(sequence BioSequence) string { + return sequence.String() + } + + return f +} + +func RotateClassifier(size int) BioSequenceClassifier { n := 0 f := func(sequence BioSequence) string { h := n % size diff --git a/pkg/obiseq/distribute.go b/pkg/obiseq/distribute.go index 5f0c26b..28cdb28 100644 --- a/pkg/obiseq/distribute.go +++ b/pkg/obiseq/distribute.go @@ -27,7 +27,7 @@ func (dist *IDistribute) News() chan string { return dist.news } -func (iterator IBioSequenceBatch) Distribute(class SequenceClassifier, sizes ...int) IDistribute { +func (iterator IBioSequenceBatch) Distribute(class BioSequenceClassifier, sizes ...int) IDistribute { batchsize := 5000 buffsize := 2 @@ -80,7 +80,7 @@ func (iterator IBioSequenceBatch) Distribute(class SequenceClassifier, sizes ... } *slice = append(*slice, s) - + if len(*slice) == batchsize { outputs[key].Channel() <- MakeBioSequenceBatch(orders[key], *slice...) orders[key]++ diff --git a/pkg/obiseq/merge.go b/pkg/obiseq/merge.go index 27f47b3..f4bd738 100644 --- a/pkg/obiseq/merge.go +++ b/pkg/obiseq/merge.go @@ -20,7 +20,7 @@ func (sequence BioSequence) HasStatsOn(key string) bool { return ok } -func (sequence BioSequence) StatsOn(key string) StatsOnValues { +func (sequence BioSequence) StatsOn(key string, na string) StatsOnValues { mkey := "merged_" + key annotations := sequence.Annotations() istat, ok := annotations[mkey] @@ -44,20 +44,22 @@ func (sequence BioSequence) StatsOn(key string) StatsOnValues { newstat = true } - if newstat && sequence.StatsPlusOne(key, sequence) { + if newstat && sequence.StatsPlusOne(key, sequence, na) { delete(sequence.Annotations(), key) } return stats } -func (sequence BioSequence) StatsPlusOne(key string, toAdd BioSequence) bool { +func (sequence BioSequence) StatsPlusOne(key string, toAdd BioSequence, na string) bool { + sval := na + stats := sequence.StatsOn(key,na) + retval := false + if toAdd.HasAnnotation() { - stats := sequence.StatsOn(key) value, ok := toAdd.Annotations()[key] if ok { - var sval string switch value := value.(type) { case string: @@ -69,17 +71,18 @@ func (sequence BioSequence) StatsPlusOne(key string, toAdd BioSequence) bool { default: log.Fatalf("Trying to make stats on a none string, integer or boolean value (%v)", value) } - old, ok := stats[sval] - if !ok { - old = 0 - } - stats[sval] = old + 1 - - return true + retval = true } + } - return false + old, ok := stats[sval] + if !ok { + old = 0 + } + stats[sval] = old + 1 + + return retval } func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues { @@ -94,7 +97,7 @@ func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues { return stats } -func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...string) BioSequence { +func (sequence BioSequence) Merge(tomerge BioSequence, na string, inplace bool, statsOn ...string) BioSequence { if !inplace { sequence = sequence.Copy() } @@ -105,15 +108,15 @@ func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...str annotation := sequence.Annotations() - count := tomerge.Count() + sequence.Count() + count := sequence.Count() + tomerge.Count() - for _, key := range keys { + for _, key := range statsOn { if tomerge.HasStatsOn(key) { - smk := sequence.StatsOn(key) - mmk := tomerge.StatsOn(key) + smk := sequence.StatsOn(key,na) + mmk := tomerge.StatsOn(key,na) smk.Merge(mmk) } else { - sequence.StatsPlusOne(key, tomerge) + sequence.StatsPlusOne(key, tomerge,na) } } @@ -140,89 +143,23 @@ func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...str return sequence } -func (sequences BioSequenceSlice) Unique(statsOn []string, keys ...string) BioSequenceSlice { - uniq := make(map[string]*BioSequenceSlice, len(sequences)) - nVariant := 0 - - for _, seq := range sequences { - - sstring := seq.String() - pgroup, ok := uniq[sstring] - - if !ok { - group := make(BioSequenceSlice, 0, 10) - pgroup = &group - uniq[sstring] = pgroup - } - - ok = false - i := 0 - var s BioSequence - - for i, s = range *pgroup { - ok = true - switch { - case seq.HasAnnotation() && s.HasAnnotation(): - for _, k := range keys { - seqV, seqOk := seq.Annotations()[k] - sV, sOk := s.Annotations()[k] - - ok = ok && ((!seqOk && !sOk) || ((seqOk && sOk) && (seqV == sV))) - - if !ok { - break - } - } - case seq.HasAnnotation() && !s.HasAnnotation(): - for _, k := range keys { - _, seqOk := seq.Annotations()[k] - ok = ok && !seqOk - if !ok { - break - } - } - case !seq.HasAnnotation() && s.HasAnnotation(): - for _, k := range keys { - _, sOk := s.Annotations()[k] - ok = ok && !sOk - if !ok { - break - } - } - default: - ok = true - } - - if ok { - break - } - } - - if ok { - (*pgroup)[i] = s.Merge(seq, true, statsOn...) - } else { - seq.SetQualities(nil) - if seq.Count() == 1 { - seq.Annotations()["count"] = 1 - } - *pgroup = append(*pgroup, seq) - nVariant++ - } +func (sequences BioSequenceSlice) Merge(na string, statsOn ...string) BioSequenceSlice { + seq := sequences[0] + seq.SetQualities(nil) + seq.Annotations()["count"] = 1 + for _, toMerge := range sequences[1:] { + seq.Merge(toMerge, na, true, statsOn...) + toMerge.Recycle() } - output := make(BioSequenceSlice, 0, nVariant) - for _, seqs := range uniq { - output = append(output, *seqs...) - } - - return output + return sequences[0:1] } -func UniqueSliceWorker(statsOn []string, keys ...string) SeqSliceWorker { +func MergeSliceWorker(na string, statsOn ...string) SeqSliceWorker { worker := func(sequences BioSequenceSlice) BioSequenceSlice { - return sequences.Unique(statsOn, keys...) + return sequences.Merge(na, statsOn...) } return worker diff --git a/pkg/obitools/obidistribute/options.go b/pkg/obitools/obidistribute/options.go index 64d20e2..7a637bd 100644 --- a/pkg/obitools/obidistribute/options.go +++ b/pkg/obitools/obidistribute/options.go @@ -14,6 +14,7 @@ var _FilenamePattern = "" var _SequenceClassifierTag = "" var _BatchCount = 0 var _HashSize = 0 +var _NAValue = "NA" func DistributeOptionSet(options *getoptions.GetOpt) { options.StringVar(&_FilenamePattern, "pattern", _FilenamePattern, @@ -29,6 +30,9 @@ func DistributeOptionSet(options *getoptions.GetOpt) { "The name must corresponds to a string, a integer or a boolean value. "+ "That value will be used to dispatch sequences amoong the different files")) + options.StringVar(&_NAValue, "na-value", _NAValue, + options.Description("Value used when the classifier tag is not defined for a sequence.")) + options.IntVar(&_BatchCount, "batches", 0, options.Alias("n"), options.Description("Indicates in how many batches the input file must bee splitted.")) @@ -44,10 +48,10 @@ func OptionSet(options *getoptions.GetOpt) { DistributeOptionSet(options) } -func CLISequenceClassifier() obiseq.SequenceClassifier { +func CLISequenceClassifier() obiseq.BioSequenceClassifier { switch { case _SequenceClassifierTag != "": - return obiseq.AnnotationClassifier(_SequenceClassifierTag) + return obiseq.AnnotationClassifier(_SequenceClassifierTag, _NAValue) case _BatchCount > 0: return obiseq.RotateClassifier(_BatchCount) case _HashSize > 0: @@ -66,3 +70,7 @@ func CLIFileNamePattern() string { return _FilenamePattern } + +func CLINAValue() string { + return _NAValue +}