diff --git a/pkg/obichunk/chunk.go b/pkg/obichunk/chunk.go index 644349b..6eff757 100644 --- a/pkg/obichunk/chunk.go +++ b/pkg/obichunk/chunk.go @@ -11,11 +11,12 @@ func ISequenceChunk(iterator obiiter.IBioSequence, dereplicate bool, na string, statsOn obiseq.StatsOnDescriptions, + uniqueClassifier *obiseq.BioSequenceClassifier, ) (obiiter.IBioSequence, error) { if onMemory { return ISequenceChunkOnMemory(iterator, classifier) } else { - return ISequenceChunkOnDisk(iterator, classifier, dereplicate, na, statsOn) + return ISequenceChunkOnDisk(iterator, classifier, dereplicate, na, statsOn, uniqueClassifier) } } diff --git a/pkg/obichunk/chunk_on_disk.go b/pkg/obichunk/chunk_on_disk.go index dd678f9..2877beb 100644 --- a/pkg/obichunk/chunk_on_disk.go +++ b/pkg/obichunk/chunk_on_disk.go @@ -78,6 +78,7 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence, dereplicate bool, na string, statsOn obiseq.StatsOnDescriptions, + uniqueClassifier *obiseq.BioSequenceClassifier, ) (obiiter.IBioSequence, error) { obiutils.RegisterAPipe() dir, err := tempDir() @@ -120,18 +121,21 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence, if dereplicate { u := make(map[string]*obiseq.BioSequence) var source string + uniqueClassifier.Reset() for iseq.Next() { batch := iseq.Get() source = batch.Source() for _, seq := range batch.Slice() { - sstring := seq.String() - prev, ok := u[sstring] + // Use composite key: sequence + categories + code := uniqueClassifier.Code(seq) + key := uniqueClassifier.Value(code) + prev, ok := u[key] if ok { prev.Merge(seq, na, true, statsOn) } else { - u[sstring] = seq + u[key] = seq } } } diff --git a/pkg/obichunk/unique.go b/pkg/obichunk/unique.go index ba8cc4a..b5297d6 100644 --- a/pkg/obichunk/unique.go +++ b/pkg/obichunk/unique.go @@ -28,29 +28,32 @@ func IUniqueSequence(iterator obiiter.IBioSequence, cat := opts.Categories() na := opts.NAValue() - var classifier *obiseq.BioSequenceClassifier + // Classifier for bucketing: Hash only to control number of chunks + bucketClassifier := obiseq.HashClassifier(opts.BatchCount()) + // Classifier for uniqueness: Sequence + categories + var uniqueClassifier *obiseq.BioSequenceClassifier if len(cat) > 0 { cls := make([]*obiseq.BioSequenceClassifier, len(cat)+1) + cls[0] = obiseq.SequenceClassifier() for i, c := range cat { cls[i+1] = obiseq.AnnotationClassifier(c, na) } - cls[0] = obiseq.HashClassifier(opts.BatchCount()) - classifier = obiseq.CompositeClassifier(cls...) + uniqueClassifier = obiseq.CompositeClassifier(cls...) } else { - classifier = obiseq.HashClassifier(opts.BatchCount()) + uniqueClassifier = obiseq.SequenceClassifier() } if opts.SortOnDisk() { nworkers = 1 - iterator, err = ISequenceChunkOnDisk(iterator, classifier, true, na, opts.StatsOn()) + iterator, err = ISequenceChunkOnDisk(iterator, bucketClassifier, true, na, opts.StatsOn(), uniqueClassifier) if err != nil { return obiiter.NilIBioSequence, err } } else { - iterator, err = ISequenceChunkOnMemory(iterator, classifier) + iterator, err = ISequenceChunkOnMemory(iterator, bucketClassifier) if err != nil { return obiiter.NilIBioSequence, err @@ -93,9 +96,9 @@ func IUniqueSequence(iterator obiiter.IBioSequence, } for i := 0; i < nworkers-1; i++ { - go ff(iterator.Split(), obiseq.SequenceClassifier()) + go ff(iterator.Split(), uniqueClassifier.Clone()) } - go ff(iterator, obiseq.SequenceClassifier()) + go ff(iterator, uniqueClassifier) iMerged := iUnique.IMergeSequenceBatch(opts.NAValue(), opts.StatsOn(), diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 4a91c52..7e2201e 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "f55dd55" +var _Commit = "52244cd" var _Version = "Release 4.4.0" // Version returns the version of the obitools package.