From b49aba9c09138c11e4224e8f197110fb4c92a895 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 14 Jan 2026 19:18:08 +0100 Subject: [PATCH] =?UTF-8?q?Impl=C3=A9mentation=20du=20filtrage=20unique=20?= =?UTF-8?q?bas=C3=A9=20sur=20s=C3=A9quence=20et=20cat=C3=A9gories?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ajout d'une fonctionnalité pour le filtrage unique qui prend en compte à la fois la séquence et les catégories. - Modification de la fonction ISequenceChunk pour accepter un classifieur unique optionnel - Implémentation du traitement unique sur disque en utilisant un classifieur composite - Mise à jour du classifieur utilisé pour le tri sur disque - Correction de la gestion des clés de unicité en utilisant le code et la valeur du classifieur - Mise à jour du numéro de commit --- pkg/obichunk/chunk.go | 3 ++- pkg/obichunk/chunk_on_disk.go | 10 +++++++--- pkg/obichunk/unique.go | 19 +++++++++++-------- pkg/obioptions/version.go | 2 +- 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/pkg/obichunk/chunk.go b/pkg/obichunk/chunk.go index 644349b..6eff757 100644 --- a/pkg/obichunk/chunk.go +++ b/pkg/obichunk/chunk.go @@ -11,11 +11,12 @@ func ISequenceChunk(iterator obiiter.IBioSequence, dereplicate bool, na string, statsOn obiseq.StatsOnDescriptions, + uniqueClassifier *obiseq.BioSequenceClassifier, ) (obiiter.IBioSequence, error) { if onMemory { return ISequenceChunkOnMemory(iterator, classifier) } else { - return ISequenceChunkOnDisk(iterator, classifier, dereplicate, na, statsOn) + return ISequenceChunkOnDisk(iterator, classifier, dereplicate, na, statsOn, uniqueClassifier) } } diff --git a/pkg/obichunk/chunk_on_disk.go b/pkg/obichunk/chunk_on_disk.go index dd678f9..2877beb 100644 --- a/pkg/obichunk/chunk_on_disk.go +++ b/pkg/obichunk/chunk_on_disk.go @@ -78,6 +78,7 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence, dereplicate bool, na string, statsOn obiseq.StatsOnDescriptions, + uniqueClassifier *obiseq.BioSequenceClassifier, ) (obiiter.IBioSequence, error) { obiutils.RegisterAPipe() dir, err := tempDir() @@ -120,18 +121,21 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence, if dereplicate { u := make(map[string]*obiseq.BioSequence) var source string + uniqueClassifier.Reset() for iseq.Next() { batch := iseq.Get() source = batch.Source() for _, seq := range batch.Slice() { - sstring := seq.String() - prev, ok := u[sstring] + // Use composite key: sequence + categories + code := uniqueClassifier.Code(seq) + key := uniqueClassifier.Value(code) + prev, ok := u[key] if ok { prev.Merge(seq, na, true, statsOn) } else { - u[sstring] = seq + u[key] = seq } } } diff --git a/pkg/obichunk/unique.go b/pkg/obichunk/unique.go index ba8cc4a..b5297d6 100644 --- a/pkg/obichunk/unique.go +++ b/pkg/obichunk/unique.go @@ -28,29 +28,32 @@ func IUniqueSequence(iterator obiiter.IBioSequence, cat := opts.Categories() na := opts.NAValue() - var classifier *obiseq.BioSequenceClassifier + // Classifier for bucketing: Hash only to control number of chunks + bucketClassifier := obiseq.HashClassifier(opts.BatchCount()) + // Classifier for uniqueness: Sequence + categories + var uniqueClassifier *obiseq.BioSequenceClassifier if len(cat) > 0 { cls := make([]*obiseq.BioSequenceClassifier, len(cat)+1) + cls[0] = obiseq.SequenceClassifier() for i, c := range cat { cls[i+1] = obiseq.AnnotationClassifier(c, na) } - cls[0] = obiseq.HashClassifier(opts.BatchCount()) - classifier = obiseq.CompositeClassifier(cls...) + uniqueClassifier = obiseq.CompositeClassifier(cls...) } else { - classifier = obiseq.HashClassifier(opts.BatchCount()) + uniqueClassifier = obiseq.SequenceClassifier() } if opts.SortOnDisk() { nworkers = 1 - iterator, err = ISequenceChunkOnDisk(iterator, classifier, true, na, opts.StatsOn()) + iterator, err = ISequenceChunkOnDisk(iterator, bucketClassifier, true, na, opts.StatsOn(), uniqueClassifier) if err != nil { return obiiter.NilIBioSequence, err } } else { - iterator, err = ISequenceChunkOnMemory(iterator, classifier) + iterator, err = ISequenceChunkOnMemory(iterator, bucketClassifier) if err != nil { return obiiter.NilIBioSequence, err @@ -93,9 +96,9 @@ func IUniqueSequence(iterator obiiter.IBioSequence, } for i := 0; i < nworkers-1; i++ { - go ff(iterator.Split(), obiseq.SequenceClassifier()) + go ff(iterator.Split(), uniqueClassifier.Clone()) } - go ff(iterator, obiseq.SequenceClassifier()) + go ff(iterator, uniqueClassifier) iMerged := iUnique.IMergeSequenceBatch(opts.NAValue(), opts.StatsOn(), diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 4a91c52..7e2201e 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "f55dd55" +var _Commit = "52244cd" var _Version = "Release 4.4.0" // Version returns the version of the obitools package.