Merge pull request #71 from metabarcoding/push-onwzsyuooozn

Implémentation du filtrage unique basé sur séquence et catégories
This commit is contained in:
coissac
2026-01-14 19:19:27 +01:00
committed by GitHub
4 changed files with 21 additions and 13 deletions

View File

@@ -11,11 +11,12 @@ func ISequenceChunk(iterator obiiter.IBioSequence,
dereplicate bool, dereplicate bool,
na string, na string,
statsOn obiseq.StatsOnDescriptions, statsOn obiseq.StatsOnDescriptions,
uniqueClassifier *obiseq.BioSequenceClassifier,
) (obiiter.IBioSequence, error) { ) (obiiter.IBioSequence, error) {
if onMemory { if onMemory {
return ISequenceChunkOnMemory(iterator, classifier) return ISequenceChunkOnMemory(iterator, classifier)
} else { } else {
return ISequenceChunkOnDisk(iterator, classifier, dereplicate, na, statsOn) return ISequenceChunkOnDisk(iterator, classifier, dereplicate, na, statsOn, uniqueClassifier)
} }
} }

View File

@@ -78,6 +78,7 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
dereplicate bool, dereplicate bool,
na string, na string,
statsOn obiseq.StatsOnDescriptions, statsOn obiseq.StatsOnDescriptions,
uniqueClassifier *obiseq.BioSequenceClassifier,
) (obiiter.IBioSequence, error) { ) (obiiter.IBioSequence, error) {
obiutils.RegisterAPipe() obiutils.RegisterAPipe()
dir, err := tempDir() dir, err := tempDir()
@@ -120,18 +121,21 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
if dereplicate { if dereplicate {
u := make(map[string]*obiseq.BioSequence) u := make(map[string]*obiseq.BioSequence)
var source string var source string
uniqueClassifier.Reset()
for iseq.Next() { for iseq.Next() {
batch := iseq.Get() batch := iseq.Get()
source = batch.Source() source = batch.Source()
for _, seq := range batch.Slice() { for _, seq := range batch.Slice() {
sstring := seq.String() // Use composite key: sequence + categories
prev, ok := u[sstring] code := uniqueClassifier.Code(seq)
key := uniqueClassifier.Value(code)
prev, ok := u[key]
if ok { if ok {
prev.Merge(seq, na, true, statsOn) prev.Merge(seq, na, true, statsOn)
} else { } else {
u[sstring] = seq u[key] = seq
} }
} }
} }

View File

@@ -28,29 +28,32 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
cat := opts.Categories() cat := opts.Categories()
na := opts.NAValue() na := opts.NAValue()
var classifier *obiseq.BioSequenceClassifier // Classifier for bucketing: Hash only to control number of chunks
bucketClassifier := obiseq.HashClassifier(opts.BatchCount())
// Classifier for uniqueness: Sequence + categories
var uniqueClassifier *obiseq.BioSequenceClassifier
if len(cat) > 0 { if len(cat) > 0 {
cls := make([]*obiseq.BioSequenceClassifier, len(cat)+1) cls := make([]*obiseq.BioSequenceClassifier, len(cat)+1)
cls[0] = obiseq.SequenceClassifier()
for i, c := range cat { for i, c := range cat {
cls[i+1] = obiseq.AnnotationClassifier(c, na) cls[i+1] = obiseq.AnnotationClassifier(c, na)
} }
cls[0] = obiseq.HashClassifier(opts.BatchCount()) uniqueClassifier = obiseq.CompositeClassifier(cls...)
classifier = obiseq.CompositeClassifier(cls...)
} else { } else {
classifier = obiseq.HashClassifier(opts.BatchCount()) uniqueClassifier = obiseq.SequenceClassifier()
} }
if opts.SortOnDisk() { if opts.SortOnDisk() {
nworkers = 1 nworkers = 1
iterator, err = ISequenceChunkOnDisk(iterator, classifier, true, na, opts.StatsOn()) iterator, err = ISequenceChunkOnDisk(iterator, bucketClassifier, true, na, opts.StatsOn(), uniqueClassifier)
if err != nil { if err != nil {
return obiiter.NilIBioSequence, err return obiiter.NilIBioSequence, err
} }
} else { } else {
iterator, err = ISequenceChunkOnMemory(iterator, classifier) iterator, err = ISequenceChunkOnMemory(iterator, bucketClassifier)
if err != nil { if err != nil {
return obiiter.NilIBioSequence, err return obiiter.NilIBioSequence, err
@@ -93,9 +96,9 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
} }
for i := 0; i < nworkers-1; i++ { for i := 0; i < nworkers-1; i++ {
go ff(iterator.Split(), obiseq.SequenceClassifier()) go ff(iterator.Split(), uniqueClassifier.Clone())
} }
go ff(iterator, obiseq.SequenceClassifier()) go ff(iterator, uniqueClassifier)
iMerged := iUnique.IMergeSequenceBatch(opts.NAValue(), iMerged := iUnique.IMergeSequenceBatch(opts.NAValue(),
opts.StatsOn(), opts.StatsOn(),

View File

@@ -8,7 +8,7 @@ import (
// corresponds to the last commit, and not the one when the file will be // corresponds to the last commit, and not the one when the file will be
// commited // commited
var _Commit = "f55dd55" var _Commit = "52244cd"
var _Version = "Release 4.4.0" var _Version = "Release 4.4.0"
// Version returns the version of the obitools package. // Version returns the version of the obitools package.