mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-02-02 22:30:34 +00:00
Merge pull request #71 from metabarcoding/push-onwzsyuooozn
Implémentation du filtrage unique basé sur séquence et catégories
This commit is contained in:
@@ -11,11 +11,12 @@ func ISequenceChunk(iterator obiiter.IBioSequence,
|
|||||||
dereplicate bool,
|
dereplicate bool,
|
||||||
na string,
|
na string,
|
||||||
statsOn obiseq.StatsOnDescriptions,
|
statsOn obiseq.StatsOnDescriptions,
|
||||||
|
uniqueClassifier *obiseq.BioSequenceClassifier,
|
||||||
) (obiiter.IBioSequence, error) {
|
) (obiiter.IBioSequence, error) {
|
||||||
|
|
||||||
if onMemory {
|
if onMemory {
|
||||||
return ISequenceChunkOnMemory(iterator, classifier)
|
return ISequenceChunkOnMemory(iterator, classifier)
|
||||||
} else {
|
} else {
|
||||||
return ISequenceChunkOnDisk(iterator, classifier, dereplicate, na, statsOn)
|
return ISequenceChunkOnDisk(iterator, classifier, dereplicate, na, statsOn, uniqueClassifier)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -78,6 +78,7 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
|
|||||||
dereplicate bool,
|
dereplicate bool,
|
||||||
na string,
|
na string,
|
||||||
statsOn obiseq.StatsOnDescriptions,
|
statsOn obiseq.StatsOnDescriptions,
|
||||||
|
uniqueClassifier *obiseq.BioSequenceClassifier,
|
||||||
) (obiiter.IBioSequence, error) {
|
) (obiiter.IBioSequence, error) {
|
||||||
obiutils.RegisterAPipe()
|
obiutils.RegisterAPipe()
|
||||||
dir, err := tempDir()
|
dir, err := tempDir()
|
||||||
@@ -120,18 +121,21 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
|
|||||||
if dereplicate {
|
if dereplicate {
|
||||||
u := make(map[string]*obiseq.BioSequence)
|
u := make(map[string]*obiseq.BioSequence)
|
||||||
var source string
|
var source string
|
||||||
|
uniqueClassifier.Reset()
|
||||||
|
|
||||||
for iseq.Next() {
|
for iseq.Next() {
|
||||||
batch := iseq.Get()
|
batch := iseq.Get()
|
||||||
source = batch.Source()
|
source = batch.Source()
|
||||||
|
|
||||||
for _, seq := range batch.Slice() {
|
for _, seq := range batch.Slice() {
|
||||||
sstring := seq.String()
|
// Use composite key: sequence + categories
|
||||||
prev, ok := u[sstring]
|
code := uniqueClassifier.Code(seq)
|
||||||
|
key := uniqueClassifier.Value(code)
|
||||||
|
prev, ok := u[key]
|
||||||
if ok {
|
if ok {
|
||||||
prev.Merge(seq, na, true, statsOn)
|
prev.Merge(seq, na, true, statsOn)
|
||||||
} else {
|
} else {
|
||||||
u[sstring] = seq
|
u[key] = seq
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,29 +28,32 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
|
|||||||
cat := opts.Categories()
|
cat := opts.Categories()
|
||||||
na := opts.NAValue()
|
na := opts.NAValue()
|
||||||
|
|
||||||
var classifier *obiseq.BioSequenceClassifier
|
// Classifier for bucketing: Hash only to control number of chunks
|
||||||
|
bucketClassifier := obiseq.HashClassifier(opts.BatchCount())
|
||||||
|
|
||||||
|
// Classifier for uniqueness: Sequence + categories
|
||||||
|
var uniqueClassifier *obiseq.BioSequenceClassifier
|
||||||
if len(cat) > 0 {
|
if len(cat) > 0 {
|
||||||
cls := make([]*obiseq.BioSequenceClassifier, len(cat)+1)
|
cls := make([]*obiseq.BioSequenceClassifier, len(cat)+1)
|
||||||
|
cls[0] = obiseq.SequenceClassifier()
|
||||||
for i, c := range cat {
|
for i, c := range cat {
|
||||||
cls[i+1] = obiseq.AnnotationClassifier(c, na)
|
cls[i+1] = obiseq.AnnotationClassifier(c, na)
|
||||||
}
|
}
|
||||||
cls[0] = obiseq.HashClassifier(opts.BatchCount())
|
uniqueClassifier = obiseq.CompositeClassifier(cls...)
|
||||||
classifier = obiseq.CompositeClassifier(cls...)
|
|
||||||
} else {
|
} else {
|
||||||
classifier = obiseq.HashClassifier(opts.BatchCount())
|
uniqueClassifier = obiseq.SequenceClassifier()
|
||||||
}
|
}
|
||||||
|
|
||||||
if opts.SortOnDisk() {
|
if opts.SortOnDisk() {
|
||||||
nworkers = 1
|
nworkers = 1
|
||||||
iterator, err = ISequenceChunkOnDisk(iterator, classifier, true, na, opts.StatsOn())
|
iterator, err = ISequenceChunkOnDisk(iterator, bucketClassifier, true, na, opts.StatsOn(), uniqueClassifier)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return obiiter.NilIBioSequence, err
|
return obiiter.NilIBioSequence, err
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
iterator, err = ISequenceChunkOnMemory(iterator, classifier)
|
iterator, err = ISequenceChunkOnMemory(iterator, bucketClassifier)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return obiiter.NilIBioSequence, err
|
return obiiter.NilIBioSequence, err
|
||||||
@@ -93,9 +96,9 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
|
|||||||
}
|
}
|
||||||
|
|
||||||
for i := 0; i < nworkers-1; i++ {
|
for i := 0; i < nworkers-1; i++ {
|
||||||
go ff(iterator.Split(), obiseq.SequenceClassifier())
|
go ff(iterator.Split(), uniqueClassifier.Clone())
|
||||||
}
|
}
|
||||||
go ff(iterator, obiseq.SequenceClassifier())
|
go ff(iterator, uniqueClassifier)
|
||||||
|
|
||||||
iMerged := iUnique.IMergeSequenceBatch(opts.NAValue(),
|
iMerged := iUnique.IMergeSequenceBatch(opts.NAValue(),
|
||||||
opts.StatsOn(),
|
opts.StatsOn(),
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import (
|
|||||||
// corresponds to the last commit, and not the one when the file will be
|
// corresponds to the last commit, and not the one when the file will be
|
||||||
// commited
|
// commited
|
||||||
|
|
||||||
var _Commit = "f55dd55"
|
var _Commit = "52244cd"
|
||||||
var _Version = "Release 4.4.0"
|
var _Version = "Release 4.4.0"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
|
|||||||
Reference in New Issue
Block a user