mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-02-03 06:40:33 +00:00
Update obiuniq for very large dataset
This commit is contained in:
@@ -25,18 +25,32 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
|
||||
|
||||
log.Infoln("Starting data splitting")
|
||||
|
||||
cat := opts.Categories()
|
||||
na := opts.NAValue()
|
||||
|
||||
var classifier *obiseq.BioSequenceClassifier
|
||||
|
||||
if len(cat) > 0 {
|
||||
cls := make([]*obiseq.BioSequenceClassifier, len(cat)+1)
|
||||
for i, c := range cat {
|
||||
cls[i+1] = obiseq.AnnotationClassifier(c, na)
|
||||
}
|
||||
cls[0] = obiseq.HashClassifier(opts.BatchCount())
|
||||
classifier = obiseq.CompositeClassifier(cls...)
|
||||
} else {
|
||||
classifier = obiseq.HashClassifier(opts.BatchCount())
|
||||
}
|
||||
|
||||
if opts.SortOnDisk() {
|
||||
nworkers = 1
|
||||
iterator, err = ISequenceChunkOnDisk(iterator,
|
||||
obiseq.HashClassifier(opts.BatchCount()))
|
||||
iterator, err = ISequenceChunkOnDisk(iterator, classifier, true, na, opts.StatsOn())
|
||||
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
} else {
|
||||
iterator, err = ISequenceChunkOnMemory(iterator,
|
||||
obiseq.HashClassifier(opts.BatchCount()))
|
||||
iterator, err = ISequenceChunkOnMemory(iterator, classifier)
|
||||
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
@@ -63,63 +77,25 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
|
||||
return neworder
|
||||
}
|
||||
|
||||
var ff func(obiiter.IBioSequence,
|
||||
*obiseq.BioSequenceClassifier,
|
||||
int)
|
||||
|
||||
cat := opts.Categories()
|
||||
na := opts.NAValue()
|
||||
|
||||
ff = func(input obiiter.IBioSequence,
|
||||
classifier *obiseq.BioSequenceClassifier,
|
||||
icat int) {
|
||||
icat--
|
||||
ff := func(input obiiter.IBioSequence,
|
||||
classifier *obiseq.BioSequenceClassifier) {
|
||||
input, err = ISequenceSubChunk(input,
|
||||
classifier,
|
||||
1)
|
||||
|
||||
var next obiiter.IBioSequence
|
||||
if icat >= 0 {
|
||||
next = obiiter.MakeIBioSequence()
|
||||
|
||||
iUnique.Add(1)
|
||||
|
||||
go ff(next,
|
||||
obiseq.AnnotationClassifier(cat[icat], na),
|
||||
icat)
|
||||
}
|
||||
|
||||
o := 0
|
||||
for input.Next() {
|
||||
batch := input.Get()
|
||||
|
||||
if icat < 0 || len(batch.Slice()) == 1 {
|
||||
// No more sub classification of sequence or only a single sequence
|
||||
if !(opts.NoSingleton() && len(batch.Slice()) == 1 && batch.Slice()[0].Count() == 1) {
|
||||
iUnique.Push(batch.Reorder(nextOrder()))
|
||||
}
|
||||
} else {
|
||||
// A new step of classification must du realized
|
||||
next.Push(batch.Reorder(o))
|
||||
o++
|
||||
if !(opts.NoSingleton() && len(batch.Slice()) == 1 && batch.Slice()[0].Count() == 1) {
|
||||
iUnique.Push(batch.Reorder(nextOrder()))
|
||||
}
|
||||
}
|
||||
|
||||
if icat >= 0 {
|
||||
next.Close()
|
||||
}
|
||||
|
||||
iUnique.Done()
|
||||
}
|
||||
|
||||
for i := 0; i < nworkers-1; i++ {
|
||||
go ff(iterator.Split(),
|
||||
obiseq.SequenceClassifier(),
|
||||
len(cat))
|
||||
go ff(iterator.Split(), obiseq.SequenceClassifier())
|
||||
}
|
||||
go ff(iterator,
|
||||
obiseq.SequenceClassifier(),
|
||||
len(cat))
|
||||
go ff(iterator, obiseq.SequenceClassifier())
|
||||
|
||||
iMerged := iUnique.IMergeSequenceBatch(opts.NAValue(),
|
||||
opts.StatsOn(),
|
||||
|
||||
Reference in New Issue
Block a user