Files
obitools4/pkg/obichunk/unique.go

130 lines
2.4 KiB
Go
Raw Normal View History

2022-02-18 09:58:08 +01:00
package obichunk
import (
"sync"
2022-02-24 12:14:52 +01:00
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
2022-02-18 09:58:08 +01:00
)
// Runs dereplication algorithm on a obiiter.IBioSequenceBatch
2022-08-21 14:47:22 +02:00
// iterator.
2023-01-22 22:04:17 +01:00
func IUniqueSequence(iterator obiiter.IBioSequence,
options ...WithOption) (obiiter.IBioSequence, error) {
2022-02-18 09:58:08 +01:00
var err error
opts := MakeOptions(options)
nworkers := opts.ParallelWorkers()
2022-02-18 09:58:08 +01:00
iUnique := obiiter.MakeIBioSequence()
2022-02-18 09:58:08 +01:00
2022-02-24 12:14:52 +01:00
iterator = iterator.Speed("Splitting data set")
log.Infoln("Starting data splitting")
2022-02-18 09:58:08 +01:00
if opts.SortOnDisk() {
nworkers = 1
2022-02-18 09:58:08 +01:00
iterator, err = ISequenceChunkOnDisk(iterator,
obiseq.HashClassifier(opts.BatchCount()))
2022-02-18 09:58:08 +01:00
if err != nil {
2023-01-22 22:04:17 +01:00
return obiiter.NilIBioSequence, err
2022-02-18 09:58:08 +01:00
}
} else {
iterator, err = ISequenceChunk(iterator,
obiseq.HashClassifier(opts.BatchCount()))
2022-02-18 09:58:08 +01:00
if err != nil {
2023-01-22 22:04:17 +01:00
return obiiter.NilIBioSequence, err
2022-02-18 09:58:08 +01:00
}
}
2022-02-24 12:14:52 +01:00
log.Infoln("End of the data splitting")
2022-02-18 09:58:08 +01:00
iUnique.Add(nworkers)
go func() {
iUnique.Wait()
iUnique.Close()
2022-02-18 09:58:08 +01:00
}()
omutex := sync.Mutex{}
order := 0
nextOrder := func() int {
omutex.Lock()
neworder := order
order++
omutex.Unlock()
return neworder
}
2023-01-22 22:04:17 +01:00
var ff func(obiiter.IBioSequence,
*obiseq.BioSequenceClassifier,
int)
2022-02-18 09:58:08 +01:00
cat := opts.Categories()
na := opts.NAValue()
2023-01-22 22:04:17 +01:00
ff = func(input obiiter.IBioSequence,
2022-02-18 22:53:09 +01:00
classifier *obiseq.BioSequenceClassifier,
2022-02-18 09:58:08 +01:00
icat int) {
icat--
input, err = ISequenceSubChunk(input,
classifier,
1)
2022-02-18 09:58:08 +01:00
2023-01-22 22:04:17 +01:00
var next obiiter.IBioSequence
2022-02-18 09:58:08 +01:00
if icat >= 0 {
next = obiiter.MakeIBioSequence()
2022-02-18 09:58:08 +01:00
iUnique.Add(1)
2022-08-21 14:47:22 +02:00
2022-02-18 09:58:08 +01:00
go ff(next,
obiseq.AnnotationClassifier(cat[icat], na),
icat)
}
o := 0
for input.Next() {
batch := input.Get()
2022-02-18 09:58:08 +01:00
if icat < 0 || len(batch.Slice()) == 1 {
2022-08-21 14:47:22 +02:00
// No more sub classification of sequence or only a single sequence
2024-09-24 16:31:30 +02:00
if !(opts.NoSingleton() && len(batch.Slice()) == 1 && batch.Slice()[0].Count() == 1) {
2022-02-24 12:14:52 +01:00
iUnique.Push(batch.Reorder(nextOrder()))
}
2022-02-18 09:58:08 +01:00
} else {
2022-08-21 14:47:22 +02:00
// A new step of classification must du realized
next.Push(batch.Reorder(o))
2022-02-18 09:58:08 +01:00
o++
}
}
if icat >= 0 {
next.Close()
2022-02-18 09:58:08 +01:00
}
iUnique.Done()
}
for i := 0; i < nworkers-1; i++ {
go ff(iterator.Split(),
obiseq.SequenceClassifier(),
len(cat))
}
go ff(iterator,
obiseq.SequenceClassifier(),
len(cat))
iMerged := iUnique.IMergeSequenceBatch(opts.NAValue(),
opts.StatsOn(),
2022-02-18 09:58:08 +01:00
)
return iMerged, nil
2022-02-18 09:58:08 +01:00
}