Update obiuniq for very large dataset

This commit is contained in:
Eric Coissac
2025-12-03 11:48:50 +01:00
parent 547135c747
commit ac0d3f3fe4
10 changed files with 281 additions and 56 deletions

View File

@@ -25,18 +25,32 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
log.Infoln("Starting data splitting")
cat := opts.Categories()
na := opts.NAValue()
var classifier *obiseq.BioSequenceClassifier
if len(cat) > 0 {
cls := make([]*obiseq.BioSequenceClassifier, len(cat)+1)
for i, c := range cat {
cls[i+1] = obiseq.AnnotationClassifier(c, na)
}
cls[0] = obiseq.HashClassifier(opts.BatchCount())
classifier = obiseq.CompositeClassifier(cls...)
} else {
classifier = obiseq.HashClassifier(opts.BatchCount())
}
if opts.SortOnDisk() {
nworkers = 1
iterator, err = ISequenceChunkOnDisk(iterator,
obiseq.HashClassifier(opts.BatchCount()))
iterator, err = ISequenceChunkOnDisk(iterator, classifier, true, na, opts.StatsOn())
if err != nil {
return obiiter.NilIBioSequence, err
}
} else {
iterator, err = ISequenceChunkOnMemory(iterator,
obiseq.HashClassifier(opts.BatchCount()))
iterator, err = ISequenceChunkOnMemory(iterator, classifier)
if err != nil {
return obiiter.NilIBioSequence, err
@@ -63,63 +77,25 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
return neworder
}
var ff func(obiiter.IBioSequence,
*obiseq.BioSequenceClassifier,
int)
cat := opts.Categories()
na := opts.NAValue()
ff = func(input obiiter.IBioSequence,
classifier *obiseq.BioSequenceClassifier,
icat int) {
icat--
ff := func(input obiiter.IBioSequence,
classifier *obiseq.BioSequenceClassifier) {
input, err = ISequenceSubChunk(input,
classifier,
1)
var next obiiter.IBioSequence
if icat >= 0 {
next = obiiter.MakeIBioSequence()
iUnique.Add(1)
go ff(next,
obiseq.AnnotationClassifier(cat[icat], na),
icat)
}
o := 0
for input.Next() {
batch := input.Get()
if icat < 0 || len(batch.Slice()) == 1 {
// No more sub classification of sequence or only a single sequence
if !(opts.NoSingleton() && len(batch.Slice()) == 1 && batch.Slice()[0].Count() == 1) {
iUnique.Push(batch.Reorder(nextOrder()))
}
} else {
// A new step of classification must du realized
next.Push(batch.Reorder(o))
o++
if !(opts.NoSingleton() && len(batch.Slice()) == 1 && batch.Slice()[0].Count() == 1) {
iUnique.Push(batch.Reorder(nextOrder()))
}
}
if icat >= 0 {
next.Close()
}
iUnique.Done()
}
for i := 0; i < nworkers-1; i++ {
go ff(iterator.Split(),
obiseq.SequenceClassifier(),
len(cat))
go ff(iterator.Split(), obiseq.SequenceClassifier())
}
go ff(iterator,
obiseq.SequenceClassifier(),
len(cat))
go ff(iterator, obiseq.SequenceClassifier())
iMerged := iUnique.IMergeSequenceBatch(opts.NAValue(),
opts.StatsOn(),