New more functionnal version of obiuniq. But it has a memory leak

This commit is contained in:
2022-02-18 10:01:23 +01:00
parent 6067b92e2f
commit 37ce3536e1
2 changed files with 40 additions and 17 deletions

View File

@ -9,18 +9,25 @@ var _StatsOn = make([]string, 0, 10)
var _Keys = make([]string, 0, 10)
var _OnDisk = false
var _chunks = 100
var _NAValue = "NA"
func UniqueOptionSet(options *getoptions.GetOpt) {
options.StringSliceVar(&_StatsOn, "merge",
1, 1000,
1, 1,
options.Alias("m"),
options.Description("Adds a merged attribute containing the list of sequence record ids merged within this group."))
options.StringSliceVar(&_Keys, "category-attribute",
1, 1000,
1, 1,
options.Alias("c"),
options.Description("Adds one attribute to the list of attributes used to define sequence groups (this option can be used several times)."))
options.StringVar(&_NAValue, "na-value", _NAValue,
options.Description("Value used when the classifier tag is not defined for a sequence."))
options.BoolVar(&_OnDisk, "on-disk", true,
options.Description("Allows for using a disk cache during the dereplication process. "))
options.IntVar(&_chunks, "chunk-count", _chunks,
options.Description("In how many chunk the dataset is pre-devided for speeding up the process."))
@ -52,3 +59,7 @@ func CLINumberOfChunks() int {
return _chunks
}
func CLINAValue() string {
return _NAValue
}

View File

@ -10,29 +10,41 @@ import (
func Unique(sequences obiseq.IBioSequenceBatch) obiseq.IBioSequenceBatch {
classifier := obiseq.HashClassifier(CLINumberOfChunks())
var newIter obiseq.IBioSequenceBatch
var err error
options := make([]obichunk.WithOption, 0, 30)
options = append(options,
obichunk.OptionBatchCount(CLINumberOfChunks()),
)
if CLIUniqueInMemory() {
log.Printf("Running dereplication in memory on %d chunks", CLINumberOfChunks())
newIter, err = obichunk.ISequenceChunk(sequences, classifier, 2)
options = append(options, obichunk.OptionSortOnMemory())
} else {
log.Printf("Running dereplication on disk with %d chunks", CLINumberOfChunks())
newIter, err = obichunk.ISequenceChunkOnDisk(sequences, classifier, 2)
options = append(options, obichunk.OptionSortOnDisk())
}
options = append(options,
obichunk.OptionStatOn(CLIStatsOn()...))
options = append(options,
obichunk.OptionSubCategory(CLIKeys()...))
options = append(options,
obichunk.OptionsParallelWorkers(
obioptions.CLIParallelWorkers()),
obichunk.OptionsBufferSize(
obioptions.CLIBufferSize()),
obichunk.OptionsBatchSize(
obioptions.CLIBatchSize()),
obichunk.OptionNAValue(CLINAValue()),
)
iUnique, err := obichunk.IUniqueSequence(sequences, options...)
if err != nil {
log.Fatalf("error in spliting the dataset : %v", err)
log.Fatal(err)
}
statsOn := CLIStatsOn()
keys := CLIKeys()
parallelWorkers := obioptions.CLIParallelWorkers()
buffSize := obioptions.CLIBufferSize()
newIter = newIter.MakeISliceWorker(obiseq.UniqueSliceWorker(statsOn, keys...),
parallelWorkers, buffSize)
return newIter
return iUnique
}