New more functionnal version of obiuniq. But it has a memory leak

This commit is contained in:
2022-02-18 10:01:23 +01:00
parent 6067b92e2f
commit 37ce3536e1
2 changed files with 40 additions and 17 deletions

View File

@ -9,18 +9,25 @@ var _StatsOn = make([]string, 0, 10)
var _Keys = make([]string, 0, 10) var _Keys = make([]string, 0, 10)
var _OnDisk = false var _OnDisk = false
var _chunks = 100 var _chunks = 100
var _NAValue = "NA"
func UniqueOptionSet(options *getoptions.GetOpt) { func UniqueOptionSet(options *getoptions.GetOpt) {
options.StringSliceVar(&_StatsOn, "merge", options.StringSliceVar(&_StatsOn, "merge",
1, 1000, 1, 1,
options.Alias("m"), options.Alias("m"),
options.Description("Adds a merged attribute containing the list of sequence record ids merged within this group.")) options.Description("Adds a merged attribute containing the list of sequence record ids merged within this group."))
options.StringSliceVar(&_Keys, "category-attribute", options.StringSliceVar(&_Keys, "category-attribute",
1, 1000, 1, 1,
options.Alias("c"), options.Alias("c"),
options.Description("Adds one attribute to the list of attributes used to define sequence groups (this option can be used several times).")) options.Description("Adds one attribute to the list of attributes used to define sequence groups (this option can be used several times)."))
options.StringVar(&_NAValue, "na-value", _NAValue,
options.Description("Value used when the classifier tag is not defined for a sequence."))
options.BoolVar(&_OnDisk, "on-disk", true, options.BoolVar(&_OnDisk, "on-disk", true,
options.Description("Allows for using a disk cache during the dereplication process. ")) options.Description("Allows for using a disk cache during the dereplication process. "))
options.IntVar(&_chunks, "chunk-count", _chunks, options.IntVar(&_chunks, "chunk-count", _chunks,
options.Description("In how many chunk the dataset is pre-devided for speeding up the process.")) options.Description("In how many chunk the dataset is pre-devided for speeding up the process."))
@ -52,3 +59,7 @@ func CLINumberOfChunks() int {
return _chunks return _chunks
} }
func CLINAValue() string {
return _NAValue
}

View File

@ -10,29 +10,41 @@ import (
func Unique(sequences obiseq.IBioSequenceBatch) obiseq.IBioSequenceBatch { func Unique(sequences obiseq.IBioSequenceBatch) obiseq.IBioSequenceBatch {
classifier := obiseq.HashClassifier(CLINumberOfChunks()) options := make([]obichunk.WithOption, 0, 30)
var newIter obiseq.IBioSequenceBatch
var err error options = append(options,
obichunk.OptionBatchCount(CLINumberOfChunks()),
)
if CLIUniqueInMemory() { if CLIUniqueInMemory() {
log.Printf("Running dereplication in memory on %d chunks", CLINumberOfChunks()) log.Printf("Running dereplication in memory on %d chunks", CLINumberOfChunks())
newIter, err = obichunk.ISequenceChunk(sequences, classifier, 2) options = append(options, obichunk.OptionSortOnMemory())
} else { } else {
log.Printf("Running dereplication on disk with %d chunks", CLINumberOfChunks()) log.Printf("Running dereplication on disk with %d chunks", CLINumberOfChunks())
newIter, err = obichunk.ISequenceChunkOnDisk(sequences, classifier, 2) options = append(options, obichunk.OptionSortOnDisk())
} }
options = append(options,
obichunk.OptionStatOn(CLIStatsOn()...))
options = append(options,
obichunk.OptionSubCategory(CLIKeys()...))
options = append(options,
obichunk.OptionsParallelWorkers(
obioptions.CLIParallelWorkers()),
obichunk.OptionsBufferSize(
obioptions.CLIBufferSize()),
obichunk.OptionsBatchSize(
obioptions.CLIBatchSize()),
obichunk.OptionNAValue(CLINAValue()),
)
iUnique, err := obichunk.IUniqueSequence(sequences, options...)
if err != nil { if err != nil {
log.Fatalf("error in spliting the dataset : %v", err) log.Fatal(err)
} }
statsOn := CLIStatsOn() return iUnique
keys := CLIKeys()
parallelWorkers := obioptions.CLIParallelWorkers()
buffSize := obioptions.CLIBufferSize()
newIter = newIter.MakeISliceWorker(obiseq.UniqueSliceWorker(statsOn, keys...),
parallelWorkers, buffSize)
return newIter
} }