mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Few debug on obidistribute and some progresses on obiunique, but -c and -m options are still not working
This commit is contained in:
@ -54,11 +54,12 @@ func InterfaceToInt(i interface{}) (val int, err error) {
|
||||
return
|
||||
}
|
||||
|
||||
func IsAnInt(i interface{}) bool {
|
||||
func CastableToInt(i interface{}) bool {
|
||||
|
||||
switch i.(type) {
|
||||
case int,
|
||||
int8, int16, int32, int64,
|
||||
float32, float64,
|
||||
uint8, uint16, uint32, uint64:
|
||||
return true
|
||||
default:
|
||||
|
@ -6,6 +6,7 @@ import (
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
@ -33,7 +34,9 @@ func find(root, ext string) []string {
|
||||
return a
|
||||
}
|
||||
|
||||
func ISequenceChunk(iterator obiseq.IBioSequenceBatch, size int, sizes ...int) (obiseq.IBioSequenceBatch, error) {
|
||||
func ISequenceChunkOnDisk(iterator obiseq.IBioSequenceBatch,
|
||||
classifier obiseq.SequenceClassifier,
|
||||
sizes ...int) (obiseq.IBioSequenceBatch, error) {
|
||||
dir, err := tempDir()
|
||||
if err != nil {
|
||||
return obiseq.NilIBioSequenceBatch, err
|
||||
@ -50,15 +53,18 @@ func ISequenceChunk(iterator obiseq.IBioSequenceBatch, size int, sizes ...int) (
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
os.RemoveAll(dir)
|
||||
log.Println("Clear the cache directory")
|
||||
}()
|
||||
|
||||
newIter.Wait()
|
||||
close(newIter.Channel())
|
||||
log.Println("====>> clear diectory")
|
||||
os.RemoveAll(dir)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
obiformats.WriterDispatcher(dir+"/chunk_%s.fastx",
|
||||
iterator.Distribute(obiseq.HashClassifier(size)),
|
||||
iterator.Distribute(classifier),
|
||||
obiformats.WriteSequencesBatchToFile,
|
||||
)
|
||||
|
||||
@ -71,7 +77,7 @@ func ISequenceChunk(iterator obiseq.IBioSequenceBatch, size int, sizes ...int) (
|
||||
panic(err)
|
||||
}
|
||||
|
||||
chunck := make(obiseq.BioSequenceSlice, 0, 3*size)
|
||||
chunck := make(obiseq.BioSequenceSlice, 0, 1000)
|
||||
|
||||
for iseq.Next() {
|
||||
b := iseq.Get()
|
||||
@ -89,3 +95,70 @@ func ISequenceChunk(iterator obiseq.IBioSequenceBatch, size int, sizes ...int) (
|
||||
|
||||
return newIter, err
|
||||
}
|
||||
|
||||
func ISequenceChunk(iterator obiseq.IBioSequenceBatch,
|
||||
classifier obiseq.SequenceClassifier,
|
||||
sizes ...int) (obiseq.IBioSequenceBatch, error) {
|
||||
|
||||
bufferSize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
bufferSize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := obiseq.MakeIBioSequenceBatch(bufferSize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.Channel())
|
||||
}()
|
||||
|
||||
go func() {
|
||||
lock := sync.Mutex{}
|
||||
|
||||
dispatcher := iterator.Distribute(classifier)
|
||||
|
||||
jobDone := sync.WaitGroup{}
|
||||
chunks := make(map[string]*obiseq.BioSequenceSlice, 100)
|
||||
|
||||
for newflux := range dispatcher.News() {
|
||||
jobDone.Add(1)
|
||||
go func(newflux string) {
|
||||
data, err := dispatcher.Outputs(newflux)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot retreive the new chanel : %v", err)
|
||||
}
|
||||
|
||||
chunk := make(obiseq.BioSequenceSlice, 0, 1000)
|
||||
|
||||
for data.Next() {
|
||||
b := data.Get()
|
||||
chunk = append(chunk, b.Slice()...)
|
||||
}
|
||||
|
||||
lock.Lock()
|
||||
chunks[newflux] = &chunk
|
||||
lock.Unlock()
|
||||
jobDone.Done()
|
||||
}(newflux)
|
||||
}
|
||||
|
||||
jobDone.Wait()
|
||||
order := 0
|
||||
|
||||
for _, chunck := range chunks {
|
||||
|
||||
if len(*chunck) > 0 {
|
||||
newIter.Channel() <- obiseq.MakeBioSequenceBatch(order, *chunck...)
|
||||
order++
|
||||
}
|
||||
|
||||
}
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter, nil
|
||||
}
|
||||
|
@ -7,6 +7,8 @@ import (
|
||||
|
||||
var _StatsOn = make([]string, 0, 10)
|
||||
var _Keys = make([]string, 0, 10)
|
||||
var _OnDisk = false
|
||||
var _chunks = 100
|
||||
|
||||
func UniqueOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringSliceVar(&_StatsOn, "merge",
|
||||
@ -17,6 +19,10 @@ func UniqueOptionSet(options *getoptions.GetOpt) {
|
||||
1, 1000,
|
||||
options.Alias("c"),
|
||||
options.Description("Adds one attribute to the list of attributes used to define sequence groups (this option can be used several times)."))
|
||||
options.BoolVar(&_OnDisk, "on-disk", true,
|
||||
options.Description("Allows for using a disk cache during the dereplication process. "))
|
||||
options.IntVar(&_chunks, "chunk-count", _chunks,
|
||||
options.Description("In how many chunk the dataset is pre-devided for speeding up the process."))
|
||||
|
||||
}
|
||||
|
||||
@ -34,3 +40,15 @@ func CLIStatsOn() []string {
|
||||
func CLIKeys() []string {
|
||||
return _Keys
|
||||
}
|
||||
|
||||
func CLIUniqueInMemory() bool {
|
||||
return _OnDisk
|
||||
}
|
||||
|
||||
func CLINumberOfChunks() int {
|
||||
if _chunks <= 1 {
|
||||
return 1
|
||||
}
|
||||
|
||||
return _chunks
|
||||
}
|
||||
|
@ -10,7 +10,17 @@ import (
|
||||
|
||||
func Unique(sequences obiseq.IBioSequenceBatch) obiseq.IBioSequenceBatch {
|
||||
|
||||
newIter, err := obichunk.ISequenceChunk(sequences, 100, 2)
|
||||
classifier := obiseq.HashClassifier(CLINumberOfChunks())
|
||||
var newIter obiseq.IBioSequenceBatch
|
||||
var err error
|
||||
|
||||
if CLIUniqueInMemory() {
|
||||
log.Printf("Running dereplication in memory on %d chunks", CLINumberOfChunks())
|
||||
newIter, err = obichunk.ISequenceChunk(sequences, classifier, 2)
|
||||
} else {
|
||||
log.Printf("Running dereplication on disk with %d chunks", CLINumberOfChunks())
|
||||
newIter, err = obichunk.ISequenceChunkOnDisk(sequences, classifier, 2)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("error in spliting the dataset : %v", err)
|
||||
|
Reference in New Issue
Block a user