mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
big bug on subschunk
This commit is contained in:
96
pkg/obichunk/chunk_on_disk.go
Normal file
96
pkg/obichunk/chunk_on_disk.go
Normal file
@ -0,0 +1,96 @@
|
||||
package obichunk
|
||||
|
||||
import (
|
||||
"io/fs"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
func tempDir() (string, error) {
|
||||
dir, err := ioutil.TempDir(os.TempDir(), "obiseq_chunks_")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return dir, nil
|
||||
}
|
||||
|
||||
func find(root, ext string) []string {
|
||||
var a []string
|
||||
filepath.WalkDir(root, func(s string, d fs.DirEntry, e error) error {
|
||||
if e != nil {
|
||||
return e
|
||||
}
|
||||
if filepath.Ext(d.Name()) == ext {
|
||||
a = append(a, s)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
return a
|
||||
}
|
||||
|
||||
func ISequenceChunkOnDisk(iterator obiseq.IBioSequenceBatch,
|
||||
classifier obiseq.BioSequenceClassifier,
|
||||
sizes ...int) (obiseq.IBioSequenceBatch, error) {
|
||||
dir, err := tempDir()
|
||||
if err != nil {
|
||||
return obiseq.NilIBioSequenceBatch, err
|
||||
}
|
||||
|
||||
bufferSize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
bufferSize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := obiseq.MakeIBioSequenceBatch(bufferSize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
defer func() {
|
||||
os.RemoveAll(dir)
|
||||
log.Println("Clear the cache directory")
|
||||
}()
|
||||
|
||||
newIter.Wait()
|
||||
close(newIter.Channel())
|
||||
}()
|
||||
|
||||
obiformats.WriterDispatcher(dir+"/chunk_%s.fastx",
|
||||
iterator.Distribute(classifier),
|
||||
obiformats.WriteSequencesBatchToFile,
|
||||
)
|
||||
|
||||
fileNames := find(dir, ".fastx")
|
||||
log.Println("batch count ", len(fileNames))
|
||||
|
||||
go func() {
|
||||
|
||||
for order, file := range fileNames {
|
||||
iseq, err := obiformats.ReadSequencesBatchFromFile(file)
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
chunck := make(obiseq.BioSequenceSlice, 0, 1000)
|
||||
|
||||
for iseq.Next() {
|
||||
b := iseq.Get()
|
||||
chunck = append(chunck, b.Slice()...)
|
||||
}
|
||||
|
||||
newIter.Channel() <- obiseq.MakeBioSequenceBatch(order, chunck...)
|
||||
|
||||
}
|
||||
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter, err
|
||||
}
|
Reference in New Issue
Block a user