Files
obitools4/pkg/obichunk/chunk_on_disk.go

91 lines
1.7 KiB
Go
Raw Normal View History

2022-02-18 09:58:08 +01:00
package obichunk
import (
"io/fs"
"io/ioutil"
"os"
"path/filepath"
2022-02-24 12:14:52 +01:00
log "github.com/sirupsen/logrus"
2022-02-18 09:58:08 +01:00
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
2022-02-18 09:58:08 +01:00
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
func tempDir() (string, error) {
dir, err := ioutil.TempDir(os.TempDir(), "obiseq_chunks_")
if err != nil {
return "", err
}
return dir, nil
}
func find(root, ext string) []string {
var a []string
filepath.WalkDir(root, func(s string, d fs.DirEntry, e error) error {
if e != nil {
return e
}
if filepath.Ext(d.Name()) == ext {
a = append(a, s)
}
return nil
})
return a
}
2023-01-22 22:04:17 +01:00
func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
classifier *obiseq.BioSequenceClassifier) (obiiter.IBioSequence, error) {
2022-02-18 09:58:08 +01:00
dir, err := tempDir()
if err != nil {
2023-01-22 22:04:17 +01:00
return obiiter.NilIBioSequence, err
2022-02-18 09:58:08 +01:00
}
newIter := obiiter.MakeIBioSequence()
2022-02-18 09:58:08 +01:00
newIter.Add(1)
go func() {
defer func() {
os.RemoveAll(dir)
2022-02-24 12:14:52 +01:00
log.Debugln("Clear the cache directory")
2022-02-18 09:58:08 +01:00
}()
newIter.Wait()
newIter.Close()
2022-02-18 09:58:08 +01:00
}()
obiformats.WriterDispatcher(dir+"/chunk_%s.fastx",
iterator.Distribute(classifier),
2022-11-16 17:13:40 +01:00
obiformats.WriteSequencesToFile,
2022-02-18 09:58:08 +01:00
)
fileNames := find(dir, ".fastx")
2022-02-24 12:14:52 +01:00
nbatch := len(fileNames)
log.Infof("Data splitted over %d batches", nbatch)
2022-02-18 09:58:08 +01:00
go func() {
for order, file := range fileNames {
2022-11-16 17:13:40 +01:00
iseq, err := obiformats.ReadSequencesFromFile(file)
2022-02-18 09:58:08 +01:00
if err != nil {
panic(err)
}
2022-05-27 11:53:29 +03:00
chunck := iseq.Load()
2022-02-18 09:58:08 +01:00
newIter.Push(obiiter.MakeBioSequenceBatch(order, chunck))
2022-02-24 12:14:52 +01:00
log.Infof("Start processing of batch %d/%d : %d sequences",
order, nbatch, len(chunck))
2022-02-18 09:58:08 +01:00
}
newIter.Done()
}()
return newIter, err
}