mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-08 16:50:27 +00:00
refactoring of the file chunck writing
This commit is contained in:
@@ -159,7 +159,7 @@ func EmblChunkParser(withFeatureTable bool) func(string, io.Reader) (obiseq.BioS
|
||||
}
|
||||
|
||||
func _ParseEmblFile(
|
||||
input ChannelSeqFileChunk,
|
||||
input ChannelFileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
withFeatureTable bool,
|
||||
) {
|
||||
@@ -189,7 +189,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er
|
||||
|
||||
buff := make([]byte, 1024*1024*128) // 128 MB
|
||||
|
||||
entry_channel := ReadSeqFileChunk(
|
||||
entry_channel := ReadFileChunk(
|
||||
opt.Source(),
|
||||
reader,
|
||||
buff,
|
||||
|
||||
@@ -205,7 +205,7 @@ func FastaChunkParser() func(string, io.Reader) (obiseq.BioSequenceSlice, error)
|
||||
}
|
||||
|
||||
func _ParseFastaFile(
|
||||
input ChannelSeqFileChunk,
|
||||
input ChannelFileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
) {
|
||||
|
||||
@@ -213,6 +213,7 @@ func _ParseFastaFile(
|
||||
|
||||
for chunks := range input {
|
||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
||||
// log.Warnf("Chunck(%d:%d) -%d- ", chunks.Order, l, sequences.Len())
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("File %s : Cannot parse the fasta file : %v", chunks.Source, err)
|
||||
@@ -234,7 +235,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
|
||||
buff := make([]byte, 1024*1024)
|
||||
|
||||
chkchan := ReadSeqFileChunk(
|
||||
chkchan := ReadFileChunk(
|
||||
opt.Source(),
|
||||
reader,
|
||||
buff,
|
||||
|
||||
@@ -296,7 +296,7 @@ func FastqChunkParser(quality_shift byte) func(string, io.Reader) (obiseq.BioSeq
|
||||
}
|
||||
|
||||
func _ParseFastqFile(
|
||||
input ChannelSeqFileChunk,
|
||||
input ChannelFileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
quality_shift byte,
|
||||
) {
|
||||
@@ -326,7 +326,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
|
||||
buff := make([]byte, 1024*1024)
|
||||
|
||||
chkchan := ReadSeqFileChunk(
|
||||
chkchan := ReadFileChunk(
|
||||
opt.Source(),
|
||||
reader,
|
||||
buff,
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
@@ -132,7 +133,7 @@ func WriteFasta(iterator obiiter.IBioSequence,
|
||||
|
||||
nwriters := opt.ParallelWorkers()
|
||||
|
||||
chunkchan := WriteSeqFileChunk(file, opt.CloseFile())
|
||||
chunkchan := WriteFileChunk(file, opt.CloseFile())
|
||||
|
||||
header_format := opt.FormatFastSeqHeader()
|
||||
|
||||
@@ -140,6 +141,9 @@ func WriteFasta(iterator obiiter.IBioSequence,
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
for len(chunkchan) > 0 {
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
close(chunkchan)
|
||||
log.Debugf("Writing fasta file done")
|
||||
}()
|
||||
@@ -151,7 +155,7 @@ func WriteFasta(iterator obiiter.IBioSequence,
|
||||
|
||||
log.Debugf("Formating fasta chunk %d", batch.Order())
|
||||
|
||||
chunkchan <- SeqFileChunk{
|
||||
chunkchan <- FileChunk{
|
||||
Source: batch.Source(),
|
||||
Raw: FormatFastaBatch(batch, header_format, opt.SkipEmptySequence()),
|
||||
Order: batch.Order(),
|
||||
@@ -166,7 +170,7 @@ func WriteFasta(iterator obiiter.IBioSequence,
|
||||
|
||||
log.Debugln("Start of the fasta file writing")
|
||||
go ff(iterator)
|
||||
for i := 0; i < nwriters-1; i++ {
|
||||
for i := 1; i < nwriters; i++ {
|
||||
go ff(iterator.Split())
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@ import (
|
||||
"bytes"
|
||||
"io"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
@@ -87,11 +86,6 @@ func FormatFastqBatch(batch obiiter.BioSequenceBatch,
|
||||
return &bs
|
||||
}
|
||||
|
||||
type FileChunk struct {
|
||||
text []byte
|
||||
order int
|
||||
}
|
||||
|
||||
func WriteFastq(iterator obiiter.IBioSequence,
|
||||
file io.WriteCloser,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
@@ -104,27 +98,25 @@ func WriteFastq(iterator obiiter.IBioSequence,
|
||||
|
||||
nwriters := opt.ParallelWorkers()
|
||||
|
||||
chunkchan := WriteSeqFileChunk(file, opt.CloseFile())
|
||||
chunkchan := WriteFileChunk(file, opt.CloseFile())
|
||||
|
||||
header_format := opt.FormatFastSeqHeader()
|
||||
|
||||
newIter.Add(nwriters)
|
||||
|
||||
var waitWriter sync.WaitGroup
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
for len(chunkchan) > 0 {
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
close(chunkchan)
|
||||
waitWriter.Wait()
|
||||
log.Debugf("Writing fastq file done")
|
||||
}()
|
||||
|
||||
ff := func(iterator obiiter.IBioSequence) {
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
chunk := SeqFileChunk{
|
||||
chunk := FileChunk{
|
||||
Source: batch.Source(),
|
||||
Raw: FormatFastqBatch(batch, header_format, opt.SkipEmptySequence()),
|
||||
Order: batch.Order(),
|
||||
@@ -137,7 +129,7 @@ func WriteFastq(iterator obiiter.IBioSequence,
|
||||
|
||||
log.Debugln("Start of the fastq file writing")
|
||||
go ff(iterator)
|
||||
for i := 0; i < nwriters-1; i++ {
|
||||
for i := 1; i < nwriters; i++ {
|
||||
go ff(iterator.Split())
|
||||
}
|
||||
|
||||
|
||||
@@ -11,13 +11,13 @@ import (
|
||||
|
||||
type SeqFileChunkParser func(string, io.Reader) (obiseq.BioSequenceSlice, error)
|
||||
|
||||
type SeqFileChunk struct {
|
||||
type FileChunk struct {
|
||||
Source string
|
||||
Raw *bytes.Buffer
|
||||
Order int
|
||||
}
|
||||
|
||||
type ChannelSeqFileChunk chan SeqFileChunk
|
||||
type ChannelFileChunk chan FileChunk
|
||||
|
||||
type LastSeqRecord func([]byte) int
|
||||
|
||||
@@ -34,15 +34,15 @@ type LastSeqRecord func([]byte) int
|
||||
//
|
||||
// Returns:
|
||||
// None
|
||||
func ReadSeqFileChunk(
|
||||
func ReadFileChunk(
|
||||
source string,
|
||||
reader io.Reader,
|
||||
buff []byte,
|
||||
splitter LastSeqRecord) ChannelSeqFileChunk {
|
||||
splitter LastSeqRecord) ChannelFileChunk {
|
||||
var err error
|
||||
var fullbuff []byte
|
||||
|
||||
chunk_channel := make(ChannelSeqFileChunk)
|
||||
chunk_channel := make(ChannelFileChunk)
|
||||
|
||||
fileChunkSize := len(buff)
|
||||
|
||||
@@ -95,8 +95,10 @@ func ReadSeqFileChunk(
|
||||
}
|
||||
|
||||
if len(buff) > 0 {
|
||||
io := bytes.NewBuffer(slices.Clone(buff))
|
||||
chunk_channel <- SeqFileChunk{source, io, i}
|
||||
cbuff := slices.Clone(buff)
|
||||
io := bytes.NewBuffer(cbuff)
|
||||
// log.Warnf("chuck %d :Read %d bytes from file %s", i, io.Len(), source)
|
||||
chunk_channel <- FileChunk{source, io, i}
|
||||
i++
|
||||
}
|
||||
|
||||
@@ -120,7 +122,7 @@ func ReadSeqFileChunk(
|
||||
// Send the last chunk to the channel
|
||||
if len(buff) > 0 {
|
||||
io := bytes.NewBuffer(slices.Clone(buff))
|
||||
chunk_channel <- SeqFileChunk{source, io, i}
|
||||
chunk_channel <- FileChunk{source, io, i}
|
||||
}
|
||||
|
||||
// Close the readers channel when the end of the file is reached
|
||||
@@ -8,16 +8,16 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func WriteSeqFileChunk(
|
||||
func WriteFileChunk(
|
||||
writer io.WriteCloser,
|
||||
toBeClosed bool) ChannelSeqFileChunk {
|
||||
toBeClosed bool) ChannelFileChunk {
|
||||
|
||||
obiiter.RegisterAPipe()
|
||||
chunk_channel := make(ChannelSeqFileChunk)
|
||||
chunk_channel := make(ChannelFileChunk)
|
||||
|
||||
go func() {
|
||||
nextToPrint := 0
|
||||
toBePrinted := make(map[int]SeqFileChunk)
|
||||
toBePrinted := make(map[int]FileChunk)
|
||||
for chunk := range chunk_channel {
|
||||
if chunk.Order == nextToPrint {
|
||||
log.Debugf("Writing chunk: %d of length %d bytes",
|
||||
@@ -198,7 +198,7 @@ func GenbankChunkParser(withFeatureTable bool) func(string, io.Reader) (obiseq.B
|
||||
}
|
||||
}
|
||||
|
||||
func _ParseGenbankFile(input ChannelSeqFileChunk,
|
||||
func _ParseGenbankFile(input ChannelFileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
withFeatureTable bool) {
|
||||
|
||||
@@ -225,7 +225,7 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
|
||||
|
||||
buff := make([]byte, 1024*1024*128) // 128 MB
|
||||
|
||||
entry_channel := ReadSeqFileChunk(
|
||||
entry_channel := ReadFileChunk(
|
||||
opt.Source(),
|
||||
reader,
|
||||
buff,
|
||||
|
||||
@@ -7,7 +7,6 @@ import (
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/goccy/go-json"
|
||||
@@ -58,9 +57,17 @@ func JSONRecord(sequence *obiseq.BioSequence) []byte {
|
||||
return text
|
||||
}
|
||||
|
||||
func FormatJSONBatch(batch obiiter.BioSequenceBatch) []byte {
|
||||
func FormatJSONBatch(batch obiiter.BioSequenceBatch) *bytes.Buffer {
|
||||
buff := new(bytes.Buffer)
|
||||
|
||||
json := bufio.NewWriter(buff)
|
||||
|
||||
if batch.Order() == 0 {
|
||||
json.WriteString("[\n")
|
||||
} else {
|
||||
json.WriteString(",\n")
|
||||
}
|
||||
|
||||
n := batch.Slice().Len() - 1
|
||||
for i, s := range batch.Slice() {
|
||||
json.WriteString(" ")
|
||||
@@ -71,8 +78,7 @@ func FormatJSONBatch(batch obiiter.BioSequenceBatch) []byte {
|
||||
}
|
||||
|
||||
json.Flush()
|
||||
|
||||
return buff.Bytes()
|
||||
return buff
|
||||
}
|
||||
|
||||
func WriteJSON(iterator obiiter.IBioSequence,
|
||||
@@ -84,14 +90,10 @@ func WriteJSON(iterator obiiter.IBioSequence,
|
||||
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
nwriters := opt.ParallelWorkers()
|
||||
|
||||
obiiter.RegisterAPipe()
|
||||
chunkchan := make(chan FileChunk)
|
||||
|
||||
chunkchan := WriteFileChunk(file, opt.CloseFile())
|
||||
newIter.Add(nwriters)
|
||||
var waitWriter sync.WaitGroup
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
@@ -99,7 +101,6 @@ func WriteJSON(iterator obiiter.IBioSequence,
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
close(chunkchan)
|
||||
waitWriter.Wait()
|
||||
}()
|
||||
|
||||
ff := func(iterator obiiter.IBioSequence) {
|
||||
@@ -107,62 +108,31 @@ func WriteJSON(iterator obiiter.IBioSequence,
|
||||
|
||||
batch := iterator.Get()
|
||||
|
||||
chunkchan <- FileChunk{
|
||||
FormatJSONBatch(batch),
|
||||
batch.Order(),
|
||||
ss := FileChunk{
|
||||
Source: batch.Source(),
|
||||
Raw: FormatJSONBatch(batch),
|
||||
Order: batch.Order(),
|
||||
}
|
||||
|
||||
chunkchan <- ss
|
||||
newIter.Push(batch)
|
||||
}
|
||||
newIter.Done()
|
||||
}
|
||||
|
||||
next_to_send := 0
|
||||
received := make(map[int]FileChunk, 100)
|
||||
|
||||
waitWriter.Add(1)
|
||||
go func() {
|
||||
for chunk := range chunkchan {
|
||||
if chunk.order == next_to_send {
|
||||
if next_to_send > 0 {
|
||||
file.Write([]byte(",\n"))
|
||||
}
|
||||
file.Write(chunk.text)
|
||||
next_to_send++
|
||||
chunk, ok := received[next_to_send]
|
||||
for ok {
|
||||
file.Write(chunk.text)
|
||||
delete(received, next_to_send)
|
||||
next_to_send++
|
||||
chunk, ok = received[next_to_send]
|
||||
}
|
||||
} else {
|
||||
received[chunk.order] = chunk
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
file.Write([]byte("\n]\n"))
|
||||
file.Close()
|
||||
|
||||
log.Debugln("End of the JSON file writing")
|
||||
obiiter.UnregisterPipe()
|
||||
waitWriter.Done()
|
||||
|
||||
}()
|
||||
|
||||
log.Debugln("Start of the JSON file writing")
|
||||
file.Write([]byte("[\n"))
|
||||
go ff(iterator)
|
||||
for i := 0; i < nwriters-1; i++ {
|
||||
for i := 1; i < nwriters; i++ {
|
||||
go ff(iterator.Split())
|
||||
}
|
||||
go ff(iterator)
|
||||
|
||||
return newIter, nil
|
||||
}
|
||||
|
||||
func WriteJSONToStdout(iterator obiiter.IBioSequence,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionDontCloseFile())
|
||||
options = append(options, OptionCloseFile())
|
||||
|
||||
return WriteJSON(iterator, os.Stdout, options...)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user