Improved performance and ability to read very long sequences.

This commit is contained in:
2022-08-21 13:38:13 +02:00
parent 5dd835d3e7
commit 9677f531c4

View File

@ -3,24 +3,27 @@ package obiformats
import ( import (
"bufio" "bufio"
"bytes" "bytes"
"compress/gzip"
"io" "io"
log "github.com/sirupsen/logrus"
"os" "os"
"strconv" "strconv"
"strings" "strings"
gzip "github.com/klauspost/pgzip"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
) )
var _FileChunkSize = 1 << 20 var _FileChunkSize = 1 << 26
type _FileChunk struct { type _FileChunk struct {
raw io.Reader raw io.Reader
order int order int
} }
// It looks for the last occurrence of the pattern `<CR>?<LF>//<CR>?<LF>` in the buffer
func _EndOfLastEntry(buff []byte) int { func _EndOfLastEntry(buff []byte) int {
// 6 5 43 2 1 // 6 5 43 2 1
// <CR>?<LF>//<CR>?<LF> // <CR>?<LF>//<CR>?<LF>
@ -155,15 +158,35 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
l := 0 l := 0
i := 0 i := 0
buff = make([]byte, 1<<20) buff = make([]byte, _FileChunkSize)
for err == nil { for err == nil {
for ; err == nil && l < len(buff); l += size { for ; err == nil && l < len(buff); l += size {
size, err = reader.Read(buff[l:]) size, err = reader.Read(buff[l:])
} }
extbuff := make([]byte, 1<<20)
buff = buff[:l] buff = buff[:l]
end := _EndOfLastEntry(buff) end := 0
ic := 0
for end = _EndOfLastEntry(buff); err == nil && end < 0; end = _EndOfLastEntry(extbuff[:size]) {
ic++
size, err = reader.Read(extbuff)
buff = append(buff, extbuff[:size]...)
}
if ic > 0 {
end = _EndOfLastEntry(buff)
}
remains := buff[end:] remains := buff[end:]
buff = buff[:end] buff = buff[:end]
if ic > 0 {
log.Debugf("EMBL File chunck : final buff size %d bytes (%d extensions)\n",
len(buff),
ic)
}
io := bytes.NewBuffer(buff) io := bytes.NewBuffer(buff)
readers <- _FileChunk{io, i} readers <- _FileChunk{io, i}
i++ i++
@ -175,7 +198,8 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
close(readers) close(readers)
} }
// 6 5 43 2 1 // 6 5 43 2 1
//
// <CR>?<LF>//<CR>?<LF> // <CR>?<LF>//<CR>?<LF>
func ReadEMBLBatch(reader io.Reader, options ...WithOption) obiiter.IBioSequenceBatch { func ReadEMBLBatch(reader io.Reader, options ...WithOption) obiiter.IBioSequenceBatch {
opt := MakeOptions(options) opt := MakeOptions(options)
@ -217,7 +241,8 @@ func ReadEMBLBatchFromFile(filename string, options ...WithOption) (obiiter.IBio
} }
// Test if the flux is compressed by gzip // Test if the flux is compressed by gzip
greader, err = gzip.NewReader(reader) //greader, err = gzip.NewReader(reader)
greader, err = gzip.NewReaderN(reader, 1<<24, 2)
if err == nil { if err == nil {
reader = greader reader = greader
} }