Update of obipcr and homogenization of logging

Former-commit-id: 46abf47c19ace5248042c02cf1f81d9f6c12eb10
This commit is contained in:
Eric Coissac
2024-05-16 15:18:30 +02:00
parent 61be8a55b1
commit 55ce36f329
27 changed files with 345 additions and 58 deletions

View File

@@ -1,7 +1,7 @@
package obiformats
import (
"log"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"

View File

@@ -179,7 +179,8 @@ func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
newIter.Add(1)
go _ParseEmblFile(opt.Source(), entry_channel, newIter,
opt.WithFeatureTable(),
opt.BatchSize(), opt.TotalSeqSize())
opt.BatchSize(),
opt.TotalSeqSize())
}
go func() {

View File

@@ -9,7 +9,6 @@ import (
"slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
@@ -43,7 +42,11 @@ func _EndOfLastFastaEntry(buffer []byte) int {
func _ParseFastaFile(source string,
input ChannelSeqFileChunk,
out obiiter.IBioSequence) {
out obiiter.IBioSequence,
no_order bool,
batch_size int,
chunck_order func() int,
) {
var identifier string
var definition string
@@ -56,7 +59,7 @@ func _ParseFastaFile(source string,
for chunks := range input {
scanner := bufio.NewReader(chunks.raw)
sequences := make(obiseq.BioSequenceSlice, 0, 100)
sequences := make(obiseq.BioSequenceSlice, 0, batch_size)
for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() {
is_end_of_line := C == '\r' || C == '\n'
@@ -130,6 +133,12 @@ func _ParseFastaFile(source string,
s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition)
s.SetSource(source)
sequences = append(sequences, s)
if no_order {
if len(sequences) == batch_size {
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
sequences = make(obiseq.BioSequenceSlice, 0, batch_size)
}
}
state = 1
} else if !is_sep {
@@ -145,8 +154,11 @@ func _ParseFastaFile(source string,
}
if len(sequences) > 0 {
log.Debugln("Pushing sequences")
out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences))
if no_order {
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
} else {
out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences))
}
}
}
@@ -158,14 +170,19 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
opt := MakeOptions(options)
out := obiiter.MakeIBioSequence()
source := opt.Source()
nworker := opt.ParallelWorkers()
nworker := obioptions.CLIReadParallelWorkers()
chkchan := ReadSeqFileChunk(reader, _EndOfLastFastaEntry)
chunck_order := obiutils.AtomicCounter()
for i := 0; i < nworker; i++ {
out.Add(1)
go _ParseFastaFile(source, chkchan, out)
go _ParseFastaFile(opt.Source(),
chkchan,
out,
opt.NoOrder(),
opt.BatchSize(),
chunck_order)
}
go func() {

View File

@@ -184,7 +184,11 @@ func lastFastqCut(buffer []byte) ([]byte, []byte) {
func _ParseFastqFile(source string,
input ChannelSeqFileChunk,
out obiiter.IBioSequence,
quality_shift byte) {
quality_shift byte,
no_order bool,
batch_size int,
chunck_order func() int,
) {
var identifier string
var definition string
@@ -311,6 +315,14 @@ func _ParseFastqFile(source string,
q[i] = q[i] - quality_shift
}
sequences[len(sequences)-1].SetQualities(q)
if no_order {
if len(sequences) == batch_size {
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
sequences = make(obiseq.BioSequenceSlice, 0, batch_size)
}
}
state = 11
} else {
qualBytes.WriteByte(C)
@@ -328,9 +340,13 @@ func _ParseFastqFile(source string,
}
if len(sequences) > 0 {
log.Debugln("Pushing sequences")
out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences))
if no_order {
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
} else {
out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences))
}
}
}
out.Done()
@@ -341,15 +357,20 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
opt := MakeOptions(options)
out := obiiter.MakeIBioSequence()
source := opt.Source()
nworker := opt.ParallelWorkers()
chunkorder := obiutils.AtomicCounter()
nworker := obioptions.CLIReadParallelWorkers()
chkchan := ReadSeqFileChunk(reader, _EndOfLastFastqEntry)
for i := 0; i < nworker; i++ {
out.Add(1)
go _ParseFastqFile(source, chkchan, out,
byte(obioptions.InputQualityShift()))
go _ParseFastqFile(opt.Source(),
chkchan,
out,
byte(obioptions.InputQualityShift()),
opt.NoOrder(),
opt.BatchSize(),
chunkorder)
}
go func() {

View File

@@ -3,12 +3,13 @@ package obiformats
import (
"bytes"
"fmt"
"log"
"math"
"regexp"
"strconv"
"strings"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/goccy/go-json"

View File

@@ -115,7 +115,7 @@ func WriteFasta(iterator obiiter.IBioSequence,
options ...WithOption) (obiiter.IBioSequence, error) {
opt := MakeOptions(options)
iterator = iterator.Rebatch(1000)
iterator = iterator.Rebatch(opt.BatchSize())
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
newIter := obiiter.MakeIBioSequence()

View File

@@ -62,9 +62,8 @@ func WriteFastq(iterator obiiter.IBioSequence,
file io.WriteCloser,
options ...WithOption) (obiiter.IBioSequence, error) {
iterator = iterator.Rebatch(1000)
opt := MakeOptions(options)
iterator = iterator.Rebatch(opt.BatchSize())
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())

View File

@@ -14,6 +14,7 @@ type __options__ struct {
total_seq_size int
full_file_batch bool
parallel_workers int
no_order bool
closefile bool
appendfile bool
compressed bool
@@ -48,6 +49,7 @@ func MakeOptions(setters []WithOption) Options {
parallel_workers: obioptions.CLIReadParallelWorkers(),
batch_size: obioptions.CLIBatchSize(),
total_seq_size: 1024 * 1024 * 100, // 100 MB by default
no_order: false,
full_file_batch: false,
closefile: false,
appendfile: false,
@@ -101,6 +103,10 @@ func (opt Options) FormatFastSeqHeader() func(*obiseq.BioSequence) string {
return opt.pointer.fastseq_header_writer
}
func (opt Options) NoOrder() bool {
return opt.pointer.no_order
}
func (opt Options) ProgressBar() bool {
return opt.pointer.with_progress_bar
}
@@ -205,6 +211,16 @@ func OptionsAppendFile(append bool) WithOption {
return f
}
func OptionNoOrder(no_order bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.no_order = no_order
})
return f
}
func OptionsCompressed(compressed bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.compressed = compressed