mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-08 08:40:26 +00:00
Update of obipcr and homogenization of logging
Former-commit-id: 46abf47c19ace5248042c02cf1f81d9f6c12eb10
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"log"
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
|
||||
@@ -179,7 +179,8 @@ func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
||||
newIter.Add(1)
|
||||
go _ParseEmblFile(opt.Source(), entry_channel, newIter,
|
||||
opt.WithFeatureTable(),
|
||||
opt.BatchSize(), opt.TotalSeqSize())
|
||||
opt.BatchSize(),
|
||||
opt.TotalSeqSize())
|
||||
}
|
||||
|
||||
go func() {
|
||||
|
||||
@@ -9,7 +9,6 @@ import (
|
||||
"slices"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
|
||||
@@ -43,7 +42,11 @@ func _EndOfLastFastaEntry(buffer []byte) int {
|
||||
|
||||
func _ParseFastaFile(source string,
|
||||
input ChannelSeqFileChunk,
|
||||
out obiiter.IBioSequence) {
|
||||
out obiiter.IBioSequence,
|
||||
no_order bool,
|
||||
batch_size int,
|
||||
chunck_order func() int,
|
||||
) {
|
||||
|
||||
var identifier string
|
||||
var definition string
|
||||
@@ -56,7 +59,7 @@ func _ParseFastaFile(source string,
|
||||
|
||||
for chunks := range input {
|
||||
scanner := bufio.NewReader(chunks.raw)
|
||||
sequences := make(obiseq.BioSequenceSlice, 0, 100)
|
||||
sequences := make(obiseq.BioSequenceSlice, 0, batch_size)
|
||||
for C, err := scanner.ReadByte(); err != io.EOF; C, err = scanner.ReadByte() {
|
||||
|
||||
is_end_of_line := C == '\r' || C == '\n'
|
||||
@@ -130,6 +133,12 @@ func _ParseFastaFile(source string,
|
||||
s := obiseq.NewBioSequence(identifier, slices.Clone(seqBytes.Bytes()), definition)
|
||||
s.SetSource(source)
|
||||
sequences = append(sequences, s)
|
||||
if no_order {
|
||||
if len(sequences) == batch_size {
|
||||
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
||||
sequences = make(obiseq.BioSequenceSlice, 0, batch_size)
|
||||
}
|
||||
}
|
||||
state = 1
|
||||
|
||||
} else if !is_sep {
|
||||
@@ -145,8 +154,11 @@ func _ParseFastaFile(source string,
|
||||
}
|
||||
|
||||
if len(sequences) > 0 {
|
||||
log.Debugln("Pushing sequences")
|
||||
out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences))
|
||||
if no_order {
|
||||
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
||||
} else {
|
||||
out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,14 +170,19 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
opt := MakeOptions(options)
|
||||
out := obiiter.MakeIBioSequence()
|
||||
|
||||
source := opt.Source()
|
||||
nworker := opt.ParallelWorkers()
|
||||
|
||||
nworker := obioptions.CLIReadParallelWorkers()
|
||||
chkchan := ReadSeqFileChunk(reader, _EndOfLastFastaEntry)
|
||||
chunck_order := obiutils.AtomicCounter()
|
||||
|
||||
for i := 0; i < nworker; i++ {
|
||||
out.Add(1)
|
||||
go _ParseFastaFile(source, chkchan, out)
|
||||
go _ParseFastaFile(opt.Source(),
|
||||
chkchan,
|
||||
out,
|
||||
opt.NoOrder(),
|
||||
opt.BatchSize(),
|
||||
chunck_order)
|
||||
}
|
||||
|
||||
go func() {
|
||||
|
||||
@@ -184,7 +184,11 @@ func lastFastqCut(buffer []byte) ([]byte, []byte) {
|
||||
func _ParseFastqFile(source string,
|
||||
input ChannelSeqFileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
quality_shift byte) {
|
||||
quality_shift byte,
|
||||
no_order bool,
|
||||
batch_size int,
|
||||
chunck_order func() int,
|
||||
) {
|
||||
|
||||
var identifier string
|
||||
var definition string
|
||||
@@ -311,6 +315,14 @@ func _ParseFastqFile(source string,
|
||||
q[i] = q[i] - quality_shift
|
||||
}
|
||||
sequences[len(sequences)-1].SetQualities(q)
|
||||
|
||||
if no_order {
|
||||
if len(sequences) == batch_size {
|
||||
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
||||
sequences = make(obiseq.BioSequenceSlice, 0, batch_size)
|
||||
}
|
||||
}
|
||||
|
||||
state = 11
|
||||
} else {
|
||||
qualBytes.WriteByte(C)
|
||||
@@ -328,9 +340,13 @@ func _ParseFastqFile(source string,
|
||||
}
|
||||
|
||||
if len(sequences) > 0 {
|
||||
log.Debugln("Pushing sequences")
|
||||
out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences))
|
||||
if no_order {
|
||||
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
||||
} else {
|
||||
out.Push(obiiter.MakeBioSequenceBatch(chunks.order, sequences))
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
out.Done()
|
||||
@@ -341,15 +357,20 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
opt := MakeOptions(options)
|
||||
out := obiiter.MakeIBioSequence()
|
||||
|
||||
source := opt.Source()
|
||||
nworker := opt.ParallelWorkers()
|
||||
chunkorder := obiutils.AtomicCounter()
|
||||
|
||||
nworker := obioptions.CLIReadParallelWorkers()
|
||||
chkchan := ReadSeqFileChunk(reader, _EndOfLastFastqEntry)
|
||||
|
||||
for i := 0; i < nworker; i++ {
|
||||
out.Add(1)
|
||||
go _ParseFastqFile(source, chkchan, out,
|
||||
byte(obioptions.InputQualityShift()))
|
||||
go _ParseFastqFile(opt.Source(),
|
||||
chkchan,
|
||||
out,
|
||||
byte(obioptions.InputQualityShift()),
|
||||
opt.NoOrder(),
|
||||
opt.BatchSize(),
|
||||
chunkorder)
|
||||
}
|
||||
|
||||
go func() {
|
||||
|
||||
@@ -3,12 +3,13 @@ package obiformats
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"log"
|
||||
"math"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/goccy/go-json"
|
||||
|
||||
@@ -115,7 +115,7 @@ func WriteFasta(iterator obiiter.IBioSequence,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
opt := MakeOptions(options)
|
||||
|
||||
iterator = iterator.Rebatch(1000)
|
||||
iterator = iterator.Rebatch(opt.BatchSize())
|
||||
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
@@ -62,9 +62,8 @@ func WriteFastq(iterator obiiter.IBioSequence,
|
||||
file io.WriteCloser,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
|
||||
iterator = iterator.Rebatch(1000)
|
||||
|
||||
opt := MakeOptions(options)
|
||||
iterator = iterator.Rebatch(opt.BatchSize())
|
||||
|
||||
file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ type __options__ struct {
|
||||
total_seq_size int
|
||||
full_file_batch bool
|
||||
parallel_workers int
|
||||
no_order bool
|
||||
closefile bool
|
||||
appendfile bool
|
||||
compressed bool
|
||||
@@ -48,6 +49,7 @@ func MakeOptions(setters []WithOption) Options {
|
||||
parallel_workers: obioptions.CLIReadParallelWorkers(),
|
||||
batch_size: obioptions.CLIBatchSize(),
|
||||
total_seq_size: 1024 * 1024 * 100, // 100 MB by default
|
||||
no_order: false,
|
||||
full_file_batch: false,
|
||||
closefile: false,
|
||||
appendfile: false,
|
||||
@@ -101,6 +103,10 @@ func (opt Options) FormatFastSeqHeader() func(*obiseq.BioSequence) string {
|
||||
return opt.pointer.fastseq_header_writer
|
||||
}
|
||||
|
||||
func (opt Options) NoOrder() bool {
|
||||
return opt.pointer.no_order
|
||||
}
|
||||
|
||||
func (opt Options) ProgressBar() bool {
|
||||
return opt.pointer.with_progress_bar
|
||||
}
|
||||
@@ -205,6 +211,16 @@ func OptionsAppendFile(append bool) WithOption {
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionNoOrder(no_order bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.no_order = no_order
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
|
||||
|
||||
func OptionsCompressed(compressed bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.compressed = compressed
|
||||
|
||||
Reference in New Issue
Block a user