mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Work on EMBL and Genbank parser efficienct
Former-commit-id: 309cc9ce4eea4c8085d7d4451a66a81710532f07
This commit is contained in:
@ -2,6 +2,14 @@
|
||||
|
||||
## Latest changes
|
||||
|
||||
### Enhancement
|
||||
- The bug corrected in the parsing of EMBL and Genbank files as implemented in version 4.1.2 of OBITools4,
|
||||
potentially induced some reduction in the performance of the parsing. This should have been now fixed.
|
||||
- In the same idea, parsing of genbank and EMBL files were reading and storing in memory not only the sequence
|
||||
but also the annotations (features table). Up to now none of the obitools are using this information, but
|
||||
with large complete genomes, it is occupying a lot of memory. To reduce this impact, the new version of the
|
||||
parser doesn't any more store in memory the annotations by default.
|
||||
|
||||
## February 16th, 2024. Release 4.1.2
|
||||
|
||||
### Bug fixes
|
||||
|
@ -94,7 +94,11 @@ func _EndOfLastEntry(buff []byte) int {
|
||||
return -1
|
||||
}
|
||||
|
||||
func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequence) {
|
||||
func _ParseEmblFile(source string, input <-chan _FileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
withFeatureTable bool,
|
||||
batch_size int,
|
||||
total_seq_size int) {
|
||||
|
||||
for chunks := range input {
|
||||
scanner := bufio.NewScanner(chunks.raw)
|
||||
@ -120,14 +124,16 @@ func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequ
|
||||
defBytes.WriteByte(' ')
|
||||
}
|
||||
defBytes.WriteString(strings.TrimSpace(line[5:]))
|
||||
case strings.HasPrefix(line, "FH "):
|
||||
case withFeatureTable && strings.HasPrefix(line, "FH "):
|
||||
featBytes.WriteString(line)
|
||||
case line == "FH":
|
||||
case withFeatureTable && line == "FH":
|
||||
featBytes.WriteByte('\n')
|
||||
featBytes.WriteString(line)
|
||||
case strings.HasPrefix(line, "FT "):
|
||||
featBytes.WriteByte('\n')
|
||||
featBytes.WriteString(line)
|
||||
if withFeatureTable {
|
||||
featBytes.WriteByte('\n')
|
||||
featBytes.WriteString(line)
|
||||
}
|
||||
if strings.HasPrefix(line, `FT /db_xref="taxon:`) {
|
||||
taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
|
||||
}
|
||||
@ -143,7 +149,9 @@ func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequ
|
||||
defBytes.String())
|
||||
sequence.SetSource(source)
|
||||
|
||||
sequence.SetFeatures(featBytes.Bytes())
|
||||
if withFeatureTable {
|
||||
sequence.SetFeatures(featBytes.Bytes())
|
||||
}
|
||||
|
||||
annot := sequence.Annotations()
|
||||
annot["scientific_name"] = scientificName
|
||||
@ -198,7 +206,7 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
|
||||
for err == nil {
|
||||
|
||||
// Create an extended buffer to read from if the end of the last entry is not found in the current buffer
|
||||
extbuff := make([]byte, 1<<22)
|
||||
extbuff := make([]byte, _FileChunkSize)
|
||||
end := 0
|
||||
ic := 0
|
||||
|
||||
@ -278,7 +286,9 @@ func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
||||
|
||||
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
||||
for j := 0; j < nworkers; j++ {
|
||||
go _ParseEmblFile(opt.Source(), entry_channel, newIter)
|
||||
go _ParseEmblFile(opt.Source(), entry_channel, newIter,
|
||||
opt.WithFeatureTable(),
|
||||
opt.BatchSize(), opt.TotalSeqSize())
|
||||
}
|
||||
|
||||
go _ReadFlatFileChunk(reader, entry_channel)
|
||||
|
@ -30,7 +30,10 @@ var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
|
||||
|
||||
func _ParseGenbankFile(source string,
|
||||
input <-chan _FileChunk, out obiiter.IBioSequence,
|
||||
chunck_order func() int) {
|
||||
chunck_order func() int,
|
||||
withFeatureTable bool,
|
||||
batch_size int,
|
||||
total_seq_size int) {
|
||||
state := inHeader
|
||||
previous_chunk := -1
|
||||
|
||||
@ -143,7 +146,10 @@ func _ParseGenbankFile(source string,
|
||||
seqBytes.Bytes(),
|
||||
defBytes.String())
|
||||
sequence.SetSource(source)
|
||||
sequence.SetFeatures(featBytes.Bytes())
|
||||
|
||||
if withFeatureTable {
|
||||
sequence.SetFeatures(featBytes.Bytes())
|
||||
}
|
||||
|
||||
annot := sequence.Annotations()
|
||||
annot["scientific_name"] = scientificName
|
||||
@ -155,7 +161,7 @@ func _ParseGenbankFile(source string,
|
||||
sequences = append(sequences, sequence)
|
||||
sumlength += sequence.Len()
|
||||
|
||||
if len(sequences) == 100 || sumlength > 1e7 {
|
||||
if len(sequences) == batch_size || sumlength > total_seq_size {
|
||||
log.Debugln("Pushing sequences")
|
||||
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
||||
sequences = make(obiseq.BioSequenceSlice, 0, 100)
|
||||
@ -184,8 +190,10 @@ func _ParseGenbankFile(source string,
|
||||
default:
|
||||
switch state {
|
||||
case inFeature:
|
||||
featBytes.WriteByte('\n')
|
||||
featBytes.WriteString(line)
|
||||
if withFeatureTable {
|
||||
featBytes.WriteByte('\n')
|
||||
featBytes.WriteString(line)
|
||||
}
|
||||
if strings.HasPrefix(line, ` /db_xref="taxon:`) {
|
||||
taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
|
||||
}
|
||||
@ -227,7 +235,8 @@ func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
||||
|
||||
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
||||
for j := 0; j < nworkers; j++ {
|
||||
go _ParseGenbankFile(opt.Source(), entry_channel, newIter, chunck_order)
|
||||
go _ParseGenbankFile(opt.Source(), entry_channel, newIter, chunck_order,
|
||||
opt.WithFeatureTable(), opt.BatchSize(), opt.TotalSeqSize())
|
||||
}
|
||||
|
||||
go _ReadFlatFileChunk(reader, entry_channel)
|
||||
|
@ -11,6 +11,7 @@ type __options__ struct {
|
||||
with_progress_bar bool
|
||||
buffer_size int
|
||||
batch_size int
|
||||
total_seq_size int
|
||||
full_file_batch bool
|
||||
parallel_workers int
|
||||
closefile bool
|
||||
@ -29,6 +30,7 @@ type __options__ struct {
|
||||
csv_auto bool
|
||||
paired_filename string
|
||||
source string
|
||||
with_feature_table bool
|
||||
}
|
||||
|
||||
type Options struct {
|
||||
@ -45,6 +47,7 @@ func MakeOptions(setters []WithOption) Options {
|
||||
buffer_size: 2,
|
||||
parallel_workers: obioptions.CLIReadParallelWorkers(),
|
||||
batch_size: obioptions.CLIBatchSize(),
|
||||
total_seq_size: 1024 * 1024 * 100, // 100 MB by default
|
||||
full_file_batch: false,
|
||||
closefile: false,
|
||||
appendfile: false,
|
||||
@ -62,6 +65,7 @@ func MakeOptions(setters []WithOption) Options {
|
||||
csv_auto: false,
|
||||
paired_filename: "",
|
||||
source: "",
|
||||
with_feature_table: false,
|
||||
}
|
||||
|
||||
opt := Options{&o}
|
||||
@ -77,6 +81,10 @@ func (opt Options) BatchSize() int {
|
||||
return opt.pointer.batch_size
|
||||
}
|
||||
|
||||
func (opt Options) TotalSeqSize() int {
|
||||
return opt.pointer.total_seq_size
|
||||
}
|
||||
|
||||
func (opt Options) FullFileBatch() bool {
|
||||
return opt.pointer.full_file_batch
|
||||
}
|
||||
@ -169,6 +177,10 @@ func (opt Options) Source() string {
|
||||
return opt.pointer.source
|
||||
}
|
||||
|
||||
func (opt Options) WithFeatureTable() bool {
|
||||
return opt.pointer.with_feature_table
|
||||
}
|
||||
|
||||
func OptionCloseFile() WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.closefile = true
|
||||
@ -259,6 +271,14 @@ func OptionsBatchSize(size int) WithOption {
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionsBatchSizeDefault(bp int) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.batch_size = bp
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionsFullFileBatch(full bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.full_file_batch = full
|
||||
@ -386,3 +406,11 @@ func CSVAutoColumn(auto bool) WithOption {
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func WithFeatureTable(with bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.with_feature_table = with
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
Reference in New Issue
Block a user