mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Work on EMBL and Genbank parser efficienct
Former-commit-id: 309cc9ce4eea4c8085d7d4451a66a81710532f07
This commit is contained in:
@ -2,6 +2,14 @@
|
|||||||
|
|
||||||
## Latest changes
|
## Latest changes
|
||||||
|
|
||||||
|
### Enhancement
|
||||||
|
- The bug corrected in the parsing of EMBL and Genbank files as implemented in version 4.1.2 of OBITools4,
|
||||||
|
potentially induced some reduction in the performance of the parsing. This should have been now fixed.
|
||||||
|
- In the same idea, parsing of genbank and EMBL files were reading and storing in memory not only the sequence
|
||||||
|
but also the annotations (features table). Up to now none of the obitools are using this information, but
|
||||||
|
with large complete genomes, it is occupying a lot of memory. To reduce this impact, the new version of the
|
||||||
|
parser doesn't any more store in memory the annotations by default.
|
||||||
|
|
||||||
## February 16th, 2024. Release 4.1.2
|
## February 16th, 2024. Release 4.1.2
|
||||||
|
|
||||||
### Bug fixes
|
### Bug fixes
|
||||||
|
@ -94,7 +94,11 @@ func _EndOfLastEntry(buff []byte) int {
|
|||||||
return -1
|
return -1
|
||||||
}
|
}
|
||||||
|
|
||||||
func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequence) {
|
func _ParseEmblFile(source string, input <-chan _FileChunk,
|
||||||
|
out obiiter.IBioSequence,
|
||||||
|
withFeatureTable bool,
|
||||||
|
batch_size int,
|
||||||
|
total_seq_size int) {
|
||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
scanner := bufio.NewScanner(chunks.raw)
|
scanner := bufio.NewScanner(chunks.raw)
|
||||||
@ -120,14 +124,16 @@ func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequ
|
|||||||
defBytes.WriteByte(' ')
|
defBytes.WriteByte(' ')
|
||||||
}
|
}
|
||||||
defBytes.WriteString(strings.TrimSpace(line[5:]))
|
defBytes.WriteString(strings.TrimSpace(line[5:]))
|
||||||
case strings.HasPrefix(line, "FH "):
|
case withFeatureTable && strings.HasPrefix(line, "FH "):
|
||||||
featBytes.WriteString(line)
|
featBytes.WriteString(line)
|
||||||
case line == "FH":
|
case withFeatureTable && line == "FH":
|
||||||
featBytes.WriteByte('\n')
|
featBytes.WriteByte('\n')
|
||||||
featBytes.WriteString(line)
|
featBytes.WriteString(line)
|
||||||
case strings.HasPrefix(line, "FT "):
|
case strings.HasPrefix(line, "FT "):
|
||||||
|
if withFeatureTable {
|
||||||
featBytes.WriteByte('\n')
|
featBytes.WriteByte('\n')
|
||||||
featBytes.WriteString(line)
|
featBytes.WriteString(line)
|
||||||
|
}
|
||||||
if strings.HasPrefix(line, `FT /db_xref="taxon:`) {
|
if strings.HasPrefix(line, `FT /db_xref="taxon:`) {
|
||||||
taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
|
taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
|
||||||
}
|
}
|
||||||
@ -143,7 +149,9 @@ func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequ
|
|||||||
defBytes.String())
|
defBytes.String())
|
||||||
sequence.SetSource(source)
|
sequence.SetSource(source)
|
||||||
|
|
||||||
|
if withFeatureTable {
|
||||||
sequence.SetFeatures(featBytes.Bytes())
|
sequence.SetFeatures(featBytes.Bytes())
|
||||||
|
}
|
||||||
|
|
||||||
annot := sequence.Annotations()
|
annot := sequence.Annotations()
|
||||||
annot["scientific_name"] = scientificName
|
annot["scientific_name"] = scientificName
|
||||||
@ -198,7 +206,7 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
|
|||||||
for err == nil {
|
for err == nil {
|
||||||
|
|
||||||
// Create an extended buffer to read from if the end of the last entry is not found in the current buffer
|
// Create an extended buffer to read from if the end of the last entry is not found in the current buffer
|
||||||
extbuff := make([]byte, 1<<22)
|
extbuff := make([]byte, _FileChunkSize)
|
||||||
end := 0
|
end := 0
|
||||||
ic := 0
|
ic := 0
|
||||||
|
|
||||||
@ -278,7 +286,9 @@ func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
|||||||
|
|
||||||
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
||||||
for j := 0; j < nworkers; j++ {
|
for j := 0; j < nworkers; j++ {
|
||||||
go _ParseEmblFile(opt.Source(), entry_channel, newIter)
|
go _ParseEmblFile(opt.Source(), entry_channel, newIter,
|
||||||
|
opt.WithFeatureTable(),
|
||||||
|
opt.BatchSize(), opt.TotalSeqSize())
|
||||||
}
|
}
|
||||||
|
|
||||||
go _ReadFlatFileChunk(reader, entry_channel)
|
go _ReadFlatFileChunk(reader, entry_channel)
|
||||||
|
@ -30,7 +30,10 @@ var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
|
|||||||
|
|
||||||
func _ParseGenbankFile(source string,
|
func _ParseGenbankFile(source string,
|
||||||
input <-chan _FileChunk, out obiiter.IBioSequence,
|
input <-chan _FileChunk, out obiiter.IBioSequence,
|
||||||
chunck_order func() int) {
|
chunck_order func() int,
|
||||||
|
withFeatureTable bool,
|
||||||
|
batch_size int,
|
||||||
|
total_seq_size int) {
|
||||||
state := inHeader
|
state := inHeader
|
||||||
previous_chunk := -1
|
previous_chunk := -1
|
||||||
|
|
||||||
@ -143,7 +146,10 @@ func _ParseGenbankFile(source string,
|
|||||||
seqBytes.Bytes(),
|
seqBytes.Bytes(),
|
||||||
defBytes.String())
|
defBytes.String())
|
||||||
sequence.SetSource(source)
|
sequence.SetSource(source)
|
||||||
|
|
||||||
|
if withFeatureTable {
|
||||||
sequence.SetFeatures(featBytes.Bytes())
|
sequence.SetFeatures(featBytes.Bytes())
|
||||||
|
}
|
||||||
|
|
||||||
annot := sequence.Annotations()
|
annot := sequence.Annotations()
|
||||||
annot["scientific_name"] = scientificName
|
annot["scientific_name"] = scientificName
|
||||||
@ -155,7 +161,7 @@ func _ParseGenbankFile(source string,
|
|||||||
sequences = append(sequences, sequence)
|
sequences = append(sequences, sequence)
|
||||||
sumlength += sequence.Len()
|
sumlength += sequence.Len()
|
||||||
|
|
||||||
if len(sequences) == 100 || sumlength > 1e7 {
|
if len(sequences) == batch_size || sumlength > total_seq_size {
|
||||||
log.Debugln("Pushing sequences")
|
log.Debugln("Pushing sequences")
|
||||||
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
|
||||||
sequences = make(obiseq.BioSequenceSlice, 0, 100)
|
sequences = make(obiseq.BioSequenceSlice, 0, 100)
|
||||||
@ -184,8 +190,10 @@ func _ParseGenbankFile(source string,
|
|||||||
default:
|
default:
|
||||||
switch state {
|
switch state {
|
||||||
case inFeature:
|
case inFeature:
|
||||||
|
if withFeatureTable {
|
||||||
featBytes.WriteByte('\n')
|
featBytes.WriteByte('\n')
|
||||||
featBytes.WriteString(line)
|
featBytes.WriteString(line)
|
||||||
|
}
|
||||||
if strings.HasPrefix(line, ` /db_xref="taxon:`) {
|
if strings.HasPrefix(line, ` /db_xref="taxon:`) {
|
||||||
taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
|
taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
|
||||||
}
|
}
|
||||||
@ -227,7 +235,8 @@ func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
|||||||
|
|
||||||
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
||||||
for j := 0; j < nworkers; j++ {
|
for j := 0; j < nworkers; j++ {
|
||||||
go _ParseGenbankFile(opt.Source(), entry_channel, newIter, chunck_order)
|
go _ParseGenbankFile(opt.Source(), entry_channel, newIter, chunck_order,
|
||||||
|
opt.WithFeatureTable(), opt.BatchSize(), opt.TotalSeqSize())
|
||||||
}
|
}
|
||||||
|
|
||||||
go _ReadFlatFileChunk(reader, entry_channel)
|
go _ReadFlatFileChunk(reader, entry_channel)
|
||||||
|
@ -11,6 +11,7 @@ type __options__ struct {
|
|||||||
with_progress_bar bool
|
with_progress_bar bool
|
||||||
buffer_size int
|
buffer_size int
|
||||||
batch_size int
|
batch_size int
|
||||||
|
total_seq_size int
|
||||||
full_file_batch bool
|
full_file_batch bool
|
||||||
parallel_workers int
|
parallel_workers int
|
||||||
closefile bool
|
closefile bool
|
||||||
@ -29,6 +30,7 @@ type __options__ struct {
|
|||||||
csv_auto bool
|
csv_auto bool
|
||||||
paired_filename string
|
paired_filename string
|
||||||
source string
|
source string
|
||||||
|
with_feature_table bool
|
||||||
}
|
}
|
||||||
|
|
||||||
type Options struct {
|
type Options struct {
|
||||||
@ -45,6 +47,7 @@ func MakeOptions(setters []WithOption) Options {
|
|||||||
buffer_size: 2,
|
buffer_size: 2,
|
||||||
parallel_workers: obioptions.CLIReadParallelWorkers(),
|
parallel_workers: obioptions.CLIReadParallelWorkers(),
|
||||||
batch_size: obioptions.CLIBatchSize(),
|
batch_size: obioptions.CLIBatchSize(),
|
||||||
|
total_seq_size: 1024 * 1024 * 100, // 100 MB by default
|
||||||
full_file_batch: false,
|
full_file_batch: false,
|
||||||
closefile: false,
|
closefile: false,
|
||||||
appendfile: false,
|
appendfile: false,
|
||||||
@ -62,6 +65,7 @@ func MakeOptions(setters []WithOption) Options {
|
|||||||
csv_auto: false,
|
csv_auto: false,
|
||||||
paired_filename: "",
|
paired_filename: "",
|
||||||
source: "",
|
source: "",
|
||||||
|
with_feature_table: false,
|
||||||
}
|
}
|
||||||
|
|
||||||
opt := Options{&o}
|
opt := Options{&o}
|
||||||
@ -77,6 +81,10 @@ func (opt Options) BatchSize() int {
|
|||||||
return opt.pointer.batch_size
|
return opt.pointer.batch_size
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (opt Options) TotalSeqSize() int {
|
||||||
|
return opt.pointer.total_seq_size
|
||||||
|
}
|
||||||
|
|
||||||
func (opt Options) FullFileBatch() bool {
|
func (opt Options) FullFileBatch() bool {
|
||||||
return opt.pointer.full_file_batch
|
return opt.pointer.full_file_batch
|
||||||
}
|
}
|
||||||
@ -169,6 +177,10 @@ func (opt Options) Source() string {
|
|||||||
return opt.pointer.source
|
return opt.pointer.source
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (opt Options) WithFeatureTable() bool {
|
||||||
|
return opt.pointer.with_feature_table
|
||||||
|
}
|
||||||
|
|
||||||
func OptionCloseFile() WithOption {
|
func OptionCloseFile() WithOption {
|
||||||
f := WithOption(func(opt Options) {
|
f := WithOption(func(opt Options) {
|
||||||
opt.pointer.closefile = true
|
opt.pointer.closefile = true
|
||||||
@ -259,6 +271,14 @@ func OptionsBatchSize(size int) WithOption {
|
|||||||
return f
|
return f
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func OptionsBatchSizeDefault(bp int) WithOption {
|
||||||
|
f := WithOption(func(opt Options) {
|
||||||
|
opt.pointer.batch_size = bp
|
||||||
|
})
|
||||||
|
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
|
||||||
func OptionsFullFileBatch(full bool) WithOption {
|
func OptionsFullFileBatch(full bool) WithOption {
|
||||||
f := WithOption(func(opt Options) {
|
f := WithOption(func(opt Options) {
|
||||||
opt.pointer.full_file_batch = full
|
opt.pointer.full_file_batch = full
|
||||||
@ -386,3 +406,11 @@ func CSVAutoColumn(auto bool) WithOption {
|
|||||||
|
|
||||||
return f
|
return f
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func WithFeatureTable(with bool) WithOption {
|
||||||
|
f := WithOption(func(opt Options) {
|
||||||
|
opt.pointer.with_feature_table = with
|
||||||
|
})
|
||||||
|
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user