Patch a bug in the genbank reader for the sequence longer than 10kb.

This commit is contained in:
2023-02-17 10:54:03 +01:00
parent ed87f821eb
commit 8458c0cd8b
2 changed files with 41 additions and 34 deletions

View File

@ -209,11 +209,6 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
end = _EndOfLastEntry(buff)
// If an extension was read, log the size and number of extensions
log.Debugf("Flat File chunck : final buff size %d bytes (%d extensions) -> end = %d\n",
len(buff),
ic,
end,
)
if len(buff) > 0 {
remains := buff[end:]
@ -221,6 +216,14 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
// Send the chunk of data as a _FileChunk struct to the readers channel
io := bytes.NewBuffer(buff)
log.Debugf("Flat File chunck : final buff size %d bytes (%d) (%d extensions) -> end = %d\n",
len(buff),
io.Cap(),
ic,
end,
)
readers <- _FileChunk{io, i}
i++

View File

@ -31,6 +31,7 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
state := inHeader
for chunks := range input {
log.Debugln("Chunk size", (chunks.raw.(*bytes.Buffer)).Len())
scanner := bufio.NewScanner(chunks.raw)
order := chunks.order
sequences := make(obiseq.BioSequenceSlice, 0, 100)
@ -40,15 +41,15 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
featBytes := new(bytes.Buffer)
seqBytes := new(bytes.Buffer)
taxid := 1
nl := 0
sl := 0
for scanner.Scan() {
nl++
line := scanner.Text()
if !strings.HasPrefix(line, " ") && state != inHeader {
state = inEntry
}
switch {
case state==inDefinition && ! strings.HasPrefix(line, " "):
state = inEntry
fallthrough
case strings.HasPrefix(line, "LOCUS "):
state = inEntry
id = strings.SplitN(line[12:], " ", 2)[0]
@ -62,27 +63,8 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
state = inFeature
case strings.HasPrefix(line, "ORIGIN "):
state = inSequence
case strings.HasPrefix(line, " "):
switch state {
case inDefinition:
defBytes.WriteByte(' ')
defBytes.WriteString(strings.TrimSpace(line[5:]))
case inFeature:
featBytes.WriteByte('\n')
featBytes.WriteString(line)
if strings.HasPrefix(line, ` /db_xref="taxon:`) {
taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
}
case inSequence:
parts := strings.SplitN(line[10:], " ", 7)
lparts := len(parts)
for i := 0; i < lparts; i++ {
seqBytes.WriteString(parts[i])
}
default: // Do nothing
}
case line == "//":
log.Debugln("Total lines := ", nl)
sequence := obiseq.NewBioSequence(id,
bytes.ToLower(seqBytes.Bytes()),
defBytes.String())
@ -98,7 +80,29 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
defBytes = new(bytes.Buffer)
featBytes = new(bytes.Buffer)
seqBytes = new(bytes.Buffer)
nl=0
sl=0
default:
switch state {
case inDefinition:
defBytes.WriteByte(' ')
defBytes.WriteString(strings.TrimSpace(line[5:]))
case inFeature:
featBytes.WriteByte('\n')
featBytes.WriteString(line)
if strings.HasPrefix(line, ` /db_xref="taxon:`) {
taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
}
case inSequence:
sl++
parts := strings.SplitN(line[10:], " ", 7)
lparts := len(parts)
for i := 0; i < lparts; i++ {
seqBytes.WriteString(parts[i])
}
}
}
}
out.Push(obiiter.MakeBioSequenceBatch(order, sequences))
}