Make some correction on genbank/embl parser

Former-commit-id: fb2ebb351f61d78432bb9648d0a509b6557651a2
This commit is contained in:
2024-02-27 07:28:56 +01:00
parent 38c49e9f38
commit c9fe6f6ebf
2 changed files with 25 additions and 33 deletions

View File

@ -218,8 +218,12 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
}
if len(buff) > 0 {
if end < 0 {
end = len(buff)
}
lremain := len(buff) - end
remains := make([]byte, max(lremain, _FileChunkSize))
lcp := copy(remains, buff[end:])
remains = remains[:lcp]
if lcp < lremain {
@ -228,38 +232,21 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
buff = buff[:end]
// Send the chunk of data as a _FileChunk struct to the readers channel
io := bytes.NewBuffer(buff)
for len(buff) > 0 && (buff[len(buff)-1] == '\n' || buff[len(buff)-1] == '\r') {
buff = buff[:len(buff)-1]
}
nzero := 0
for j := 0; j < len(buff); j++ {
if buff[j] == 0 {
nzero++
if len(buff) > 0 {
io := bytes.NewBuffer(buff)
if string(buff[io.Len()-2:]) != "//" {
log.Fatalf("File chunck ends with 3 bytes : %s", io.Bytes()[io.Len()-3:])
}
readers <- _FileChunk{io, i}
i++
buff = remains
}
if nzero > 0 {
log.Fatalf("File chunck %d contains %d zero bytes", i, nzero)
}
log.Debugf("Flat File chunck %d : final buff size %d bytes (%d) (%d extensions count) -> end = %d starting by = %s, ending by = %s, remaining = %s",
i,
len(buff),
io.Cap(),
ic,
end,
io.Bytes()[0:30],
io.Bytes()[io.Len()-3:],
remains[0:30],
)
if string(buff[io.Len()-3:]) != "//\n" {
log.Fatalf("File chunck ends with 3 bytes : %s", io.Bytes()[io.Len()-3:])
}
readers <- _FileChunk{io, i}
i++
buff = remains
}
}

View File

@ -24,6 +24,7 @@ const (
inDefinition gbstate = 2
inFeature gbstate = 3
inSequence gbstate = 4
inContig gbstate = 5
)
var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
@ -127,18 +128,22 @@ func _ParseGenbankFile(source string,
state = inSequence
processed = true
case strings.HasPrefix(line, "CONTIG"):
if state != inFeature && state != inContig {
log.Fatalf("Unexpected state %d while reading ORIGIN: %s", state, line)
}
state = inContig
processed = true
case line == "//":
if state != inSequence {
if state != inSequence && state != inContig {
log.Fatalf("Unexpected state %d while reading end of record %s", state, id)
}
// log.Debugln("Total lines := ", nl)
if id == "" {
log.Warn("Empty id when parsing genbank file")
}
if seqBytes.Len() == 0 {
log.Warn("Empty sequence when parsing genbank file")
}
log.Debugf("End of sequence %s: %dbp ", id, seqBytes.Len())