diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go index 9e5cd69..b72ff1b 100644 --- a/pkg/obiformats/embl_read.go +++ b/pkg/obiformats/embl_read.go @@ -209,25 +209,28 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) { end = _EndOfLastEntry(buff) // If an extension was read, log the size and number of extensions - log.Debugf("Flat File chunck : final buff size %d bytes (%d extensions) -> end = %d\n", - len(buff), - ic, - end, - ) if len(buff) > 0 { remains := buff[end:] buff = buff[:end] - + // Send the chunk of data as a _FileChunk struct to the readers channel io := bytes.NewBuffer(buff) + + log.Debugf("Flat File chunck : final buff size %d bytes (%d) (%d extensions) -> end = %d\n", + len(buff), + io.Cap(), + ic, + end, + ) + readers <- _FileChunk{io, i} i++ - + // Set the buffer to the size of a chunk of data and copy any remaining data to the new buffer buff = make([]byte, _FileChunkSize) copy(buff, remains) - l = len(remains) + l = len(remains) } } diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go index 28e2d66..53cb271 100644 --- a/pkg/obiformats/genbank_read.go +++ b/pkg/obiformats/genbank_read.go @@ -31,6 +31,7 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) { state := inHeader for chunks := range input { + log.Debugln("Chunk size", (chunks.raw.(*bytes.Buffer)).Len()) scanner := bufio.NewScanner(chunks.raw) order := chunks.order sequences := make(obiseq.BioSequenceSlice, 0, 100) @@ -40,15 +41,15 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) { featBytes := new(bytes.Buffer) seqBytes := new(bytes.Buffer) taxid := 1 + nl := 0 + sl := 0 for scanner.Scan() { - + nl++ line := scanner.Text() - - if !strings.HasPrefix(line, " ") && state != inHeader { - state = inEntry - } - switch { + case state==inDefinition && ! strings.HasPrefix(line, " "): + state = inEntry + fallthrough case strings.HasPrefix(line, "LOCUS "): state = inEntry id = strings.SplitN(line[12:], " ", 2)[0] @@ -62,27 +63,8 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) { state = inFeature case strings.HasPrefix(line, "ORIGIN "): state = inSequence - case strings.HasPrefix(line, " "): - switch state { - case inDefinition: - defBytes.WriteByte(' ') - defBytes.WriteString(strings.TrimSpace(line[5:])) - case inFeature: - featBytes.WriteByte('\n') - featBytes.WriteString(line) - if strings.HasPrefix(line, ` /db_xref="taxon:`) { - taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0]) - } - case inSequence: - parts := strings.SplitN(line[10:], " ", 7) - lparts := len(parts) - for i := 0; i < lparts; i++ { - seqBytes.WriteString(parts[i]) - } - default: // Do nothing - } - case line == "//": + log.Debugln("Total lines := ", nl) sequence := obiseq.NewBioSequence(id, bytes.ToLower(seqBytes.Bytes()), defBytes.String()) @@ -98,7 +80,29 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) { defBytes = new(bytes.Buffer) featBytes = new(bytes.Buffer) seqBytes = new(bytes.Buffer) + nl=0 + sl=0 + default: + switch state { + case inDefinition: + defBytes.WriteByte(' ') + defBytes.WriteString(strings.TrimSpace(line[5:])) + case inFeature: + featBytes.WriteByte('\n') + featBytes.WriteString(line) + if strings.HasPrefix(line, ` /db_xref="taxon:`) { + taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0]) + } + case inSequence: + sl++ + parts := strings.SplitN(line[10:], " ", 7) + lparts := len(parts) + for i := 0; i < lparts; i++ { + seqBytes.WriteString(parts[i]) + } + } } + } out.Push(obiiter.MakeBioSequenceBatch(order, sequences)) }