Files
obitools4/pkg/obiformats/embl_read.go

238 lines
5.0 KiB
Go
Raw Normal View History

2022-01-13 23:27:39 +01:00
package obiformats
import (
"bufio"
"bytes"
"compress/gzip"
"io"
"log"
"os"
"strconv"
"strings"
"time"
2022-01-13 23:43:01 +01:00
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
2022-01-13 23:27:39 +01:00
)
2022-01-14 17:32:12 +01:00
var _FileChunkSize = 1 << 20
2022-01-13 23:27:39 +01:00
2022-01-14 17:32:12 +01:00
type _FileChunk struct {
2022-01-13 23:27:39 +01:00
raw io.Reader
order int
}
2022-01-14 17:32:12 +01:00
func _EndOfLastEntry(buff []byte) int {
2022-01-13 23:27:39 +01:00
// 6 5 43 2 1
// <CR>?<LF>//<CR>?<LF>
var i int
var state = 0
var start = 0
for i = len(buff) - 1; i >= 0 && state < 5; i-- {
switch state {
case 0: // outside of the pattern
if buff[i] == '\n' {
state = 1
}
case 1: // a \n have been matched
start = i + 2
switch buff[i] {
case '\r':
state = 2
case '/':
state = 3
case '\n':
state = 1
default:
state = 0
}
case 2: // a \r have been matched
switch buff[i] {
case '/':
state = 3
case '\n':
state = 1
default:
state = 0
}
case 3: // the first / have been matched
switch buff[i] {
case '/':
state = 4
case '\n':
state = 1
default:
state = 0
}
case 4: // the second / have been matched
switch buff[i] {
case '\n':
state = 5
default:
state = 0
}
}
}
if i > 0 {
return start
}
2022-01-14 17:32:12 +01:00
return -1
2022-01-13 23:27:39 +01:00
}
2022-01-14 17:32:12 +01:00
func _ParseEmblFile(input <-chan _FileChunk, out obiseq.IBioSequenceBatch) {
2022-01-13 23:27:39 +01:00
for chunks := range input {
scanner := bufio.NewScanner(chunks.raw)
order := chunks.order
sequences := make(obiseq.BioSequenceSlice, 0, 100)
id := ""
2022-01-14 17:32:12 +01:00
scientificName := ""
defBytes := new(bytes.Buffer)
featBytes := new(bytes.Buffer)
seqBytes := new(bytes.Buffer)
2022-01-13 23:27:39 +01:00
taxid := 1
for scanner.Scan() {
line := scanner.Text()
switch {
case strings.HasPrefix(line, "ID "):
id = strings.SplitN(line[5:], ";", 2)[0]
case strings.HasPrefix(line, "OS "):
2022-01-14 17:32:12 +01:00
scientificName = strings.TrimSpace(line[5:])
2022-01-13 23:27:39 +01:00
case strings.HasPrefix(line, "DE "):
2022-01-14 17:32:12 +01:00
if defBytes.Len() > 0 {
defBytes.WriteByte(' ')
2022-01-13 23:27:39 +01:00
}
2022-01-14 17:32:12 +01:00
defBytes.WriteString(strings.TrimSpace(line[5:]))
2022-01-13 23:27:39 +01:00
case strings.HasPrefix(line, "FH "):
2022-01-14 17:32:12 +01:00
featBytes.WriteString(line)
2022-01-13 23:27:39 +01:00
case line == "FH":
2022-01-14 17:32:12 +01:00
featBytes.WriteByte('\n')
featBytes.WriteString(line)
2022-01-13 23:27:39 +01:00
case strings.HasPrefix(line, "FT "):
2022-01-14 17:32:12 +01:00
featBytes.WriteByte('\n')
featBytes.WriteString(line)
2022-01-13 23:27:39 +01:00
if strings.HasPrefix(line, `FT /db_xref="taxon:`) {
taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
}
case strings.HasPrefix(line, " "):
parts := strings.SplitN(line[5:], " ", 7)
for i := 0; i < 6; i++ {
2022-01-14 17:32:12 +01:00
seqBytes.WriteString(parts[i])
2022-01-13 23:27:39 +01:00
}
case line == "//":
sequence := obiseq.MakeBioSequence(id,
2022-01-14 17:32:12 +01:00
seqBytes.Bytes(),
defBytes.String())
2022-01-13 23:27:39 +01:00
2022-01-16 00:21:42 +01:00
sequence.SetFeatures(featBytes.Bytes())
2022-01-13 23:27:39 +01:00
annot := sequence.Annotations()
2022-01-14 17:32:12 +01:00
annot["scientific_name"] = scientificName
2022-01-13 23:27:39 +01:00
annot["taxid"] = taxid
// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))
sequences = append(sequences, sequence)
2022-01-14 17:32:12 +01:00
defBytes = new(bytes.Buffer)
featBytes = new(bytes.Buffer)
seqBytes = new(bytes.Buffer)
2022-01-13 23:27:39 +01:00
}
}
out.Channel() <- obiseq.MakeBioSequenceBatch(order, sequences...)
}
out.Done()
}
2022-01-14 17:32:12 +01:00
func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
2022-01-13 23:27:39 +01:00
var err error
var buff []byte
size := 0
l := 0
i := 0
buff = make([]byte, 1<<20)
for err == nil {
for ; err == nil && l < len(buff); l += size {
size, err = reader.Read(buff[l:])
}
buff = buff[:l]
2022-01-14 17:32:12 +01:00
end := _EndOfLastEntry(buff)
2022-01-13 23:27:39 +01:00
remains := buff[end:]
buff = buff[:end]
io := bytes.NewBuffer(buff)
2022-01-14 17:32:12 +01:00
readers <- _FileChunk{io, i}
2022-01-13 23:27:39 +01:00
i++
2022-01-14 17:32:12 +01:00
buff = make([]byte, _FileChunkSize)
2022-01-13 23:27:39 +01:00
copy(buff, remains)
l = len(remains)
}
close(readers)
}
// 6 5 43 2 1
// <CR>?<LF>//<CR>?<LF>
func ReadEMBLBatch(reader io.Reader, options ...WithOption) obiseq.IBioSequenceBatch {
opt := MakeOptions(options)
2022-01-14 17:32:12 +01:00
entry_channel := make(chan _FileChunk, opt.BufferSize())
2022-01-13 23:27:39 +01:00
2022-01-14 17:32:12 +01:00
newIter := obiseq.MakeIBioSequenceBatch(opt.BufferSize())
2022-01-13 23:27:39 +01:00
nworkers := opt.ParallelWorkers()
newIter.Add(nworkers)
2022-01-13 23:27:39 +01:00
go func() {
2022-01-14 17:32:12 +01:00
newIter.Wait()
for len(newIter.Channel()) > 0 {
2022-01-13 23:27:39 +01:00
time.Sleep(time.Millisecond)
}
2022-01-14 17:32:12 +01:00
close(newIter.Channel())
2022-01-13 23:27:39 +01:00
}()
// for j := 0; j < opt.ParallelWorkers(); j++ {
for j := 0; j < nworkers; j++ {
2022-01-14 17:32:12 +01:00
go _ParseEmblFile(entry_channel, newIter)
2022-01-13 23:27:39 +01:00
}
2022-01-14 17:32:12 +01:00
go _ReadFlatFileChunk(reader, entry_channel)
2022-01-13 23:27:39 +01:00
2022-01-14 17:32:12 +01:00
return newIter
2022-01-13 23:27:39 +01:00
}
func ReadEMBL(reader io.Reader, options ...WithOption) obiseq.IBioSequence {
ib := ReadEMBLBatch(reader, options...)
return ib.SortBatches().IBioSequence()
}
func ReadEMBLBatchFromFile(filename string, options ...WithOption) (obiseq.IBioSequenceBatch, error) {
var reader io.Reader
var greader io.Reader
var err error
reader, err = os.Open(filename)
if err != nil {
log.Printf("open file error: %+v", err)
return obiseq.NilIBioSequenceBatch, err
}
// Test if the flux is compressed by gzip
greader, err = gzip.NewReader(reader)
if err == nil {
reader = greader
}
return ReadEMBLBatch(reader, options...), nil
}
func ReadEMBLFromFile(filename string, options ...WithOption) (obiseq.IBioSequence, error) {
ib, err := ReadEMBLBatchFromFile(filename, options...)
return ib.SortBatches().IBioSequence(), err
}