Files
obitools4/pkg/obiformats/genbank_read.go

191 lines
4.7 KiB
Go
Raw Normal View History

2022-08-23 11:04:57 +02:00
package obiformats
import (
"bufio"
"bytes"
"io"
"path"
"regexp"
2022-08-23 11:04:57 +02:00
"strconv"
"strings"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
2022-08-23 11:04:57 +02:00
)
type gbstate int
const (
inHeader gbstate = 0
2022-11-16 17:13:03 +01:00
inEntry gbstate = 1
inDefinition gbstate = 2
inFeature gbstate = 3
inSequence gbstate = 4
2022-08-23 11:04:57 +02:00
)
var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
func _ParseGenbankFile(source string,
input <-chan _FileChunk, out obiiter.IBioSequence,
chunck_order func() int) {
var err error
2022-08-23 11:04:57 +02:00
state := inHeader
for chunks := range input {
// log.Debugln("Chunk size", (chunks.raw.(*bytes.Buffer)).Len())
2022-08-23 11:04:57 +02:00
scanner := bufio.NewScanner(chunks.raw)
sequences := make(obiseq.BioSequenceSlice, 0, 100)
sumlength := 0
2022-08-23 11:04:57 +02:00
id := ""
lseq := -1
2022-08-23 11:04:57 +02:00
scientificName := ""
defBytes := new(bytes.Buffer)
featBytes := new(bytes.Buffer)
seqBytes := new(bytes.Buffer)
taxid := 1
nl := 0
sl := 0
2022-08-23 11:04:57 +02:00
for scanner.Scan() {
nl++
2022-08-23 11:04:57 +02:00
line := scanner.Text()
switch {
case state == inDefinition && !strings.HasPrefix(line, " "):
state = inEntry
fallthrough
2022-08-23 11:04:57 +02:00
case strings.HasPrefix(line, "LOCUS "):
state = inEntry
id = strings.SplitN(line[12:], " ", 2)[0]
match_length := _seqlenght_rx.FindStringSubmatch(line)
if len(match_length) > 0 {
lseq, err = strconv.Atoi(match_length[1])
if err != nil {
lseq = -1
}
}
if lseq > 0 {
seqBytes = bytes.NewBuffer(obiseq.GetSlice(lseq + 20))
} else {
seqBytes = new(bytes.Buffer)
}
2022-08-23 11:04:57 +02:00
case strings.HasPrefix(line, "SOURCE "):
scientificName = strings.TrimSpace(line[12:])
case strings.HasPrefix(line, "DEFINITION "):
defBytes.WriteString(strings.TrimSpace(line[12:]))
state = inDefinition
case strings.HasPrefix(line, "FEATURES "):
featBytes.WriteString(line)
state = inFeature
case strings.HasPrefix(line, "ORIGIN"):
2022-08-23 11:04:57 +02:00
state = inSequence
case line == "//":
// log.Debugln("Total lines := ", nl)
sequence := obiseq.NewBioSequence(id,
seqBytes.Bytes(),
defBytes.String())
sequence.SetSource(source)
state = inHeader
sequence.SetFeatures(featBytes.Bytes())
annot := sequence.Annotations()
annot["scientific_name"] = scientificName
annot["taxid"] = taxid
// log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader))
// log.Debugf("Read sequences %s: %dbp (%d)", sequence.Id(),
// sequence.Len(), seqBytes.Len())
sequences = append(sequences, sequence)
sumlength += sequence.Len()
if len(sequences) == 100 || sumlength > 1e7 {
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
sequences = make(obiseq.BioSequenceSlice, 0, 100)
sumlength = 0
}
defBytes = bytes.NewBuffer(obiseq.GetSlice(200))
featBytes = new(bytes.Buffer)
nl = 0
sl = 0
default:
2022-08-23 11:04:57 +02:00
switch state {
case inDefinition:
defBytes.WriteByte(' ')
defBytes.WriteString(strings.TrimSpace(line[5:]))
case inFeature:
featBytes.WriteByte('\n')
featBytes.WriteString(line)
if strings.HasPrefix(line, ` /db_xref="taxon:`) {
taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
}
case inSequence:
sl++
2022-08-23 11:04:57 +02:00
parts := strings.SplitN(line[10:], " ", 7)
lparts := len(parts)
for i := 0; i < lparts; i++ {
seqBytes.WriteString(parts[i])
}
}
}
2022-08-23 11:04:57 +02:00
}
if len(sequences) > 0 {
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
}
}
2022-08-23 11:04:57 +02:00
out.Done()
}
2023-01-22 22:04:17 +01:00
func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
2022-08-23 11:04:57 +02:00
opt := MakeOptions(options)
entry_channel := make(chan _FileChunk)
2022-08-23 11:04:57 +02:00
newIter := obiiter.MakeIBioSequence()
2022-08-23 11:04:57 +02:00
nworkers := opt.ParallelWorkers()
chunck_order := obiutils.AtomicCounter()
2022-08-23 11:04:57 +02:00
newIter.Add(nworkers)
go func() {
newIter.WaitAndClose()
}()
// for j := 0; j < opt.ParallelWorkers(); j++ {
for j := 0; j < nworkers; j++ {
go _ParseGenbankFile(opt.Source(), entry_channel, newIter, chunck_order)
2022-08-23 11:04:57 +02:00
}
go _ReadFlatFileChunk(reader, entry_channel)
if opt.pointer.full_file_batch {
newIter = newIter.CompleteFileIterator()
}
2022-08-23 11:04:57 +02:00
return newIter
}
2023-01-22 22:04:17 +01:00
func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
2022-08-23 11:04:57 +02:00
var reader io.Reader
var err error
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
reader, err = Ropen(filename)
if err == ErrNoContent {
log.Infof("file %s is empty", filename)
return ReadEmptyFile(options...)
}
2022-08-23 11:04:57 +02:00
if err != nil {
log.Printf("open file error: %+v", err)
2023-01-22 22:04:17 +01:00
return obiiter.NilIBioSequence, err
2022-08-23 11:04:57 +02:00
}
2022-11-16 17:13:03 +01:00
return ReadGenbank(reader, options...), nil
2022-08-23 11:04:57 +02:00
}