diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go new file mode 100644 index 0000000..4649506 --- /dev/null +++ b/pkg/obiformats/genbank_read.go @@ -0,0 +1,163 @@ +package obiformats + +import ( + "bufio" + "bytes" + "io" + "os" + "strconv" + "strings" + + gzip "github.com/klauspost/pgzip" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" +) + +type gbstate int + +const ( + inHeader gbstate = 0 + inEntry = 1 + inDefinition = 2 + inFeature = 3 + inSequence = 4 +) + +func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequenceBatch) { + + state := inHeader + + for chunks := range input { + scanner := bufio.NewScanner(chunks.raw) + order := chunks.order + sequences := make(obiseq.BioSequenceSlice, 0, 100) + id := "" + scientificName := "" + defBytes := new(bytes.Buffer) + featBytes := new(bytes.Buffer) + seqBytes := new(bytes.Buffer) + taxid := 1 + for scanner.Scan() { + + line := scanner.Text() + + if !strings.HasPrefix(line, " ") && state != inHeader { + state = inEntry + } + + switch { + case strings.HasPrefix(line, "LOCUS "): + state = inEntry + id = strings.SplitN(line[12:], " ", 2)[0] + case strings.HasPrefix(line, "SOURCE "): + scientificName = strings.TrimSpace(line[12:]) + case strings.HasPrefix(line, "DEFINITION "): + defBytes.WriteString(strings.TrimSpace(line[12:])) + state = inDefinition + case strings.HasPrefix(line, "FEATURES "): + featBytes.WriteString(line) + state = inFeature + case strings.HasPrefix(line, "ORIGIN "): + state = inSequence + case strings.HasPrefix(line, " "): + switch state { + case inDefinition: + defBytes.WriteByte(' ') + defBytes.WriteString(strings.TrimSpace(line[5:])) + case inFeature: + featBytes.WriteByte('\n') + featBytes.WriteString(line) + if strings.HasPrefix(line, ` /db_xref="taxon:`) { + taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0]) + } + case inSequence: + parts := strings.SplitN(line[10:], " ", 7) + lparts := len(parts) + for i := 0; i < lparts; i++ { + seqBytes.WriteString(parts[i]) + } + default: // Do nothing + } + + case line == "//": + sequence := obiseq.NewBioSequence(id, + seqBytes.Bytes(), + defBytes.String()) + state = inHeader + + sequence.SetFeatures(featBytes.Bytes()) + + annot := sequence.Annotations() + annot["scientific_name"] = scientificName + annot["taxid"] = taxid + // log.Println(FormatFasta(sequence, FormatFastSeqJsonHeader)) + sequences = append(sequences, sequence) + defBytes = new(bytes.Buffer) + featBytes = new(bytes.Buffer) + seqBytes = new(bytes.Buffer) + } + } + out.Push(obiiter.MakeBioSequenceBatch(order, sequences)) + } + + out.Done() + +} + +func ReadGenbankBatch(reader io.Reader, options ...WithOption) obiiter.IBioSequenceBatch { + opt := MakeOptions(options) + entry_channel := make(chan _FileChunk, opt.BufferSize()) + + newIter := obiiter.MakeIBioSequenceBatch(opt.BufferSize()) + + nworkers := opt.ParallelWorkers() + newIter.Add(nworkers) + + go func() { + newIter.WaitAndClose() + }() + + // for j := 0; j < opt.ParallelWorkers(); j++ { + for j := 0; j < nworkers; j++ { + go _ParseGenbankFile(entry_channel, newIter) + } + + go _ReadFlatFileChunk(reader, entry_channel) + + return newIter +} + +func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence { + ib := ReadGenbankBatch(reader, options...) + return ib.SortBatches().IBioSequence() +} + +func ReadGenbankBatchFromFile(filename string, options ...WithOption) (obiiter.IBioSequenceBatch, error) { + var reader io.Reader + var greader io.Reader + var err error + + reader, err = os.Open(filename) + if err != nil { + log.Printf("open file error: %+v", err) + return obiiter.NilIBioSequenceBatch, err + } + + // Test if the flux is compressed by gzip + //greader, err = gzip.NewReader(reader) + greader, err = gzip.NewReaderN(reader, 1<<24, 2) + if err == nil { + reader = greader + } + + return ReadGenbankBatch(reader, options...), nil +} + +func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) { + ib, err := ReadGenbankBatchFromFile(filename, options...) + return ib.SortBatches().IBioSequence(), err + +} diff --git a/pkg/obiformats/universal_read.go b/pkg/obiformats/universal_read.go index ad42fb4..78a7a70 100644 --- a/pkg/obiformats/universal_read.go +++ b/pkg/obiformats/universal_read.go @@ -30,7 +30,12 @@ func GuessSeqFileType(firstline string) string { return "embl" case strings.HasPrefix(firstline, "LOCUS "): - return "genebank" + return "genbank" + + // Special case for genbank release files + // I hope it is enougth stringeant + case strings.HasSuffix(firstline, " Genetic Se"): + return "genbank" default: return "unknown" @@ -86,6 +91,8 @@ func ReadSequencesBatchFromFile(filename string, return ReadEcoPCRBatch(reader, options...), nil case "embl": return ReadEMBLBatch(reader, options...), nil + case "genbank": + return ReadGenbankBatch(reader, options...), nil default: log.Fatalf("File %s has guessed format %s which is not yet implemented", filename, filetype)