Files
obitools4/pkg/obiformats/ecopcr_read.go

243 lines
5.9 KiB
Go
Raw Normal View History

2022-01-13 23:27:39 +01:00
package obiformats
import (
"encoding/csv"
"fmt"
"io"
"path"
2022-01-13 23:27:39 +01:00
"strconv"
"strings"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
2022-01-13 23:27:39 +01:00
)
type __ecopcr_file__ struct {
file io.Reader
csv *csv.Reader
names map[string]int
version int
mode string
forward_primer string
reverse_primer string
}
func __readline__(stream io.Reader) string {
line := make([]byte, 1024)
char := make([]byte, 1)
i := 0
for n, err := stream.Read(char); err == nil && n == 1 && char[0] != '\n'; n, err = stream.Read(char) {
line[i] = char[0]
i++
}
return string(line[0:i])
}
func __read_ecopcr_bioseq__(file *__ecopcr_file__) (*obiseq.BioSequence, error) {
2022-01-13 23:27:39 +01:00
record, err := file.csv.Read()
if err != nil {
return nil, err
2022-01-13 23:27:39 +01:00
}
name := strings.TrimSpace(record[0])
// Ensure that sequence name is unique accross a file.
if val, ok := file.names[name]; ok {
file.names[name]++
name = fmt.Sprintf("%s_%d", name, val)
} else {
file.names[name] = 1
}
var sequence []byte
var comment string
if file.version == 2 {
sequence = []byte(strings.TrimSpace(record[20]))
comment = strings.TrimSpace(record[21])
} else {
sequence = []byte(strings.TrimSpace(record[18]))
comment = strings.TrimSpace(record[19])
}
bseq := obiseq.NewBioSequence(name, sequence, comment)
2022-01-13 23:27:39 +01:00
annotation := bseq.Annotations()
annotation["ac"] = name
annotation["seq_length"], _ = strconv.Atoi(strings.TrimSpace(record[1]))
annotation["taxid"], _ = strconv.Atoi(strings.TrimSpace(record[2]))
annotation["rank"] = strings.TrimSpace(record[3])
annotation["species_taxid"], _ = strconv.Atoi(strings.TrimSpace(record[4]))
annotation["species_name"] = strings.TrimSpace(record[5])
annotation["genus_taxid"], _ = strconv.Atoi(strings.TrimSpace(record[6]))
annotation["genus_name"] = strings.TrimSpace(record[7])
annotation["family_taxid"], _ = strconv.Atoi(strings.TrimSpace(record[8]))
annotation["family_name"] = strings.TrimSpace(record[9])
k_m_taxid := file.mode + "_taxid"
k_m_name := file.mode + "_name"
annotation[k_m_taxid], _ = strconv.Atoi(strings.TrimSpace(record[10]))
annotation[k_m_name] = strings.TrimSpace(record[11])
annotation["strand"] = strings.TrimSpace(record[12])
annotation["forward_primer"] = file.forward_primer
annotation["forward_match"] = strings.TrimSpace(record[13])
annotation["forward_mismatch"], _ = strconv.Atoi(strings.TrimSpace(record[14]))
delta := 0
if file.version == 2 {
value, err := strconv.ParseFloat(strings.TrimSpace(record[15]), 64)
if err != nil {
annotation["forward_tm"] = value
} else {
annotation["forward_tm"] = -1
}
delta++
}
annotation["reverse_primer"] = file.reverse_primer
annotation["reverse_match"] = strings.TrimSpace(record[15+delta])
annotation["reverse_mismatch"], _ = strconv.Atoi(strings.TrimSpace(record[16+delta]))
if file.version == 2 {
value, err := strconv.ParseFloat(strings.TrimSpace(record[17+delta]), 64)
if err != nil {
annotation["reverse_tm"] = value
} else {
annotation["reverse_tm"] = -1
}
delta++
}
annotation["amplicon_length"], _ = strconv.Atoi(strings.TrimSpace(record[17+delta]))
return bseq, nil
}
func ReadEcoPCR(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
2022-01-13 23:27:39 +01:00
tag := make([]byte, 11)
n, _ := reader.Read(tag)
version := 1
if n == 11 && string(tag) == "#@ecopcr-v2" {
version = 2
}
line := __readline__(reader)
for !strings.HasPrefix(line, "# direct strand oligo1") {
line = __readline__(reader)
}
forward_primer := (strings.Split(line, " "))[6]
line = __readline__(reader)
for !strings.HasPrefix(line, "# reverse strand oligo2") {
line = __readline__(reader)
}
reverse_primer := (strings.Split(line, " "))[5]
line = __readline__(reader)
for !strings.HasPrefix(line, "# output in") {
line = __readline__(reader)
}
mode := (strings.Split(line, " "))[3]
file := csv.NewReader(reader)
file.Comma = '|'
file.Comment = '#'
file.TrimLeadingSpace = true
file.ReuseRecord = true
log.Printf("EcoPCR file version : %d Mode : %s\n", version, mode)
ecopcr := __ecopcr_file__{
file: reader,
csv: file,
names: make(map[string]int),
version: version,
mode: mode,
forward_primer: forward_primer,
reverse_primer: reverse_primer}
opt := MakeOptions(options)
newIter := obiiter.MakeIBioSequence()
2022-01-14 17:32:12 +01:00
newIter.Add(1)
2022-01-13 23:27:39 +01:00
go func() {
2022-01-14 17:32:12 +01:00
newIter.Wait()
newIter.Close()
2022-01-13 23:27:39 +01:00
}()
go func() {
var err error = nil
var seq *obiseq.BioSequence
2022-01-13 23:27:39 +01:00
seq, err = __read_ecopcr_bioseq__(&ecopcr)
seq.SetSource(opt.Source())
2022-01-13 23:27:39 +01:00
slice := make(obiseq.BioSequenceSlice, 0, opt.BatchSize())
i := 0
ii := 0
for err == nil {
slice = append(slice, seq)
ii++
if ii >= opt.BatchSize() {
newIter.Push(obiiter.MakeBioSequenceBatch(opt.Source(), i, slice))
slice = obiseq.MakeBioSequenceSlice()
2022-01-13 23:27:39 +01:00
i++
ii = 0
}
seq, err = __read_ecopcr_bioseq__(&ecopcr)
if err == nil {
seq.SetSource(opt.Source())
} else if err != io.EOF {
log.Panicf("%+v", err)
}
2022-01-13 23:27:39 +01:00
}
if len(slice) > 0 {
newIter.Push(obiiter.MakeBioSequenceBatch(opt.Source(), i, slice))
2022-01-13 23:27:39 +01:00
}
2022-01-14 17:32:12 +01:00
newIter.Done()
2022-01-13 23:27:39 +01:00
if err != nil && err != io.EOF {
log.Panicf("%+v", err)
}
}()
if opt.pointer.full_file_batch {
newIter = newIter.CompleteFileIterator()
}
return newIter, nil
2022-01-13 23:27:39 +01:00
}
func ReadEcoPCRFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
2022-01-13 23:27:39 +01:00
var reader io.Reader
var err error
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
reader, err = obiutils.Ropen(filename)
if err == obiutils.ErrNoContent {
log.Infof("file %s is empty", filename)
return ReadEmptyFile(options...)
}
2022-01-13 23:27:39 +01:00
if err != nil {
log.Printf("open file error: %+v", err)
2023-01-22 22:04:17 +01:00
return obiiter.NilIBioSequence, err
2022-01-13 23:27:39 +01:00
}
return ReadEcoPCR(reader, options...)
2022-01-13 23:27:39 +01:00
}