mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00

modified: cmd/obitools/obitag/main.go modified: cmd/obitools/obitaxonomy/main.go modified: pkg/obiformats/csvtaxdump_read.go modified: pkg/obiformats/ecopcr_read.go modified: pkg/obiformats/ncbitaxdump_read.go modified: pkg/obiformats/ncbitaxdump_readtar.go modified: pkg/obiformats/newick_write.go modified: pkg/obiformats/options.go modified: pkg/obiformats/taxonomy_read.go modified: pkg/obiformats/universal_read.go modified: pkg/obiiter/extract_taxonomy.go modified: pkg/obioptions/options.go modified: pkg/obioptions/version.go new file: pkg/obiphylo/tree.go modified: pkg/obiseq/biosequenceslice.go modified: pkg/obiseq/taxonomy_methods.go modified: pkg/obitax/taxonomy.go modified: pkg/obitax/taxonset.go modified: pkg/obitools/obiconvert/sequence_reader.go modified: pkg/obitools/obitag/obitag.go modified: pkg/obitools/obitaxonomy/obitaxonomy.go modified: pkg/obitools/obitaxonomy/options.go deleted: sample/.DS_Store
243 lines
5.9 KiB
Go
243 lines
5.9 KiB
Go
package obiformats
|
|
|
|
import (
|
|
"encoding/csv"
|
|
"fmt"
|
|
"io"
|
|
"path"
|
|
"strconv"
|
|
"strings"
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
|
)
|
|
|
|
type __ecopcr_file__ struct {
|
|
file io.Reader
|
|
csv *csv.Reader
|
|
names map[string]int
|
|
version int
|
|
mode string
|
|
forward_primer string
|
|
reverse_primer string
|
|
}
|
|
|
|
func __readline__(stream io.Reader) string {
|
|
line := make([]byte, 1024)
|
|
char := make([]byte, 1)
|
|
|
|
i := 0
|
|
for n, err := stream.Read(char); err == nil && n == 1 && char[0] != '\n'; n, err = stream.Read(char) {
|
|
line[i] = char[0]
|
|
i++
|
|
}
|
|
return string(line[0:i])
|
|
}
|
|
|
|
func __read_ecopcr_bioseq__(file *__ecopcr_file__) (*obiseq.BioSequence, error) {
|
|
|
|
record, err := file.csv.Read()
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
name := strings.TrimSpace(record[0])
|
|
|
|
// Ensure that sequence name is unique accross a file.
|
|
if val, ok := file.names[name]; ok {
|
|
file.names[name]++
|
|
name = fmt.Sprintf("%s_%d", name, val)
|
|
} else {
|
|
file.names[name] = 1
|
|
}
|
|
|
|
var sequence []byte
|
|
var comment string
|
|
|
|
if file.version == 2 {
|
|
sequence = []byte(strings.TrimSpace(record[20]))
|
|
comment = strings.TrimSpace(record[21])
|
|
|
|
} else {
|
|
sequence = []byte(strings.TrimSpace(record[18]))
|
|
comment = strings.TrimSpace(record[19])
|
|
}
|
|
|
|
bseq := obiseq.NewBioSequence(name, sequence, comment)
|
|
annotation := bseq.Annotations()
|
|
|
|
annotation["ac"] = name
|
|
annotation["seq_length"], _ = strconv.Atoi(strings.TrimSpace(record[1]))
|
|
annotation["taxid"], _ = strconv.Atoi(strings.TrimSpace(record[2]))
|
|
annotation["rank"] = strings.TrimSpace(record[3])
|
|
annotation["species_taxid"], _ = strconv.Atoi(strings.TrimSpace(record[4]))
|
|
annotation["species_name"] = strings.TrimSpace(record[5])
|
|
annotation["genus_taxid"], _ = strconv.Atoi(strings.TrimSpace(record[6]))
|
|
annotation["genus_name"] = strings.TrimSpace(record[7])
|
|
annotation["family_taxid"], _ = strconv.Atoi(strings.TrimSpace(record[8]))
|
|
annotation["family_name"] = strings.TrimSpace(record[9])
|
|
k_m_taxid := file.mode + "_taxid"
|
|
k_m_name := file.mode + "_name"
|
|
annotation[k_m_taxid], _ = strconv.Atoi(strings.TrimSpace(record[10]))
|
|
annotation[k_m_name] = strings.TrimSpace(record[11])
|
|
annotation["strand"] = strings.TrimSpace(record[12])
|
|
annotation["forward_primer"] = file.forward_primer
|
|
annotation["forward_match"] = strings.TrimSpace(record[13])
|
|
annotation["forward_mismatch"], _ = strconv.Atoi(strings.TrimSpace(record[14]))
|
|
|
|
delta := 0
|
|
if file.version == 2 {
|
|
value, err := strconv.ParseFloat(strings.TrimSpace(record[15]), 64)
|
|
if err != nil {
|
|
annotation["forward_tm"] = value
|
|
} else {
|
|
annotation["forward_tm"] = -1
|
|
}
|
|
delta++
|
|
}
|
|
|
|
annotation["reverse_primer"] = file.reverse_primer
|
|
annotation["reverse_match"] = strings.TrimSpace(record[15+delta])
|
|
annotation["reverse_mismatch"], _ = strconv.Atoi(strings.TrimSpace(record[16+delta]))
|
|
|
|
if file.version == 2 {
|
|
value, err := strconv.ParseFloat(strings.TrimSpace(record[17+delta]), 64)
|
|
if err != nil {
|
|
annotation["reverse_tm"] = value
|
|
} else {
|
|
annotation["reverse_tm"] = -1
|
|
}
|
|
delta++
|
|
}
|
|
|
|
annotation["amplicon_length"], _ = strconv.Atoi(strings.TrimSpace(record[17+delta]))
|
|
|
|
return bseq, nil
|
|
}
|
|
|
|
func ReadEcoPCR(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
|
tag := make([]byte, 11)
|
|
n, _ := reader.Read(tag)
|
|
|
|
version := 1
|
|
if n == 11 && string(tag) == "#@ecopcr-v2" {
|
|
version = 2
|
|
}
|
|
|
|
line := __readline__(reader)
|
|
for !strings.HasPrefix(line, "# direct strand oligo1") {
|
|
line = __readline__(reader)
|
|
}
|
|
forward_primer := (strings.Split(line, " "))[6]
|
|
|
|
line = __readline__(reader)
|
|
for !strings.HasPrefix(line, "# reverse strand oligo2") {
|
|
line = __readline__(reader)
|
|
}
|
|
reverse_primer := (strings.Split(line, " "))[5]
|
|
|
|
line = __readline__(reader)
|
|
for !strings.HasPrefix(line, "# output in") {
|
|
line = __readline__(reader)
|
|
}
|
|
mode := (strings.Split(line, " "))[3]
|
|
|
|
file := csv.NewReader(reader)
|
|
file.Comma = '|'
|
|
file.Comment = '#'
|
|
file.TrimLeadingSpace = true
|
|
file.ReuseRecord = true
|
|
|
|
log.Printf("EcoPCR file version : %d Mode : %s\n", version, mode)
|
|
|
|
ecopcr := __ecopcr_file__{
|
|
file: reader,
|
|
csv: file,
|
|
names: make(map[string]int),
|
|
version: version,
|
|
mode: mode,
|
|
forward_primer: forward_primer,
|
|
reverse_primer: reverse_primer}
|
|
|
|
opt := MakeOptions(options)
|
|
|
|
newIter := obiiter.MakeIBioSequence()
|
|
newIter.Add(1)
|
|
|
|
go func() {
|
|
newIter.Wait()
|
|
newIter.Close()
|
|
}()
|
|
|
|
go func() {
|
|
var err error = nil
|
|
var seq *obiseq.BioSequence
|
|
|
|
seq, err = __read_ecopcr_bioseq__(&ecopcr)
|
|
seq.SetSource(opt.Source())
|
|
slice := make(obiseq.BioSequenceSlice, 0, opt.BatchSize())
|
|
i := 0
|
|
ii := 0
|
|
for err == nil {
|
|
slice = append(slice, seq)
|
|
ii++
|
|
if ii >= opt.BatchSize() {
|
|
newIter.Push(obiiter.MakeBioSequenceBatch(opt.Source(), i, slice))
|
|
slice = obiseq.MakeBioSequenceSlice()
|
|
i++
|
|
ii = 0
|
|
}
|
|
|
|
seq, err = __read_ecopcr_bioseq__(&ecopcr)
|
|
|
|
if err == nil {
|
|
seq.SetSource(opt.Source())
|
|
} else if err != io.EOF {
|
|
log.Panicf("%+v", err)
|
|
}
|
|
}
|
|
|
|
if len(slice) > 0 {
|
|
newIter.Push(obiiter.MakeBioSequenceBatch(opt.Source(), i, slice))
|
|
}
|
|
|
|
newIter.Done()
|
|
|
|
if err != nil && err != io.EOF {
|
|
log.Panicf("%+v", err)
|
|
}
|
|
|
|
}()
|
|
|
|
if opt.pointer.full_file_batch {
|
|
newIter = newIter.CompleteFileIterator()
|
|
}
|
|
|
|
return newIter, nil
|
|
}
|
|
|
|
func ReadEcoPCRFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
|
var reader io.Reader
|
|
var err error
|
|
|
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
|
|
|
reader, err = obiutils.Ropen(filename)
|
|
|
|
if err == obiutils.ErrNoContent {
|
|
log.Infof("file %s is empty", filename)
|
|
return ReadEmptyFile(options...)
|
|
}
|
|
|
|
if err != nil {
|
|
log.Printf("open file error: %+v", err)
|
|
return obiiter.NilIBioSequence, err
|
|
}
|
|
|
|
return ReadEcoPCR(reader, options...)
|
|
}
|