mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-08 16:50:27 +00:00
Refactoring codes for removing buffer size options. An some other changes...
Former-commit-id: 10b57cc1a27446ade3c444217341e9651e89cdce
This commit is contained in:
248
pkg/obiformats/csv_writer.go
Normal file
248
pkg/obiformats/csv_writer.go
Normal file
@@ -0,0 +1,248 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func CSVRecord(sequence *obiseq.BioSequence, opt Options) []string {
|
||||
keys := opt.CSVKeys()
|
||||
record := make([]string, 0, len(keys)+4)
|
||||
|
||||
if opt.CSVId() {
|
||||
record = append(record, sequence.Id())
|
||||
}
|
||||
|
||||
if opt.CSVCount() {
|
||||
record = append(record, fmt.Sprint(sequence.Count()))
|
||||
}
|
||||
|
||||
if opt.CSVTaxon() {
|
||||
taxid := sequence.Taxid()
|
||||
sn, ok := sequence.GetAttribute("scientific_name")
|
||||
|
||||
if !ok {
|
||||
if taxid == 1 {
|
||||
sn = "root"
|
||||
} else {
|
||||
sn = opt.CSVNAValue()
|
||||
}
|
||||
}
|
||||
|
||||
record = append(record, fmt.Sprint(taxid), fmt.Sprint(sn))
|
||||
}
|
||||
|
||||
if opt.CSVDefinition() {
|
||||
record = append(record, sequence.Definition())
|
||||
}
|
||||
|
||||
for _, key := range opt.CSVKeys() {
|
||||
value, ok := sequence.GetAttribute(key)
|
||||
if !ok {
|
||||
value = opt.CSVNAValue()
|
||||
}
|
||||
|
||||
svalue, _ := goutils.InterfaceToString(value)
|
||||
record = append(record, svalue)
|
||||
}
|
||||
|
||||
if opt.CSVSequence() {
|
||||
record = append(record, string(sequence.Sequence()))
|
||||
}
|
||||
|
||||
if opt.CSVQuality() {
|
||||
if sequence.HasQualities() {
|
||||
l := sequence.Len()
|
||||
q := sequence.Qualities()
|
||||
ascii := make([]byte, l)
|
||||
quality_shift := opt.QualityShift()
|
||||
for j := 0; j < l; j++ {
|
||||
ascii[j] = uint8(q[j]) + uint8(quality_shift)
|
||||
}
|
||||
record = append(record, string(ascii))
|
||||
} else {
|
||||
record = append(record, opt.CSVNAValue())
|
||||
}
|
||||
}
|
||||
|
||||
return record
|
||||
}
|
||||
|
||||
func CSVHeader(opt Options) []string {
|
||||
keys := opt.CSVKeys()
|
||||
record := make([]string, 0, len(keys)+4)
|
||||
|
||||
if opt.CSVId() {
|
||||
record = append(record, "id")
|
||||
}
|
||||
|
||||
if opt.CSVCount() {
|
||||
record = append(record, "count")
|
||||
}
|
||||
|
||||
if opt.CSVTaxon() {
|
||||
record = append(record, "taxid", "scientific_name")
|
||||
}
|
||||
|
||||
if opt.CSVDefinition() {
|
||||
record = append(record, "definition")
|
||||
}
|
||||
|
||||
record = append(record, opt.CSVKeys()...)
|
||||
|
||||
if opt.CSVSequence() {
|
||||
record = append(record, "sequence")
|
||||
}
|
||||
|
||||
if opt.CSVQuality() {
|
||||
record = append(record, "quality")
|
||||
}
|
||||
|
||||
return record
|
||||
}
|
||||
|
||||
func FormatCVSBatch(batch obiiter.BioSequenceBatch, opt Options) []byte {
|
||||
buff := new(bytes.Buffer)
|
||||
csv := csv.NewWriter(buff)
|
||||
|
||||
if batch.Order() == 0 {
|
||||
csv.Write(CSVHeader(opt))
|
||||
}
|
||||
for _, s := range batch.Slice() {
|
||||
csv.Write(CSVRecord(s, opt))
|
||||
}
|
||||
|
||||
csv.Flush()
|
||||
|
||||
return buff.Bytes()
|
||||
}
|
||||
|
||||
func WriteCSV(iterator obiiter.IBioSequence,
|
||||
file io.WriteCloser,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
opt := MakeOptions(options)
|
||||
|
||||
file, _ = goutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
nwriters := opt.ParallelWorkers()
|
||||
|
||||
obiiter.RegisterAPipe()
|
||||
chunkchan := make(chan FileChunck)
|
||||
|
||||
newIter.Add(nwriters)
|
||||
var waitWriter sync.WaitGroup
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
for len(chunkchan) > 0 {
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
close(chunkchan)
|
||||
waitWriter.Wait()
|
||||
}()
|
||||
|
||||
ff := func(iterator obiiter.IBioSequence) {
|
||||
for iterator.Next() {
|
||||
|
||||
batch := iterator.Get()
|
||||
|
||||
chunkchan <- FileChunck{
|
||||
FormatCVSBatch(batch, opt),
|
||||
batch.Order(),
|
||||
}
|
||||
newIter.Push(batch)
|
||||
}
|
||||
newIter.Done()
|
||||
}
|
||||
|
||||
log.Debugln("Start of the CSV file writing")
|
||||
go ff(iterator)
|
||||
for i := 0; i < nwriters-1; i++ {
|
||||
go ff(iterator.Split())
|
||||
}
|
||||
|
||||
next_to_send := 0
|
||||
received := make(map[int]FileChunck, 100)
|
||||
|
||||
waitWriter.Add(1)
|
||||
go func() {
|
||||
for chunk := range chunkchan {
|
||||
if chunk.order == next_to_send {
|
||||
file.Write(chunk.text)
|
||||
next_to_send++
|
||||
chunk, ok := received[next_to_send]
|
||||
for ok {
|
||||
file.Write(chunk.text)
|
||||
delete(received, next_to_send)
|
||||
next_to_send++
|
||||
chunk, ok = received[next_to_send]
|
||||
}
|
||||
} else {
|
||||
received[chunk.order] = chunk
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
file.Close()
|
||||
|
||||
log.Debugln("End of the CSV file writing")
|
||||
obiiter.UnregisterPipe()
|
||||
waitWriter.Done()
|
||||
|
||||
}()
|
||||
|
||||
return newIter, nil
|
||||
}
|
||||
|
||||
func WriteCSVToStdout(iterator obiiter.IBioSequence,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionDontCloseFile())
|
||||
return WriteCSV(iterator, os.Stdout, options...)
|
||||
}
|
||||
|
||||
func WriteCSVToFile(iterator obiiter.IBioSequence,
|
||||
filename string,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
|
||||
opt := MakeOptions(options)
|
||||
flags := os.O_WRONLY | os.O_CREATE
|
||||
|
||||
if opt.AppendFile() {
|
||||
flags |= os.O_APPEND
|
||||
}
|
||||
file, err := os.OpenFile(filename, flags, 0660)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("open file error: %v", err)
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
options = append(options, OptionCloseFile())
|
||||
|
||||
iterator, err = WriteCSV(iterator, file, options...)
|
||||
|
||||
if opt.HaveToSavePaired() {
|
||||
var revfile *os.File
|
||||
|
||||
revfile, err = os.OpenFile(opt.PairedFileName(), flags, 0660)
|
||||
if err != nil {
|
||||
log.Fatalf("open file error: %v", err)
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
iterator, err = WriteCSV(iterator.PairedWith(), revfile, options...)
|
||||
}
|
||||
|
||||
return iterator, err
|
||||
}
|
||||
@@ -166,7 +166,7 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
||||
|
||||
opt := MakeOptions(options)
|
||||
|
||||
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
|
||||
@@ -244,9 +244,9 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
|
||||
// <CR>?<LF>//<CR>?<LF>
|
||||
func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
||||
opt := MakeOptions(options)
|
||||
entry_channel := make(chan _FileChunk, opt.BufferSize())
|
||||
entry_channel := make(chan _FileChunk)
|
||||
|
||||
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
nworkers := opt.ParallelWorkers()
|
||||
newIter.Add(nworkers)
|
||||
|
||||
@@ -19,6 +19,5 @@ func IParseFastSeqHeaderBatch(iterator obiiter.IBioSequence,
|
||||
options ...WithOption) obiiter.IBioSequence {
|
||||
opt := MakeOptions(options)
|
||||
return iterator.MakeIWorker(obiseq.AnnotatorToSeqWorker(opt.ParseFastSeqHeader()),
|
||||
opt.ParallelWorkers(),
|
||||
opt.BufferSize())
|
||||
opt.ParallelWorkers())
|
||||
}
|
||||
|
||||
@@ -105,7 +105,7 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe
|
||||
size = -1
|
||||
}
|
||||
|
||||
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
@@ -127,7 +127,7 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe
|
||||
|
||||
func ReadFastSeqFromStdin(options ...WithOption) obiiter.IBioSequence {
|
||||
opt := MakeOptions(options)
|
||||
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
|
||||
@@ -71,8 +71,7 @@ func WriteFasta(iterator obiiter.IBioSequence,
|
||||
|
||||
file, _ = goutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||
|
||||
buffsize := iterator.BufferSize()
|
||||
newIter := obiiter.MakeIBioSequence(buffsize)
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
nwriters := opt.ParallelWorkers()
|
||||
|
||||
|
||||
@@ -60,8 +60,7 @@ func WriteFastq(iterator obiiter.IBioSequence,
|
||||
|
||||
file, _ = goutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||
|
||||
buffsize := iterator.BufferSize()
|
||||
newIter := obiiter.MakeIBioSequence(buffsize)
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
nwriters := opt.ParallelWorkers()
|
||||
|
||||
|
||||
@@ -113,9 +113,9 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
|
||||
|
||||
func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
||||
opt := MakeOptions(options)
|
||||
entry_channel := make(chan _FileChunk, opt.BufferSize())
|
||||
entry_channel := make(chan _FileChunk)
|
||||
|
||||
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
nworkers := opt.ParallelWorkers()
|
||||
newIter.Add(nworkers)
|
||||
|
||||
@@ -15,10 +15,15 @@ type __options__ struct {
|
||||
closefile bool
|
||||
appendfile bool
|
||||
compressed bool
|
||||
csv_ids bool
|
||||
cvs_sequence bool
|
||||
csv_id bool
|
||||
csv_sequence bool
|
||||
csv_quality bool
|
||||
csv_definition bool
|
||||
csv_count bool
|
||||
csv_taxon bool
|
||||
csv_keys []string
|
||||
csv_separator string
|
||||
csv_navalue string
|
||||
paired_filename string
|
||||
}
|
||||
|
||||
@@ -40,11 +45,16 @@ func MakeOptions(setters []WithOption) Options {
|
||||
closefile: false,
|
||||
appendfile: false,
|
||||
compressed: false,
|
||||
csv_ids: true,
|
||||
csv_id: true,
|
||||
csv_definition: false,
|
||||
cvs_sequence: true,
|
||||
csv_count: false,
|
||||
csv_taxon: false,
|
||||
csv_sequence: true,
|
||||
csv_quality: false,
|
||||
csv_separator: ",",
|
||||
paired_filename: "",
|
||||
csv_navalue: "NA",
|
||||
csv_keys: make([]string, 0),
|
||||
paired_filename: "",
|
||||
}
|
||||
|
||||
opt := Options{&o}
|
||||
@@ -60,10 +70,6 @@ func (opt Options) QualityShift() int {
|
||||
return opt.pointer.quality_shift
|
||||
}
|
||||
|
||||
func (opt Options) BufferSize() int {
|
||||
return opt.pointer.buffer_size
|
||||
}
|
||||
|
||||
func (opt Options) BatchSize() int {
|
||||
return opt.pointer.batch_size
|
||||
}
|
||||
@@ -96,8 +102,40 @@ func (opt Options) CompressedFile() bool {
|
||||
return opt.pointer.compressed
|
||||
}
|
||||
|
||||
func (opt Options) CSVIds() bool {
|
||||
return opt.pointer.csv_ids
|
||||
func (opt Options) CSVId() bool {
|
||||
return opt.pointer.csv_id
|
||||
}
|
||||
|
||||
func (opt Options) CSVDefinition() bool {
|
||||
return opt.pointer.csv_definition
|
||||
}
|
||||
|
||||
func (opt Options) CSVCount() bool {
|
||||
return opt.pointer.csv_count
|
||||
}
|
||||
|
||||
func (opt Options) CSVTaxon() bool {
|
||||
return opt.pointer.csv_taxon
|
||||
}
|
||||
|
||||
func (opt Options) CSVSequence() bool {
|
||||
return opt.pointer.csv_sequence
|
||||
}
|
||||
|
||||
func (opt Options) CSVQuality() bool {
|
||||
return opt.pointer.csv_quality
|
||||
}
|
||||
|
||||
func (opt Options) CSVKeys() []string {
|
||||
return opt.pointer.csv_keys
|
||||
}
|
||||
|
||||
func (opt Options) CSVSeparator() string {
|
||||
return opt.pointer.csv_separator
|
||||
}
|
||||
|
||||
func (opt Options) CSVNAValue() string {
|
||||
return opt.pointer.csv_navalue
|
||||
}
|
||||
|
||||
func (opt Options) HaveToSavePaired() bool {
|
||||
@@ -108,14 +146,6 @@ func (opt Options) PairedFileName() string {
|
||||
return opt.pointer.paired_filename
|
||||
}
|
||||
|
||||
func OptionsBufferSize(size int) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.buffer_size = size
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionCloseFile() WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.closefile = true
|
||||
@@ -247,3 +277,82 @@ func WritePairedReadsTo(filename string) WithOption {
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVId(include bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_id = include
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVSequence(include bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_sequence = include
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVQuality(include bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_quality = include
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVDefinition(include bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_definition = include
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVCount(include bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_count = include
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVTaxon(include bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_taxon = include
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVKey(key string) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_keys = append(opt.pointer.csv_keys, key)
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVKeys(keys []string) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_keys = append(opt.pointer.csv_keys, keys...)
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVSeparator(separator string) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_separator = separator
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVNAValue(navalue string) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_navalue = navalue
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user