Refactoring codes for removing buffer size options. An some other changes...

Former-commit-id: 10b57cc1a27446ade3c444217341e9651e89cdce
This commit is contained in:
2023-03-07 11:12:13 +07:00
parent 9811e440b8
commit d88de15cdc
52 changed files with 1172 additions and 421 deletions

View File

@@ -0,0 +1,248 @@
package obiformats
import (
"bytes"
"encoding/csv"
"fmt"
"io"
"os"
"sync"
"time"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
log "github.com/sirupsen/logrus"
)
func CSVRecord(sequence *obiseq.BioSequence, opt Options) []string {
keys := opt.CSVKeys()
record := make([]string, 0, len(keys)+4)
if opt.CSVId() {
record = append(record, sequence.Id())
}
if opt.CSVCount() {
record = append(record, fmt.Sprint(sequence.Count()))
}
if opt.CSVTaxon() {
taxid := sequence.Taxid()
sn, ok := sequence.GetAttribute("scientific_name")
if !ok {
if taxid == 1 {
sn = "root"
} else {
sn = opt.CSVNAValue()
}
}
record = append(record, fmt.Sprint(taxid), fmt.Sprint(sn))
}
if opt.CSVDefinition() {
record = append(record, sequence.Definition())
}
for _, key := range opt.CSVKeys() {
value, ok := sequence.GetAttribute(key)
if !ok {
value = opt.CSVNAValue()
}
svalue, _ := goutils.InterfaceToString(value)
record = append(record, svalue)
}
if opt.CSVSequence() {
record = append(record, string(sequence.Sequence()))
}
if opt.CSVQuality() {
if sequence.HasQualities() {
l := sequence.Len()
q := sequence.Qualities()
ascii := make([]byte, l)
quality_shift := opt.QualityShift()
for j := 0; j < l; j++ {
ascii[j] = uint8(q[j]) + uint8(quality_shift)
}
record = append(record, string(ascii))
} else {
record = append(record, opt.CSVNAValue())
}
}
return record
}
func CSVHeader(opt Options) []string {
keys := opt.CSVKeys()
record := make([]string, 0, len(keys)+4)
if opt.CSVId() {
record = append(record, "id")
}
if opt.CSVCount() {
record = append(record, "count")
}
if opt.CSVTaxon() {
record = append(record, "taxid", "scientific_name")
}
if opt.CSVDefinition() {
record = append(record, "definition")
}
record = append(record, opt.CSVKeys()...)
if opt.CSVSequence() {
record = append(record, "sequence")
}
if opt.CSVQuality() {
record = append(record, "quality")
}
return record
}
func FormatCVSBatch(batch obiiter.BioSequenceBatch, opt Options) []byte {
buff := new(bytes.Buffer)
csv := csv.NewWriter(buff)
if batch.Order() == 0 {
csv.Write(CSVHeader(opt))
}
for _, s := range batch.Slice() {
csv.Write(CSVRecord(s, opt))
}
csv.Flush()
return buff.Bytes()
}
func WriteCSV(iterator obiiter.IBioSequence,
file io.WriteCloser,
options ...WithOption) (obiiter.IBioSequence, error) {
opt := MakeOptions(options)
file, _ = goutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
newIter := obiiter.MakeIBioSequence()
nwriters := opt.ParallelWorkers()
obiiter.RegisterAPipe()
chunkchan := make(chan FileChunck)
newIter.Add(nwriters)
var waitWriter sync.WaitGroup
go func() {
newIter.WaitAndClose()
for len(chunkchan) > 0 {
time.Sleep(time.Millisecond)
}
close(chunkchan)
waitWriter.Wait()
}()
ff := func(iterator obiiter.IBioSequence) {
for iterator.Next() {
batch := iterator.Get()
chunkchan <- FileChunck{
FormatCVSBatch(batch, opt),
batch.Order(),
}
newIter.Push(batch)
}
newIter.Done()
}
log.Debugln("Start of the CSV file writing")
go ff(iterator)
for i := 0; i < nwriters-1; i++ {
go ff(iterator.Split())
}
next_to_send := 0
received := make(map[int]FileChunck, 100)
waitWriter.Add(1)
go func() {
for chunk := range chunkchan {
if chunk.order == next_to_send {
file.Write(chunk.text)
next_to_send++
chunk, ok := received[next_to_send]
for ok {
file.Write(chunk.text)
delete(received, next_to_send)
next_to_send++
chunk, ok = received[next_to_send]
}
} else {
received[chunk.order] = chunk
}
}
file.Close()
log.Debugln("End of the CSV file writing")
obiiter.UnregisterPipe()
waitWriter.Done()
}()
return newIter, nil
}
func WriteCSVToStdout(iterator obiiter.IBioSequence,
options ...WithOption) (obiiter.IBioSequence, error) {
options = append(options, OptionDontCloseFile())
return WriteCSV(iterator, os.Stdout, options...)
}
func WriteCSVToFile(iterator obiiter.IBioSequence,
filename string,
options ...WithOption) (obiiter.IBioSequence, error) {
opt := MakeOptions(options)
flags := os.O_WRONLY | os.O_CREATE
if opt.AppendFile() {
flags |= os.O_APPEND
}
file, err := os.OpenFile(filename, flags, 0660)
if err != nil {
log.Fatalf("open file error: %v", err)
return obiiter.NilIBioSequence, err
}
options = append(options, OptionCloseFile())
iterator, err = WriteCSV(iterator, file, options...)
if opt.HaveToSavePaired() {
var revfile *os.File
revfile, err = os.OpenFile(opt.PairedFileName(), flags, 0660)
if err != nil {
log.Fatalf("open file error: %v", err)
return obiiter.NilIBioSequence, err
}
iterator, err = WriteCSV(iterator.PairedWith(), revfile, options...)
}
return iterator, err
}

View File

@@ -166,7 +166,7 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
opt := MakeOptions(options)
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
newIter := obiiter.MakeIBioSequence()
newIter.Add(1)
go func() {

View File

@@ -244,9 +244,9 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
// <CR>?<LF>//<CR>?<LF>
func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
opt := MakeOptions(options)
entry_channel := make(chan _FileChunk, opt.BufferSize())
entry_channel := make(chan _FileChunk)
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
newIter := obiiter.MakeIBioSequence()
nworkers := opt.ParallelWorkers()
newIter.Add(nworkers)

View File

@@ -19,6 +19,5 @@ func IParseFastSeqHeaderBatch(iterator obiiter.IBioSequence,
options ...WithOption) obiiter.IBioSequence {
opt := MakeOptions(options)
return iterator.MakeIWorker(obiseq.AnnotatorToSeqWorker(opt.ParseFastSeqHeader()),
opt.ParallelWorkers(),
opt.BufferSize())
opt.ParallelWorkers())
}

View File

@@ -105,7 +105,7 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe
size = -1
}
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
newIter := obiiter.MakeIBioSequence()
newIter.Add(1)
go func() {
@@ -127,7 +127,7 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe
func ReadFastSeqFromStdin(options ...WithOption) obiiter.IBioSequence {
opt := MakeOptions(options)
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
newIter := obiiter.MakeIBioSequence()
newIter.Add(1)

View File

@@ -71,8 +71,7 @@ func WriteFasta(iterator obiiter.IBioSequence,
file, _ = goutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
buffsize := iterator.BufferSize()
newIter := obiiter.MakeIBioSequence(buffsize)
newIter := obiiter.MakeIBioSequence()
nwriters := opt.ParallelWorkers()

View File

@@ -60,8 +60,7 @@ func WriteFastq(iterator obiiter.IBioSequence,
file, _ = goutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
buffsize := iterator.BufferSize()
newIter := obiiter.MakeIBioSequence(buffsize)
newIter := obiiter.MakeIBioSequence()
nwriters := opt.ParallelWorkers()

View File

@@ -113,9 +113,9 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
opt := MakeOptions(options)
entry_channel := make(chan _FileChunk, opt.BufferSize())
entry_channel := make(chan _FileChunk)
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
newIter := obiiter.MakeIBioSequence()
nworkers := opt.ParallelWorkers()
newIter.Add(nworkers)

View File

@@ -15,10 +15,15 @@ type __options__ struct {
closefile bool
appendfile bool
compressed bool
csv_ids bool
cvs_sequence bool
csv_id bool
csv_sequence bool
csv_quality bool
csv_definition bool
csv_count bool
csv_taxon bool
csv_keys []string
csv_separator string
csv_navalue string
paired_filename string
}
@@ -40,11 +45,16 @@ func MakeOptions(setters []WithOption) Options {
closefile: false,
appendfile: false,
compressed: false,
csv_ids: true,
csv_id: true,
csv_definition: false,
cvs_sequence: true,
csv_count: false,
csv_taxon: false,
csv_sequence: true,
csv_quality: false,
csv_separator: ",",
paired_filename: "",
csv_navalue: "NA",
csv_keys: make([]string, 0),
paired_filename: "",
}
opt := Options{&o}
@@ -60,10 +70,6 @@ func (opt Options) QualityShift() int {
return opt.pointer.quality_shift
}
func (opt Options) BufferSize() int {
return opt.pointer.buffer_size
}
func (opt Options) BatchSize() int {
return opt.pointer.batch_size
}
@@ -96,8 +102,40 @@ func (opt Options) CompressedFile() bool {
return opt.pointer.compressed
}
func (opt Options) CSVIds() bool {
return opt.pointer.csv_ids
func (opt Options) CSVId() bool {
return opt.pointer.csv_id
}
func (opt Options) CSVDefinition() bool {
return opt.pointer.csv_definition
}
func (opt Options) CSVCount() bool {
return opt.pointer.csv_count
}
func (opt Options) CSVTaxon() bool {
return opt.pointer.csv_taxon
}
func (opt Options) CSVSequence() bool {
return opt.pointer.csv_sequence
}
func (opt Options) CSVQuality() bool {
return opt.pointer.csv_quality
}
func (opt Options) CSVKeys() []string {
return opt.pointer.csv_keys
}
func (opt Options) CSVSeparator() string {
return opt.pointer.csv_separator
}
func (opt Options) CSVNAValue() string {
return opt.pointer.csv_navalue
}
func (opt Options) HaveToSavePaired() bool {
@@ -108,14 +146,6 @@ func (opt Options) PairedFileName() string {
return opt.pointer.paired_filename
}
func OptionsBufferSize(size int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.buffer_size = size
})
return f
}
func OptionCloseFile() WithOption {
f := WithOption(func(opt Options) {
opt.pointer.closefile = true
@@ -247,3 +277,82 @@ func WritePairedReadsTo(filename string) WithOption {
return f
}
func CSVId(include bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.csv_id = include
})
return f
}
func CSVSequence(include bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.csv_sequence = include
})
return f
}
func CSVQuality(include bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.csv_quality = include
})
return f
}
func CSVDefinition(include bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.csv_definition = include
})
return f
}
func CSVCount(include bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.csv_count = include
})
return f
}
func CSVTaxon(include bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.csv_taxon = include
})
return f
}
func CSVKey(key string) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.csv_keys = append(opt.pointer.csv_keys, key)
})
return f
}
func CSVKeys(keys []string) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.csv_keys = append(opt.pointer.csv_keys, keys...)
})
return f
}
func CSVSeparator(separator string) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.csv_separator = separator
})
return f
}
func CSVNAValue(navalue string) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.csv_navalue = navalue
})
return f
}