mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Refactoring codes for removing buffer size options. An some other changes...
Former-commit-id: 10b57cc1a27446ade3c444217341e9651e89cdce
This commit is contained in:
@ -13,6 +13,7 @@ import (
|
||||
"github.com/barkimedes/go-deepcopy"
|
||||
)
|
||||
|
||||
|
||||
// InterfaceToInt converts a interface{} to an integer value if possible.
|
||||
// If not a "NotAnInteger" error is returned via the err
|
||||
// return value and val is set to 0.
|
||||
@ -302,15 +303,6 @@ func ReadLines(path string) (lines []string, err error) {
|
||||
return
|
||||
}
|
||||
|
||||
func Contains[T comparable](arr []T, x T) bool {
|
||||
for _, v := range arr {
|
||||
if v == x {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func AtomicCounter(initial ...int) func() int {
|
||||
counterMutex := sync.Mutex{}
|
||||
counter := 0
|
||||
|
24
pkg/goutils/slices.go
Normal file
24
pkg/goutils/slices.go
Normal file
@ -0,0 +1,24 @@
|
||||
package goutils
|
||||
|
||||
|
||||
func Contains[T comparable](arr []T, x T) bool {
|
||||
for _, v := range arr {
|
||||
if v == x {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func LookFor[T comparable](arr []T, x T) int {
|
||||
for i, v := range arr {
|
||||
if v == x {
|
||||
return i
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func RemoveIndex[T comparable](s []T, index int) []T {
|
||||
return append(s[:index], s[index+1:]...)
|
||||
}
|
@ -13,7 +13,6 @@ type _Options struct {
|
||||
circular bool
|
||||
forwardError int
|
||||
reverseError int
|
||||
bufferSize int
|
||||
batchSize int
|
||||
parallelWorkers int
|
||||
forward ApatPattern
|
||||
@ -66,12 +65,6 @@ func (options Options) Circular() bool {
|
||||
return options.pointer.circular
|
||||
}
|
||||
|
||||
// BufferSize returns the size of the channel
|
||||
// buffer specified by the options
|
||||
func (options Options) BufferSize() int {
|
||||
return options.pointer.bufferSize
|
||||
}
|
||||
|
||||
// BatchSize returns the size of the
|
||||
// sequence batch used by the PCR algorithm
|
||||
func (options Options) BatchSize() int {
|
||||
@ -95,7 +88,6 @@ func MakeOptions(setters []WithOption) Options {
|
||||
circular: false,
|
||||
parallelWorkers: 4,
|
||||
batchSize: 100,
|
||||
bufferSize: 100,
|
||||
forward: NilApatPattern,
|
||||
cfwd: NilApatPattern,
|
||||
reverse: NilApatPattern,
|
||||
@ -188,16 +180,6 @@ func OptionCircular(circular bool) WithOption {
|
||||
return f
|
||||
}
|
||||
|
||||
// OptionBufferSize sets the requested channel
|
||||
// buffer size.
|
||||
func OptionBufferSize(size int) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.bufferSize = size
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
// OptionParallelWorkers sets how many search
|
||||
// jobs will be run in parallel.
|
||||
func OptionParallelWorkers(nworkers int) WithOption {
|
||||
|
@ -36,20 +36,14 @@ func find(root, ext string) []string {
|
||||
}
|
||||
|
||||
func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
|
||||
classifier *obiseq.BioSequenceClassifier,
|
||||
sizes ...int) (obiiter.IBioSequence, error) {
|
||||
classifier *obiseq.BioSequenceClassifier) (obiiter.IBioSequence, error) {
|
||||
dir, err := tempDir()
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
bufferSize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
bufferSize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := obiiter.MakeIBioSequence(bufferSize)
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
|
@ -10,16 +10,9 @@ import (
|
||||
)
|
||||
|
||||
func ISequenceChunk(iterator obiiter.IBioSequence,
|
||||
classifier *obiseq.BioSequenceClassifier,
|
||||
sizes ...int) (obiiter.IBioSequence, error) {
|
||||
classifier *obiseq.BioSequenceClassifier) (obiiter.IBioSequence, error) {
|
||||
|
||||
bufferSize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
bufferSize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := obiiter.MakeIBioSequence(bufferSize)
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
|
@ -6,7 +6,6 @@ type __options__ struct {
|
||||
navalue string
|
||||
cacheOnDisk bool
|
||||
batchCount int
|
||||
bufferSize int
|
||||
batchSize int
|
||||
parallelWorkers int
|
||||
noSingleton bool
|
||||
@ -25,7 +24,6 @@ func MakeOptions(setters []WithOption) Options {
|
||||
navalue: "NA",
|
||||
cacheOnDisk: false,
|
||||
batchCount: 100,
|
||||
bufferSize: 2,
|
||||
batchSize: 5000,
|
||||
parallelWorkers: 4,
|
||||
noSingleton: false,
|
||||
@ -65,10 +63,6 @@ func (opt Options) BatchCount() int {
|
||||
return opt.pointer.batchCount
|
||||
}
|
||||
|
||||
func (opt Options) BufferSize() int {
|
||||
return opt.pointer.bufferSize
|
||||
}
|
||||
|
||||
func (opt Options) BatchSize() int {
|
||||
return opt.pointer.batchSize
|
||||
}
|
||||
@ -148,14 +142,6 @@ func OptionsBatchSize(size int) WithOption {
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionsBufferSize(size int) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.bufferSize = size
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionsNoSingleton() WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.noSingleton = true
|
||||
|
@ -58,20 +58,13 @@ func (by _By) Sort(seqs []sSS) {
|
||||
|
||||
func ISequenceSubChunk(iterator obiiter.IBioSequence,
|
||||
classifier *obiseq.BioSequenceClassifier,
|
||||
sizes ...int) (obiiter.IBioSequence, error) {
|
||||
nworkers int) (obiiter.IBioSequence, error) {
|
||||
|
||||
bufferSize := iterator.BufferSize()
|
||||
nworkers := 4
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
if nworkers <=0 {
|
||||
nworkers = 4
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
bufferSize = sizes[1]
|
||||
}
|
||||
|
||||
newIter := obiiter.MakeIBioSequence(bufferSize)
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
newIter.Add(nworkers)
|
||||
|
||||
|
@ -19,7 +19,7 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
|
||||
opts := MakeOptions(options)
|
||||
nworkers := opts.ParallelWorkers()
|
||||
|
||||
iUnique := obiiter.MakeIBioSequence(opts.BufferSize())
|
||||
iUnique := obiiter.MakeIBioSequence()
|
||||
|
||||
iterator = iterator.Speed("Splitting data set")
|
||||
|
||||
@ -28,8 +28,7 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
|
||||
if opts.SortOnDisk() {
|
||||
nworkers = 1
|
||||
iterator, err = ISequenceChunkOnDisk(iterator,
|
||||
obiseq.HashClassifier(opts.BatchCount()),
|
||||
0)
|
||||
obiseq.HashClassifier(opts.BatchCount()))
|
||||
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
@ -37,8 +36,7 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
|
||||
|
||||
} else {
|
||||
iterator, err = ISequenceChunk(iterator,
|
||||
obiseq.HashClassifier(opts.BatchCount()),
|
||||
opts.BufferSize())
|
||||
obiseq.HashClassifier(opts.BatchCount()))
|
||||
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
@ -78,12 +76,11 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
|
||||
icat--
|
||||
input, err = ISequenceSubChunk(input,
|
||||
classifier,
|
||||
1,
|
||||
opts.BufferSize())
|
||||
1)
|
||||
|
||||
var next obiiter.IBioSequence
|
||||
if icat >= 0 {
|
||||
next = obiiter.MakeIBioSequence(opts.BufferSize())
|
||||
next = obiiter.MakeIBioSequence()
|
||||
|
||||
iUnique.Add(1)
|
||||
|
||||
@ -130,7 +127,6 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
|
||||
|
||||
iMerged := iUnique.IMergeSequenceBatch(opts.NAValue(),
|
||||
opts.StatsOn(),
|
||||
opts.BufferSize(),
|
||||
)
|
||||
|
||||
return iMerged, nil
|
||||
|
248
pkg/obiformats/csv_writer.go
Normal file
248
pkg/obiformats/csv_writer.go
Normal file
@ -0,0 +1,248 @@
|
||||
package obiformats
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func CSVRecord(sequence *obiseq.BioSequence, opt Options) []string {
|
||||
keys := opt.CSVKeys()
|
||||
record := make([]string, 0, len(keys)+4)
|
||||
|
||||
if opt.CSVId() {
|
||||
record = append(record, sequence.Id())
|
||||
}
|
||||
|
||||
if opt.CSVCount() {
|
||||
record = append(record, fmt.Sprint(sequence.Count()))
|
||||
}
|
||||
|
||||
if opt.CSVTaxon() {
|
||||
taxid := sequence.Taxid()
|
||||
sn, ok := sequence.GetAttribute("scientific_name")
|
||||
|
||||
if !ok {
|
||||
if taxid == 1 {
|
||||
sn = "root"
|
||||
} else {
|
||||
sn = opt.CSVNAValue()
|
||||
}
|
||||
}
|
||||
|
||||
record = append(record, fmt.Sprint(taxid), fmt.Sprint(sn))
|
||||
}
|
||||
|
||||
if opt.CSVDefinition() {
|
||||
record = append(record, sequence.Definition())
|
||||
}
|
||||
|
||||
for _, key := range opt.CSVKeys() {
|
||||
value, ok := sequence.GetAttribute(key)
|
||||
if !ok {
|
||||
value = opt.CSVNAValue()
|
||||
}
|
||||
|
||||
svalue, _ := goutils.InterfaceToString(value)
|
||||
record = append(record, svalue)
|
||||
}
|
||||
|
||||
if opt.CSVSequence() {
|
||||
record = append(record, string(sequence.Sequence()))
|
||||
}
|
||||
|
||||
if opt.CSVQuality() {
|
||||
if sequence.HasQualities() {
|
||||
l := sequence.Len()
|
||||
q := sequence.Qualities()
|
||||
ascii := make([]byte, l)
|
||||
quality_shift := opt.QualityShift()
|
||||
for j := 0; j < l; j++ {
|
||||
ascii[j] = uint8(q[j]) + uint8(quality_shift)
|
||||
}
|
||||
record = append(record, string(ascii))
|
||||
} else {
|
||||
record = append(record, opt.CSVNAValue())
|
||||
}
|
||||
}
|
||||
|
||||
return record
|
||||
}
|
||||
|
||||
func CSVHeader(opt Options) []string {
|
||||
keys := opt.CSVKeys()
|
||||
record := make([]string, 0, len(keys)+4)
|
||||
|
||||
if opt.CSVId() {
|
||||
record = append(record, "id")
|
||||
}
|
||||
|
||||
if opt.CSVCount() {
|
||||
record = append(record, "count")
|
||||
}
|
||||
|
||||
if opt.CSVTaxon() {
|
||||
record = append(record, "taxid", "scientific_name")
|
||||
}
|
||||
|
||||
if opt.CSVDefinition() {
|
||||
record = append(record, "definition")
|
||||
}
|
||||
|
||||
record = append(record, opt.CSVKeys()...)
|
||||
|
||||
if opt.CSVSequence() {
|
||||
record = append(record, "sequence")
|
||||
}
|
||||
|
||||
if opt.CSVQuality() {
|
||||
record = append(record, "quality")
|
||||
}
|
||||
|
||||
return record
|
||||
}
|
||||
|
||||
func FormatCVSBatch(batch obiiter.BioSequenceBatch, opt Options) []byte {
|
||||
buff := new(bytes.Buffer)
|
||||
csv := csv.NewWriter(buff)
|
||||
|
||||
if batch.Order() == 0 {
|
||||
csv.Write(CSVHeader(opt))
|
||||
}
|
||||
for _, s := range batch.Slice() {
|
||||
csv.Write(CSVRecord(s, opt))
|
||||
}
|
||||
|
||||
csv.Flush()
|
||||
|
||||
return buff.Bytes()
|
||||
}
|
||||
|
||||
func WriteCSV(iterator obiiter.IBioSequence,
|
||||
file io.WriteCloser,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
opt := MakeOptions(options)
|
||||
|
||||
file, _ = goutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
nwriters := opt.ParallelWorkers()
|
||||
|
||||
obiiter.RegisterAPipe()
|
||||
chunkchan := make(chan FileChunck)
|
||||
|
||||
newIter.Add(nwriters)
|
||||
var waitWriter sync.WaitGroup
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
for len(chunkchan) > 0 {
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
close(chunkchan)
|
||||
waitWriter.Wait()
|
||||
}()
|
||||
|
||||
ff := func(iterator obiiter.IBioSequence) {
|
||||
for iterator.Next() {
|
||||
|
||||
batch := iterator.Get()
|
||||
|
||||
chunkchan <- FileChunck{
|
||||
FormatCVSBatch(batch, opt),
|
||||
batch.Order(),
|
||||
}
|
||||
newIter.Push(batch)
|
||||
}
|
||||
newIter.Done()
|
||||
}
|
||||
|
||||
log.Debugln("Start of the CSV file writing")
|
||||
go ff(iterator)
|
||||
for i := 0; i < nwriters-1; i++ {
|
||||
go ff(iterator.Split())
|
||||
}
|
||||
|
||||
next_to_send := 0
|
||||
received := make(map[int]FileChunck, 100)
|
||||
|
||||
waitWriter.Add(1)
|
||||
go func() {
|
||||
for chunk := range chunkchan {
|
||||
if chunk.order == next_to_send {
|
||||
file.Write(chunk.text)
|
||||
next_to_send++
|
||||
chunk, ok := received[next_to_send]
|
||||
for ok {
|
||||
file.Write(chunk.text)
|
||||
delete(received, next_to_send)
|
||||
next_to_send++
|
||||
chunk, ok = received[next_to_send]
|
||||
}
|
||||
} else {
|
||||
received[chunk.order] = chunk
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
file.Close()
|
||||
|
||||
log.Debugln("End of the CSV file writing")
|
||||
obiiter.UnregisterPipe()
|
||||
waitWriter.Done()
|
||||
|
||||
}()
|
||||
|
||||
return newIter, nil
|
||||
}
|
||||
|
||||
func WriteCSVToStdout(iterator obiiter.IBioSequence,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
options = append(options, OptionDontCloseFile())
|
||||
return WriteCSV(iterator, os.Stdout, options...)
|
||||
}
|
||||
|
||||
func WriteCSVToFile(iterator obiiter.IBioSequence,
|
||||
filename string,
|
||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
|
||||
opt := MakeOptions(options)
|
||||
flags := os.O_WRONLY | os.O_CREATE
|
||||
|
||||
if opt.AppendFile() {
|
||||
flags |= os.O_APPEND
|
||||
}
|
||||
file, err := os.OpenFile(filename, flags, 0660)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("open file error: %v", err)
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
options = append(options, OptionCloseFile())
|
||||
|
||||
iterator, err = WriteCSV(iterator, file, options...)
|
||||
|
||||
if opt.HaveToSavePaired() {
|
||||
var revfile *os.File
|
||||
|
||||
revfile, err = os.OpenFile(opt.PairedFileName(), flags, 0660)
|
||||
if err != nil {
|
||||
log.Fatalf("open file error: %v", err)
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
iterator, err = WriteCSV(iterator.PairedWith(), revfile, options...)
|
||||
}
|
||||
|
||||
return iterator, err
|
||||
}
|
@ -166,7 +166,7 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
||||
|
||||
opt := MakeOptions(options)
|
||||
|
||||
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
|
@ -244,9 +244,9 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
|
||||
// <CR>?<LF>//<CR>?<LF>
|
||||
func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
||||
opt := MakeOptions(options)
|
||||
entry_channel := make(chan _FileChunk, opt.BufferSize())
|
||||
entry_channel := make(chan _FileChunk)
|
||||
|
||||
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
nworkers := opt.ParallelWorkers()
|
||||
newIter.Add(nworkers)
|
||||
|
@ -19,6 +19,5 @@ func IParseFastSeqHeaderBatch(iterator obiiter.IBioSequence,
|
||||
options ...WithOption) obiiter.IBioSequence {
|
||||
opt := MakeOptions(options)
|
||||
return iterator.MakeIWorker(obiseq.AnnotatorToSeqWorker(opt.ParseFastSeqHeader()),
|
||||
opt.ParallelWorkers(),
|
||||
opt.BufferSize())
|
||||
opt.ParallelWorkers())
|
||||
}
|
||||
|
@ -105,7 +105,7 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe
|
||||
size = -1
|
||||
}
|
||||
|
||||
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
@ -127,7 +127,7 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe
|
||||
|
||||
func ReadFastSeqFromStdin(options ...WithOption) obiiter.IBioSequence {
|
||||
opt := MakeOptions(options)
|
||||
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
|
@ -71,8 +71,7 @@ func WriteFasta(iterator obiiter.IBioSequence,
|
||||
|
||||
file, _ = goutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||
|
||||
buffsize := iterator.BufferSize()
|
||||
newIter := obiiter.MakeIBioSequence(buffsize)
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
nwriters := opt.ParallelWorkers()
|
||||
|
||||
|
@ -60,8 +60,7 @@ func WriteFastq(iterator obiiter.IBioSequence,
|
||||
|
||||
file, _ = goutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile())
|
||||
|
||||
buffsize := iterator.BufferSize()
|
||||
newIter := obiiter.MakeIBioSequence(buffsize)
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
nwriters := opt.ParallelWorkers()
|
||||
|
||||
|
@ -113,9 +113,9 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
|
||||
|
||||
func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
||||
opt := MakeOptions(options)
|
||||
entry_channel := make(chan _FileChunk, opt.BufferSize())
|
||||
entry_channel := make(chan _FileChunk)
|
||||
|
||||
newIter := obiiter.MakeIBioSequence(opt.BufferSize())
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
nworkers := opt.ParallelWorkers()
|
||||
newIter.Add(nworkers)
|
||||
|
@ -15,10 +15,15 @@ type __options__ struct {
|
||||
closefile bool
|
||||
appendfile bool
|
||||
compressed bool
|
||||
csv_ids bool
|
||||
cvs_sequence bool
|
||||
csv_id bool
|
||||
csv_sequence bool
|
||||
csv_quality bool
|
||||
csv_definition bool
|
||||
csv_count bool
|
||||
csv_taxon bool
|
||||
csv_keys []string
|
||||
csv_separator string
|
||||
csv_navalue string
|
||||
paired_filename string
|
||||
}
|
||||
|
||||
@ -40,11 +45,16 @@ func MakeOptions(setters []WithOption) Options {
|
||||
closefile: false,
|
||||
appendfile: false,
|
||||
compressed: false,
|
||||
csv_ids: true,
|
||||
csv_id: true,
|
||||
csv_definition: false,
|
||||
cvs_sequence: true,
|
||||
csv_count: false,
|
||||
csv_taxon: false,
|
||||
csv_sequence: true,
|
||||
csv_quality: false,
|
||||
csv_separator: ",",
|
||||
paired_filename: "",
|
||||
csv_navalue: "NA",
|
||||
csv_keys: make([]string, 0),
|
||||
paired_filename: "",
|
||||
}
|
||||
|
||||
opt := Options{&o}
|
||||
@ -60,10 +70,6 @@ func (opt Options) QualityShift() int {
|
||||
return opt.pointer.quality_shift
|
||||
}
|
||||
|
||||
func (opt Options) BufferSize() int {
|
||||
return opt.pointer.buffer_size
|
||||
}
|
||||
|
||||
func (opt Options) BatchSize() int {
|
||||
return opt.pointer.batch_size
|
||||
}
|
||||
@ -96,8 +102,40 @@ func (opt Options) CompressedFile() bool {
|
||||
return opt.pointer.compressed
|
||||
}
|
||||
|
||||
func (opt Options) CSVIds() bool {
|
||||
return opt.pointer.csv_ids
|
||||
func (opt Options) CSVId() bool {
|
||||
return opt.pointer.csv_id
|
||||
}
|
||||
|
||||
func (opt Options) CSVDefinition() bool {
|
||||
return opt.pointer.csv_definition
|
||||
}
|
||||
|
||||
func (opt Options) CSVCount() bool {
|
||||
return opt.pointer.csv_count
|
||||
}
|
||||
|
||||
func (opt Options) CSVTaxon() bool {
|
||||
return opt.pointer.csv_taxon
|
||||
}
|
||||
|
||||
func (opt Options) CSVSequence() bool {
|
||||
return opt.pointer.csv_sequence
|
||||
}
|
||||
|
||||
func (opt Options) CSVQuality() bool {
|
||||
return opt.pointer.csv_quality
|
||||
}
|
||||
|
||||
func (opt Options) CSVKeys() []string {
|
||||
return opt.pointer.csv_keys
|
||||
}
|
||||
|
||||
func (opt Options) CSVSeparator() string {
|
||||
return opt.pointer.csv_separator
|
||||
}
|
||||
|
||||
func (opt Options) CSVNAValue() string {
|
||||
return opt.pointer.csv_navalue
|
||||
}
|
||||
|
||||
func (opt Options) HaveToSavePaired() bool {
|
||||
@ -108,14 +146,6 @@ func (opt Options) PairedFileName() string {
|
||||
return opt.pointer.paired_filename
|
||||
}
|
||||
|
||||
func OptionsBufferSize(size int) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.buffer_size = size
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionCloseFile() WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.closefile = true
|
||||
@ -247,3 +277,82 @@ func WritePairedReadsTo(filename string) WithOption {
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVId(include bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_id = include
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVSequence(include bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_sequence = include
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVQuality(include bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_quality = include
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVDefinition(include bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_definition = include
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVCount(include bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_count = include
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVTaxon(include bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_taxon = include
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVKey(key string) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_keys = append(opt.pointer.csv_keys, key)
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVKeys(keys []string) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_keys = append(opt.pointer.csv_keys, keys...)
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVSeparator(separator string) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_separator = separator
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func CSVNAValue(navalue string) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.csv_navalue = navalue
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
@ -60,17 +60,11 @@ type IBioSequence struct {
|
||||
var NilIBioSequence = IBioSequence{pointer: nil}
|
||||
|
||||
func MakeIBioSequence(sizes ...int) IBioSequence {
|
||||
buffsize := int32(0)
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = int32(sizes[0])
|
||||
}
|
||||
|
||||
i := _IBioSequence{
|
||||
channel: make(chan BioSequenceBatch, buffsize),
|
||||
channel: make(chan BioSequenceBatch),
|
||||
current: NilBioSequenceBatch,
|
||||
pushBack: abool.New(),
|
||||
buffer_size: buffsize,
|
||||
batch_size: -1,
|
||||
sequence_format: "",
|
||||
finished: abool.New(),
|
||||
@ -160,14 +154,6 @@ func (iterator IBioSequence) IsNil() bool {
|
||||
return iterator.pointer == nil
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) BufferSize() int {
|
||||
if iterator.pointer == nil {
|
||||
log.Panic("call of IBioSequenceBatch.BufferSize method on NilIBioSequenceBatch")
|
||||
}
|
||||
|
||||
return int(atomic.LoadInt32(&iterator.pointer.buffer_size))
|
||||
}
|
||||
|
||||
func (iterator IBioSequence) BatchSize() int {
|
||||
if iterator.pointer == nil {
|
||||
log.Panic("call of IBioSequenceBatch.BatchSize method on NilIBioSequenceBatch")
|
||||
@ -279,13 +265,8 @@ func (iterator IBioSequence) Finished() bool {
|
||||
|
||||
// Sorting the batches of sequences.
|
||||
func (iterator IBioSequence) SortBatches(sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
newIter := MakeIBioSequence()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
@ -338,8 +319,7 @@ func (iterator IBioSequence) Concat(iterators ...IBioSequence) IBioSequence {
|
||||
allPaired = allPaired && i.IsPaired()
|
||||
}
|
||||
|
||||
buffsize := iterator.BufferSize()
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
newIter := MakeIBioSequence()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
@ -396,8 +376,7 @@ func (iterator IBioSequence) Pool(iterators ...IBioSequence) IBioSequence {
|
||||
}
|
||||
|
||||
nextCounter := goutils.AtomicCounter()
|
||||
buffsize := iterator.BufferSize()
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
newIter := MakeIBioSequence()
|
||||
|
||||
newIter.Add(niterator)
|
||||
|
||||
@ -431,13 +410,8 @@ func (iterator IBioSequence) Pool(iterators ...IBioSequence) IBioSequence {
|
||||
// indicated in parameter. Rebatching implies to sort the
|
||||
// source IBioSequenceBatch.
|
||||
func (iterator IBioSequence) Rebatch(size int, sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
newIter := MakeIBioSequence()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
@ -532,14 +506,9 @@ func (iterator IBioSequence) Count(recycle bool) (int, int, int) {
|
||||
// iterator following the predicate value.
|
||||
func (iterator IBioSequence) DivideOn(predicate obiseq.SequencePredicate,
|
||||
size int, sizes ...int) (IBioSequence, IBioSequence) {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
trueIter := MakeIBioSequence(buffsize)
|
||||
falseIter := MakeIBioSequence(buffsize)
|
||||
trueIter := MakeIBioSequence()
|
||||
falseIter := MakeIBioSequence()
|
||||
|
||||
trueIter.Add(1)
|
||||
falseIter.Add(1)
|
||||
@ -604,18 +573,13 @@ func (iterator IBioSequence) DivideOn(predicate obiseq.SequencePredicate,
|
||||
// A function that takes a predicate and a batch of sequences and returns a filtered batch of sequences.
|
||||
func (iterator IBioSequence) FilterOn(predicate obiseq.SequencePredicate,
|
||||
size int, sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
nworkers := 4
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
trueIter := MakeIBioSequence(buffsize)
|
||||
trueIter := MakeIBioSequence()
|
||||
|
||||
trueIter.Add(nworkers)
|
||||
|
||||
@ -661,18 +625,13 @@ func (iterator IBioSequence) FilterOn(predicate obiseq.SequencePredicate,
|
||||
|
||||
func (iterator IBioSequence) FilterAnd(predicate obiseq.SequencePredicate,
|
||||
size int, sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
nworkers := 4
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
trueIter := MakeIBioSequence(buffsize)
|
||||
trueIter := MakeIBioSequence()
|
||||
|
||||
trueIter.Add(nworkers)
|
||||
|
||||
@ -740,13 +699,7 @@ func (iterator IBioSequence) Load() obiseq.BioSequenceSlice {
|
||||
func IBatchOver(data obiseq.BioSequenceSlice,
|
||||
size int, sizes ...int) IBioSequence {
|
||||
|
||||
buffsize := 0
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
newIter := MakeIBioSequence()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
|
@ -36,7 +36,6 @@ func (dist *IDistribute) Classifier() *obiseq.BioSequenceClassifier {
|
||||
|
||||
func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, sizes ...int) IDistribute {
|
||||
batchsize := 5000
|
||||
buffsize := 2
|
||||
|
||||
outputs := make(map[int]IBioSequence, 100)
|
||||
slices := make(map[int]*obiseq.BioSequenceSlice, 100)
|
||||
@ -47,9 +46,7 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz
|
||||
batchsize = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
|
||||
jobDone := sync.WaitGroup{}
|
||||
lock := sync.Mutex{}
|
||||
@ -80,7 +77,7 @@ func (iterator IBioSequence) Distribute(class *obiseq.BioSequenceClassifier, siz
|
||||
orders[key] = 0
|
||||
|
||||
lock.Lock()
|
||||
outputs[key] = MakeIBioSequence(buffsize)
|
||||
outputs[key] = MakeIBioSequence()
|
||||
lock.Unlock()
|
||||
|
||||
news <- key
|
||||
|
@ -4,16 +4,12 @@ import "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
|
||||
func (iterator IBioSequence) IMergeSequenceBatch(na string, statsOn []string, sizes ...int) IBioSequence {
|
||||
batchsize := 100
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
batchsize = sizes[0]
|
||||
}
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
newIter := MakeIBioSequence()
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
|
@ -6,7 +6,6 @@ import (
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
|
||||
// That method allows for applying a SeqWorker function on every sequences.
|
||||
//
|
||||
// Sequences are provided by the iterator and modified sequences are pushed
|
||||
@ -17,17 +16,12 @@ import (
|
||||
// - The second the size of the chanel buffer. By default set to the same value than the input buffer.
|
||||
func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int) IBioSequence {
|
||||
nworkers := 4
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
newIter := MakeIBioSequence()
|
||||
|
||||
newIter.Add(nworkers)
|
||||
|
||||
@ -64,17 +58,12 @@ func (iterator IBioSequence) MakeIWorker(worker obiseq.SeqWorker, sizes ...int)
|
||||
func (iterator IBioSequence) MakeIConditionalWorker(predicate obiseq.SequencePredicate,
|
||||
worker obiseq.SeqWorker, sizes ...int) IBioSequence {
|
||||
nworkers := 4
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
newIter := MakeIBioSequence()
|
||||
|
||||
newIter.Add(nworkers)
|
||||
|
||||
@ -112,17 +101,12 @@ func (iterator IBioSequence) MakeIConditionalWorker(predicate obiseq.SequencePre
|
||||
|
||||
func (iterator IBioSequence) MakeISliceWorker(worker obiseq.SeqSliceWorker, sizes ...int) IBioSequence {
|
||||
nworkers := 4
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
newIter := MakeIBioSequence()
|
||||
|
||||
newIter.Add(nworkers)
|
||||
|
||||
@ -140,7 +124,7 @@ func (iterator IBioSequence) MakeISliceWorker(worker obiseq.SeqSliceWorker, size
|
||||
newIter.Done()
|
||||
}
|
||||
|
||||
log.Printf("Start of the batch slice workers on %d workers (buffer : %d)\n", nworkers, buffsize)
|
||||
log.Printf("Start of the batch slice workers on %d workers\n", nworkers)
|
||||
for i := 0; i < nworkers-1; i++ {
|
||||
go f(iterator.Split())
|
||||
}
|
||||
@ -168,4 +152,3 @@ func SliceWorkerPipe(worker obiseq.SeqSliceWorker, sizes ...int) Pipeable {
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
|
@ -11,7 +11,6 @@ type _Options struct {
|
||||
withProgressBar bool
|
||||
parallelWorkers int
|
||||
batchSize int
|
||||
bufferSize int
|
||||
}
|
||||
|
||||
// Options stores a set of option usable by the
|
||||
@ -56,16 +55,6 @@ func OptionAllowedMismatches(count int) WithOption {
|
||||
return f
|
||||
}
|
||||
|
||||
// OptionBufferSize sets the requested channel
|
||||
// buffer size.
|
||||
func OptionBufferSize(size int) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.bufferSize = size
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
// OptionParallelWorkers sets how many search
|
||||
// jobs will be run in parallel.
|
||||
func OptionParallelWorkers(nworkers int) WithOption {
|
||||
@ -102,12 +91,6 @@ func (options Options) WithProgressBar() bool {
|
||||
return options.pointer.withProgressBar
|
||||
}
|
||||
|
||||
// BufferSize returns the size of the channel
|
||||
// buffer specified by the options
|
||||
func (options Options) BufferSize() int {
|
||||
return options.pointer.bufferSize
|
||||
}
|
||||
|
||||
// BatchSize returns the size of the
|
||||
// sequence batch used by the PCR algorithm
|
||||
func (options Options) BatchSize() int {
|
||||
@ -130,7 +113,6 @@ func MakeOptions(setters []WithOption) Options {
|
||||
withProgressBar: false,
|
||||
parallelWorkers: 4,
|
||||
batchSize: 1000,
|
||||
bufferSize: 100,
|
||||
}
|
||||
|
||||
opt := Options{&o}
|
||||
|
@ -11,12 +11,11 @@ import (
|
||||
)
|
||||
|
||||
var _Debug = false
|
||||
var _ParallelWorkers = runtime.NumCPU() * 2 - 1
|
||||
var _ParallelWorkers = runtime.NumCPU()*2 - 1
|
||||
var _MaxAllowedCPU = runtime.NumCPU()
|
||||
var _BufferSize = 1
|
||||
var _BatchSize = 5000
|
||||
|
||||
type ArgumentParser func([]string) (*getoptions.GetOpt, []string, error)
|
||||
type ArgumentParser func([]string) (*getoptions.GetOpt, []string)
|
||||
|
||||
func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser {
|
||||
|
||||
@ -38,16 +37,20 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
|
||||
o(options)
|
||||
}
|
||||
|
||||
return func(args []string) (*getoptions.GetOpt, []string, error) {
|
||||
return func(args []string) (*getoptions.GetOpt, []string) {
|
||||
|
||||
remaining, err := options.Parse(args[1:])
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Error on the commande line : %v",err)
|
||||
}
|
||||
|
||||
// Setup the maximum number of CPU usable by the program
|
||||
runtime.GOMAXPROCS(_MaxAllowedCPU)
|
||||
if options.Called("max-cpu") {
|
||||
log.Printf("CPU number limited to %d", _MaxAllowedCPU)
|
||||
if ! options.Called("workers") {
|
||||
_ParallelWorkers=_MaxAllowedCPU * 2 - 1
|
||||
if !options.Called("workers") {
|
||||
_ParallelWorkers = _MaxAllowedCPU*2 - 1
|
||||
log.Printf("Number of workers set %d", _ParallelWorkers)
|
||||
}
|
||||
}
|
||||
@ -67,7 +70,7 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
|
||||
log.Debugln("Switch to debug level logging")
|
||||
}
|
||||
|
||||
return options, remaining, err
|
||||
return options, remaining
|
||||
}
|
||||
}
|
||||
|
||||
@ -88,11 +91,6 @@ func CLIMaxCPU() int {
|
||||
return _MaxAllowedCPU
|
||||
}
|
||||
|
||||
// CLIBufferSize returns the expeted channel buffer size for obitools
|
||||
func CLIBufferSize() int {
|
||||
return _BufferSize
|
||||
}
|
||||
|
||||
// CLIBatchSize returns the expeted size of the sequence batches
|
||||
func CLIBatchSize() int {
|
||||
return _BatchSize
|
||||
|
@ -8,6 +8,15 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func (s *BioSequence) HasAttribute(key string) bool {
|
||||
ok := s.annotations != nil
|
||||
|
||||
if ok {
|
||||
_, ok = s.annotations[key]
|
||||
}
|
||||
|
||||
return ok
|
||||
}
|
||||
// A method that returns the value of the key in the annotation map.
|
||||
func (s *BioSequence) GetAttribute(key string) (interface{}, bool) {
|
||||
var val interface{}
|
||||
|
@ -278,3 +278,28 @@ func (s *BioSequence) Clear() {
|
||||
s.sequence = s.sequence[0:0]
|
||||
}
|
||||
|
||||
func (s *BioSequence) Composition() map[byte]int {
|
||||
|
||||
a := 0
|
||||
c := 0
|
||||
g := 0
|
||||
t := 0
|
||||
other := 0
|
||||
for _, char := range s.sequence {
|
||||
switch char {
|
||||
case 'a':
|
||||
a++
|
||||
case 'c':
|
||||
c++
|
||||
case 'g':
|
||||
g++
|
||||
case 't':
|
||||
t++
|
||||
default:
|
||||
other++
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return map[byte]int{'a': a, 'c': c, 'g': g, 't': t, 'o': other}
|
||||
}
|
||||
|
@ -316,3 +316,4 @@ func RotateClassifier(size int) *BioSequenceClassifier {
|
||||
c := BioSequenceClassifier{code, value, reset, clone,"RotateClassifier"}
|
||||
return &c
|
||||
}
|
||||
|
||||
|
@ -4,22 +4,21 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obieval"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func Expression(expression string) func(*BioSequence) (interface{},error) {
|
||||
func Expression(expression string) func(*BioSequence) (interface{}, error) {
|
||||
|
||||
exp, err := obieval.OBILang.NewEvaluable(expression)
|
||||
exp, err := OBILang.NewEvaluable(expression)
|
||||
if err != nil {
|
||||
log.Fatalf("Error in the expression : %s", expression)
|
||||
}
|
||||
|
||||
f := func(sequence *BioSequence) (interface{},error) {
|
||||
f := func(sequence *BioSequence) (interface{}, error) {
|
||||
return exp(context.Background(),
|
||||
map[string]interface{}{
|
||||
"annotations": sequence.Annotations(),
|
||||
"sequence": sequence,
|
||||
"annotations": sequence.Annotations(),
|
||||
"sequence": sequence,
|
||||
},
|
||||
)
|
||||
}
|
||||
@ -30,14 +29,14 @@ func Expression(expression string) func(*BioSequence) (interface{},error) {
|
||||
func EditIdWorker(expression string) SeqWorker {
|
||||
e := Expression(expression)
|
||||
f := func(sequence *BioSequence) *BioSequence {
|
||||
v,err := e(sequence)
|
||||
v, err := e(sequence)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Expression '%s' cannot be evaluated on sequence %s",
|
||||
expression,
|
||||
sequence.Id())
|
||||
}
|
||||
sequence.SetId(fmt.Sprintf("%v",v))
|
||||
sequence.SetId(fmt.Sprintf("%v", v))
|
||||
return sequence
|
||||
}
|
||||
|
||||
@ -47,16 +46,16 @@ func EditIdWorker(expression string) SeqWorker {
|
||||
func EditAttributeWorker(key string, expression string) SeqWorker {
|
||||
e := Expression(expression)
|
||||
f := func(sequence *BioSequence) *BioSequence {
|
||||
v,err := e(sequence)
|
||||
v, err := e(sequence)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Expression '%s' cannot be evaluated on sequence %s",
|
||||
expression,
|
||||
sequence.Id())
|
||||
}
|
||||
sequence.SetAttribute(key,v)
|
||||
sequence.SetAttribute(key, v)
|
||||
return sequence
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
package obieval
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
@ -174,8 +174,19 @@ var OBILang = gval.NewLanguage(
|
||||
log.Fatalf("%v cannot be converted to a boolan value", args[0])
|
||||
}
|
||||
return val, nil
|
||||
}),
|
||||
gval.Function("ifelse", func(args ...interface{}) (interface{}, error) {
|
||||
if args[0].(bool) {
|
||||
return args[1], nil
|
||||
} else {
|
||||
return args[2], nil
|
||||
}
|
||||
}),
|
||||
gval.Function("gcskew", func(args ...interface{}) (interface{}, error) {
|
||||
composition := (args[0].(*BioSequence)).Composition()
|
||||
return float64(composition['g']-composition['c']) / float64(composition['g']+composition['c']), nil
|
||||
}),
|
||||
gval.Function("composition", func(args ...interface{}) (interface{}, error) {
|
||||
return (args[0].(*BioSequence)).Composition(), nil
|
||||
}))
|
||||
|
||||
func Expression(expression string) (gval.Evaluable, error) {
|
||||
return OBILang.NewEvaluable(expression)
|
||||
}
|
@ -5,7 +5,6 @@ import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obieval"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
@ -256,7 +255,7 @@ func IsIdIn(ids ...string) SequencePredicate {
|
||||
|
||||
func ExpressionPredicat(expression string) SequencePredicate {
|
||||
|
||||
exp, err := obieval.OBILang.NewEvaluable(expression)
|
||||
exp, err := OBILang.NewEvaluable(expression)
|
||||
if err != nil {
|
||||
log.Fatalf("Error in the expression : %s", expression)
|
||||
}
|
||||
|
63
pkg/obitools/obicleandb/obicleandb.go
Normal file
63
pkg/obitools/obicleandb/obicleandb.go
Normal file
@ -0,0 +1,63 @@
|
||||
package obicleandb
|
||||
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obichunk"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obigrep"
|
||||
)
|
||||
|
||||
func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
var rankPredicate obiseq.SequencePredicate
|
||||
|
||||
options := make([]obichunk.WithOption, 0, 30)
|
||||
|
||||
// Make sequence dereplication with a constraint on the taxid.
|
||||
// To be merged, both sequences must have the same taxid.
|
||||
|
||||
options = append(options,
|
||||
obichunk.OptionBatchCount(100),
|
||||
obichunk.OptionSortOnMemory(),
|
||||
obichunk.OptionSubCategory("taxid"),
|
||||
obichunk.OptionsParallelWorkers(
|
||||
obioptions.CLIParallelWorkers()),
|
||||
obichunk.OptionsBatchSize(
|
||||
obioptions.CLIBatchSize()),
|
||||
obichunk.OptionNAValue("NA"),
|
||||
)
|
||||
|
||||
unique, err := obichunk.IUniqueSequence(itertator, options...)
|
||||
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
taxonomy := obigrep.CLILoadSelectedTaxonomy()
|
||||
|
||||
if len(obigrep.CLIRequiredRanks()) > 0 {
|
||||
rankPredicate = obigrep.CLIHasRankDefinedPredicate()
|
||||
} else {
|
||||
rankPredicate = taxonomy.HasRequiredRank("species").And(taxonomy.HasRequiredRank("genus")).And(taxonomy.HasRequiredRank("family"))
|
||||
}
|
||||
|
||||
goodTaxa := taxonomy.IsAValidTaxon(CLIUpdateTaxids()).And(rankPredicate)
|
||||
|
||||
usable := unique.FilterOn(goodTaxa,
|
||||
obioptions.CLIBatchSize(),
|
||||
obioptions.CLIParallelWorkers())
|
||||
|
||||
annotated := usable.MakeIWorker(taxonomy.MakeSetSpeciesWorker(),
|
||||
obioptions.CLIParallelWorkers(),
|
||||
).MakeIWorker(taxonomy.MakeSetGenusWorker(),
|
||||
obioptions.CLIParallelWorkers(),
|
||||
).MakeIWorker(taxonomy.MakeSetFamilyWorker(),
|
||||
obioptions.CLIParallelWorkers(),
|
||||
)
|
||||
|
||||
// annotated.MakeIConditionalWorker(obiseq.IsMoreAbundantOrEqualTo(3),1000)
|
||||
|
||||
return annotated
|
||||
}
|
@ -60,6 +60,21 @@ func InputOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
}
|
||||
|
||||
func OutputModeOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&__no_progress_bar__, "no-progressbar", false,
|
||||
options.Description("Disable the progress bar printing"))
|
||||
|
||||
options.BoolVar(&__compressed__, "compress", false,
|
||||
options.Alias("Z"),
|
||||
options.Description("Output is compressed"))
|
||||
|
||||
options.StringVar(&__output_file_name__, "out", __output_file_name__,
|
||||
options.Alias("o"),
|
||||
options.ArgName("FILENAME"),
|
||||
options.Description("Filename used for saving the output"),
|
||||
)
|
||||
}
|
||||
|
||||
func OutputOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&__output_in_fasta__, "fasta-output", false,
|
||||
options.Description("Read data following the ecoPCR output format."))
|
||||
@ -73,19 +88,7 @@ func OutputOptionSet(options *getoptions.GetOpt) {
|
||||
options.Alias("O"),
|
||||
options.Description("output FASTA/FASTQ title line annotations follow OBI format."))
|
||||
|
||||
options.BoolVar(&__no_progress_bar__, "no-progressbar", false,
|
||||
options.Description("Disable the progress bar printing"))
|
||||
|
||||
options.BoolVar(&__compressed__, "compress", false,
|
||||
options.Alias("Z"),
|
||||
options.Description("Output is compressed"))
|
||||
|
||||
options.StringVar(&__output_file_name__, "out", __output_file_name__,
|
||||
options.Alias("o"),
|
||||
options.ArgName("FILENAME"),
|
||||
options.Description("Filename used for saving the output"),
|
||||
)
|
||||
|
||||
OutputModeOptionSet(options)
|
||||
}
|
||||
|
||||
func PairedFilesOptionSet(options *getoptions.GetOpt) {
|
||||
@ -197,4 +200,4 @@ func CLIHasPairedFile() bool {
|
||||
}
|
||||
func CLIPairedFileName() string {
|
||||
return __paired_file_name__
|
||||
}
|
||||
}
|
||||
|
@ -48,6 +48,10 @@ func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
|
||||
strings.HasSuffix(path, "fasta.gz") ||
|
||||
strings.HasSuffix(path, "fastq") ||
|
||||
strings.HasSuffix(path, "fastq.gz") ||
|
||||
strings.HasSuffix(path, "seq") ||
|
||||
strings.HasSuffix(path, "seq.gz") ||
|
||||
strings.HasSuffix(path, "gb") ||
|
||||
strings.HasSuffix(path, "gb.gz") ||
|
||||
strings.HasSuffix(path, "dat") ||
|
||||
strings.HasSuffix(path, "dat.gz") ||
|
||||
strings.HasSuffix(path, "ecopcr") ||
|
||||
@ -82,13 +86,12 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
||||
opts = append(opts, obiformats.OptionsFastSeqHeaderParser(obiformats.ParseGuessedFastSeqHeader))
|
||||
}
|
||||
|
||||
nworkers := obioptions.CLIParallelWorkers() // / 4
|
||||
nworkers := obioptions.CLIParallelWorkers()
|
||||
if nworkers < 2 {
|
||||
nworkers = 2
|
||||
}
|
||||
|
||||
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
|
||||
opts = append(opts, obiformats.OptionsBufferSize(obioptions.CLIBufferSize()))
|
||||
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
|
||||
|
||||
opts = append(opts, obiformats.OptionsQualityShift(CLIInputQualityShift()))
|
||||
|
@ -60,7 +60,6 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
|
||||
}
|
||||
|
||||
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
|
||||
opts = append(opts, obiformats.OptionsBufferSize(obioptions.CLIBufferSize()))
|
||||
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
|
||||
|
||||
opts = append(opts, obiformats.OptionsQualityShift(CLIOutputQualityShift()))
|
||||
|
61
pkg/obitools/obicsv/obicsv.go
Normal file
61
pkg/obitools/obicsv/obicsv.go
Normal file
@ -0,0 +1,61 @@
|
||||
package obicsv
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||
)
|
||||
|
||||
func CLIWriteCSV(iterator obiiter.IBioSequence,
|
||||
terminalAction bool, filenames ...string) (obiiter.IBioSequence, error) {
|
||||
|
||||
if obiconvert.CLIProgressBar() {
|
||||
iterator = iterator.Speed()
|
||||
}
|
||||
|
||||
var newIter obiiter.IBioSequence
|
||||
|
||||
opts := make([]obiformats.WithOption, 0, 10)
|
||||
|
||||
nworkers := obioptions.CLIParallelWorkers() / 4
|
||||
if nworkers < 2 {
|
||||
nworkers = 2
|
||||
}
|
||||
|
||||
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
|
||||
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
|
||||
|
||||
opts = append(opts, obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift()))
|
||||
opts = append(opts, obiformats.OptionsCompressed(obiconvert.CLICompressed()))
|
||||
|
||||
opts = append(opts, obiformats.CSVId(CLIPrintId()),
|
||||
obiformats.CSVCount(CLIPrintCount()),
|
||||
obiformats.CSVTaxon(CLIPrintTaxon()),
|
||||
obiformats.CSVDefinition(CLIPrintDefinition()),
|
||||
obiformats.CSVKeys(CLIToBeKeptAttributes()),
|
||||
)
|
||||
|
||||
var err error
|
||||
|
||||
if len(filenames) == 0 {
|
||||
newIter, err = obiformats.WriteCSVToStdout(iterator, opts...)
|
||||
} else {
|
||||
newIter, err = obiformats.WriteCSVToFile(iterator, filenames[0], opts...)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Write file error: %v", err)
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
if terminalAction {
|
||||
newIter.Recycle()
|
||||
return obiiter.NilIBioSequence, nil
|
||||
}
|
||||
|
||||
return newIter, nil
|
||||
|
||||
}
|
126
pkg/obitools/obicsv/options.go
Normal file
126
pkg/obitools/obicsv/options.go
Normal file
@ -0,0 +1,126 @@
|
||||
package obicsv
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _outputIds = true
|
||||
var _outputCount = false
|
||||
var _outputTaxon = false
|
||||
var _outputSequence = true
|
||||
var _outputQuality = true
|
||||
var _outputDefinition = false
|
||||
var _obipairing = false
|
||||
var _autoColumns = false
|
||||
var _keepOnly = make([]string, 0)
|
||||
var _naValue = "NA"
|
||||
|
||||
var _softAttributes = map[string][]string{
|
||||
"obipairing": {"mode", "seq_a_single", "seq_b_single",
|
||||
"ali_dir", "score", "score_norm",
|
||||
"seq_ab_match", "pairing_mismatches",
|
||||
},
|
||||
}
|
||||
|
||||
func CSVOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&_outputIds, "ids", _outputIds,
|
||||
options.Alias("i"),
|
||||
options.Description("Prints sequence ids in the ouput."))
|
||||
|
||||
options.BoolVar(&_outputSequence, "sequence", _outputSequence,
|
||||
options.Alias("s"),
|
||||
options.Description("Prints sequence itself in the output."))
|
||||
|
||||
options.BoolVar(&_outputQuality, "quality", _outputQuality,
|
||||
options.Alias("q"),
|
||||
options.Description("Prints sequence quality in the output."))
|
||||
|
||||
options.BoolVar(&_outputDefinition, "definition", _outputDefinition,
|
||||
options.Alias("d"),
|
||||
options.Description("Prints sequence definition in the output."))
|
||||
|
||||
options.BoolVar(&_autoColumns, "auto", _autoColumns,
|
||||
options.Description("Based on the first sequences, propose a list of attibutes to print"))
|
||||
|
||||
options.BoolVar(&_outputCount, "count", _outputCount,
|
||||
options.Description("Prints the count attribute in the output"))
|
||||
|
||||
options.BoolVar(&_outputTaxon, "taxon", _outputTaxon,
|
||||
options.Description("Prints the NCBI taxid and its related scientific name"))
|
||||
|
||||
options.BoolVar(&_obipairing, "obipairing", _obipairing,
|
||||
options.Description("Prints the attributes added by obipairing"))
|
||||
|
||||
options.StringSliceVar(&_keepOnly, "keep", 1, 1,
|
||||
options.Alias("k"),
|
||||
options.ArgName("KEY"),
|
||||
options.Description("Keeps only attribute with key <KEY>. Several -k options can be combined."))
|
||||
|
||||
options.StringVar(&_naValue, "na-value", _naValue,
|
||||
options.ArgName("NAVALUE"),
|
||||
options.Description("A string representing non available values in the CSV file."))
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OutputModeOptionSet(options)
|
||||
CSVOptionSet(options)
|
||||
}
|
||||
|
||||
func CLIPrintId() bool {
|
||||
return _outputIds
|
||||
}
|
||||
|
||||
func CLIPrintSequence() bool {
|
||||
return _outputSequence
|
||||
}
|
||||
|
||||
func CLIPrintCount() bool {
|
||||
return _outputCount
|
||||
}
|
||||
func CLIPrintTaxon() bool {
|
||||
return _outputTaxon
|
||||
}
|
||||
func CLIPrintQuality() bool {
|
||||
return _outputQuality
|
||||
}
|
||||
|
||||
func CLIPrintDefinition() bool {
|
||||
return _outputDefinition
|
||||
}
|
||||
|
||||
func CLIAutoColumns() bool {
|
||||
return _autoColumns
|
||||
}
|
||||
|
||||
func CLIHasToBeKeptAttributes() bool {
|
||||
return len(_keepOnly) > 0
|
||||
}
|
||||
|
||||
func CLIToBeKeptAttributes() []string {
|
||||
if _obipairing {
|
||||
_keepOnly = append(_keepOnly, _softAttributes["obipairing"]...)
|
||||
}
|
||||
|
||||
if i := goutils.LookFor(_keepOnly, "count"); i >= 0 {
|
||||
_keepOnly = goutils.RemoveIndex(_keepOnly, i)
|
||||
_outputCount = true
|
||||
}
|
||||
|
||||
if i := goutils.LookFor(_keepOnly, "taxid"); i >= 0 {
|
||||
_keepOnly = goutils.RemoveIndex(_keepOnly, i)
|
||||
_outputTaxon = true
|
||||
}
|
||||
|
||||
if i := goutils.LookFor(_keepOnly, "scientific_name"); i >= 0 {
|
||||
_keepOnly = goutils.RemoveIndex(_keepOnly, i)
|
||||
_outputTaxon = true
|
||||
}
|
||||
|
||||
return _keepOnly
|
||||
}
|
||||
|
||||
func CLINAValue() string {
|
||||
return _naValue
|
||||
}
|
@ -31,7 +31,6 @@ func DistributeSequence(sequences obiiter.IBioSequence) {
|
||||
}
|
||||
|
||||
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers),
|
||||
obiformats.OptionsBufferSize(obioptions.CLIBufferSize()),
|
||||
obiformats.OptionsBatchSize(obioptions.CLIBatchSize()),
|
||||
obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift()),
|
||||
obiformats.OptionsAppendFile(CLIAppendSequences()),
|
||||
|
@ -39,7 +39,6 @@ func CLIFilterSequence(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
newIter = iterator.FilterOn(predicate,
|
||||
obioptions.CLIBatchSize(),
|
||||
obioptions.CLIParallelWorkers(),
|
||||
obioptions.CLIBufferSize(),
|
||||
)
|
||||
}
|
||||
} else {
|
||||
|
@ -20,7 +20,6 @@ func IExtractBarcode(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error
|
||||
obingslibrary.OptionDiscardErrors(!CLIConservedErrors()),
|
||||
obingslibrary.OptionParallelWorkers(obioptions.CLIParallelWorkers()),
|
||||
obingslibrary.OptionBatchSize(obioptions.CLIBatchSize()),
|
||||
obingslibrary.OptionBufferSize(obioptions.CLIBufferSize()),
|
||||
)
|
||||
|
||||
ngsfilter, err := CLINGSFIlter()
|
||||
|
@ -211,17 +211,13 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
|
||||
}
|
||||
|
||||
nworkers := obioptions.CLIMaxCPU() * 3 / 2
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
newIter := obiiter.MakeIBioSequence(buffsize)
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
newIter.Add(nworkers)
|
||||
|
||||
|
@ -51,8 +51,6 @@ func Unique(sequences obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
options = append(options,
|
||||
obichunk.OptionsParallelWorkers(
|
||||
obioptions.CLIParallelWorkers()),
|
||||
obichunk.OptionsBufferSize(
|
||||
obioptions.CLIBufferSize()),
|
||||
obichunk.OptionsBatchSize(
|
||||
obioptions.CLIBatchSize()),
|
||||
obichunk.OptionNAValue(CLINAValue()),
|
||||
|
Reference in New Issue
Block a user