First attempt for obiconsensus... The graph traversing algorithm is too simple

Former-commit-id: 0456e6c7fd55d6d0fcf9856c40386b976b912cba
This commit is contained in:
2023-03-27 19:51:10 +07:00
parent d5e84ec676
commit a33e471b39
17 changed files with 868 additions and 23 deletions

View File

@ -16,7 +16,7 @@ func ReadSequencesBatchFromFiles(filenames []string,
reader = ReadSequencesFromFile
}
batchiter := obiiter.MakeIBioSequence(0)
batchiter := obiiter.MakeIBioSequence()
nextCounter := obiutils.AtomicCounter()
batchiter.Add(concurrent_readers)
@ -48,6 +48,8 @@ func ReadSequencesBatchFromFiles(filenames []string,
log.Printf("Start reading of file : %s", filename)
for iter.Next() {
batch := iter.Get()
batchiter.Push(batch.Reorder(nextCounter()))

View File

@ -7,6 +7,7 @@ import (
"fmt"
"io"
"os"
"path"
"strconv"
"strings"
@ -14,6 +15,7 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
)
type __ecopcr_file__ struct {
@ -177,6 +179,7 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
go func() {
seq, err := __read_ecopcr_bioseq__(&ecopcr)
seq.SetSource(opt.Source())
slice := make(obiseq.BioSequenceSlice, 0, opt.BatchSize())
i := 0
ii := 0
@ -191,6 +194,7 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
}
seq, err = __read_ecopcr_bioseq__(&ecopcr)
seq.SetSource(opt.Source())
}
if len(slice) > 0 {
@ -205,14 +209,20 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
}()
if opt.pointer.full_file_batch {
newIter = newIter.FullFileIterator()
}
return newIter
}
func ReadEcoPCRBatchFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
func ReadEcoPCRFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
var reader io.Reader
var greader io.Reader
var err error
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
reader, err = os.Open(filename)
if err != nil {
log.Printf("open file error: %+v", err)

View File

@ -5,6 +5,7 @@ import (
"bytes"
"io"
"os"
"path"
"strconv"
"strings"
@ -14,6 +15,7 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
)
var _FileChunkSize = 1 << 26
@ -95,7 +97,7 @@ func _EndOfLastEntry(buff []byte) int {
return -1
}
func _ParseEmblFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequence) {
for chunks := range input {
scanner := bufio.NewScanner(chunks.raw)
@ -141,7 +143,8 @@ func _ParseEmblFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
sequence := obiseq.NewBioSequence(id,
bytes.ToLower(seqBytes.Bytes()),
defBytes.String())
sequence.SetSource(source)
sequence.SetFeatures(featBytes.Bytes())
annot := sequence.Annotations()
@ -257,11 +260,15 @@ func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
// for j := 0; j < opt.ParallelWorkers(); j++ {
for j := 0; j < nworkers; j++ {
go _ParseEmblFile(entry_channel, newIter)
go _ParseEmblFile(opt.Source(),entry_channel, newIter)
}
go _ReadFlatFileChunk(reader, entry_channel)
if opt.pointer.full_file_batch {
newIter = newIter.FullFileIterator()
}
return newIter
}
@ -270,6 +277,8 @@ func ReadEMBLFromFile(filename string, options ...WithOption) (obiiter.IBioSeque
var greader io.Reader
var err error
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
reader, err = os.Open(filename)
if err != nil {
log.Printf("open file error: %+v", err)

View File

@ -10,15 +10,18 @@ import (
"bytes"
"fmt"
"os"
"path"
"unsafe"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
)
func _FastseqReader(seqfile C.fast_kseq_p,
func _FastseqReader(source string,
seqfile C.fast_kseq_p,
iterator obiiter.IBioSequence,
batch_size int) {
var comment string
@ -40,7 +43,7 @@ func _FastseqReader(seqfile C.fast_kseq_p,
}
rep := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment)
rep.SetSource(source)
if s.qual.l > C.ulong(0) {
cquality := unsafe.Slice(s.qual.s, C.int(s.qual.l))
l := int(s.qual.l)
@ -81,6 +84,9 @@ func _FastseqReader(seqfile C.fast_kseq_p,
}
func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
opt := MakeOptions(options)
name := C.CString(filename)
@ -108,14 +114,20 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe
newIter := obiiter.MakeIBioSequence()
newIter.Add(1)
go func() {
newIter.WaitAndClose()
log.Debugln("End of the fastq file reading")
}()
go func(iter obiiter.IBioSequence) {
iter.WaitAndClose()
log.Debugln("End of the fastx file reading")
}(newIter)
log.Debugln("Start of the fastq file reading")
log.Debugln("Start of the fastx file reading")
go _FastseqReader(opt.Source(), pointer, newIter, opt.BatchSize())
log.Debugln("Full file batch mode : ", opt.FullFileBatch())
if opt.FullFileBatch() {
newIter = newIter.FullFileIterator()
}
go _FastseqReader(pointer, newIter, opt.BatchSize())
parser := opt.ParseFastSeqHeader()
if parser != nil {
@ -126,18 +138,27 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe
}
func ReadFastSeqFromStdin(options ...WithOption) obiiter.IBioSequence {
options = append(options, OptionsSource("stdin"))
opt := MakeOptions(options)
newIter := obiiter.MakeIBioSequence()
newIter.Add(1)
go func() {
newIter.WaitAndClose()
}()
go func(iter obiiter.IBioSequence) {
iter.WaitAndClose()
}(newIter)
go _FastseqReader(C.open_fast_sek_stdin(C.int32_t(opt.QualityShift())),
go _FastseqReader(opt.Source(),
C.open_fast_sek_stdin(C.int32_t(opt.QualityShift())),
newIter, opt.BatchSize())
log.Debugln("Full file batch mode : ", opt.FullFileBatch())
if opt.FullFileBatch() {
newIter = newIter.FullFileIterator()
}
parser := opt.ParseFastSeqHeader()
if parser != nil {

View File

@ -5,6 +5,7 @@ import (
"bytes"
"io"
"os"
"path"
"strconv"
"strings"
@ -14,6 +15,7 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
)
type gbstate int
@ -26,7 +28,8 @@ const (
inSequence gbstate = 4
)
func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
func _ParseGenbankFile(source string,
input <-chan _FileChunk, out obiiter.IBioSequence) {
state := inHeader
@ -68,6 +71,7 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
sequence := obiseq.NewBioSequence(id,
bytes.ToLower(seqBytes.Bytes()),
defBytes.String())
sequence.SetSource(source)
state = inHeader
sequence.SetFeatures(featBytes.Bytes())
@ -129,11 +133,15 @@ func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
// for j := 0; j < opt.ParallelWorkers(); j++ {
for j := 0; j < nworkers; j++ {
go _ParseGenbankFile(entry_channel, newIter)
go _ParseGenbankFile(opt.Source(),entry_channel, newIter)
}
go _ReadFlatFileChunk(reader, entry_channel)
if opt.pointer.full_file_batch {
newIter = newIter.FullFileIterator()
}
return newIter
}
@ -142,6 +150,9 @@ func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSe
var greader io.Reader
var err error
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
reader, err = os.Open(filename)
if err != nil {
log.Printf("open file error: %+v", err)

View File

@ -10,6 +10,7 @@ type __options__ struct {
with_progress_bar bool
buffer_size int
batch_size int
full_file_batch bool
quality_shift int
parallel_workers int
closefile bool
@ -25,6 +26,7 @@ type __options__ struct {
csv_separator string
csv_navalue string
paired_filename string
source string
}
type Options struct {
@ -42,6 +44,7 @@ func MakeOptions(setters []WithOption) Options {
quality_shift: 33,
parallel_workers: 4,
batch_size: 5000,
full_file_batch: false,
closefile: false,
appendfile: false,
compressed: false,
@ -52,9 +55,10 @@ func MakeOptions(setters []WithOption) Options {
csv_sequence: true,
csv_quality: false,
csv_separator: ",",
csv_navalue: "NA",
csv_navalue: "NA",
csv_keys: make([]string, 0),
paired_filename: "",
source: "",
}
opt := Options{&o}
@ -74,6 +78,10 @@ func (opt Options) BatchSize() int {
return opt.pointer.batch_size
}
func (opt Options) FullFileBatch() bool {
return opt.pointer.full_file_batch
}
func (opt Options) ParallelWorkers() int {
return opt.pointer.parallel_workers
}
@ -146,6 +154,14 @@ func (opt Options) PairedFileName() string {
return opt.pointer.paired_filename
}
func (opt Options) HasSource() bool {
return opt.pointer.source != ""
}
func (opt Options) Source() string {
return opt.pointer.source
}
func OptionCloseFile() WithOption {
f := WithOption(func(opt Options) {
opt.pointer.closefile = true
@ -253,6 +269,22 @@ func OptionsBatchSize(size int) WithOption {
return f
}
func OptionsFullFileBatch(full bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.full_file_batch = full
})
return f
}
func OptionsSource(source string) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.source = source
})
return f
}
func OptionsWithProgressBar() WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_progress_bar = true

View File

@ -5,11 +5,13 @@ import (
"compress/gzip"
"io"
"os"
"path"
"strings"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
)
func GuessSeqFileType(firstline string) string {
@ -49,6 +51,8 @@ func ReadSequencesFromFile(filename string,
var greader io.Reader
var err error
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
file, err = os.Open(filename)
if err != nil {