mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
First attempt for obiconsensus... The graph traversing algorithm is too simple
Former-commit-id: 0456e6c7fd55d6d0fcf9856c40386b976b912cba
This commit is contained in:
33
cmd/obitools/obiconsensus/main.go
Normal file
33
cmd/obitools/obiconsensus/main.go
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconsensus"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
optionParser := obioptions.GenerateOptionParser(obiconvert.OptionSet)
|
||||||
|
|
||||||
|
_, args := optionParser(os.Args)
|
||||||
|
obiconvert.SetFullFileBatch()
|
||||||
|
|
||||||
|
fs, err := obiconvert.CLIReadBioSequences(args...)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Cannot open file (%v)", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
consensus := obiconsensus.Consensus(fs, 0.99)
|
||||||
|
obiconvert.CLIWriteBioSequences(consensus, true)
|
||||||
|
|
||||||
|
obiiter.WaitForLastPipe()
|
||||||
|
|
||||||
|
}
|
@ -16,7 +16,7 @@ func ReadSequencesBatchFromFiles(filenames []string,
|
|||||||
reader = ReadSequencesFromFile
|
reader = ReadSequencesFromFile
|
||||||
}
|
}
|
||||||
|
|
||||||
batchiter := obiiter.MakeIBioSequence(0)
|
batchiter := obiiter.MakeIBioSequence()
|
||||||
nextCounter := obiutils.AtomicCounter()
|
nextCounter := obiutils.AtomicCounter()
|
||||||
|
|
||||||
batchiter.Add(concurrent_readers)
|
batchiter.Add(concurrent_readers)
|
||||||
@ -48,6 +48,8 @@ func ReadSequencesBatchFromFiles(filenames []string,
|
|||||||
|
|
||||||
log.Printf("Start reading of file : %s", filename)
|
log.Printf("Start reading of file : %s", filename)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for iter.Next() {
|
for iter.Next() {
|
||||||
batch := iter.Get()
|
batch := iter.Get()
|
||||||
batchiter.Push(batch.Reorder(nextCounter()))
|
batchiter.Push(batch.Reorder(nextCounter()))
|
||||||
|
@ -7,6 +7,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
|
"path"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@ -14,6 +15,7 @@ import (
|
|||||||
|
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
type __ecopcr_file__ struct {
|
type __ecopcr_file__ struct {
|
||||||
@ -177,6 +179,7 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
|||||||
go func() {
|
go func() {
|
||||||
|
|
||||||
seq, err := __read_ecopcr_bioseq__(&ecopcr)
|
seq, err := __read_ecopcr_bioseq__(&ecopcr)
|
||||||
|
seq.SetSource(opt.Source())
|
||||||
slice := make(obiseq.BioSequenceSlice, 0, opt.BatchSize())
|
slice := make(obiseq.BioSequenceSlice, 0, opt.BatchSize())
|
||||||
i := 0
|
i := 0
|
||||||
ii := 0
|
ii := 0
|
||||||
@ -191,6 +194,7 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
seq, err = __read_ecopcr_bioseq__(&ecopcr)
|
seq, err = __read_ecopcr_bioseq__(&ecopcr)
|
||||||
|
seq.SetSource(opt.Source())
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(slice) > 0 {
|
if len(slice) > 0 {
|
||||||
@ -205,14 +209,20 @@ func ReadEcoPCR(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
|||||||
|
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if opt.pointer.full_file_batch {
|
||||||
|
newIter = newIter.FullFileIterator()
|
||||||
|
}
|
||||||
|
|
||||||
return newIter
|
return newIter
|
||||||
}
|
}
|
||||||
|
|
||||||
func ReadEcoPCRBatchFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
func ReadEcoPCRFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
var reader io.Reader
|
var reader io.Reader
|
||||||
var greader io.Reader
|
var greader io.Reader
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||||
|
|
||||||
reader, err = os.Open(filename)
|
reader, err = os.Open(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("open file error: %+v", err)
|
log.Printf("open file error: %+v", err)
|
||||||
|
@ -5,6 +5,7 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
|
"path"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@ -14,6 +15,7 @@ import (
|
|||||||
|
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
var _FileChunkSize = 1 << 26
|
var _FileChunkSize = 1 << 26
|
||||||
@ -95,7 +97,7 @@ func _EndOfLastEntry(buff []byte) int {
|
|||||||
return -1
|
return -1
|
||||||
}
|
}
|
||||||
|
|
||||||
func _ParseEmblFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
|
func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequence) {
|
||||||
|
|
||||||
for chunks := range input {
|
for chunks := range input {
|
||||||
scanner := bufio.NewScanner(chunks.raw)
|
scanner := bufio.NewScanner(chunks.raw)
|
||||||
@ -141,6 +143,7 @@ func _ParseEmblFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
|
|||||||
sequence := obiseq.NewBioSequence(id,
|
sequence := obiseq.NewBioSequence(id,
|
||||||
bytes.ToLower(seqBytes.Bytes()),
|
bytes.ToLower(seqBytes.Bytes()),
|
||||||
defBytes.String())
|
defBytes.String())
|
||||||
|
sequence.SetSource(source)
|
||||||
|
|
||||||
sequence.SetFeatures(featBytes.Bytes())
|
sequence.SetFeatures(featBytes.Bytes())
|
||||||
|
|
||||||
@ -257,11 +260,15 @@ func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
|||||||
|
|
||||||
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
||||||
for j := 0; j < nworkers; j++ {
|
for j := 0; j < nworkers; j++ {
|
||||||
go _ParseEmblFile(entry_channel, newIter)
|
go _ParseEmblFile(opt.Source(),entry_channel, newIter)
|
||||||
}
|
}
|
||||||
|
|
||||||
go _ReadFlatFileChunk(reader, entry_channel)
|
go _ReadFlatFileChunk(reader, entry_channel)
|
||||||
|
|
||||||
|
if opt.pointer.full_file_batch {
|
||||||
|
newIter = newIter.FullFileIterator()
|
||||||
|
}
|
||||||
|
|
||||||
return newIter
|
return newIter
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -270,6 +277,8 @@ func ReadEMBLFromFile(filename string, options ...WithOption) (obiiter.IBioSeque
|
|||||||
var greader io.Reader
|
var greader io.Reader
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||||
|
|
||||||
reader, err = os.Open(filename)
|
reader, err = os.Open(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("open file error: %+v", err)
|
log.Printf("open file error: %+v", err)
|
||||||
|
@ -10,15 +10,18 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"path"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
func _FastseqReader(seqfile C.fast_kseq_p,
|
func _FastseqReader(source string,
|
||||||
|
seqfile C.fast_kseq_p,
|
||||||
iterator obiiter.IBioSequence,
|
iterator obiiter.IBioSequence,
|
||||||
batch_size int) {
|
batch_size int) {
|
||||||
var comment string
|
var comment string
|
||||||
@ -40,7 +43,7 @@ func _FastseqReader(seqfile C.fast_kseq_p,
|
|||||||
}
|
}
|
||||||
|
|
||||||
rep := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment)
|
rep := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment)
|
||||||
|
rep.SetSource(source)
|
||||||
if s.qual.l > C.ulong(0) {
|
if s.qual.l > C.ulong(0) {
|
||||||
cquality := unsafe.Slice(s.qual.s, C.int(s.qual.l))
|
cquality := unsafe.Slice(s.qual.s, C.int(s.qual.l))
|
||||||
l := int(s.qual.l)
|
l := int(s.qual.l)
|
||||||
@ -81,6 +84,9 @@ func _FastseqReader(seqfile C.fast_kseq_p,
|
|||||||
}
|
}
|
||||||
|
|
||||||
func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
|
|
||||||
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||||
|
|
||||||
opt := MakeOptions(options)
|
opt := MakeOptions(options)
|
||||||
|
|
||||||
name := C.CString(filename)
|
name := C.CString(filename)
|
||||||
@ -108,14 +114,20 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe
|
|||||||
newIter := obiiter.MakeIBioSequence()
|
newIter := obiiter.MakeIBioSequence()
|
||||||
newIter.Add(1)
|
newIter.Add(1)
|
||||||
|
|
||||||
go func() {
|
go func(iter obiiter.IBioSequence) {
|
||||||
newIter.WaitAndClose()
|
iter.WaitAndClose()
|
||||||
log.Debugln("End of the fastq file reading")
|
log.Debugln("End of the fastx file reading")
|
||||||
}()
|
}(newIter)
|
||||||
|
|
||||||
log.Debugln("Start of the fastq file reading")
|
log.Debugln("Start of the fastx file reading")
|
||||||
|
|
||||||
|
go _FastseqReader(opt.Source(), pointer, newIter, opt.BatchSize())
|
||||||
|
|
||||||
|
log.Debugln("Full file batch mode : ", opt.FullFileBatch())
|
||||||
|
if opt.FullFileBatch() {
|
||||||
|
newIter = newIter.FullFileIterator()
|
||||||
|
}
|
||||||
|
|
||||||
go _FastseqReader(pointer, newIter, opt.BatchSize())
|
|
||||||
parser := opt.ParseFastSeqHeader()
|
parser := opt.ParseFastSeqHeader()
|
||||||
|
|
||||||
if parser != nil {
|
if parser != nil {
|
||||||
@ -126,18 +138,27 @@ func ReadFastSeqFromFile(filename string, options ...WithOption) (obiiter.IBioSe
|
|||||||
}
|
}
|
||||||
|
|
||||||
func ReadFastSeqFromStdin(options ...WithOption) obiiter.IBioSequence {
|
func ReadFastSeqFromStdin(options ...WithOption) obiiter.IBioSequence {
|
||||||
|
|
||||||
|
options = append(options, OptionsSource("stdin"))
|
||||||
|
|
||||||
opt := MakeOptions(options)
|
opt := MakeOptions(options)
|
||||||
newIter := obiiter.MakeIBioSequence()
|
newIter := obiiter.MakeIBioSequence()
|
||||||
|
|
||||||
newIter.Add(1)
|
newIter.Add(1)
|
||||||
|
|
||||||
go func() {
|
go func(iter obiiter.IBioSequence) {
|
||||||
newIter.WaitAndClose()
|
iter.WaitAndClose()
|
||||||
}()
|
}(newIter)
|
||||||
|
|
||||||
go _FastseqReader(C.open_fast_sek_stdin(C.int32_t(opt.QualityShift())),
|
go _FastseqReader(opt.Source(),
|
||||||
|
C.open_fast_sek_stdin(C.int32_t(opt.QualityShift())),
|
||||||
newIter, opt.BatchSize())
|
newIter, opt.BatchSize())
|
||||||
|
|
||||||
|
log.Debugln("Full file batch mode : ", opt.FullFileBatch())
|
||||||
|
if opt.FullFileBatch() {
|
||||||
|
newIter = newIter.FullFileIterator()
|
||||||
|
}
|
||||||
|
|
||||||
parser := opt.ParseFastSeqHeader()
|
parser := opt.ParseFastSeqHeader()
|
||||||
|
|
||||||
if parser != nil {
|
if parser != nil {
|
||||||
|
@ -5,6 +5,7 @@ import (
|
|||||||
"bytes"
|
"bytes"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
|
"path"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@ -14,6 +15,7 @@ import (
|
|||||||
|
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
type gbstate int
|
type gbstate int
|
||||||
@ -26,7 +28,8 @@ const (
|
|||||||
inSequence gbstate = 4
|
inSequence gbstate = 4
|
||||||
)
|
)
|
||||||
|
|
||||||
func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
|
func _ParseGenbankFile(source string,
|
||||||
|
input <-chan _FileChunk, out obiiter.IBioSequence) {
|
||||||
|
|
||||||
state := inHeader
|
state := inHeader
|
||||||
|
|
||||||
@ -68,6 +71,7 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequence) {
|
|||||||
sequence := obiseq.NewBioSequence(id,
|
sequence := obiseq.NewBioSequence(id,
|
||||||
bytes.ToLower(seqBytes.Bytes()),
|
bytes.ToLower(seqBytes.Bytes()),
|
||||||
defBytes.String())
|
defBytes.String())
|
||||||
|
sequence.SetSource(source)
|
||||||
state = inHeader
|
state = inHeader
|
||||||
|
|
||||||
sequence.SetFeatures(featBytes.Bytes())
|
sequence.SetFeatures(featBytes.Bytes())
|
||||||
@ -129,11 +133,15 @@ func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
|
|||||||
|
|
||||||
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
// for j := 0; j < opt.ParallelWorkers(); j++ {
|
||||||
for j := 0; j < nworkers; j++ {
|
for j := 0; j < nworkers; j++ {
|
||||||
go _ParseGenbankFile(entry_channel, newIter)
|
go _ParseGenbankFile(opt.Source(),entry_channel, newIter)
|
||||||
}
|
}
|
||||||
|
|
||||||
go _ReadFlatFileChunk(reader, entry_channel)
|
go _ReadFlatFileChunk(reader, entry_channel)
|
||||||
|
|
||||||
|
if opt.pointer.full_file_batch {
|
||||||
|
newIter = newIter.FullFileIterator()
|
||||||
|
}
|
||||||
|
|
||||||
return newIter
|
return newIter
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -142,6 +150,9 @@ func ReadGenbankFromFile(filename string, options ...WithOption) (obiiter.IBioSe
|
|||||||
var greader io.Reader
|
var greader io.Reader
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||||
|
|
||||||
|
|
||||||
reader, err = os.Open(filename)
|
reader, err = os.Open(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("open file error: %+v", err)
|
log.Printf("open file error: %+v", err)
|
||||||
|
@ -10,6 +10,7 @@ type __options__ struct {
|
|||||||
with_progress_bar bool
|
with_progress_bar bool
|
||||||
buffer_size int
|
buffer_size int
|
||||||
batch_size int
|
batch_size int
|
||||||
|
full_file_batch bool
|
||||||
quality_shift int
|
quality_shift int
|
||||||
parallel_workers int
|
parallel_workers int
|
||||||
closefile bool
|
closefile bool
|
||||||
@ -25,6 +26,7 @@ type __options__ struct {
|
|||||||
csv_separator string
|
csv_separator string
|
||||||
csv_navalue string
|
csv_navalue string
|
||||||
paired_filename string
|
paired_filename string
|
||||||
|
source string
|
||||||
}
|
}
|
||||||
|
|
||||||
type Options struct {
|
type Options struct {
|
||||||
@ -42,6 +44,7 @@ func MakeOptions(setters []WithOption) Options {
|
|||||||
quality_shift: 33,
|
quality_shift: 33,
|
||||||
parallel_workers: 4,
|
parallel_workers: 4,
|
||||||
batch_size: 5000,
|
batch_size: 5000,
|
||||||
|
full_file_batch: false,
|
||||||
closefile: false,
|
closefile: false,
|
||||||
appendfile: false,
|
appendfile: false,
|
||||||
compressed: false,
|
compressed: false,
|
||||||
@ -52,9 +55,10 @@ func MakeOptions(setters []WithOption) Options {
|
|||||||
csv_sequence: true,
|
csv_sequence: true,
|
||||||
csv_quality: false,
|
csv_quality: false,
|
||||||
csv_separator: ",",
|
csv_separator: ",",
|
||||||
csv_navalue: "NA",
|
csv_navalue: "NA",
|
||||||
csv_keys: make([]string, 0),
|
csv_keys: make([]string, 0),
|
||||||
paired_filename: "",
|
paired_filename: "",
|
||||||
|
source: "",
|
||||||
}
|
}
|
||||||
|
|
||||||
opt := Options{&o}
|
opt := Options{&o}
|
||||||
@ -74,6 +78,10 @@ func (opt Options) BatchSize() int {
|
|||||||
return opt.pointer.batch_size
|
return opt.pointer.batch_size
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (opt Options) FullFileBatch() bool {
|
||||||
|
return opt.pointer.full_file_batch
|
||||||
|
}
|
||||||
|
|
||||||
func (opt Options) ParallelWorkers() int {
|
func (opt Options) ParallelWorkers() int {
|
||||||
return opt.pointer.parallel_workers
|
return opt.pointer.parallel_workers
|
||||||
}
|
}
|
||||||
@ -146,6 +154,14 @@ func (opt Options) PairedFileName() string {
|
|||||||
return opt.pointer.paired_filename
|
return opt.pointer.paired_filename
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (opt Options) HasSource() bool {
|
||||||
|
return opt.pointer.source != ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func (opt Options) Source() string {
|
||||||
|
return opt.pointer.source
|
||||||
|
}
|
||||||
|
|
||||||
func OptionCloseFile() WithOption {
|
func OptionCloseFile() WithOption {
|
||||||
f := WithOption(func(opt Options) {
|
f := WithOption(func(opt Options) {
|
||||||
opt.pointer.closefile = true
|
opt.pointer.closefile = true
|
||||||
@ -253,6 +269,22 @@ func OptionsBatchSize(size int) WithOption {
|
|||||||
return f
|
return f
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func OptionsFullFileBatch(full bool) WithOption {
|
||||||
|
f := WithOption(func(opt Options) {
|
||||||
|
opt.pointer.full_file_batch = full
|
||||||
|
})
|
||||||
|
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
|
||||||
|
func OptionsSource(source string) WithOption {
|
||||||
|
f := WithOption(func(opt Options) {
|
||||||
|
opt.pointer.source = source
|
||||||
|
})
|
||||||
|
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
|
||||||
func OptionsWithProgressBar() WithOption {
|
func OptionsWithProgressBar() WithOption {
|
||||||
f := WithOption(func(opt Options) {
|
f := WithOption(func(opt Options) {
|
||||||
opt.pointer.with_progress_bar = true
|
opt.pointer.with_progress_bar = true
|
||||||
|
@ -5,11 +5,13 @@ import (
|
|||||||
"compress/gzip"
|
"compress/gzip"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
|
"path"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
func GuessSeqFileType(firstline string) string {
|
func GuessSeqFileType(firstline string) string {
|
||||||
@ -49,6 +51,8 @@ func ReadSequencesFromFile(filename string,
|
|||||||
var greader io.Reader
|
var greader io.Reader
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
|
options = append(options, OptionsSource(obiutils.RemoveAllExt((path.Base(filename)))))
|
||||||
|
|
||||||
file, err = os.Open(filename)
|
file, err = os.Open(filename)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -59,7 +59,7 @@ type IBioSequence struct {
|
|||||||
// IBioSequenceBatch type.
|
// IBioSequenceBatch type.
|
||||||
var NilIBioSequence = IBioSequence{pointer: nil}
|
var NilIBioSequence = IBioSequence{pointer: nil}
|
||||||
|
|
||||||
func MakeIBioSequence(sizes ...int) IBioSequence {
|
func MakeIBioSequence() IBioSequence {
|
||||||
|
|
||||||
i := _IBioSequence{
|
i := _IBioSequence{
|
||||||
channel: make(chan BioSequenceBatch),
|
channel: make(chan BioSequenceBatch),
|
||||||
@ -409,7 +409,7 @@ func (iterator IBioSequence) Pool(iterators ...IBioSequence) IBioSequence {
|
|||||||
// IBioSequenceBatch with every batches having the same size
|
// IBioSequenceBatch with every batches having the same size
|
||||||
// indicated in parameter. Rebatching implies to sort the
|
// indicated in parameter. Rebatching implies to sort the
|
||||||
// source IBioSequenceBatch.
|
// source IBioSequenceBatch.
|
||||||
func (iterator IBioSequence) Rebatch(size int, sizes ...int) IBioSequence {
|
func (iterator IBioSequence) Rebatch(size int) IBioSequence {
|
||||||
|
|
||||||
newIter := MakeIBioSequence()
|
newIter := MakeIBioSequence()
|
||||||
|
|
||||||
@ -686,6 +686,7 @@ func (iterator IBioSequence) Load() obiseq.BioSequenceSlice {
|
|||||||
chunck := obiseq.MakeBioSequenceSlice()
|
chunck := obiseq.MakeBioSequenceSlice()
|
||||||
for iterator.Next() {
|
for iterator.Next() {
|
||||||
b := iterator.Get()
|
b := iterator.Get()
|
||||||
|
log.Debugf("append %d sequences",b.Len())
|
||||||
chunck = append(chunck, b.Slice()...)
|
chunck = append(chunck, b.Slice()...)
|
||||||
b.Recycle()
|
b.Recycle()
|
||||||
}
|
}
|
||||||
@ -693,6 +694,33 @@ func (iterator IBioSequence) Load() obiseq.BioSequenceSlice {
|
|||||||
return chunck
|
return chunck
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (iterator IBioSequence) FullFileIterator() IBioSequence {
|
||||||
|
|
||||||
|
newIter := MakeIBioSequence()
|
||||||
|
log.Debug("Stream is read in full file mode")
|
||||||
|
|
||||||
|
newIter.Add(1)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
newIter.WaitAndClose()
|
||||||
|
}()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
slice := iterator.Load()
|
||||||
|
log.Printf("A batch of %d sequence is read",len(slice))
|
||||||
|
if len(slice) > 0 {
|
||||||
|
newIter.Push(MakeBioSequenceBatch(0, slice))
|
||||||
|
}
|
||||||
|
newIter.Done()
|
||||||
|
}()
|
||||||
|
|
||||||
|
if iterator.IsPaired() {
|
||||||
|
newIter.MarkAsPaired()
|
||||||
|
}
|
||||||
|
|
||||||
|
return newIter
|
||||||
|
}
|
||||||
|
|
||||||
// It takes a slice of BioSequence objects, and returns an iterator that will return batches of
|
// It takes a slice of BioSequence objects, and returns an iterator that will return batches of
|
||||||
// BioSequence objects
|
// BioSequence objects
|
||||||
|
|
||||||
|
376
pkg/obikmer/debruijn.go
Normal file
376
pkg/obikmer/debruijn.go
Normal file
@ -0,0 +1,376 @@
|
|||||||
|
package obikmer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
)
|
||||||
|
|
||||||
|
type KmerIdx32 uint32
|
||||||
|
type KmerIdx64 uint64
|
||||||
|
type KmerIdx128 struct {
|
||||||
|
Lo uint64
|
||||||
|
Hi uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
var iupac = map[byte][]uint64{
|
||||||
|
'a': {0},
|
||||||
|
'c': {1},
|
||||||
|
'g': {2},
|
||||||
|
't': {3},
|
||||||
|
'u': {3},
|
||||||
|
'r': {0, 2},
|
||||||
|
'y': {1, 3},
|
||||||
|
's': {1, 2},
|
||||||
|
'w': {0, 3},
|
||||||
|
'k': {2, 3},
|
||||||
|
'm': {0, 1},
|
||||||
|
'b': {1, 2, 3},
|
||||||
|
'd': {0, 2, 3},
|
||||||
|
'h': {0, 1, 3},
|
||||||
|
'v': {0, 1, 2},
|
||||||
|
'n': {0, 1, 2, 3},
|
||||||
|
}
|
||||||
|
|
||||||
|
var decode = map[uint64]byte{
|
||||||
|
0: 'a',
|
||||||
|
1: 'c',
|
||||||
|
2: 'g',
|
||||||
|
3: 't',
|
||||||
|
}
|
||||||
|
|
||||||
|
type KmerIdx_t interface {
|
||||||
|
KmerIdx32 | KmerIdx64 | KmerIdx128
|
||||||
|
}
|
||||||
|
|
||||||
|
type DeBruijnGraph struct {
|
||||||
|
kmersize int
|
||||||
|
kmermask uint64
|
||||||
|
prevc uint64
|
||||||
|
prevg uint64
|
||||||
|
prevt uint64
|
||||||
|
graph map[uint64]uint
|
||||||
|
}
|
||||||
|
|
||||||
|
func MakeDeBruijnGraph(kmersize int) *DeBruijnGraph {
|
||||||
|
g := DeBruijnGraph{
|
||||||
|
kmersize: kmersize,
|
||||||
|
kmermask: ^(^uint64(0) << (uint64(kmersize+1) * 2)),
|
||||||
|
prevc: uint64(1) << (uint64(kmersize) * 2),
|
||||||
|
prevg: uint64(2) << (uint64(kmersize) * 2),
|
||||||
|
prevt: uint64(3) << (uint64(kmersize) * 2),
|
||||||
|
graph: make(map[uint64]uint),
|
||||||
|
}
|
||||||
|
|
||||||
|
return &g
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) KmerSize() int {
|
||||||
|
return g.kmersize
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) Len() int {
|
||||||
|
return len(g.graph)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) MaxLink() int {
|
||||||
|
max := uint(0)
|
||||||
|
for _, count := range g.graph {
|
||||||
|
if count > max {
|
||||||
|
max = count
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return int(max)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) LinkSpectrum() []int {
|
||||||
|
max := g.MaxLink()
|
||||||
|
spectrum := make([]int, max+1)
|
||||||
|
for _, count := range g.graph {
|
||||||
|
spectrum[int(count)]++
|
||||||
|
}
|
||||||
|
|
||||||
|
return spectrum
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) FilterMin(min int) {
|
||||||
|
umin := uint(min)
|
||||||
|
for idx, count := range g.graph {
|
||||||
|
if count < umin {
|
||||||
|
delete(g.graph, idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) Previouses(index uint64) []uint64 {
|
||||||
|
rep := make([]uint64, 0, 4)
|
||||||
|
index = index >> 2
|
||||||
|
|
||||||
|
if _, ok := g.graph[index]; ok {
|
||||||
|
rep = append(rep, index)
|
||||||
|
}
|
||||||
|
|
||||||
|
key := index | g.prevc
|
||||||
|
if _, ok := g.graph[key]; ok {
|
||||||
|
rep = append(rep, key)
|
||||||
|
}
|
||||||
|
|
||||||
|
key = index | g.prevg
|
||||||
|
if _, ok := g.graph[key]; ok {
|
||||||
|
rep = append(rep, key)
|
||||||
|
}
|
||||||
|
|
||||||
|
key = index | g.prevt
|
||||||
|
if _, ok := g.graph[key]; ok {
|
||||||
|
rep = append(rep, key)
|
||||||
|
}
|
||||||
|
|
||||||
|
return rep
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) Nexts(index uint64) []uint64 {
|
||||||
|
rep := make([]uint64, 0, 4)
|
||||||
|
index = index << 2 & g.kmermask
|
||||||
|
|
||||||
|
if _, ok := g.graph[index]; ok {
|
||||||
|
rep = append(rep, index)
|
||||||
|
}
|
||||||
|
|
||||||
|
key := index | 1
|
||||||
|
if _, ok := g.graph[key]; ok {
|
||||||
|
rep = append(rep, key)
|
||||||
|
}
|
||||||
|
|
||||||
|
key = index | 2
|
||||||
|
if _, ok := g.graph[key]; ok {
|
||||||
|
rep = append(rep, key)
|
||||||
|
}
|
||||||
|
|
||||||
|
key = index | 3
|
||||||
|
if _, ok := g.graph[key]; ok {
|
||||||
|
rep = append(rep, key)
|
||||||
|
}
|
||||||
|
|
||||||
|
return rep
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) MaxNext(index uint64) (uint64, bool) {
|
||||||
|
ns := g.Nexts(index)
|
||||||
|
|
||||||
|
if len(ns) == 0 {
|
||||||
|
return uint64(0), false
|
||||||
|
}
|
||||||
|
|
||||||
|
max := uint(0)
|
||||||
|
rep := uint64(0)
|
||||||
|
for _, idx := range ns {
|
||||||
|
w, _ := g.graph[idx]
|
||||||
|
if w > max {
|
||||||
|
rep = idx
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return rep, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) MaxPath() []uint64 {
|
||||||
|
path := make([]uint64, 0, 1000)
|
||||||
|
ok := false
|
||||||
|
idx := uint64(0)
|
||||||
|
|
||||||
|
idx, ok = g.MaxHead()
|
||||||
|
|
||||||
|
for ok {
|
||||||
|
path = append(path, idx)
|
||||||
|
idx, ok = g.MaxNext(idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
return path
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) LongestPath() []uint64 {
|
||||||
|
var path []uint64
|
||||||
|
wmax := uint(0)
|
||||||
|
ok := true
|
||||||
|
|
||||||
|
starts:= g.Heads()
|
||||||
|
|
||||||
|
for _,idx := range starts {
|
||||||
|
lp := make([]uint64, 0, 1000)
|
||||||
|
w := uint(0)
|
||||||
|
for ok {
|
||||||
|
nw:= g.graph[idx]
|
||||||
|
w+=nw
|
||||||
|
lp = append(lp, idx)
|
||||||
|
idx, ok = g.MaxNext(idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
if w > wmax {
|
||||||
|
path=lp
|
||||||
|
wmax=w
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return path
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence,error) {
|
||||||
|
path := g.LongestPath()
|
||||||
|
s := g.DecodePath(path)
|
||||||
|
|
||||||
|
if len(s) > 0 {
|
||||||
|
seq := obiseq.MakeBioSequence(
|
||||||
|
id,
|
||||||
|
[]byte(s),
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
|
return &seq,nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil,fmt.Errorf("cannot identify optimum path")
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) Heads() []uint64 {
|
||||||
|
rep := make([]uint64, 0, 10)
|
||||||
|
|
||||||
|
for k := range g.graph {
|
||||||
|
if len(g.Previouses(k)) == 0 {
|
||||||
|
rep = append(rep, k)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return rep
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) MaxHead() (uint64, bool) {
|
||||||
|
rep := uint64(0)
|
||||||
|
max := uint(0)
|
||||||
|
found := false
|
||||||
|
for k, w := range g.graph {
|
||||||
|
if len(g.Previouses(k)) == 0 && w > max {
|
||||||
|
rep = k
|
||||||
|
found = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return rep, found
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) DecodeNode(index uint64) string {
|
||||||
|
rep := make([]byte, g.kmersize)
|
||||||
|
index >>= 2
|
||||||
|
for i := g.kmersize - 1; i >= 0; i-- {
|
||||||
|
rep[i], _ = decode[index&3]
|
||||||
|
index >>= 2
|
||||||
|
}
|
||||||
|
|
||||||
|
return string(rep)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) DecodePath(path []uint64) string {
|
||||||
|
rep := make([]byte, 0, len(path)+g.kmersize)
|
||||||
|
buf := bytes.NewBuffer(rep)
|
||||||
|
|
||||||
|
if len(path) > 0 {
|
||||||
|
buf.WriteString(g.DecodeNode(path[0]))
|
||||||
|
|
||||||
|
for _, idx := range path[1:] {
|
||||||
|
buf.WriteByte(decode[idx&3])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return buf.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) BestConsensus(id string) (*obiseq.BioSequence,error) {
|
||||||
|
path := g.MaxPath()
|
||||||
|
s := g.DecodePath(path)
|
||||||
|
|
||||||
|
if len(s) > 0 {
|
||||||
|
seq := obiseq.MakeBioSequence(
|
||||||
|
id,
|
||||||
|
[]byte(s),
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
|
return &seq,nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil,fmt.Errorf("cannot identify optimum path")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (g *DeBruijnGraph) Weight(index uint64) int {
|
||||||
|
val, ok := g.graph[index]
|
||||||
|
if !ok {
|
||||||
|
val = 0
|
||||||
|
}
|
||||||
|
return int(val)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (graph *DeBruijnGraph) append(sequence []byte, current uint64) {
|
||||||
|
|
||||||
|
for i := 0; i < len(sequence); i++ {
|
||||||
|
current <<= 2
|
||||||
|
current &= graph.kmermask
|
||||||
|
b := iupac[sequence[i]]
|
||||||
|
if len(b) == 1 {
|
||||||
|
current |= b[0]
|
||||||
|
|
||||||
|
weight, ok := graph.graph[current]
|
||||||
|
if !ok {
|
||||||
|
weight = 0
|
||||||
|
}
|
||||||
|
graph.graph[current] = weight + 1
|
||||||
|
} else {
|
||||||
|
for j := 0; j < len(b); j++ {
|
||||||
|
current &= ^uint64(3)
|
||||||
|
current |= b[j]
|
||||||
|
|
||||||
|
weight, ok := graph.graph[current]
|
||||||
|
if !ok {
|
||||||
|
weight = 0
|
||||||
|
}
|
||||||
|
graph.graph[current] = weight + 1
|
||||||
|
graph.append(sequence[(i+1):], current)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) {
|
||||||
|
key := uint64(0)
|
||||||
|
s := sequence.Sequence()
|
||||||
|
init := make([]uint64, 0, 16)
|
||||||
|
var f func(start int, key uint64)
|
||||||
|
f = func(start int, key uint64) {
|
||||||
|
for i := start; i < graph.kmersize; i++ {
|
||||||
|
key <<= 2
|
||||||
|
b := iupac[s[i]]
|
||||||
|
if len(b) == 1 {
|
||||||
|
key |= b[0]
|
||||||
|
} else {
|
||||||
|
for j := 0; j < len(b); j++ {
|
||||||
|
key &= ^uint64(3)
|
||||||
|
key |= b[j]
|
||||||
|
f(i+1, key)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
init = append(init, key&graph.kmermask)
|
||||||
|
}
|
||||||
|
|
||||||
|
f(0, key)
|
||||||
|
|
||||||
|
for _, idx := range init {
|
||||||
|
graph.append(s[graph.kmersize:], idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -52,6 +52,7 @@ type Annotation map[string]interface{}
|
|||||||
type BioSequence struct {
|
type BioSequence struct {
|
||||||
id string // The identidier of the sequence (private accessible through the method Id)
|
id string // The identidier of the sequence (private accessible through the method Id)
|
||||||
definition string // The documentation of the sequence (private accessible through the method Definition)
|
definition string // The documentation of the sequence (private accessible through the method Definition)
|
||||||
|
source string // The filename without directory name and extension from where the sequence was read.
|
||||||
sequence []byte // The sequence itself, it is accessible by the methode Sequence
|
sequence []byte // The sequence itself, it is accessible by the methode Sequence
|
||||||
qualities []byte // The quality scores of the sequence.
|
qualities []byte // The quality scores of the sequence.
|
||||||
feature []byte
|
feature []byte
|
||||||
@ -67,6 +68,7 @@ func MakeEmptyBioSequence() BioSequence {
|
|||||||
return BioSequence{
|
return BioSequence{
|
||||||
id: "",
|
id: "",
|
||||||
definition: "",
|
definition: "",
|
||||||
|
source: "",
|
||||||
sequence: nil,
|
sequence: nil,
|
||||||
qualities: nil,
|
qualities: nil,
|
||||||
feature: nil,
|
feature: nil,
|
||||||
@ -199,6 +201,16 @@ func (s *BioSequence) Annotations() Annotation {
|
|||||||
return s.annotations
|
return s.annotations
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Checking if the BioSequence has a source.
|
||||||
|
func (s *BioSequence) HasSource() bool {
|
||||||
|
return len(s.source) > 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *BioSequence) Source() string {
|
||||||
|
return s.source
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Returning the MD5 hash of the sequence.
|
// Returning the MD5 hash of the sequence.
|
||||||
func (s *BioSequence) MD5() [16]byte {
|
func (s *BioSequence) MD5() [16]byte {
|
||||||
return md5.Sum(s.sequence)
|
return md5.Sum(s.sequence)
|
||||||
@ -214,6 +226,11 @@ func (s *BioSequence) SetDefinition(definition string) {
|
|||||||
s.definition = definition
|
s.definition = definition
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Setting the source of the sequence.
|
||||||
|
func (s *BioSequence) SetSource(source string) {
|
||||||
|
s.source = source
|
||||||
|
}
|
||||||
|
|
||||||
// Setting the features of the BioSequence.
|
// Setting the features of the BioSequence.
|
||||||
func (s *BioSequence) SetFeatures(feature []byte) {
|
func (s *BioSequence) SetFeatures(feature []byte) {
|
||||||
if cap(s.feature) >= 300 {
|
if cap(s.feature) >= 300 {
|
||||||
|
137
pkg/obisuffix/suffix_array.go
Normal file
137
pkg/obisuffix/suffix_array.go
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
package obisuffix
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Suffix struct {
|
||||||
|
Idx uint32
|
||||||
|
Pos uint32
|
||||||
|
}
|
||||||
|
|
||||||
|
type SuffixArray struct {
|
||||||
|
Sequences *obiseq.BioSequenceSlice
|
||||||
|
Suffixes []Suffix
|
||||||
|
Common []int
|
||||||
|
}
|
||||||
|
|
||||||
|
func SuffixLess(suffixarray SuffixArray) func(i, j int) bool {
|
||||||
|
less := func(i, j int) bool {
|
||||||
|
si := suffixarray.Suffixes[i]
|
||||||
|
bi := (*suffixarray.Sequences)[int(si.Idx)].Sequence()[si.Pos:]
|
||||||
|
sj := suffixarray.Suffixes[j]
|
||||||
|
bj := (*suffixarray.Sequences)[int(sj.Idx)].Sequence()[sj.Pos:]
|
||||||
|
|
||||||
|
l := obiutils.MinInt(len(bi), len(bj))
|
||||||
|
p := 0
|
||||||
|
for p < l && bi[p] == bj[p] {
|
||||||
|
p++
|
||||||
|
}
|
||||||
|
|
||||||
|
// if p < l {
|
||||||
|
// log.Debugln(si, sj, p, l, rune(bi[p]), rune(bj[p]))
|
||||||
|
// } else {
|
||||||
|
// log.Debugln(si, sj, p, l, rune('-'), rune('-'))
|
||||||
|
// }
|
||||||
|
|
||||||
|
if p == l {
|
||||||
|
switch {
|
||||||
|
case len(bi) != len(bj):
|
||||||
|
return len(bi) < len(bj)
|
||||||
|
case si.Idx != sj.Idx:
|
||||||
|
return si.Idx < sj.Idx
|
||||||
|
default:
|
||||||
|
return si.Pos < sj.Pos
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bi[p] < bj[p]
|
||||||
|
}
|
||||||
|
|
||||||
|
return less
|
||||||
|
}
|
||||||
|
|
||||||
|
func BuildSuffixArray(data *obiseq.BioSequenceSlice) SuffixArray {
|
||||||
|
totalLength := 0
|
||||||
|
for _, s := range *data {
|
||||||
|
totalLength += s.Len()
|
||||||
|
}
|
||||||
|
|
||||||
|
sa := SuffixArray{
|
||||||
|
Sequences: data,
|
||||||
|
Suffixes: make([]Suffix, 0, totalLength),
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, s := range *data {
|
||||||
|
sl := uint32(s.Len())
|
||||||
|
for p := uint32(0); p < sl; p++ {
|
||||||
|
sa.Suffixes = append(sa.Suffixes, Suffix{uint32(i), p})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.SliceStable(sa.Suffixes, SuffixLess(sa))
|
||||||
|
return sa
|
||||||
|
}
|
||||||
|
|
||||||
|
func (suffixarray *SuffixArray) CommonSuffix() []int {
|
||||||
|
if len(suffixarray.Common) == len(suffixarray.Suffixes) {
|
||||||
|
return suffixarray.Common
|
||||||
|
}
|
||||||
|
|
||||||
|
lrep := len(suffixarray.Suffixes)
|
||||||
|
rep := make([]int, lrep)
|
||||||
|
|
||||||
|
sp := suffixarray.Suffixes[0]
|
||||||
|
bp := (*suffixarray.Sequences)[int(sp.Idx)].Sequence()[sp.Pos:]
|
||||||
|
for i := 1; i < lrep; i++ {
|
||||||
|
si := suffixarray.Suffixes[i]
|
||||||
|
bi := (*suffixarray.Sequences)[int(si.Idx)].Sequence()[si.Pos:]
|
||||||
|
|
||||||
|
l := obiutils.MinInt(len(bi), len(bp))
|
||||||
|
p := 0
|
||||||
|
for p < l && bi[p] == bp[p] {
|
||||||
|
p++
|
||||||
|
}
|
||||||
|
rep[i-1] = p
|
||||||
|
sp = si
|
||||||
|
bp = bi
|
||||||
|
}
|
||||||
|
|
||||||
|
rep[lrep-1] = 0
|
||||||
|
|
||||||
|
suffixarray.Common = rep
|
||||||
|
|
||||||
|
return suffixarray.Common
|
||||||
|
}
|
||||||
|
|
||||||
|
func (suffixarray *SuffixArray) String() string {
|
||||||
|
sb := bytes.Buffer{}
|
||||||
|
|
||||||
|
common := suffixarray.CommonSuffix()
|
||||||
|
|
||||||
|
sb.WriteString("Common\tSeqIdx\tPosition\tSuffix\n")
|
||||||
|
|
||||||
|
for i := range suffixarray.Suffixes {
|
||||||
|
idx := suffixarray.Suffixes[i].Idx
|
||||||
|
pos := suffixarray.Suffixes[i].Pos
|
||||||
|
sb.WriteString(fmt.Sprintf("%6d\t%6d\t%8d\t%s\n",
|
||||||
|
common[i],
|
||||||
|
idx,
|
||||||
|
pos,
|
||||||
|
string((*suffixarray.Sequences)[idx].Sequence()[pos:]),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// func LongestInternalRepeat(suffixarray SuffixArray) []int {
|
||||||
|
// common := suffixarray.CommonSuffix()
|
||||||
|
// rep := make([]int, len(*suffixarray.Sequences))
|
||||||
|
|
||||||
|
// }
|
123
pkg/obitools/obiconsensus/obiconsensus.go
Normal file
123
pkg/obitools/obiconsensus/obiconsensus.go
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
package obiconsensus
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sort"
|
||||||
|
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obisuffix"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
|
)
|
||||||
|
|
||||||
|
func BuildConsensus(seqs obiseq.BioSequenceSlice, quorum float64) (*obiseq.BioSequence, error) {
|
||||||
|
|
||||||
|
log.Printf("Number of reads : %d\n", len(seqs))
|
||||||
|
|
||||||
|
longest := make([]int, len(seqs))
|
||||||
|
|
||||||
|
for i := range seqs {
|
||||||
|
s := seqs[i : i+1]
|
||||||
|
sa := obisuffix.BuildSuffixArray(&s)
|
||||||
|
longest[i] = obiutils.MaxSlice(sa.CommonSuffix())
|
||||||
|
}
|
||||||
|
|
||||||
|
o := obiutils.Order(sort.IntSlice(longest))
|
||||||
|
i := int(float64(len(seqs)) * quorum)
|
||||||
|
|
||||||
|
kmersize := longest[o[i]] + 1
|
||||||
|
log.Printf("estimated kmer size : %d", kmersize)
|
||||||
|
|
||||||
|
graph := obikmer.MakeDeBruijnGraph(kmersize)
|
||||||
|
|
||||||
|
for _, s := range seqs {
|
||||||
|
graph.Push(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("Graph size : %d\n", graph.Len())
|
||||||
|
total_kmer := graph.Len()
|
||||||
|
spectrum := graph.LinkSpectrum()
|
||||||
|
cum := make(map[int]int)
|
||||||
|
|
||||||
|
spectrum[1]=0
|
||||||
|
for i := 2; i < len(spectrum); i++ {
|
||||||
|
spectrum[i] += spectrum[i-1]
|
||||||
|
cum[spectrum[i]]++
|
||||||
|
}
|
||||||
|
|
||||||
|
max := 0
|
||||||
|
kmax := 0
|
||||||
|
for k, obs := range cum {
|
||||||
|
if obs > max {
|
||||||
|
max = obs
|
||||||
|
kmax = k
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
threshold := 0
|
||||||
|
for i, total := range spectrum {
|
||||||
|
if total == kmax {
|
||||||
|
threshold = i
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
graph.FilterMin(threshold)
|
||||||
|
log.Printf("Graph size : %d\n", graph.Len())
|
||||||
|
|
||||||
|
seq,err := graph.LongestConsensus(seqs[0].Source())
|
||||||
|
|
||||||
|
seq.SetCount(len(seqs))
|
||||||
|
seq.SetAttribute("seq_length",seq.Len())
|
||||||
|
seq.SetAttribute("kmer_size",kmersize)
|
||||||
|
seq.SetAttribute("kmer_min_occur",threshold)
|
||||||
|
seq.SetAttribute("kmer_max_occur",graph.MaxLink())
|
||||||
|
seq.SetAttribute("filtered_graph_size",graph.Len())
|
||||||
|
seq.SetAttribute("full_graph_size",total_kmer)
|
||||||
|
|
||||||
|
return seq,err
|
||||||
|
}
|
||||||
|
|
||||||
|
func Consensus(iterator obiiter.IBioSequence, quorum float64) obiiter.IBioSequence {
|
||||||
|
newIter := obiiter.MakeIBioSequence()
|
||||||
|
size:=10
|
||||||
|
|
||||||
|
newIter.Add(1)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
newIter.WaitAndClose()
|
||||||
|
}()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
order := 0
|
||||||
|
iterator = iterator.SortBatches()
|
||||||
|
buffer := obiseq.MakeBioSequenceSlice()
|
||||||
|
|
||||||
|
for iterator.Next() {
|
||||||
|
seqs := iterator.Get()
|
||||||
|
consensus,err := BuildConsensus(seqs.Slice(),quorum)
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
buffer = append(buffer, consensus)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(buffer) == size {
|
||||||
|
newIter.Push(obiiter.MakeBioSequenceBatch(order, buffer))
|
||||||
|
order++
|
||||||
|
buffer = obiseq.MakeBioSequenceSlice()
|
||||||
|
}
|
||||||
|
seqs.Recycle()
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(buffer) > 0 {
|
||||||
|
newIter.Push(obiiter.MakeBioSequenceBatch(order, buffer))
|
||||||
|
}
|
||||||
|
|
||||||
|
newIter.Done()
|
||||||
|
|
||||||
|
}()
|
||||||
|
|
||||||
|
return newIter
|
||||||
|
}
|
13
pkg/obitools/obiconsensus/options.go
Normal file
13
pkg/obitools/obiconsensus/options.go
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
package obiconsensus
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||||
|
"github.com/DavidGamba/go-getoptions"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
|
obiconvert.InputOptionSet(options)
|
||||||
|
obiconvert.OutputOptionSet(options)
|
||||||
|
}
|
||||||
|
|
@ -30,6 +30,8 @@ var __compressed__ = false
|
|||||||
var __output_file_name__ = "-"
|
var __output_file_name__ = "-"
|
||||||
var __paired_file_name__ = ""
|
var __paired_file_name__ = ""
|
||||||
|
|
||||||
|
var __full_file_batch__ = false
|
||||||
|
|
||||||
func InputOptionSet(options *getoptions.GetOpt) {
|
func InputOptionSet(options *getoptions.GetOpt) {
|
||||||
// options.IntVar(&__skipped_entries__, "skip", __skipped_entries__,
|
// options.IntVar(&__skipped_entries__, "skip", __skipped_entries__,
|
||||||
// options.Description("The N first sequence records of the file are discarded from the analysis and not reported to the output file."))
|
// options.Description("The N first sequence records of the file are discarded from the analysis and not reported to the output file."))
|
||||||
@ -201,3 +203,10 @@ func CLIHasPairedFile() bool {
|
|||||||
func CLIPairedFileName() string {
|
func CLIPairedFileName() string {
|
||||||
return __paired_file_name__
|
return __paired_file_name__
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func SetFullFileBatch() {
|
||||||
|
__full_file_batch__ = true
|
||||||
|
}
|
||||||
|
func FullFileBatch() bool {
|
||||||
|
return __full_file_batch__
|
||||||
|
}
|
@ -99,9 +99,13 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
|||||||
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
|
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
|
||||||
|
|
||||||
opts = append(opts, obiformats.OptionsQualityShift(CLIInputQualityShift()))
|
opts = append(opts, obiformats.OptionsQualityShift(CLIInputQualityShift()))
|
||||||
|
opts = append(opts, obiformats.OptionsFullFileBatch(FullFileBatch()))
|
||||||
|
|
||||||
|
|
||||||
if len(filenames) == 0 {
|
if len(filenames) == 0 {
|
||||||
log.Printf("Reading sequences from stdin in %s\n", CLIInputFormat())
|
log.Printf("Reading sequences from stdin in %s\n", CLIInputFormat())
|
||||||
|
opts = append(opts, obiformats.OptionsSource("stdin"))
|
||||||
|
|
||||||
switch CLIInputFormat() {
|
switch CLIInputFormat() {
|
||||||
case "ecopcr":
|
case "ecopcr":
|
||||||
iterator = obiformats.ReadEcoPCR(os.Stdin, opts...)
|
iterator = obiformats.ReadEcoPCR(os.Stdin, opts...)
|
||||||
@ -121,7 +125,7 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
|||||||
|
|
||||||
switch CLIInputFormat() {
|
switch CLIInputFormat() {
|
||||||
case "ecopcr":
|
case "ecopcr":
|
||||||
reader = obiformats.ReadEcoPCRBatchFromFile
|
reader = obiformats.ReadEcoPCRFromFile
|
||||||
case "embl":
|
case "embl":
|
||||||
reader = obiformats.ReadEMBLFromFile
|
reader = obiformats.ReadEMBLFromFile
|
||||||
case "genbank":
|
case "genbank":
|
||||||
|
16
pkg/obiutils/path.go
Normal file
16
pkg/obiutils/path.go
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
package obiutils
|
||||||
|
|
||||||
|
import (
|
||||||
|
"path"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func RemoveAllExt(p string) string {
|
||||||
|
|
||||||
|
for ext := path.Ext(p); len(ext) > 0; ext = path.Ext(p) {
|
||||||
|
p = strings.TrimSuffix(p, ext)
|
||||||
|
}
|
||||||
|
|
||||||
|
return p
|
||||||
|
|
||||||
|
}
|
Reference in New Issue
Block a user