mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
A first version of obigrep. Normally fully functionnal, but not fully tested
This commit is contained in:
42
cmd/obitools/obigrep/main.go
Normal file
42
cmd/obitools/obigrep/main.go
Normal file
@ -0,0 +1,42 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"runtime/pprof"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obigrep"
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
defer obiseq.LogBioSeqStatus()
|
||||
|
||||
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
|
||||
f, err := os.Create("cpu.pprof")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
pprof.StartCPUProfile(f)
|
||||
defer pprof.StopCPUProfile()
|
||||
|
||||
// go tool trace cpu.trace
|
||||
// ftrace, err := os.Create("cpu.trace")
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
// trace.Start(ftrace)
|
||||
// defer trace.Stop()
|
||||
|
||||
optionParser := obioptions.GenerateOptionParser(obigrep.OptionSet)
|
||||
|
||||
_, args, _ := optionParser(os.Args)
|
||||
|
||||
sequences, _ := obiconvert.ReadBioSequencesBatch(args...)
|
||||
selected := obigrep.IFilterSequence(sequences)
|
||||
obiconvert.WriteBioSequencesBatch(selected, true)
|
||||
}
|
@ -1,8 +1,11 @@
|
||||
package goutils
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/gob"
|
||||
"io"
|
||||
"os"
|
||||
)
|
||||
|
||||
// NotAnInteger defines a new type of Error : "NotAnInteger"
|
||||
@ -73,3 +76,33 @@ func CopyMap(dest, src map[string]interface{}) {
|
||||
gob.NewEncoder(buf).Encode(src)
|
||||
gob.NewDecoder(buf).Decode(&dest)
|
||||
}
|
||||
|
||||
// Read a whole file into the memory and store it as array of lines
|
||||
func ReadLines(path string) (lines []string, err error) {
|
||||
var (
|
||||
file *os.File
|
||||
part []byte
|
||||
prefix bool
|
||||
)
|
||||
if file, err = os.Open(path); err != nil {
|
||||
return
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
reader := bufio.NewReader(file)
|
||||
buffer := bytes.NewBuffer(make([]byte, 0))
|
||||
for {
|
||||
if part, prefix, err = reader.ReadLine(); err != nil {
|
||||
break
|
||||
}
|
||||
buffer.Write(part)
|
||||
if !prefix {
|
||||
lines = append(lines, buffer.String())
|
||||
buffer.Reset()
|
||||
}
|
||||
}
|
||||
if err == io.EOF {
|
||||
err = nil
|
||||
}
|
||||
return
|
||||
}
|
||||
|
@ -559,3 +559,56 @@ func (iterator IBioSequenceBatch) DivideOn(predicate obiseq.SequencePredicate,
|
||||
|
||||
return trueIter, falseIter
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) FilterOn(predicate obiseq.SequencePredicate,
|
||||
size int, sizes ...int) IBioSequenceBatch {
|
||||
buffsize := iterator.BufferSize()
|
||||
nworkers := 4
|
||||
|
||||
if len(sizes) > 0 {
|
||||
nworkers = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
trueIter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
trueIter.Add(nworkers)
|
||||
|
||||
go func() {
|
||||
trueIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
ff := func(iterator IBioSequenceBatch) {
|
||||
iterator = iterator.SortBatches()
|
||||
|
||||
for iterator.Next() {
|
||||
seqs := iterator.Get()
|
||||
slice := seqs.slice
|
||||
j := 0
|
||||
for _, s := range slice {
|
||||
if predicate(s) {
|
||||
slice[j] = s
|
||||
j++
|
||||
} else {
|
||||
s.Recycle()
|
||||
}
|
||||
}
|
||||
|
||||
seqs.slice = slice[:j]
|
||||
trueIter.Push(seqs)
|
||||
}
|
||||
|
||||
trueIter.Done()
|
||||
}
|
||||
|
||||
for i := 1; i < nworkers; i++ {
|
||||
go ff(iterator.Split())
|
||||
}
|
||||
|
||||
go ff(iterator)
|
||||
|
||||
return trueIter.Rebatch(size)
|
||||
}
|
||||
|
@ -1,533 +0,0 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/tevino/abool/v2"
|
||||
)
|
||||
|
||||
type BioSequenceBatch struct {
|
||||
slice BioSequenceSlice
|
||||
order int
|
||||
}
|
||||
|
||||
var NilBioSequenceBatch = BioSequenceBatch{nil, -1}
|
||||
|
||||
func MakeBioSequenceBatch(order int, sequences BioSequenceSlice) BioSequenceBatch {
|
||||
return BioSequenceBatch{
|
||||
slice: sequences,
|
||||
order: order,
|
||||
}
|
||||
}
|
||||
|
||||
func (batch BioSequenceBatch) Order() int {
|
||||
return batch.order
|
||||
}
|
||||
|
||||
func (batch BioSequenceBatch) Reorder(newOrder int) BioSequenceBatch {
|
||||
batch.order = newOrder
|
||||
return batch
|
||||
}
|
||||
|
||||
func (batch BioSequenceBatch) Slice() BioSequenceSlice {
|
||||
return batch.slice
|
||||
}
|
||||
|
||||
func (batch BioSequenceBatch) Length() int {
|
||||
return len(batch.slice)
|
||||
}
|
||||
|
||||
func (batch BioSequenceBatch) NotEmpty() bool {
|
||||
return batch.slice.NotEmpty()
|
||||
}
|
||||
|
||||
func (batch BioSequenceBatch) Pop0() *BioSequence {
|
||||
return batch.slice.Pop0()
|
||||
}
|
||||
|
||||
func (batch BioSequenceBatch) IsNil() bool {
|
||||
return batch.slice == nil
|
||||
}
|
||||
|
||||
func (batch BioSequenceBatch) Recycle() {
|
||||
batch.slice.Recycle()
|
||||
batch.slice = nil
|
||||
}
|
||||
|
||||
// Structure implementing an iterator over bioseq.BioSequenceBatch
|
||||
// based on a channel.
|
||||
type _IBioSequenceBatch struct {
|
||||
channel chan BioSequenceBatch
|
||||
current BioSequenceBatch
|
||||
pushBack *abool.AtomicBool
|
||||
all_done *sync.WaitGroup
|
||||
lock *sync.RWMutex
|
||||
buffer_size int32
|
||||
batch_size int32
|
||||
sequence_format string
|
||||
finished *abool.AtomicBool
|
||||
}
|
||||
|
||||
type IBioSequenceBatch struct {
|
||||
pointer *_IBioSequenceBatch
|
||||
}
|
||||
|
||||
var NilIBioSequenceBatch = IBioSequenceBatch{pointer: nil}
|
||||
|
||||
func MakeIBioSequenceBatch(sizes ...int) IBioSequenceBatch {
|
||||
buffsize := int32(1)
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = int32(sizes[0])
|
||||
}
|
||||
|
||||
i := _IBioSequenceBatch{
|
||||
channel: make(chan BioSequenceBatch, buffsize),
|
||||
current: NilBioSequenceBatch,
|
||||
pushBack: abool.New(),
|
||||
buffer_size: buffsize,
|
||||
batch_size: -1,
|
||||
sequence_format: "",
|
||||
finished: abool.New(),
|
||||
}
|
||||
|
||||
waiting := sync.WaitGroup{}
|
||||
i.all_done = &waiting
|
||||
lock := sync.RWMutex{}
|
||||
i.lock = &lock
|
||||
ii := IBioSequenceBatch{&i}
|
||||
return ii
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Add(n int) {
|
||||
iterator.pointer.all_done.Add(n)
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Done() {
|
||||
iterator.pointer.all_done.Done()
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Unlock() {
|
||||
iterator.pointer.lock.Unlock()
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Lock() {
|
||||
iterator.pointer.lock.Lock()
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) RLock() {
|
||||
iterator.pointer.lock.RLock()
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) RUnlock() {
|
||||
iterator.pointer.lock.RUnlock()
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Wait() {
|
||||
iterator.pointer.all_done.Wait()
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Channel() chan BioSequenceBatch {
|
||||
return iterator.pointer.channel
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) IsNil() bool {
|
||||
return iterator.pointer == nil
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) BufferSize() int {
|
||||
return int(atomic.LoadInt32(&iterator.pointer.buffer_size))
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) BatchSize() int {
|
||||
return int(atomic.LoadInt32(&iterator.pointer.batch_size))
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) SetBatchSize(size int) error {
|
||||
if size >= 0 {
|
||||
atomic.StoreInt32(&iterator.pointer.batch_size, int32(size))
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("size (%d) cannot be negative", size)
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Split() IBioSequenceBatch {
|
||||
iterator.pointer.lock.RLock()
|
||||
defer iterator.pointer.lock.RUnlock()
|
||||
i := _IBioSequenceBatch{
|
||||
channel: iterator.pointer.channel,
|
||||
current: NilBioSequenceBatch,
|
||||
pushBack: abool.New(),
|
||||
all_done: iterator.pointer.all_done,
|
||||
buffer_size: iterator.pointer.buffer_size,
|
||||
batch_size: iterator.pointer.batch_size,
|
||||
sequence_format: iterator.pointer.sequence_format,
|
||||
finished: iterator.pointer.finished}
|
||||
lock := sync.RWMutex{}
|
||||
i.lock = &lock
|
||||
|
||||
newIter := IBioSequenceBatch{&i}
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Next() bool {
|
||||
if iterator.pointer.pushBack.IsSet() {
|
||||
iterator.pointer.pushBack.UnSet()
|
||||
return true
|
||||
}
|
||||
|
||||
if iterator.pointer.finished.IsSet() {
|
||||
return false
|
||||
}
|
||||
|
||||
next, ok := (<-iterator.pointer.channel)
|
||||
|
||||
if ok {
|
||||
iterator.pointer.current = next
|
||||
return true
|
||||
}
|
||||
|
||||
iterator.pointer.current = NilBioSequenceBatch
|
||||
iterator.pointer.finished.Set()
|
||||
return false
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) PushBack() {
|
||||
if !iterator.pointer.current.IsNil() {
|
||||
iterator.pointer.pushBack.Set()
|
||||
}
|
||||
}
|
||||
|
||||
// The 'Get' method returns the instance of BioSequenceBatch
|
||||
// currently pointed by the iterator. You have to use the
|
||||
// 'Next' method to move to the next entry before calling
|
||||
// 'Get' to retreive the following instance.
|
||||
func (iterator IBioSequenceBatch) Get() BioSequenceBatch {
|
||||
return iterator.pointer.current
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Push(batch BioSequenceBatch) {
|
||||
if batch.IsNil() {
|
||||
log.Panicln("An Nil batch is pushed on the channel")
|
||||
}
|
||||
if batch.Length() == 0 {
|
||||
log.Panicln("An empty batch is pushed on the channel")
|
||||
}
|
||||
|
||||
iterator.pointer.channel <- batch
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Close() {
|
||||
close(iterator.pointer.channel)
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) WaitAndClose() {
|
||||
iterator.Wait()
|
||||
|
||||
for len(iterator.Channel()) > 0 {
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
iterator.Close()
|
||||
}
|
||||
|
||||
// Finished returns 'true' value if no more data is available
|
||||
// from the iterator.
|
||||
func (iterator IBioSequenceBatch) Finished() bool {
|
||||
return iterator.pointer.finished.IsSet()
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) IBioSequence(sizes ...int) IBioSequence {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequence(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
|
||||
for batch.NotEmpty() {
|
||||
newIter.pointer.channel <- batch.Pop0()
|
||||
}
|
||||
batch.Recycle()
|
||||
}
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) SortBatches(sizes ...int) IBioSequenceBatch {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.pointer.channel)
|
||||
}()
|
||||
|
||||
next_to_send := 0
|
||||
received := make(map[int]BioSequenceBatch)
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
if batch.order == next_to_send {
|
||||
newIter.pointer.channel <- batch
|
||||
next_to_send++
|
||||
batch, ok := received[next_to_send]
|
||||
for ok {
|
||||
newIter.pointer.channel <- batch
|
||||
delete(received, next_to_send)
|
||||
next_to_send++
|
||||
batch, ok = received[next_to_send]
|
||||
}
|
||||
} else {
|
||||
received[batch.order] = batch
|
||||
}
|
||||
}
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Concat(iterators ...IBioSequenceBatch) IBioSequenceBatch {
|
||||
|
||||
if len(iterators) == 0 {
|
||||
return iterator
|
||||
}
|
||||
|
||||
buffsize := iterator.BufferSize()
|
||||
newIter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.Channel())
|
||||
}()
|
||||
|
||||
go func() {
|
||||
previous_max := 0
|
||||
max_order := 0
|
||||
|
||||
for iterator.Next() {
|
||||
s := iterator.Get()
|
||||
if s.order > max_order {
|
||||
max_order = s.order
|
||||
}
|
||||
newIter.Push(s.Reorder(s.order + previous_max))
|
||||
}
|
||||
|
||||
previous_max = max_order + 1
|
||||
for _, iter := range iterators {
|
||||
for iter.Next() {
|
||||
s := iter.Get()
|
||||
if (s.order + previous_max) > max_order {
|
||||
max_order = s.order + previous_max
|
||||
}
|
||||
|
||||
newIter.Push(s.Reorder(s.order + previous_max))
|
||||
}
|
||||
previous_max = max_order + 1
|
||||
}
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
// Redistributes sequences from a IBioSequenceBatch into a new
|
||||
// IBioSequenceBatch with every batches having the same size
|
||||
// indicated in parameter. Rebatching implies to sort the
|
||||
// source IBioSequenceBatch.
|
||||
func (iterator IBioSequenceBatch) Rebatch(size int, sizes ...int) IBioSequenceBatch {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.pointer.channel)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
order := 0
|
||||
iterator = iterator.SortBatches()
|
||||
buffer := MakeBioSequenceSlice()
|
||||
|
||||
for iterator.Next() {
|
||||
seqs := iterator.Get()
|
||||
for _, s := range seqs.slice {
|
||||
buffer = append(buffer, s)
|
||||
if len(buffer) == size {
|
||||
newIter.Push(MakeBioSequenceBatch(order, buffer))
|
||||
order++
|
||||
buffer = MakeBioSequenceSlice()
|
||||
}
|
||||
}
|
||||
seqs.Recycle()
|
||||
}
|
||||
|
||||
if len(buffer) > 0 {
|
||||
newIter.Push(MakeBioSequenceBatch(order, buffer))
|
||||
}
|
||||
|
||||
newIter.Done()
|
||||
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) Recycle() {
|
||||
|
||||
log.Println("Start recycling of Bioseq objects")
|
||||
recycled := 0
|
||||
for iterator.Next() {
|
||||
// iterator.Get()
|
||||
batch := iterator.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
seq.Recycle()
|
||||
recycled++
|
||||
}
|
||||
batch.Recycle()
|
||||
}
|
||||
log.Printf("End of the recycling of %d Bioseq objects", recycled)
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) PairWith(reverse IBioSequenceBatch, sizes ...int) IPairedBioSequenceBatch {
|
||||
buffsize := iterator.BufferSize()
|
||||
batchsize := 5000
|
||||
|
||||
if len(sizes) > 0 {
|
||||
batchsize = sizes[0]
|
||||
}
|
||||
|
||||
if len(sizes) > 1 {
|
||||
buffsize = sizes[1]
|
||||
}
|
||||
|
||||
iterator = iterator.Rebatch(batchsize)
|
||||
reverse = reverse.Rebatch(batchsize)
|
||||
|
||||
newIter := MakeIPairedBioSequenceBatch(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.pointer.channel)
|
||||
log.Println("End of association of paired reads")
|
||||
}()
|
||||
|
||||
log.Println("Start association of paired reads")
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
if !reverse.Next() {
|
||||
log.Panicln("Etrange reverse pas prêt")
|
||||
}
|
||||
newIter.Channel() <- MakePairedBioSequenceBatch(iterator.Get(),
|
||||
reverse.Get())
|
||||
}
|
||||
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IBioSequenceBatch) DivideOn(predicate SequencePredicate,
|
||||
size int, sizes ...int) (IBioSequenceBatch, IBioSequenceBatch) {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
trueIter := MakeIBioSequenceBatch(buffsize)
|
||||
falseIter := MakeIBioSequenceBatch(buffsize)
|
||||
|
||||
trueIter.Add(1)
|
||||
falseIter.Add(1)
|
||||
|
||||
go func() {
|
||||
trueIter.WaitAndClose()
|
||||
falseIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
go func() {
|
||||
trueOrder := 0
|
||||
falseOrder := 0
|
||||
iterator = iterator.SortBatches()
|
||||
|
||||
trueSlice := MakeBioSequenceSlice()
|
||||
falseSlice := MakeBioSequenceSlice()
|
||||
|
||||
for iterator.Next() {
|
||||
seqs := iterator.Get()
|
||||
for _, s := range seqs.slice {
|
||||
if predicate(s) {
|
||||
trueSlice = append(trueSlice, s)
|
||||
} else {
|
||||
falseSlice = append(falseSlice, s)
|
||||
}
|
||||
|
||||
if len(trueSlice) == size {
|
||||
trueIter.Push(MakeBioSequenceBatch(trueOrder, trueSlice))
|
||||
trueOrder++
|
||||
trueSlice = MakeBioSequenceSlice()
|
||||
}
|
||||
|
||||
if len(falseSlice) == size {
|
||||
falseIter.Push(MakeBioSequenceBatch(falseOrder, falseSlice))
|
||||
falseOrder++
|
||||
falseSlice = MakeBioSequenceSlice()
|
||||
}
|
||||
}
|
||||
seqs.Recycle()
|
||||
}
|
||||
|
||||
if len(trueSlice) > 0 {
|
||||
trueIter.Push(MakeBioSequenceBatch(trueOrder, trueSlice))
|
||||
}
|
||||
|
||||
if len(falseSlice) > 0 {
|
||||
falseIter.Push(MakeBioSequenceBatch(falseOrder, falseSlice))
|
||||
}
|
||||
|
||||
trueIter.Done()
|
||||
falseIter.Done()
|
||||
}()
|
||||
|
||||
return trueIter, falseIter
|
||||
}
|
@ -1,213 +0,0 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"log"
|
||||
"sync"
|
||||
)
|
||||
|
||||
type PairedBioSequenceBatch struct {
|
||||
forward BioSequenceSlice
|
||||
reverse BioSequenceSlice
|
||||
order int
|
||||
}
|
||||
|
||||
var NilPairedBioSequenceBatch = PairedBioSequenceBatch{nil, nil, -1}
|
||||
|
||||
func MakePairedBioSequenceBatch(forward, reverse BioSequenceBatch) PairedBioSequenceBatch {
|
||||
if forward.order != reverse.order {
|
||||
log.Fatalf("Forward order : %d and reverse order : %d are not matching",
|
||||
forward.order, reverse.order)
|
||||
}
|
||||
|
||||
for i := range reverse.slice {
|
||||
reverse.slice[i].ReverseComplement(true)
|
||||
}
|
||||
|
||||
return PairedBioSequenceBatch{
|
||||
forward: forward.slice,
|
||||
reverse: reverse.slice,
|
||||
order: forward.order,
|
||||
}
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) Order() int {
|
||||
return batch.order
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) Length() int {
|
||||
return len(batch.forward)
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) Forward() BioSequenceSlice {
|
||||
return batch.forward
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) Reverse() BioSequenceSlice {
|
||||
return batch.reverse
|
||||
}
|
||||
|
||||
func (batch PairedBioSequenceBatch) IsNil() bool {
|
||||
return batch.forward == nil
|
||||
}
|
||||
|
||||
// Structure implementing an iterator over bioseq.BioSequenceBatch
|
||||
// based on a channel.
|
||||
type __ipairedbiosequencebatch__ struct {
|
||||
channel chan PairedBioSequenceBatch
|
||||
current PairedBioSequenceBatch
|
||||
pushBack bool
|
||||
all_done *sync.WaitGroup
|
||||
buffer_size int
|
||||
finished bool
|
||||
p_finished *bool
|
||||
}
|
||||
|
||||
type IPairedBioSequenceBatch struct {
|
||||
pointer *__ipairedbiosequencebatch__
|
||||
}
|
||||
|
||||
var NilIPairedBioSequenceBatch = IPairedBioSequenceBatch{pointer: nil}
|
||||
|
||||
func MakeIPairedBioSequenceBatch(sizes ...int) IPairedBioSequenceBatch {
|
||||
buffsize := 1
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
i := __ipairedbiosequencebatch__{
|
||||
channel: make(chan PairedBioSequenceBatch, buffsize),
|
||||
current: NilPairedBioSequenceBatch,
|
||||
pushBack: false,
|
||||
buffer_size: buffsize,
|
||||
finished: false,
|
||||
p_finished: nil,
|
||||
}
|
||||
|
||||
i.p_finished = &i.finished
|
||||
waiting := sync.WaitGroup{}
|
||||
i.all_done = &waiting
|
||||
ii := IPairedBioSequenceBatch{&i}
|
||||
return ii
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Add(n int) {
|
||||
iterator.pointer.all_done.Add(n)
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Done() {
|
||||
iterator.pointer.all_done.Done()
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Wait() {
|
||||
iterator.pointer.all_done.Wait()
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Channel() chan PairedBioSequenceBatch {
|
||||
return iterator.pointer.channel
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) IsNil() bool {
|
||||
return iterator.pointer == nil
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) BufferSize() int {
|
||||
return iterator.pointer.buffer_size
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Split() IPairedBioSequenceBatch {
|
||||
i := __ipairedbiosequencebatch__{
|
||||
channel: iterator.pointer.channel,
|
||||
current: NilPairedBioSequenceBatch,
|
||||
pushBack: false,
|
||||
all_done: iterator.pointer.all_done,
|
||||
buffer_size: iterator.pointer.buffer_size,
|
||||
finished: false,
|
||||
p_finished: iterator.pointer.p_finished}
|
||||
newIter := IPairedBioSequenceBatch{&i}
|
||||
return newIter
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) Next() bool {
|
||||
if *(iterator.pointer.p_finished) {
|
||||
return false
|
||||
}
|
||||
|
||||
if iterator.pointer.pushBack {
|
||||
iterator.pointer.pushBack = false
|
||||
return true
|
||||
}
|
||||
|
||||
next, ok := (<-iterator.pointer.channel)
|
||||
|
||||
if ok {
|
||||
iterator.pointer.current = next
|
||||
return true
|
||||
}
|
||||
|
||||
iterator.pointer.current = NilPairedBioSequenceBatch
|
||||
*iterator.pointer.p_finished = true
|
||||
return false
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) PushBack() {
|
||||
if !iterator.pointer.current.IsNil() {
|
||||
iterator.pointer.pushBack = true
|
||||
}
|
||||
}
|
||||
|
||||
// The 'Get' method returns the instance of BioSequenceBatch
|
||||
// currently pointed by the iterator. You have to use the
|
||||
// 'Next' method to move to the next entry before calling
|
||||
// 'Get' to retreive the following instance.
|
||||
func (iterator IPairedBioSequenceBatch) Get() PairedBioSequenceBatch {
|
||||
return iterator.pointer.current
|
||||
}
|
||||
|
||||
// Finished returns 'true' value if no more data is available
|
||||
// from the iterator.
|
||||
func (iterator IPairedBioSequenceBatch) Finished() bool {
|
||||
return *iterator.pointer.p_finished
|
||||
}
|
||||
|
||||
func (iterator IPairedBioSequenceBatch) SortBatches(sizes ...int) IPairedBioSequenceBatch {
|
||||
buffsize := iterator.BufferSize()
|
||||
|
||||
if len(sizes) > 0 {
|
||||
buffsize = sizes[0]
|
||||
}
|
||||
|
||||
newIter := MakeIPairedBioSequenceBatch(buffsize)
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.Wait()
|
||||
close(newIter.pointer.channel)
|
||||
}()
|
||||
|
||||
next_to_send := 0
|
||||
received := make(map[int]PairedBioSequenceBatch)
|
||||
go func() {
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
if batch.order == next_to_send {
|
||||
newIter.pointer.channel <- batch
|
||||
next_to_send++
|
||||
batch, ok := received[next_to_send]
|
||||
for ok {
|
||||
newIter.pointer.channel <- batch
|
||||
delete(received, next_to_send)
|
||||
next_to_send++
|
||||
batch, ok = received[next_to_send]
|
||||
}
|
||||
} else {
|
||||
received[batch.order] = batch
|
||||
}
|
||||
}
|
||||
newIter.Done()
|
||||
}()
|
||||
|
||||
return newIter
|
||||
|
||||
}
|
@ -2,6 +2,9 @@ package obiseq
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"regexp"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"github.com/PaesslerAG/gval"
|
||||
@ -10,37 +13,55 @@ import (
|
||||
type SequencePredicate func(*BioSequence) bool
|
||||
|
||||
func (predicate1 SequencePredicate) And(predicate2 SequencePredicate) SequencePredicate {
|
||||
f := func(sequence *BioSequence) bool {
|
||||
return predicate1(sequence) && predicate2(sequence)
|
||||
switch {
|
||||
case predicate1 == nil:
|
||||
return predicate2
|
||||
case predicate2 == nil:
|
||||
return predicate1
|
||||
default:
|
||||
return func(sequence *BioSequence) bool {
|
||||
return predicate1(sequence) && predicate2(sequence)
|
||||
}
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func (predicate1 SequencePredicate) Or(predicate2 SequencePredicate) SequencePredicate {
|
||||
f := func(sequence *BioSequence) bool {
|
||||
return predicate1(sequence) || predicate2(sequence)
|
||||
switch {
|
||||
case predicate1 == nil:
|
||||
return predicate2
|
||||
case predicate2 == nil:
|
||||
return predicate1
|
||||
default:
|
||||
return func(sequence *BioSequence) bool {
|
||||
return predicate1(sequence) || predicate2(sequence)
|
||||
}
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func (predicate1 SequencePredicate) Xor(predicate2 SequencePredicate) SequencePredicate {
|
||||
f := func(sequence *BioSequence) bool {
|
||||
p1 := predicate1(sequence)
|
||||
p2 := predicate2(sequence)
|
||||
return (p1 && !p2) || (p2 && !p1)
|
||||
switch {
|
||||
case predicate1 == nil:
|
||||
return predicate2
|
||||
case predicate2 == nil:
|
||||
return predicate1
|
||||
default:
|
||||
return func(sequence *BioSequence) bool {
|
||||
p1 := predicate1(sequence)
|
||||
p2 := predicate2(sequence)
|
||||
return (p1 && !p2) || (p2 && !p1)
|
||||
}
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func (predicate1 SequencePredicate) Not() SequencePredicate {
|
||||
f := func(sequence *BioSequence) bool {
|
||||
return !predicate1(sequence)
|
||||
switch {
|
||||
case predicate1 == nil:
|
||||
return nil
|
||||
default:
|
||||
return func(sequence *BioSequence) bool {
|
||||
return !predicate1(sequence)
|
||||
}
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func HasAttribute(name string) SequencePredicate {
|
||||
@ -57,9 +78,43 @@ func HasAttribute(name string) SequencePredicate {
|
||||
return f
|
||||
}
|
||||
|
||||
func MoreAbundantThan(count int) SequencePredicate {
|
||||
func IsAttributeMatch(name string, pattern string) SequencePredicate {
|
||||
pat, err := regexp.Compile(pattern)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("error in atribute %s regular pattern syntax : %v", name, err)
|
||||
}
|
||||
|
||||
f := func(sequence *BioSequence) bool {
|
||||
return sequence.Count() > count
|
||||
if sequence.HasAnnotation() {
|
||||
val, ok := (sequence.Annotations())[name]
|
||||
if ok {
|
||||
switch val := val.(type) {
|
||||
case string:
|
||||
return pat.MatchString(val)
|
||||
default:
|
||||
return pat.MatchString(fmt.Sprint(val))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func IsMoreAbundantOrEqualTo(count int) SequencePredicate {
|
||||
f := func(sequence *BioSequence) bool {
|
||||
return sequence.Count() >= count
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func IsLessAbundantOrEqualTo(count int) SequencePredicate {
|
||||
f := func(sequence *BioSequence) bool {
|
||||
return sequence.Count() <= count
|
||||
}
|
||||
|
||||
return f
|
||||
@ -81,7 +136,64 @@ func IsShorterOrEqualTo(length int) SequencePredicate {
|
||||
return f
|
||||
}
|
||||
|
||||
func ExrpessionPredicat(expression string) SequencePredicate {
|
||||
func IsSequenceMatch(pattern string) SequencePredicate {
|
||||
pat, err := regexp.Compile("(?i)" + pattern)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("error in sequence regular pattern syntax : %v", err)
|
||||
}
|
||||
|
||||
f := func(sequence *BioSequence) bool {
|
||||
return pat.Match(sequence.Sequence())
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func IsDefinitionMatch(pattern string) SequencePredicate {
|
||||
pat, err := regexp.Compile(pattern)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("error in definition regular pattern syntax : %v", err)
|
||||
}
|
||||
|
||||
f := func(sequence *BioSequence) bool {
|
||||
return pat.MatchString(sequence.Definition())
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func IsIdMatch(pattern string) SequencePredicate {
|
||||
pat, err := regexp.Compile(pattern)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("error in identifier regular pattern syntax : %v", err)
|
||||
}
|
||||
|
||||
f := func(sequence *BioSequence) bool {
|
||||
return pat.MatchString(sequence.Id())
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func IsIdIn(ids ...string) SequencePredicate {
|
||||
idset := make(map[string]bool)
|
||||
|
||||
for _, v := range ids {
|
||||
idset[v] = true
|
||||
}
|
||||
|
||||
f := func(sequence *BioSequence) bool {
|
||||
_, ok := idset[sequence.Id()]
|
||||
return ok
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func ExpressionPredicat(expression string) SequencePredicate {
|
||||
|
||||
exp, err := gval.Full().NewEvaluable(expression)
|
||||
|
||||
|
@ -1,2 +0,0 @@
|
||||
package obiseq
|
||||
|
@ -2,6 +2,8 @@ package obitax
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
type TaxNode struct {
|
||||
@ -64,3 +66,23 @@ func (node *TaxNode) IsNameMatching(pattern *regexp.Regexp) bool {
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (node *TaxNode) HasRankDefined(rank string) bool {
|
||||
|
||||
for node.rank != rank && node.parent != node.taxid {
|
||||
node = node.pparent
|
||||
}
|
||||
|
||||
return node.rank == rank
|
||||
|
||||
}
|
||||
|
||||
func HasRankDefined(taxonomy Taxonomy, rank string) obiseq.SequencePredicate {
|
||||
|
||||
f := func(sequence *obiseq.BioSequence) bool {
|
||||
taxon, err := taxonomy.Taxon(sequence.Taxid())
|
||||
return err == nil && taxon.HasRankDefined(rank)
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
@ -38,9 +38,10 @@ func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bo
|
||||
}
|
||||
options.BoolVar(&__rank_list__, "rank-list", false,
|
||||
options.Alias("l"),
|
||||
options.Description("List every taxonomic rank available iin the taxonomy."))
|
||||
options.IntSliceVar(&__taxonomical_restriction__, "subclade-of", 1, 1,
|
||||
options.Alias("s"),
|
||||
options.Description("List every taxonomic rank available in the taxonomy."))
|
||||
|
||||
options.IntSliceVar(&__taxonomical_restriction__, "restrict-to-taxon", 1, 1,
|
||||
options.Alias("r"),
|
||||
options.Description("Restrict output to some subclades."))
|
||||
}
|
||||
|
||||
|
47
pkg/obitools/obigrep/grep.go
Normal file
47
pkg/obitools/obigrep/grep.go
Normal file
@ -0,0 +1,47 @@
|
||||
package obigrep
|
||||
|
||||
import (
|
||||
"log"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||
)
|
||||
|
||||
func IFilterSequence(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
|
||||
var newIter obiiter.IBioSequenceBatch
|
||||
|
||||
predicate := CLISequenceSelectionPredicate()
|
||||
|
||||
if predicate != nil {
|
||||
if CLISaveDiscardedSequences() {
|
||||
var discarded obiiter.IBioSequenceBatch
|
||||
|
||||
log.Printf("Discarded sequences saved in file: %s\n", CLIDiscardedFileName())
|
||||
newIter, discarded = iterator.DivideOn(predicate,
|
||||
obioptions.CLIBatchSize())
|
||||
|
||||
go func() {
|
||||
_, err := obiconvert.WriteBioSequencesBatch(discarded,
|
||||
true,
|
||||
CLIDiscardedFileName())
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("%v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
} else {
|
||||
newIter = iterator.FilterOn(predicate,
|
||||
obioptions.CLIBatchSize(),
|
||||
obioptions.CLIParallelWorkers(),
|
||||
obioptions.CLIBufferSize(),
|
||||
)
|
||||
}
|
||||
} else {
|
||||
newIter = iterator
|
||||
}
|
||||
|
||||
return newIter
|
||||
|
||||
}
|
375
pkg/obitools/obigrep/options.go
Normal file
375
pkg/obitools/obigrep/options.go
Normal file
@ -0,0 +1,375 @@
|
||||
package obigrep
|
||||
|
||||
import (
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats/ncbitaxdump"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _BelongTaxa = make([]int, 0)
|
||||
var _NotBelongTaxa = make([]int, 0)
|
||||
var _RequiredRanks = make([]string, 0)
|
||||
|
||||
var _MinimumLength = 1
|
||||
var _MaximumLength = int(2e9)
|
||||
|
||||
var _MinimumCount = 1
|
||||
var _MaximumCount = int(2e9)
|
||||
|
||||
var _SequencePatterns = make([]string, 0)
|
||||
var _DefinitionPatterns = make([]string, 0)
|
||||
var _IdPatterns = make([]string, 0)
|
||||
|
||||
var _Predicats = make([]string, 0)
|
||||
|
||||
var _IdList = ""
|
||||
|
||||
var _Taxdump = ""
|
||||
var _Taxonomy = (*obitax.Taxonomy)(nil)
|
||||
var _RequiredAttributes = make([]string, 0)
|
||||
var _AttributePatterns = make(map[string]string, 0)
|
||||
|
||||
var _InvertMatch = false
|
||||
var _SaveRejected = ""
|
||||
|
||||
func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
options.StringVar(&_Taxdump, "taxdump", _Taxdump,
|
||||
options.Alias("t"),
|
||||
options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
|
||||
|
||||
options.IntSliceVar(&_BelongTaxa, "restrict-to-taxon", 1, 1,
|
||||
options.Alias("r"),
|
||||
options.ArgName("TAXID"),
|
||||
options.Description("Require that the actual taxon of the sequence belongs the provided taxid."))
|
||||
|
||||
options.IntSliceVar(&_NotBelongTaxa, "ignore-taxon", 1, 1,
|
||||
options.Alias("i"),
|
||||
options.ArgName("TAXID"),
|
||||
options.Description("Require that the actual taxon of the sequence doesn't belong the provided taxid."))
|
||||
|
||||
options.StringSliceVar(&_RequiredRanks, "require-rank", 1, 1,
|
||||
options.ArgName("RANK_NAME"),
|
||||
options.Description("Select sequences belonging a taxon with a rank <RANK_NAME>"))
|
||||
|
||||
}
|
||||
|
||||
func SequenceSelectionOptionSet(options *getoptions.GetOpt) {
|
||||
TaxonomySelectionOptionSet(options)
|
||||
options.StringVar(&_SaveRejected, "save-discarded", _SaveRejected,
|
||||
options.ArgName("FILENAME"),
|
||||
options.Description("Indicates in which file not selected sequences must be saved."))
|
||||
|
||||
options.StringVar(&_IdList, "id-list", _IdList,
|
||||
options.ArgName("FILENAME"),
|
||||
options.Description("<FILENAME> points to a text file containing the list of sequence record identifiers to be selected."+
|
||||
" The file format consists in a single identifier per line."))
|
||||
|
||||
options.BoolVar(&_InvertMatch, "inverse-match", _InvertMatch,
|
||||
options.Alias("v"),
|
||||
options.Description("Inverts the sequence record selection."))
|
||||
|
||||
options.IntVar(&_MinimumLength, "min-length", _MinimumLength,
|
||||
options.ArgName("LENGTH"),
|
||||
options.Alias("l"),
|
||||
options.Description("Selects sequence records whose sequence length is equal or longer than lmin."))
|
||||
|
||||
options.IntVar(&_MaximumLength, "max-length", _MaximumLength,
|
||||
options.ArgName("LENGTH"),
|
||||
options.Alias("L"),
|
||||
options.Description("Selects sequence records whose sequence length is equal or shorter than lmax."))
|
||||
|
||||
options.IntVar(&_MinimumCount, "min-count", _MinimumCount,
|
||||
options.ArgName("COUNT"),
|
||||
options.Alias("c"),
|
||||
options.Description("Selects sequence records occuring at least min count time in the data set."))
|
||||
|
||||
options.IntVar(&_MaximumCount, "max-count", _MaximumCount,
|
||||
options.ArgName("COUNT"),
|
||||
options.Alias("C"),
|
||||
options.Description("Selects sequence records occuring no more than max count time in the data set."))
|
||||
|
||||
options.StringSliceVar(&_Predicats, "predicate", 1, 1,
|
||||
options.Alias("p"),
|
||||
options.ArgName("EXPRESSION"),
|
||||
options.Description("boolean expression to be evaluated for each sequence record. "+
|
||||
"The attribute keys defined for each sequence record can be used in the "+
|
||||
"expression as variable names. An extra variable named ‘sequence’ refers "+
|
||||
"to the sequence record itself. Several -p options can be used on the same "+
|
||||
"command line and in this last case, the selected sequence records will match "+
|
||||
"all constraints."))
|
||||
|
||||
options.StringSliceVar(&_SequencePatterns, "sequence", 1, 1,
|
||||
options.Alias("s"),
|
||||
options.ArgName("PATTERN"),
|
||||
options.Description("Regular expression pattern to be tested against the sequence itself. The pattern is case insensitive."))
|
||||
|
||||
options.StringSliceVar(&_DefinitionPatterns, "definition", 1, 1,
|
||||
options.Alias("D"),
|
||||
options.ArgName("PATTERN"),
|
||||
options.Description("Regular expression pattern to be tested against the sequence definition. The pattern is case insensitive."))
|
||||
|
||||
options.StringSliceVar(&_IdPatterns, "identifier", 1, 1,
|
||||
options.Alias("I"),
|
||||
options.ArgName("PATTERN"),
|
||||
options.Description("Regular expression pattern to be tested against the sequence identifier. The pattern is case insensitive."))
|
||||
|
||||
options.StringSliceVar(&_RequiredAttributes, "has-attribute", 1, 1,
|
||||
options.Alias("A"),
|
||||
options.ArgName("KEY"),
|
||||
options.Description("Selects sequence records having an attribute whose key = <KEY>."))
|
||||
|
||||
options.StringMapVar(&_AttributePatterns, "attribute", 1, 1,
|
||||
options.Alias("a"),
|
||||
options.ArgName("KEY=VALUE"),
|
||||
options.Description("Regular expression pattern matched against the attributes of the sequence record. "+
|
||||
"the value of this attribute is of the form : key:regular_pattern. The pattern is case sensitive. "+
|
||||
"Several -a options can be used on the same command line and in this last case, the selected "+
|
||||
"sequence records will match all constraints."))
|
||||
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every options declared for
|
||||
// the obipcr command
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
SequenceSelectionOptionSet(options)
|
||||
}
|
||||
|
||||
func CLIMinSequenceLength() int {
|
||||
return _MinimumLength
|
||||
}
|
||||
|
||||
func CLIMaxSequenceLength() int {
|
||||
return _MaximumLength
|
||||
}
|
||||
|
||||
func CLISequenceSizePredicate() obiseq.SequencePredicate {
|
||||
if _MinimumLength > 1 {
|
||||
p := obiseq.IsLongerOrEqualTo(_MinimumLength)
|
||||
|
||||
if _MaximumLength != int(2e9) {
|
||||
p = p.And(obiseq.IsShorterOrEqualTo(_MaximumLength))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
if _MaximumLength != int(2e9) {
|
||||
return obiseq.IsShorterOrEqualTo(_MaximumLength)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIMinSequenceCount() int {
|
||||
return _MinimumCount
|
||||
}
|
||||
|
||||
func CLIMaxSequenceCount() int {
|
||||
return _MaximumCount
|
||||
}
|
||||
|
||||
func CLISequenceCountPredicate() obiseq.SequencePredicate {
|
||||
if _MinimumCount > 1 {
|
||||
p := obiseq.IsMoreAbundantOrEqualTo(_MinimumCount)
|
||||
|
||||
if _MaximumCount != int(2e9) {
|
||||
p = p.And(obiseq.IsLessAbundantOrEqualTo(_MaximumCount))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
if _MaximumLength != int(2e9) {
|
||||
return obiseq.IsLessAbundantOrEqualTo(_MaximumCount)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLISelectedNCBITaxDump() string {
|
||||
return _Taxdump
|
||||
}
|
||||
|
||||
func CLILoadSelectedTaxonomy() *obitax.Taxonomy {
|
||||
if CLISelectedNCBITaxDump() != "" {
|
||||
if _Taxonomy == nil {
|
||||
var err error
|
||||
_Taxonomy, err = ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(), true)
|
||||
if err != nil {
|
||||
log.Fatalf("cannot load taxonomy %s : %v",
|
||||
CLISelectedNCBITaxDump(), err)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return _Taxonomy
|
||||
}
|
||||
|
||||
log.Fatalln("no NCBI taxdump selected using option -t|--taxdump")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_BelongTaxa) > 0 {
|
||||
taxonomy := CLILoadSelectedTaxonomy()
|
||||
p := obitax.IsSubCladeOf(*taxonomy, _BelongTaxa[0])
|
||||
|
||||
for _, taxid := range _BelongTaxa[1:] {
|
||||
p.Or(obitax.IsSubCladeOf(*taxonomy, taxid))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_NotBelongTaxa) > 0 {
|
||||
taxonomy := CLILoadSelectedTaxonomy()
|
||||
p := obitax.IsSubCladeOf(*taxonomy, _NotBelongTaxa[0])
|
||||
|
||||
for _, taxid := range _NotBelongTaxa[1:] {
|
||||
p.Or(obitax.IsSubCladeOf(*taxonomy, taxid))
|
||||
}
|
||||
|
||||
return p.Not()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIHasRankDefinedPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_RequiredRanks) > 0 {
|
||||
taxonomy := CLILoadSelectedTaxonomy()
|
||||
p := obitax.HasRankDefined(*taxonomy, _RequiredRanks[0])
|
||||
|
||||
for _, rank := range _RequiredRanks[1:] {
|
||||
p.And(obitax.HasRankDefined(*taxonomy, rank))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLITaxonomyFilterPredicate() obiseq.SequencePredicate {
|
||||
return CLIHasRankDefinedPredicate().And(CLIRestrictTaxonomyPredicate()).And(CLIAvoidTaxonomyPredicate())
|
||||
}
|
||||
|
||||
func CLIPredicatesPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_Predicats) > 0 {
|
||||
p := obiseq.ExpressionPredicat(_Predicats[0])
|
||||
|
||||
for _, expression := range _Predicats[1:] {
|
||||
p.And(obiseq.ExpressionPredicat(expression))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLISequencePatternPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_SequencePatterns) > 0 {
|
||||
p := obiseq.IsSequenceMatch(_SequencePatterns[0])
|
||||
|
||||
for _, pattern := range _SequencePatterns[1:] {
|
||||
p.And(obiseq.IsSequenceMatch(pattern))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIDefinitionPatternPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_DefinitionPatterns) > 0 {
|
||||
p := obiseq.IsDefinitionMatch(_DefinitionPatterns[0])
|
||||
|
||||
for _, pattern := range _DefinitionPatterns[1:] {
|
||||
p.And(obiseq.IsDefinitionMatch(pattern))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIIdPatternPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if len(_IdPatterns) > 0 {
|
||||
p := obiseq.IsIdMatch(_IdPatterns[0])
|
||||
|
||||
for _, pattern := range _IdPatterns[1:] {
|
||||
p.And(obiseq.IsIdMatch(pattern))
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIIdListPredicate() obiseq.SequencePredicate {
|
||||
|
||||
if _IdList != "" {
|
||||
ids, err := goutils.ReadLines(_IdList)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("cannot read the id file %s : %v", _IdList, err)
|
||||
}
|
||||
|
||||
for i, v := range ids {
|
||||
ids[i] = strings.TrimSpace(v)
|
||||
}
|
||||
|
||||
p := obiseq.IsIdIn(ids...)
|
||||
|
||||
return p
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLISequenceSelectionPredicate() obiseq.SequencePredicate {
|
||||
p := CLISequenceSizePredicate()
|
||||
p = p.And(CLISequenceCountPredicate())
|
||||
p = p.And(CLITaxonomyFilterPredicate())
|
||||
p = p.And(CLIPredicatesPredicate())
|
||||
p = p.And(CLISequencePatternPredicate())
|
||||
p = p.And(CLIDefinitionPatternPredicate())
|
||||
p = p.And(CLIIdPatternPredicate())
|
||||
p = p.And(CLIIdListPredicate())
|
||||
|
||||
if _InvertMatch {
|
||||
p = p.Not()
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
func CLISaveDiscardedSequences() bool {
|
||||
return _SaveRejected != ""
|
||||
}
|
||||
|
||||
func CLIDiscardedFileName() string {
|
||||
return _SaveRejected
|
||||
}
|
Reference in New Issue
Block a user