A first version of obigrep. Normally fully functionnal, but not fully tested

This commit is contained in:
2022-02-25 07:29:52 +01:00
parent 0a36447e2a
commit 011898bd9d
11 changed files with 709 additions and 772 deletions

View File

@ -0,0 +1,42 @@
package main
import (
"os"
"runtime/pprof"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obigrep"
)
func main() {
defer obiseq.LogBioSeqStatus()
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
f, err := os.Create("cpu.pprof")
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
// go tool trace cpu.trace
// ftrace, err := os.Create("cpu.trace")
// if err != nil {
// log.Fatal(err)
// }
// trace.Start(ftrace)
// defer trace.Stop()
optionParser := obioptions.GenerateOptionParser(obigrep.OptionSet)
_, args, _ := optionParser(os.Args)
sequences, _ := obiconvert.ReadBioSequencesBatch(args...)
selected := obigrep.IFilterSequence(sequences)
obiconvert.WriteBioSequencesBatch(selected, true)
}

View File

@ -1,8 +1,11 @@
package goutils
import (
"bufio"
"bytes"
"encoding/gob"
"io"
"os"
)
// NotAnInteger defines a new type of Error : "NotAnInteger"
@ -73,3 +76,33 @@ func CopyMap(dest, src map[string]interface{}) {
gob.NewEncoder(buf).Encode(src)
gob.NewDecoder(buf).Decode(&dest)
}
// Read a whole file into the memory and store it as array of lines
func ReadLines(path string) (lines []string, err error) {
var (
file *os.File
part []byte
prefix bool
)
if file, err = os.Open(path); err != nil {
return
}
defer file.Close()
reader := bufio.NewReader(file)
buffer := bytes.NewBuffer(make([]byte, 0))
for {
if part, prefix, err = reader.ReadLine(); err != nil {
break
}
buffer.Write(part)
if !prefix {
lines = append(lines, buffer.String())
buffer.Reset()
}
}
if err == io.EOF {
err = nil
}
return
}

View File

@ -559,3 +559,56 @@ func (iterator IBioSequenceBatch) DivideOn(predicate obiseq.SequencePredicate,
return trueIter, falseIter
}
func (iterator IBioSequenceBatch) FilterOn(predicate obiseq.SequencePredicate,
size int, sizes ...int) IBioSequenceBatch {
buffsize := iterator.BufferSize()
nworkers := 4
if len(sizes) > 0 {
nworkers = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
trueIter := MakeIBioSequenceBatch(buffsize)
trueIter.Add(nworkers)
go func() {
trueIter.WaitAndClose()
}()
ff := func(iterator IBioSequenceBatch) {
iterator = iterator.SortBatches()
for iterator.Next() {
seqs := iterator.Get()
slice := seqs.slice
j := 0
for _, s := range slice {
if predicate(s) {
slice[j] = s
j++
} else {
s.Recycle()
}
}
seqs.slice = slice[:j]
trueIter.Push(seqs)
}
trueIter.Done()
}
for i := 1; i < nworkers; i++ {
go ff(iterator.Split())
}
go ff(iterator)
return trueIter.Rebatch(size)
}

View File

@ -1,533 +0,0 @@
package obiseq
import (
"fmt"
"log"
"sync"
"sync/atomic"
"time"
"github.com/tevino/abool/v2"
)
type BioSequenceBatch struct {
slice BioSequenceSlice
order int
}
var NilBioSequenceBatch = BioSequenceBatch{nil, -1}
func MakeBioSequenceBatch(order int, sequences BioSequenceSlice) BioSequenceBatch {
return BioSequenceBatch{
slice: sequences,
order: order,
}
}
func (batch BioSequenceBatch) Order() int {
return batch.order
}
func (batch BioSequenceBatch) Reorder(newOrder int) BioSequenceBatch {
batch.order = newOrder
return batch
}
func (batch BioSequenceBatch) Slice() BioSequenceSlice {
return batch.slice
}
func (batch BioSequenceBatch) Length() int {
return len(batch.slice)
}
func (batch BioSequenceBatch) NotEmpty() bool {
return batch.slice.NotEmpty()
}
func (batch BioSequenceBatch) Pop0() *BioSequence {
return batch.slice.Pop0()
}
func (batch BioSequenceBatch) IsNil() bool {
return batch.slice == nil
}
func (batch BioSequenceBatch) Recycle() {
batch.slice.Recycle()
batch.slice = nil
}
// Structure implementing an iterator over bioseq.BioSequenceBatch
// based on a channel.
type _IBioSequenceBatch struct {
channel chan BioSequenceBatch
current BioSequenceBatch
pushBack *abool.AtomicBool
all_done *sync.WaitGroup
lock *sync.RWMutex
buffer_size int32
batch_size int32
sequence_format string
finished *abool.AtomicBool
}
type IBioSequenceBatch struct {
pointer *_IBioSequenceBatch
}
var NilIBioSequenceBatch = IBioSequenceBatch{pointer: nil}
func MakeIBioSequenceBatch(sizes ...int) IBioSequenceBatch {
buffsize := int32(1)
if len(sizes) > 0 {
buffsize = int32(sizes[0])
}
i := _IBioSequenceBatch{
channel: make(chan BioSequenceBatch, buffsize),
current: NilBioSequenceBatch,
pushBack: abool.New(),
buffer_size: buffsize,
batch_size: -1,
sequence_format: "",
finished: abool.New(),
}
waiting := sync.WaitGroup{}
i.all_done = &waiting
lock := sync.RWMutex{}
i.lock = &lock
ii := IBioSequenceBatch{&i}
return ii
}
func (iterator IBioSequenceBatch) Add(n int) {
iterator.pointer.all_done.Add(n)
}
func (iterator IBioSequenceBatch) Done() {
iterator.pointer.all_done.Done()
}
func (iterator IBioSequenceBatch) Unlock() {
iterator.pointer.lock.Unlock()
}
func (iterator IBioSequenceBatch) Lock() {
iterator.pointer.lock.Lock()
}
func (iterator IBioSequenceBatch) RLock() {
iterator.pointer.lock.RLock()
}
func (iterator IBioSequenceBatch) RUnlock() {
iterator.pointer.lock.RUnlock()
}
func (iterator IBioSequenceBatch) Wait() {
iterator.pointer.all_done.Wait()
}
func (iterator IBioSequenceBatch) Channel() chan BioSequenceBatch {
return iterator.pointer.channel
}
func (iterator IBioSequenceBatch) IsNil() bool {
return iterator.pointer == nil
}
func (iterator IBioSequenceBatch) BufferSize() int {
return int(atomic.LoadInt32(&iterator.pointer.buffer_size))
}
func (iterator IBioSequenceBatch) BatchSize() int {
return int(atomic.LoadInt32(&iterator.pointer.batch_size))
}
func (iterator IBioSequenceBatch) SetBatchSize(size int) error {
if size >= 0 {
atomic.StoreInt32(&iterator.pointer.batch_size, int32(size))
return nil
}
return fmt.Errorf("size (%d) cannot be negative", size)
}
func (iterator IBioSequenceBatch) Split() IBioSequenceBatch {
iterator.pointer.lock.RLock()
defer iterator.pointer.lock.RUnlock()
i := _IBioSequenceBatch{
channel: iterator.pointer.channel,
current: NilBioSequenceBatch,
pushBack: abool.New(),
all_done: iterator.pointer.all_done,
buffer_size: iterator.pointer.buffer_size,
batch_size: iterator.pointer.batch_size,
sequence_format: iterator.pointer.sequence_format,
finished: iterator.pointer.finished}
lock := sync.RWMutex{}
i.lock = &lock
newIter := IBioSequenceBatch{&i}
return newIter
}
func (iterator IBioSequenceBatch) Next() bool {
if iterator.pointer.pushBack.IsSet() {
iterator.pointer.pushBack.UnSet()
return true
}
if iterator.pointer.finished.IsSet() {
return false
}
next, ok := (<-iterator.pointer.channel)
if ok {
iterator.pointer.current = next
return true
}
iterator.pointer.current = NilBioSequenceBatch
iterator.pointer.finished.Set()
return false
}
func (iterator IBioSequenceBatch) PushBack() {
if !iterator.pointer.current.IsNil() {
iterator.pointer.pushBack.Set()
}
}
// The 'Get' method returns the instance of BioSequenceBatch
// currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling
// 'Get' to retreive the following instance.
func (iterator IBioSequenceBatch) Get() BioSequenceBatch {
return iterator.pointer.current
}
func (iterator IBioSequenceBatch) Push(batch BioSequenceBatch) {
if batch.IsNil() {
log.Panicln("An Nil batch is pushed on the channel")
}
if batch.Length() == 0 {
log.Panicln("An empty batch is pushed on the channel")
}
iterator.pointer.channel <- batch
}
func (iterator IBioSequenceBatch) Close() {
close(iterator.pointer.channel)
}
func (iterator IBioSequenceBatch) WaitAndClose() {
iterator.Wait()
for len(iterator.Channel()) > 0 {
time.Sleep(time.Millisecond)
}
iterator.Close()
}
// Finished returns 'true' value if no more data is available
// from the iterator.
func (iterator IBioSequenceBatch) Finished() bool {
return iterator.pointer.finished.IsSet()
}
func (iterator IBioSequenceBatch) IBioSequence(sizes ...int) IBioSequence {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
newIter := MakeIBioSequence(buffsize)
newIter.Add(1)
go func() {
newIter.Wait()
close(newIter.pointer.channel)
}()
go func() {
for iterator.Next() {
batch := iterator.Get()
for batch.NotEmpty() {
newIter.pointer.channel <- batch.Pop0()
}
batch.Recycle()
}
newIter.Done()
}()
return newIter
}
func (iterator IBioSequenceBatch) SortBatches(sizes ...int) IBioSequenceBatch {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
newIter := MakeIBioSequenceBatch(buffsize)
newIter.Add(1)
go func() {
newIter.Wait()
close(newIter.pointer.channel)
}()
next_to_send := 0
received := make(map[int]BioSequenceBatch)
go func() {
for iterator.Next() {
batch := iterator.Get()
if batch.order == next_to_send {
newIter.pointer.channel <- batch
next_to_send++
batch, ok := received[next_to_send]
for ok {
newIter.pointer.channel <- batch
delete(received, next_to_send)
next_to_send++
batch, ok = received[next_to_send]
}
} else {
received[batch.order] = batch
}
}
newIter.Done()
}()
return newIter
}
func (iterator IBioSequenceBatch) Concat(iterators ...IBioSequenceBatch) IBioSequenceBatch {
if len(iterators) == 0 {
return iterator
}
buffsize := iterator.BufferSize()
newIter := MakeIBioSequenceBatch(buffsize)
newIter.Add(1)
go func() {
newIter.Wait()
close(newIter.Channel())
}()
go func() {
previous_max := 0
max_order := 0
for iterator.Next() {
s := iterator.Get()
if s.order > max_order {
max_order = s.order
}
newIter.Push(s.Reorder(s.order + previous_max))
}
previous_max = max_order + 1
for _, iter := range iterators {
for iter.Next() {
s := iter.Get()
if (s.order + previous_max) > max_order {
max_order = s.order + previous_max
}
newIter.Push(s.Reorder(s.order + previous_max))
}
previous_max = max_order + 1
}
newIter.Done()
}()
return newIter
}
// Redistributes sequences from a IBioSequenceBatch into a new
// IBioSequenceBatch with every batches having the same size
// indicated in parameter. Rebatching implies to sort the
// source IBioSequenceBatch.
func (iterator IBioSequenceBatch) Rebatch(size int, sizes ...int) IBioSequenceBatch {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
newIter := MakeIBioSequenceBatch(buffsize)
newIter.Add(1)
go func() {
newIter.Wait()
close(newIter.pointer.channel)
}()
go func() {
order := 0
iterator = iterator.SortBatches()
buffer := MakeBioSequenceSlice()
for iterator.Next() {
seqs := iterator.Get()
for _, s := range seqs.slice {
buffer = append(buffer, s)
if len(buffer) == size {
newIter.Push(MakeBioSequenceBatch(order, buffer))
order++
buffer = MakeBioSequenceSlice()
}
}
seqs.Recycle()
}
if len(buffer) > 0 {
newIter.Push(MakeBioSequenceBatch(order, buffer))
}
newIter.Done()
}()
return newIter
}
func (iterator IBioSequenceBatch) Recycle() {
log.Println("Start recycling of Bioseq objects")
recycled := 0
for iterator.Next() {
// iterator.Get()
batch := iterator.Get()
for _, seq := range batch.Slice() {
seq.Recycle()
recycled++
}
batch.Recycle()
}
log.Printf("End of the recycling of %d Bioseq objects", recycled)
}
func (iterator IBioSequenceBatch) PairWith(reverse IBioSequenceBatch, sizes ...int) IPairedBioSequenceBatch {
buffsize := iterator.BufferSize()
batchsize := 5000
if len(sizes) > 0 {
batchsize = sizes[0]
}
if len(sizes) > 1 {
buffsize = sizes[1]
}
iterator = iterator.Rebatch(batchsize)
reverse = reverse.Rebatch(batchsize)
newIter := MakeIPairedBioSequenceBatch(buffsize)
newIter.Add(1)
go func() {
newIter.Wait()
close(newIter.pointer.channel)
log.Println("End of association of paired reads")
}()
log.Println("Start association of paired reads")
go func() {
for iterator.Next() {
if !reverse.Next() {
log.Panicln("Etrange reverse pas prêt")
}
newIter.Channel() <- MakePairedBioSequenceBatch(iterator.Get(),
reverse.Get())
}
newIter.Done()
}()
return newIter
}
func (iterator IBioSequenceBatch) DivideOn(predicate SequencePredicate,
size int, sizes ...int) (IBioSequenceBatch, IBioSequenceBatch) {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
trueIter := MakeIBioSequenceBatch(buffsize)
falseIter := MakeIBioSequenceBatch(buffsize)
trueIter.Add(1)
falseIter.Add(1)
go func() {
trueIter.WaitAndClose()
falseIter.WaitAndClose()
}()
go func() {
trueOrder := 0
falseOrder := 0
iterator = iterator.SortBatches()
trueSlice := MakeBioSequenceSlice()
falseSlice := MakeBioSequenceSlice()
for iterator.Next() {
seqs := iterator.Get()
for _, s := range seqs.slice {
if predicate(s) {
trueSlice = append(trueSlice, s)
} else {
falseSlice = append(falseSlice, s)
}
if len(trueSlice) == size {
trueIter.Push(MakeBioSequenceBatch(trueOrder, trueSlice))
trueOrder++
trueSlice = MakeBioSequenceSlice()
}
if len(falseSlice) == size {
falseIter.Push(MakeBioSequenceBatch(falseOrder, falseSlice))
falseOrder++
falseSlice = MakeBioSequenceSlice()
}
}
seqs.Recycle()
}
if len(trueSlice) > 0 {
trueIter.Push(MakeBioSequenceBatch(trueOrder, trueSlice))
}
if len(falseSlice) > 0 {
falseIter.Push(MakeBioSequenceBatch(falseOrder, falseSlice))
}
trueIter.Done()
falseIter.Done()
}()
return trueIter, falseIter
}

View File

@ -1,213 +0,0 @@
package obiseq
import (
"log"
"sync"
)
type PairedBioSequenceBatch struct {
forward BioSequenceSlice
reverse BioSequenceSlice
order int
}
var NilPairedBioSequenceBatch = PairedBioSequenceBatch{nil, nil, -1}
func MakePairedBioSequenceBatch(forward, reverse BioSequenceBatch) PairedBioSequenceBatch {
if forward.order != reverse.order {
log.Fatalf("Forward order : %d and reverse order : %d are not matching",
forward.order, reverse.order)
}
for i := range reverse.slice {
reverse.slice[i].ReverseComplement(true)
}
return PairedBioSequenceBatch{
forward: forward.slice,
reverse: reverse.slice,
order: forward.order,
}
}
func (batch PairedBioSequenceBatch) Order() int {
return batch.order
}
func (batch PairedBioSequenceBatch) Length() int {
return len(batch.forward)
}
func (batch PairedBioSequenceBatch) Forward() BioSequenceSlice {
return batch.forward
}
func (batch PairedBioSequenceBatch) Reverse() BioSequenceSlice {
return batch.reverse
}
func (batch PairedBioSequenceBatch) IsNil() bool {
return batch.forward == nil
}
// Structure implementing an iterator over bioseq.BioSequenceBatch
// based on a channel.
type __ipairedbiosequencebatch__ struct {
channel chan PairedBioSequenceBatch
current PairedBioSequenceBatch
pushBack bool
all_done *sync.WaitGroup
buffer_size int
finished bool
p_finished *bool
}
type IPairedBioSequenceBatch struct {
pointer *__ipairedbiosequencebatch__
}
var NilIPairedBioSequenceBatch = IPairedBioSequenceBatch{pointer: nil}
func MakeIPairedBioSequenceBatch(sizes ...int) IPairedBioSequenceBatch {
buffsize := 1
if len(sizes) > 0 {
buffsize = sizes[0]
}
i := __ipairedbiosequencebatch__{
channel: make(chan PairedBioSequenceBatch, buffsize),
current: NilPairedBioSequenceBatch,
pushBack: false,
buffer_size: buffsize,
finished: false,
p_finished: nil,
}
i.p_finished = &i.finished
waiting := sync.WaitGroup{}
i.all_done = &waiting
ii := IPairedBioSequenceBatch{&i}
return ii
}
func (iterator IPairedBioSequenceBatch) Add(n int) {
iterator.pointer.all_done.Add(n)
}
func (iterator IPairedBioSequenceBatch) Done() {
iterator.pointer.all_done.Done()
}
func (iterator IPairedBioSequenceBatch) Wait() {
iterator.pointer.all_done.Wait()
}
func (iterator IPairedBioSequenceBatch) Channel() chan PairedBioSequenceBatch {
return iterator.pointer.channel
}
func (iterator IPairedBioSequenceBatch) IsNil() bool {
return iterator.pointer == nil
}
func (iterator IPairedBioSequenceBatch) BufferSize() int {
return iterator.pointer.buffer_size
}
func (iterator IPairedBioSequenceBatch) Split() IPairedBioSequenceBatch {
i := __ipairedbiosequencebatch__{
channel: iterator.pointer.channel,
current: NilPairedBioSequenceBatch,
pushBack: false,
all_done: iterator.pointer.all_done,
buffer_size: iterator.pointer.buffer_size,
finished: false,
p_finished: iterator.pointer.p_finished}
newIter := IPairedBioSequenceBatch{&i}
return newIter
}
func (iterator IPairedBioSequenceBatch) Next() bool {
if *(iterator.pointer.p_finished) {
return false
}
if iterator.pointer.pushBack {
iterator.pointer.pushBack = false
return true
}
next, ok := (<-iterator.pointer.channel)
if ok {
iterator.pointer.current = next
return true
}
iterator.pointer.current = NilPairedBioSequenceBatch
*iterator.pointer.p_finished = true
return false
}
func (iterator IPairedBioSequenceBatch) PushBack() {
if !iterator.pointer.current.IsNil() {
iterator.pointer.pushBack = true
}
}
// The 'Get' method returns the instance of BioSequenceBatch
// currently pointed by the iterator. You have to use the
// 'Next' method to move to the next entry before calling
// 'Get' to retreive the following instance.
func (iterator IPairedBioSequenceBatch) Get() PairedBioSequenceBatch {
return iterator.pointer.current
}
// Finished returns 'true' value if no more data is available
// from the iterator.
func (iterator IPairedBioSequenceBatch) Finished() bool {
return *iterator.pointer.p_finished
}
func (iterator IPairedBioSequenceBatch) SortBatches(sizes ...int) IPairedBioSequenceBatch {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[0]
}
newIter := MakeIPairedBioSequenceBatch(buffsize)
newIter.Add(1)
go func() {
newIter.Wait()
close(newIter.pointer.channel)
}()
next_to_send := 0
received := make(map[int]PairedBioSequenceBatch)
go func() {
for iterator.Next() {
batch := iterator.Get()
if batch.order == next_to_send {
newIter.pointer.channel <- batch
next_to_send++
batch, ok := received[next_to_send]
for ok {
newIter.pointer.channel <- batch
delete(received, next_to_send)
next_to_send++
batch, ok = received[next_to_send]
}
} else {
received[batch.order] = batch
}
}
newIter.Done()
}()
return newIter
}

View File

@ -2,6 +2,9 @@ package obiseq
import (
"context"
"fmt"
"regexp"
log "github.com/sirupsen/logrus"
"github.com/PaesslerAG/gval"
@ -10,37 +13,55 @@ import (
type SequencePredicate func(*BioSequence) bool
func (predicate1 SequencePredicate) And(predicate2 SequencePredicate) SequencePredicate {
f := func(sequence *BioSequence) bool {
return predicate1(sequence) && predicate2(sequence)
switch {
case predicate1 == nil:
return predicate2
case predicate2 == nil:
return predicate1
default:
return func(sequence *BioSequence) bool {
return predicate1(sequence) && predicate2(sequence)
}
}
return f
}
func (predicate1 SequencePredicate) Or(predicate2 SequencePredicate) SequencePredicate {
f := func(sequence *BioSequence) bool {
return predicate1(sequence) || predicate2(sequence)
switch {
case predicate1 == nil:
return predicate2
case predicate2 == nil:
return predicate1
default:
return func(sequence *BioSequence) bool {
return predicate1(sequence) || predicate2(sequence)
}
}
return f
}
func (predicate1 SequencePredicate) Xor(predicate2 SequencePredicate) SequencePredicate {
f := func(sequence *BioSequence) bool {
p1 := predicate1(sequence)
p2 := predicate2(sequence)
return (p1 && !p2) || (p2 && !p1)
switch {
case predicate1 == nil:
return predicate2
case predicate2 == nil:
return predicate1
default:
return func(sequence *BioSequence) bool {
p1 := predicate1(sequence)
p2 := predicate2(sequence)
return (p1 && !p2) || (p2 && !p1)
}
}
return f
}
func (predicate1 SequencePredicate) Not() SequencePredicate {
f := func(sequence *BioSequence) bool {
return !predicate1(sequence)
switch {
case predicate1 == nil:
return nil
default:
return func(sequence *BioSequence) bool {
return !predicate1(sequence)
}
}
return f
}
func HasAttribute(name string) SequencePredicate {
@ -57,9 +78,43 @@ func HasAttribute(name string) SequencePredicate {
return f
}
func MoreAbundantThan(count int) SequencePredicate {
func IsAttributeMatch(name string, pattern string) SequencePredicate {
pat, err := regexp.Compile(pattern)
if err != nil {
log.Fatalf("error in atribute %s regular pattern syntax : %v", name, err)
}
f := func(sequence *BioSequence) bool {
return sequence.Count() > count
if sequence.HasAnnotation() {
val, ok := (sequence.Annotations())[name]
if ok {
switch val := val.(type) {
case string:
return pat.MatchString(val)
default:
return pat.MatchString(fmt.Sprint(val))
}
}
}
return false
}
return f
}
func IsMoreAbundantOrEqualTo(count int) SequencePredicate {
f := func(sequence *BioSequence) bool {
return sequence.Count() >= count
}
return f
}
func IsLessAbundantOrEqualTo(count int) SequencePredicate {
f := func(sequence *BioSequence) bool {
return sequence.Count() <= count
}
return f
@ -81,7 +136,64 @@ func IsShorterOrEqualTo(length int) SequencePredicate {
return f
}
func ExrpessionPredicat(expression string) SequencePredicate {
func IsSequenceMatch(pattern string) SequencePredicate {
pat, err := regexp.Compile("(?i)" + pattern)
if err != nil {
log.Fatalf("error in sequence regular pattern syntax : %v", err)
}
f := func(sequence *BioSequence) bool {
return pat.Match(sequence.Sequence())
}
return f
}
func IsDefinitionMatch(pattern string) SequencePredicate {
pat, err := regexp.Compile(pattern)
if err != nil {
log.Fatalf("error in definition regular pattern syntax : %v", err)
}
f := func(sequence *BioSequence) bool {
return pat.MatchString(sequence.Definition())
}
return f
}
func IsIdMatch(pattern string) SequencePredicate {
pat, err := regexp.Compile(pattern)
if err != nil {
log.Fatalf("error in identifier regular pattern syntax : %v", err)
}
f := func(sequence *BioSequence) bool {
return pat.MatchString(sequence.Id())
}
return f
}
func IsIdIn(ids ...string) SequencePredicate {
idset := make(map[string]bool)
for _, v := range ids {
idset[v] = true
}
f := func(sequence *BioSequence) bool {
_, ok := idset[sequence.Id()]
return ok
}
return f
}
func ExpressionPredicat(expression string) SequencePredicate {
exp, err := gval.Full().NewEvaluable(expression)

View File

@ -1,2 +0,0 @@
package obiseq

View File

@ -2,6 +2,8 @@ package obitax
import (
"regexp"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
type TaxNode struct {
@ -64,3 +66,23 @@ func (node *TaxNode) IsNameMatching(pattern *regexp.Regexp) bool {
return false
}
func (node *TaxNode) HasRankDefined(rank string) bool {
for node.rank != rank && node.parent != node.taxid {
node = node.pparent
}
return node.rank == rank
}
func HasRankDefined(taxonomy Taxonomy, rank string) obiseq.SequencePredicate {
f := func(sequence *obiseq.BioSequence) bool {
taxon, err := taxonomy.Taxon(sequence.Taxid())
return err == nil && taxon.HasRankDefined(rank)
}
return f
}

View File

@ -38,9 +38,10 @@ func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bo
}
options.BoolVar(&__rank_list__, "rank-list", false,
options.Alias("l"),
options.Description("List every taxonomic rank available iin the taxonomy."))
options.IntSliceVar(&__taxonomical_restriction__, "subclade-of", 1, 1,
options.Alias("s"),
options.Description("List every taxonomic rank available in the taxonomy."))
options.IntSliceVar(&__taxonomical_restriction__, "restrict-to-taxon", 1, 1,
options.Alias("r"),
options.Description("Restrict output to some subclades."))
}

View File

@ -0,0 +1,47 @@
package obigrep
import (
"log"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
)
func IFilterSequence(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
var newIter obiiter.IBioSequenceBatch
predicate := CLISequenceSelectionPredicate()
if predicate != nil {
if CLISaveDiscardedSequences() {
var discarded obiiter.IBioSequenceBatch
log.Printf("Discarded sequences saved in file: %s\n", CLIDiscardedFileName())
newIter, discarded = iterator.DivideOn(predicate,
obioptions.CLIBatchSize())
go func() {
_, err := obiconvert.WriteBioSequencesBatch(discarded,
true,
CLIDiscardedFileName())
if err != nil {
log.Fatalf("%v", err)
}
}()
} else {
newIter = iterator.FilterOn(predicate,
obioptions.CLIBatchSize(),
obioptions.CLIParallelWorkers(),
obioptions.CLIBufferSize(),
)
}
} else {
newIter = iterator
}
return newIter
}

View File

@ -0,0 +1,375 @@
package obigrep
import (
"log"
"strings"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats/ncbitaxdump"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _BelongTaxa = make([]int, 0)
var _NotBelongTaxa = make([]int, 0)
var _RequiredRanks = make([]string, 0)
var _MinimumLength = 1
var _MaximumLength = int(2e9)
var _MinimumCount = 1
var _MaximumCount = int(2e9)
var _SequencePatterns = make([]string, 0)
var _DefinitionPatterns = make([]string, 0)
var _IdPatterns = make([]string, 0)
var _Predicats = make([]string, 0)
var _IdList = ""
var _Taxdump = ""
var _Taxonomy = (*obitax.Taxonomy)(nil)
var _RequiredAttributes = make([]string, 0)
var _AttributePatterns = make(map[string]string, 0)
var _InvertMatch = false
var _SaveRejected = ""
func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_Taxdump, "taxdump", _Taxdump,
options.Alias("t"),
options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
options.IntSliceVar(&_BelongTaxa, "restrict-to-taxon", 1, 1,
options.Alias("r"),
options.ArgName("TAXID"),
options.Description("Require that the actual taxon of the sequence belongs the provided taxid."))
options.IntSliceVar(&_NotBelongTaxa, "ignore-taxon", 1, 1,
options.Alias("i"),
options.ArgName("TAXID"),
options.Description("Require that the actual taxon of the sequence doesn't belong the provided taxid."))
options.StringSliceVar(&_RequiredRanks, "require-rank", 1, 1,
options.ArgName("RANK_NAME"),
options.Description("Select sequences belonging a taxon with a rank <RANK_NAME>"))
}
func SequenceSelectionOptionSet(options *getoptions.GetOpt) {
TaxonomySelectionOptionSet(options)
options.StringVar(&_SaveRejected, "save-discarded", _SaveRejected,
options.ArgName("FILENAME"),
options.Description("Indicates in which file not selected sequences must be saved."))
options.StringVar(&_IdList, "id-list", _IdList,
options.ArgName("FILENAME"),
options.Description("<FILENAME> points to a text file containing the list of sequence record identifiers to be selected."+
" The file format consists in a single identifier per line."))
options.BoolVar(&_InvertMatch, "inverse-match", _InvertMatch,
options.Alias("v"),
options.Description("Inverts the sequence record selection."))
options.IntVar(&_MinimumLength, "min-length", _MinimumLength,
options.ArgName("LENGTH"),
options.Alias("l"),
options.Description("Selects sequence records whose sequence length is equal or longer than lmin."))
options.IntVar(&_MaximumLength, "max-length", _MaximumLength,
options.ArgName("LENGTH"),
options.Alias("L"),
options.Description("Selects sequence records whose sequence length is equal or shorter than lmax."))
options.IntVar(&_MinimumCount, "min-count", _MinimumCount,
options.ArgName("COUNT"),
options.Alias("c"),
options.Description("Selects sequence records occuring at least min count time in the data set."))
options.IntVar(&_MaximumCount, "max-count", _MaximumCount,
options.ArgName("COUNT"),
options.Alias("C"),
options.Description("Selects sequence records occuring no more than max count time in the data set."))
options.StringSliceVar(&_Predicats, "predicate", 1, 1,
options.Alias("p"),
options.ArgName("EXPRESSION"),
options.Description("boolean expression to be evaluated for each sequence record. "+
"The attribute keys defined for each sequence record can be used in the "+
"expression as variable names. An extra variable named sequence refers "+
"to the sequence record itself. Several -p options can be used on the same "+
"command line and in this last case, the selected sequence records will match "+
"all constraints."))
options.StringSliceVar(&_SequencePatterns, "sequence", 1, 1,
options.Alias("s"),
options.ArgName("PATTERN"),
options.Description("Regular expression pattern to be tested against the sequence itself. The pattern is case insensitive."))
options.StringSliceVar(&_DefinitionPatterns, "definition", 1, 1,
options.Alias("D"),
options.ArgName("PATTERN"),
options.Description("Regular expression pattern to be tested against the sequence definition. The pattern is case insensitive."))
options.StringSliceVar(&_IdPatterns, "identifier", 1, 1,
options.Alias("I"),
options.ArgName("PATTERN"),
options.Description("Regular expression pattern to be tested against the sequence identifier. The pattern is case insensitive."))
options.StringSliceVar(&_RequiredAttributes, "has-attribute", 1, 1,
options.Alias("A"),
options.ArgName("KEY"),
options.Description("Selects sequence records having an attribute whose key = <KEY>."))
options.StringMapVar(&_AttributePatterns, "attribute", 1, 1,
options.Alias("a"),
options.ArgName("KEY=VALUE"),
options.Description("Regular expression pattern matched against the attributes of the sequence record. "+
"the value of this attribute is of the form : key:regular_pattern. The pattern is case sensitive. "+
"Several -a options can be used on the same command line and in this last case, the selected "+
"sequence records will match all constraints."))
}
// OptionSet adds to the basic option set every options declared for
// the obipcr command
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
SequenceSelectionOptionSet(options)
}
func CLIMinSequenceLength() int {
return _MinimumLength
}
func CLIMaxSequenceLength() int {
return _MaximumLength
}
func CLISequenceSizePredicate() obiseq.SequencePredicate {
if _MinimumLength > 1 {
p := obiseq.IsLongerOrEqualTo(_MinimumLength)
if _MaximumLength != int(2e9) {
p = p.And(obiseq.IsShorterOrEqualTo(_MaximumLength))
}
return p
}
if _MaximumLength != int(2e9) {
return obiseq.IsShorterOrEqualTo(_MaximumLength)
}
return nil
}
func CLIMinSequenceCount() int {
return _MinimumCount
}
func CLIMaxSequenceCount() int {
return _MaximumCount
}
func CLISequenceCountPredicate() obiseq.SequencePredicate {
if _MinimumCount > 1 {
p := obiseq.IsMoreAbundantOrEqualTo(_MinimumCount)
if _MaximumCount != int(2e9) {
p = p.And(obiseq.IsLessAbundantOrEqualTo(_MaximumCount))
}
return p
}
if _MaximumLength != int(2e9) {
return obiseq.IsLessAbundantOrEqualTo(_MaximumCount)
}
return nil
}
func CLISelectedNCBITaxDump() string {
return _Taxdump
}
func CLILoadSelectedTaxonomy() *obitax.Taxonomy {
if CLISelectedNCBITaxDump() != "" {
if _Taxonomy == nil {
var err error
_Taxonomy, err = ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(), true)
if err != nil {
log.Fatalf("cannot load taxonomy %s : %v",
CLISelectedNCBITaxDump(), err)
return nil
}
}
return _Taxonomy
}
log.Fatalln("no NCBI taxdump selected using option -t|--taxdump")
return nil
}
func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
if len(_BelongTaxa) > 0 {
taxonomy := CLILoadSelectedTaxonomy()
p := obitax.IsSubCladeOf(*taxonomy, _BelongTaxa[0])
for _, taxid := range _BelongTaxa[1:] {
p.Or(obitax.IsSubCladeOf(*taxonomy, taxid))
}
return p
}
return nil
}
func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate {
if len(_NotBelongTaxa) > 0 {
taxonomy := CLILoadSelectedTaxonomy()
p := obitax.IsSubCladeOf(*taxonomy, _NotBelongTaxa[0])
for _, taxid := range _NotBelongTaxa[1:] {
p.Or(obitax.IsSubCladeOf(*taxonomy, taxid))
}
return p.Not()
}
return nil
}
func CLIHasRankDefinedPredicate() obiseq.SequencePredicate {
if len(_RequiredRanks) > 0 {
taxonomy := CLILoadSelectedTaxonomy()
p := obitax.HasRankDefined(*taxonomy, _RequiredRanks[0])
for _, rank := range _RequiredRanks[1:] {
p.And(obitax.HasRankDefined(*taxonomy, rank))
}
return p
}
return nil
}
func CLITaxonomyFilterPredicate() obiseq.SequencePredicate {
return CLIHasRankDefinedPredicate().And(CLIRestrictTaxonomyPredicate()).And(CLIAvoidTaxonomyPredicate())
}
func CLIPredicatesPredicate() obiseq.SequencePredicate {
if len(_Predicats) > 0 {
p := obiseq.ExpressionPredicat(_Predicats[0])
for _, expression := range _Predicats[1:] {
p.And(obiseq.ExpressionPredicat(expression))
}
return p
}
return nil
}
func CLISequencePatternPredicate() obiseq.SequencePredicate {
if len(_SequencePatterns) > 0 {
p := obiseq.IsSequenceMatch(_SequencePatterns[0])
for _, pattern := range _SequencePatterns[1:] {
p.And(obiseq.IsSequenceMatch(pattern))
}
return p
}
return nil
}
func CLIDefinitionPatternPredicate() obiseq.SequencePredicate {
if len(_DefinitionPatterns) > 0 {
p := obiseq.IsDefinitionMatch(_DefinitionPatterns[0])
for _, pattern := range _DefinitionPatterns[1:] {
p.And(obiseq.IsDefinitionMatch(pattern))
}
return p
}
return nil
}
func CLIIdPatternPredicate() obiseq.SequencePredicate {
if len(_IdPatterns) > 0 {
p := obiseq.IsIdMatch(_IdPatterns[0])
for _, pattern := range _IdPatterns[1:] {
p.And(obiseq.IsIdMatch(pattern))
}
return p
}
return nil
}
func CLIIdListPredicate() obiseq.SequencePredicate {
if _IdList != "" {
ids, err := goutils.ReadLines(_IdList)
if err != nil {
log.Fatalf("cannot read the id file %s : %v", _IdList, err)
}
for i, v := range ids {
ids[i] = strings.TrimSpace(v)
}
p := obiseq.IsIdIn(ids...)
return p
}
return nil
}
func CLISequenceSelectionPredicate() obiseq.SequencePredicate {
p := CLISequenceSizePredicate()
p = p.And(CLISequenceCountPredicate())
p = p.And(CLITaxonomyFilterPredicate())
p = p.And(CLIPredicatesPredicate())
p = p.And(CLISequencePatternPredicate())
p = p.And(CLIDefinitionPatternPredicate())
p = p.And(CLIIdPatternPredicate())
p = p.And(CLIIdListPredicate())
if _InvertMatch {
p = p.Not()
}
return p
}
func CLISaveDiscardedSequences() bool {
return _SaveRejected != ""
}
func CLIDiscardedFileName() string {
return _SaveRejected
}