diff --git a/cmd/obitools/obigrep/main.go b/cmd/obitools/obigrep/main.go new file mode 100644 index 0000000..46e4557 --- /dev/null +++ b/cmd/obitools/obigrep/main.go @@ -0,0 +1,42 @@ +package main + +import ( + "os" + "runtime/pprof" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obigrep" +) + +func main() { + + defer obiseq.LogBioSeqStatus() + + // go tool pprof -http=":8000" ./obipairing ./cpu.pprof + f, err := os.Create("cpu.pprof") + if err != nil { + log.Fatal(err) + } + pprof.StartCPUProfile(f) + defer pprof.StopCPUProfile() + + // go tool trace cpu.trace + // ftrace, err := os.Create("cpu.trace") + // if err != nil { + // log.Fatal(err) + // } + // trace.Start(ftrace) + // defer trace.Stop() + + optionParser := obioptions.GenerateOptionParser(obigrep.OptionSet) + + _, args, _ := optionParser(os.Args) + + sequences, _ := obiconvert.ReadBioSequencesBatch(args...) + selected := obigrep.IFilterSequence(sequences) + obiconvert.WriteBioSequencesBatch(selected, true) +} diff --git a/pkg/goutils/goutils.go b/pkg/goutils/goutils.go index cadbed3..097b70d 100644 --- a/pkg/goutils/goutils.go +++ b/pkg/goutils/goutils.go @@ -1,8 +1,11 @@ package goutils import ( + "bufio" "bytes" "encoding/gob" + "io" + "os" ) // NotAnInteger defines a new type of Error : "NotAnInteger" @@ -73,3 +76,33 @@ func CopyMap(dest, src map[string]interface{}) { gob.NewEncoder(buf).Encode(src) gob.NewDecoder(buf).Decode(&dest) } + +// Read a whole file into the memory and store it as array of lines +func ReadLines(path string) (lines []string, err error) { + var ( + file *os.File + part []byte + prefix bool + ) + if file, err = os.Open(path); err != nil { + return + } + defer file.Close() + + reader := bufio.NewReader(file) + buffer := bytes.NewBuffer(make([]byte, 0)) + for { + if part, prefix, err = reader.ReadLine(); err != nil { + break + } + buffer.Write(part) + if !prefix { + lines = append(lines, buffer.String()) + buffer.Reset() + } + } + if err == io.EOF { + err = nil + } + return +} diff --git a/pkg/obiiter/batchiterator.go b/pkg/obiiter/batchiterator.go index aa913cf..da67db4 100644 --- a/pkg/obiiter/batchiterator.go +++ b/pkg/obiiter/batchiterator.go @@ -559,3 +559,56 @@ func (iterator IBioSequenceBatch) DivideOn(predicate obiseq.SequencePredicate, return trueIter, falseIter } + +func (iterator IBioSequenceBatch) FilterOn(predicate obiseq.SequencePredicate, + size int, sizes ...int) IBioSequenceBatch { + buffsize := iterator.BufferSize() + nworkers := 4 + + if len(sizes) > 0 { + nworkers = sizes[0] + } + + if len(sizes) > 1 { + buffsize = sizes[1] + } + + trueIter := MakeIBioSequenceBatch(buffsize) + + trueIter.Add(nworkers) + + go func() { + trueIter.WaitAndClose() + }() + + ff := func(iterator IBioSequenceBatch) { + iterator = iterator.SortBatches() + + for iterator.Next() { + seqs := iterator.Get() + slice := seqs.slice + j := 0 + for _, s := range slice { + if predicate(s) { + slice[j] = s + j++ + } else { + s.Recycle() + } + } + + seqs.slice = slice[:j] + trueIter.Push(seqs) + } + + trueIter.Done() + } + + for i := 1; i < nworkers; i++ { + go ff(iterator.Split()) + } + + go ff(iterator) + + return trueIter.Rebatch(size) +} diff --git a/pkg/obiseq/batchiterator.go.old b/pkg/obiseq/batchiterator.go.old deleted file mode 100644 index 1be3a50..0000000 --- a/pkg/obiseq/batchiterator.go.old +++ /dev/null @@ -1,533 +0,0 @@ -package obiseq - -import ( - "fmt" - "log" - "sync" - "sync/atomic" - "time" - - "github.com/tevino/abool/v2" -) - -type BioSequenceBatch struct { - slice BioSequenceSlice - order int -} - -var NilBioSequenceBatch = BioSequenceBatch{nil, -1} - -func MakeBioSequenceBatch(order int, sequences BioSequenceSlice) BioSequenceBatch { - return BioSequenceBatch{ - slice: sequences, - order: order, - } -} - -func (batch BioSequenceBatch) Order() int { - return batch.order -} - -func (batch BioSequenceBatch) Reorder(newOrder int) BioSequenceBatch { - batch.order = newOrder - return batch -} - -func (batch BioSequenceBatch) Slice() BioSequenceSlice { - return batch.slice -} - -func (batch BioSequenceBatch) Length() int { - return len(batch.slice) -} - -func (batch BioSequenceBatch) NotEmpty() bool { - return batch.slice.NotEmpty() -} - -func (batch BioSequenceBatch) Pop0() *BioSequence { - return batch.slice.Pop0() -} - -func (batch BioSequenceBatch) IsNil() bool { - return batch.slice == nil -} - -func (batch BioSequenceBatch) Recycle() { - batch.slice.Recycle() - batch.slice = nil -} - -// Structure implementing an iterator over bioseq.BioSequenceBatch -// based on a channel. -type _IBioSequenceBatch struct { - channel chan BioSequenceBatch - current BioSequenceBatch - pushBack *abool.AtomicBool - all_done *sync.WaitGroup - lock *sync.RWMutex - buffer_size int32 - batch_size int32 - sequence_format string - finished *abool.AtomicBool -} - -type IBioSequenceBatch struct { - pointer *_IBioSequenceBatch -} - -var NilIBioSequenceBatch = IBioSequenceBatch{pointer: nil} - -func MakeIBioSequenceBatch(sizes ...int) IBioSequenceBatch { - buffsize := int32(1) - - if len(sizes) > 0 { - buffsize = int32(sizes[0]) - } - - i := _IBioSequenceBatch{ - channel: make(chan BioSequenceBatch, buffsize), - current: NilBioSequenceBatch, - pushBack: abool.New(), - buffer_size: buffsize, - batch_size: -1, - sequence_format: "", - finished: abool.New(), - } - - waiting := sync.WaitGroup{} - i.all_done = &waiting - lock := sync.RWMutex{} - i.lock = &lock - ii := IBioSequenceBatch{&i} - return ii -} - -func (iterator IBioSequenceBatch) Add(n int) { - iterator.pointer.all_done.Add(n) -} - -func (iterator IBioSequenceBatch) Done() { - iterator.pointer.all_done.Done() -} - -func (iterator IBioSequenceBatch) Unlock() { - iterator.pointer.lock.Unlock() -} - -func (iterator IBioSequenceBatch) Lock() { - iterator.pointer.lock.Lock() -} - -func (iterator IBioSequenceBatch) RLock() { - iterator.pointer.lock.RLock() -} - -func (iterator IBioSequenceBatch) RUnlock() { - iterator.pointer.lock.RUnlock() -} - -func (iterator IBioSequenceBatch) Wait() { - iterator.pointer.all_done.Wait() -} - -func (iterator IBioSequenceBatch) Channel() chan BioSequenceBatch { - return iterator.pointer.channel -} - -func (iterator IBioSequenceBatch) IsNil() bool { - return iterator.pointer == nil -} - -func (iterator IBioSequenceBatch) BufferSize() int { - return int(atomic.LoadInt32(&iterator.pointer.buffer_size)) -} - -func (iterator IBioSequenceBatch) BatchSize() int { - return int(atomic.LoadInt32(&iterator.pointer.batch_size)) -} - -func (iterator IBioSequenceBatch) SetBatchSize(size int) error { - if size >= 0 { - atomic.StoreInt32(&iterator.pointer.batch_size, int32(size)) - return nil - } - - return fmt.Errorf("size (%d) cannot be negative", size) -} - -func (iterator IBioSequenceBatch) Split() IBioSequenceBatch { - iterator.pointer.lock.RLock() - defer iterator.pointer.lock.RUnlock() - i := _IBioSequenceBatch{ - channel: iterator.pointer.channel, - current: NilBioSequenceBatch, - pushBack: abool.New(), - all_done: iterator.pointer.all_done, - buffer_size: iterator.pointer.buffer_size, - batch_size: iterator.pointer.batch_size, - sequence_format: iterator.pointer.sequence_format, - finished: iterator.pointer.finished} - lock := sync.RWMutex{} - i.lock = &lock - - newIter := IBioSequenceBatch{&i} - return newIter -} - -func (iterator IBioSequenceBatch) Next() bool { - if iterator.pointer.pushBack.IsSet() { - iterator.pointer.pushBack.UnSet() - return true - } - - if iterator.pointer.finished.IsSet() { - return false - } - - next, ok := (<-iterator.pointer.channel) - - if ok { - iterator.pointer.current = next - return true - } - - iterator.pointer.current = NilBioSequenceBatch - iterator.pointer.finished.Set() - return false -} - -func (iterator IBioSequenceBatch) PushBack() { - if !iterator.pointer.current.IsNil() { - iterator.pointer.pushBack.Set() - } -} - -// The 'Get' method returns the instance of BioSequenceBatch -// currently pointed by the iterator. You have to use the -// 'Next' method to move to the next entry before calling -// 'Get' to retreive the following instance. -func (iterator IBioSequenceBatch) Get() BioSequenceBatch { - return iterator.pointer.current -} - -func (iterator IBioSequenceBatch) Push(batch BioSequenceBatch) { - if batch.IsNil() { - log.Panicln("An Nil batch is pushed on the channel") - } - if batch.Length() == 0 { - log.Panicln("An empty batch is pushed on the channel") - } - - iterator.pointer.channel <- batch -} - -func (iterator IBioSequenceBatch) Close() { - close(iterator.pointer.channel) -} - -func (iterator IBioSequenceBatch) WaitAndClose() { - iterator.Wait() - - for len(iterator.Channel()) > 0 { - time.Sleep(time.Millisecond) - } - iterator.Close() -} - -// Finished returns 'true' value if no more data is available -// from the iterator. -func (iterator IBioSequenceBatch) Finished() bool { - return iterator.pointer.finished.IsSet() -} - -func (iterator IBioSequenceBatch) IBioSequence(sizes ...int) IBioSequence { - buffsize := iterator.BufferSize() - - if len(sizes) > 0 { - buffsize = sizes[0] - } - - newIter := MakeIBioSequence(buffsize) - - newIter.Add(1) - - go func() { - newIter.Wait() - close(newIter.pointer.channel) - }() - - go func() { - for iterator.Next() { - batch := iterator.Get() - - for batch.NotEmpty() { - newIter.pointer.channel <- batch.Pop0() - } - batch.Recycle() - } - newIter.Done() - }() - - return newIter -} - -func (iterator IBioSequenceBatch) SortBatches(sizes ...int) IBioSequenceBatch { - buffsize := iterator.BufferSize() - - if len(sizes) > 0 { - buffsize = sizes[0] - } - - newIter := MakeIBioSequenceBatch(buffsize) - - newIter.Add(1) - - go func() { - newIter.Wait() - close(newIter.pointer.channel) - }() - - next_to_send := 0 - received := make(map[int]BioSequenceBatch) - go func() { - for iterator.Next() { - batch := iterator.Get() - if batch.order == next_to_send { - newIter.pointer.channel <- batch - next_to_send++ - batch, ok := received[next_to_send] - for ok { - newIter.pointer.channel <- batch - delete(received, next_to_send) - next_to_send++ - batch, ok = received[next_to_send] - } - } else { - received[batch.order] = batch - } - } - newIter.Done() - }() - - return newIter - -} - -func (iterator IBioSequenceBatch) Concat(iterators ...IBioSequenceBatch) IBioSequenceBatch { - - if len(iterators) == 0 { - return iterator - } - - buffsize := iterator.BufferSize() - newIter := MakeIBioSequenceBatch(buffsize) - - newIter.Add(1) - - go func() { - newIter.Wait() - close(newIter.Channel()) - }() - - go func() { - previous_max := 0 - max_order := 0 - - for iterator.Next() { - s := iterator.Get() - if s.order > max_order { - max_order = s.order - } - newIter.Push(s.Reorder(s.order + previous_max)) - } - - previous_max = max_order + 1 - for _, iter := range iterators { - for iter.Next() { - s := iter.Get() - if (s.order + previous_max) > max_order { - max_order = s.order + previous_max - } - - newIter.Push(s.Reorder(s.order + previous_max)) - } - previous_max = max_order + 1 - } - newIter.Done() - }() - - return newIter -} - -// Redistributes sequences from a IBioSequenceBatch into a new -// IBioSequenceBatch with every batches having the same size -// indicated in parameter. Rebatching implies to sort the -// source IBioSequenceBatch. -func (iterator IBioSequenceBatch) Rebatch(size int, sizes ...int) IBioSequenceBatch { - buffsize := iterator.BufferSize() - - if len(sizes) > 0 { - buffsize = sizes[0] - } - - newIter := MakeIBioSequenceBatch(buffsize) - - newIter.Add(1) - - go func() { - newIter.Wait() - close(newIter.pointer.channel) - }() - - go func() { - order := 0 - iterator = iterator.SortBatches() - buffer := MakeBioSequenceSlice() - - for iterator.Next() { - seqs := iterator.Get() - for _, s := range seqs.slice { - buffer = append(buffer, s) - if len(buffer) == size { - newIter.Push(MakeBioSequenceBatch(order, buffer)) - order++ - buffer = MakeBioSequenceSlice() - } - } - seqs.Recycle() - } - - if len(buffer) > 0 { - newIter.Push(MakeBioSequenceBatch(order, buffer)) - } - - newIter.Done() - - }() - - return newIter -} - -func (iterator IBioSequenceBatch) Recycle() { - - log.Println("Start recycling of Bioseq objects") - recycled := 0 - for iterator.Next() { - // iterator.Get() - batch := iterator.Get() - for _, seq := range batch.Slice() { - seq.Recycle() - recycled++ - } - batch.Recycle() - } - log.Printf("End of the recycling of %d Bioseq objects", recycled) -} - -func (iterator IBioSequenceBatch) PairWith(reverse IBioSequenceBatch, sizes ...int) IPairedBioSequenceBatch { - buffsize := iterator.BufferSize() - batchsize := 5000 - - if len(sizes) > 0 { - batchsize = sizes[0] - } - - if len(sizes) > 1 { - buffsize = sizes[1] - } - - iterator = iterator.Rebatch(batchsize) - reverse = reverse.Rebatch(batchsize) - - newIter := MakeIPairedBioSequenceBatch(buffsize) - - newIter.Add(1) - - go func() { - newIter.Wait() - close(newIter.pointer.channel) - log.Println("End of association of paired reads") - }() - - log.Println("Start association of paired reads") - go func() { - for iterator.Next() { - if !reverse.Next() { - log.Panicln("Etrange reverse pas prêt") - } - newIter.Channel() <- MakePairedBioSequenceBatch(iterator.Get(), - reverse.Get()) - } - - newIter.Done() - }() - - return newIter -} - -func (iterator IBioSequenceBatch) DivideOn(predicate SequencePredicate, - size int, sizes ...int) (IBioSequenceBatch, IBioSequenceBatch) { - buffsize := iterator.BufferSize() - - if len(sizes) > 0 { - buffsize = sizes[0] - } - - trueIter := MakeIBioSequenceBatch(buffsize) - falseIter := MakeIBioSequenceBatch(buffsize) - - trueIter.Add(1) - falseIter.Add(1) - - go func() { - trueIter.WaitAndClose() - falseIter.WaitAndClose() - }() - - go func() { - trueOrder := 0 - falseOrder := 0 - iterator = iterator.SortBatches() - - trueSlice := MakeBioSequenceSlice() - falseSlice := MakeBioSequenceSlice() - - for iterator.Next() { - seqs := iterator.Get() - for _, s := range seqs.slice { - if predicate(s) { - trueSlice = append(trueSlice, s) - } else { - falseSlice = append(falseSlice, s) - } - - if len(trueSlice) == size { - trueIter.Push(MakeBioSequenceBatch(trueOrder, trueSlice)) - trueOrder++ - trueSlice = MakeBioSequenceSlice() - } - - if len(falseSlice) == size { - falseIter.Push(MakeBioSequenceBatch(falseOrder, falseSlice)) - falseOrder++ - falseSlice = MakeBioSequenceSlice() - } - } - seqs.Recycle() - } - - if len(trueSlice) > 0 { - trueIter.Push(MakeBioSequenceBatch(trueOrder, trueSlice)) - } - - if len(falseSlice) > 0 { - falseIter.Push(MakeBioSequenceBatch(falseOrder, falseSlice)) - } - - trueIter.Done() - falseIter.Done() - }() - - return trueIter, falseIter -} diff --git a/pkg/obiseq/pairedbatchiterator.go.old b/pkg/obiseq/pairedbatchiterator.go.old deleted file mode 100644 index 973492a..0000000 --- a/pkg/obiseq/pairedbatchiterator.go.old +++ /dev/null @@ -1,213 +0,0 @@ -package obiseq - -import ( - "log" - "sync" -) - -type PairedBioSequenceBatch struct { - forward BioSequenceSlice - reverse BioSequenceSlice - order int -} - -var NilPairedBioSequenceBatch = PairedBioSequenceBatch{nil, nil, -1} - -func MakePairedBioSequenceBatch(forward, reverse BioSequenceBatch) PairedBioSequenceBatch { - if forward.order != reverse.order { - log.Fatalf("Forward order : %d and reverse order : %d are not matching", - forward.order, reverse.order) - } - - for i := range reverse.slice { - reverse.slice[i].ReverseComplement(true) - } - - return PairedBioSequenceBatch{ - forward: forward.slice, - reverse: reverse.slice, - order: forward.order, - } -} - -func (batch PairedBioSequenceBatch) Order() int { - return batch.order -} - -func (batch PairedBioSequenceBatch) Length() int { - return len(batch.forward) -} - -func (batch PairedBioSequenceBatch) Forward() BioSequenceSlice { - return batch.forward -} - -func (batch PairedBioSequenceBatch) Reverse() BioSequenceSlice { - return batch.reverse -} - -func (batch PairedBioSequenceBatch) IsNil() bool { - return batch.forward == nil -} - -// Structure implementing an iterator over bioseq.BioSequenceBatch -// based on a channel. -type __ipairedbiosequencebatch__ struct { - channel chan PairedBioSequenceBatch - current PairedBioSequenceBatch - pushBack bool - all_done *sync.WaitGroup - buffer_size int - finished bool - p_finished *bool -} - -type IPairedBioSequenceBatch struct { - pointer *__ipairedbiosequencebatch__ -} - -var NilIPairedBioSequenceBatch = IPairedBioSequenceBatch{pointer: nil} - -func MakeIPairedBioSequenceBatch(sizes ...int) IPairedBioSequenceBatch { - buffsize := 1 - - if len(sizes) > 0 { - buffsize = sizes[0] - } - - i := __ipairedbiosequencebatch__{ - channel: make(chan PairedBioSequenceBatch, buffsize), - current: NilPairedBioSequenceBatch, - pushBack: false, - buffer_size: buffsize, - finished: false, - p_finished: nil, - } - - i.p_finished = &i.finished - waiting := sync.WaitGroup{} - i.all_done = &waiting - ii := IPairedBioSequenceBatch{&i} - return ii -} - -func (iterator IPairedBioSequenceBatch) Add(n int) { - iterator.pointer.all_done.Add(n) -} - -func (iterator IPairedBioSequenceBatch) Done() { - iterator.pointer.all_done.Done() -} - -func (iterator IPairedBioSequenceBatch) Wait() { - iterator.pointer.all_done.Wait() -} - -func (iterator IPairedBioSequenceBatch) Channel() chan PairedBioSequenceBatch { - return iterator.pointer.channel -} - -func (iterator IPairedBioSequenceBatch) IsNil() bool { - return iterator.pointer == nil -} - -func (iterator IPairedBioSequenceBatch) BufferSize() int { - return iterator.pointer.buffer_size -} - -func (iterator IPairedBioSequenceBatch) Split() IPairedBioSequenceBatch { - i := __ipairedbiosequencebatch__{ - channel: iterator.pointer.channel, - current: NilPairedBioSequenceBatch, - pushBack: false, - all_done: iterator.pointer.all_done, - buffer_size: iterator.pointer.buffer_size, - finished: false, - p_finished: iterator.pointer.p_finished} - newIter := IPairedBioSequenceBatch{&i} - return newIter -} - -func (iterator IPairedBioSequenceBatch) Next() bool { - if *(iterator.pointer.p_finished) { - return false - } - - if iterator.pointer.pushBack { - iterator.pointer.pushBack = false - return true - } - - next, ok := (<-iterator.pointer.channel) - - if ok { - iterator.pointer.current = next - return true - } - - iterator.pointer.current = NilPairedBioSequenceBatch - *iterator.pointer.p_finished = true - return false -} - -func (iterator IPairedBioSequenceBatch) PushBack() { - if !iterator.pointer.current.IsNil() { - iterator.pointer.pushBack = true - } -} - -// The 'Get' method returns the instance of BioSequenceBatch -// currently pointed by the iterator. You have to use the -// 'Next' method to move to the next entry before calling -// 'Get' to retreive the following instance. -func (iterator IPairedBioSequenceBatch) Get() PairedBioSequenceBatch { - return iterator.pointer.current -} - -// Finished returns 'true' value if no more data is available -// from the iterator. -func (iterator IPairedBioSequenceBatch) Finished() bool { - return *iterator.pointer.p_finished -} - -func (iterator IPairedBioSequenceBatch) SortBatches(sizes ...int) IPairedBioSequenceBatch { - buffsize := iterator.BufferSize() - - if len(sizes) > 0 { - buffsize = sizes[0] - } - - newIter := MakeIPairedBioSequenceBatch(buffsize) - - newIter.Add(1) - - go func() { - newIter.Wait() - close(newIter.pointer.channel) - }() - - next_to_send := 0 - received := make(map[int]PairedBioSequenceBatch) - go func() { - for iterator.Next() { - batch := iterator.Get() - if batch.order == next_to_send { - newIter.pointer.channel <- batch - next_to_send++ - batch, ok := received[next_to_send] - for ok { - newIter.pointer.channel <- batch - delete(received, next_to_send) - next_to_send++ - batch, ok = received[next_to_send] - } - } else { - received[batch.order] = batch - } - } - newIter.Done() - }() - - return newIter - -} diff --git a/pkg/obiseq/predicate.go b/pkg/obiseq/predicate.go index 3b786ca..fc2adce 100644 --- a/pkg/obiseq/predicate.go +++ b/pkg/obiseq/predicate.go @@ -2,6 +2,9 @@ package obiseq import ( "context" + "fmt" + "regexp" + log "github.com/sirupsen/logrus" "github.com/PaesslerAG/gval" @@ -10,37 +13,55 @@ import ( type SequencePredicate func(*BioSequence) bool func (predicate1 SequencePredicate) And(predicate2 SequencePredicate) SequencePredicate { - f := func(sequence *BioSequence) bool { - return predicate1(sequence) && predicate2(sequence) + switch { + case predicate1 == nil: + return predicate2 + case predicate2 == nil: + return predicate1 + default: + return func(sequence *BioSequence) bool { + return predicate1(sequence) && predicate2(sequence) + } } - - return f } func (predicate1 SequencePredicate) Or(predicate2 SequencePredicate) SequencePredicate { - f := func(sequence *BioSequence) bool { - return predicate1(sequence) || predicate2(sequence) + switch { + case predicate1 == nil: + return predicate2 + case predicate2 == nil: + return predicate1 + default: + return func(sequence *BioSequence) bool { + return predicate1(sequence) || predicate2(sequence) + } } - - return f } func (predicate1 SequencePredicate) Xor(predicate2 SequencePredicate) SequencePredicate { - f := func(sequence *BioSequence) bool { - p1 := predicate1(sequence) - p2 := predicate2(sequence) - return (p1 && !p2) || (p2 && !p1) + switch { + case predicate1 == nil: + return predicate2 + case predicate2 == nil: + return predicate1 + default: + return func(sequence *BioSequence) bool { + p1 := predicate1(sequence) + p2 := predicate2(sequence) + return (p1 && !p2) || (p2 && !p1) + } } - - return f } func (predicate1 SequencePredicate) Not() SequencePredicate { - f := func(sequence *BioSequence) bool { - return !predicate1(sequence) + switch { + case predicate1 == nil: + return nil + default: + return func(sequence *BioSequence) bool { + return !predicate1(sequence) + } } - - return f } func HasAttribute(name string) SequencePredicate { @@ -57,9 +78,43 @@ func HasAttribute(name string) SequencePredicate { return f } -func MoreAbundantThan(count int) SequencePredicate { +func IsAttributeMatch(name string, pattern string) SequencePredicate { + pat, err := regexp.Compile(pattern) + + if err != nil { + log.Fatalf("error in atribute %s regular pattern syntax : %v", name, err) + } + f := func(sequence *BioSequence) bool { - return sequence.Count() > count + if sequence.HasAnnotation() { + val, ok := (sequence.Annotations())[name] + if ok { + switch val := val.(type) { + case string: + return pat.MatchString(val) + default: + return pat.MatchString(fmt.Sprint(val)) + } + } + } + + return false + } + + return f +} + +func IsMoreAbundantOrEqualTo(count int) SequencePredicate { + f := func(sequence *BioSequence) bool { + return sequence.Count() >= count + } + + return f +} + +func IsLessAbundantOrEqualTo(count int) SequencePredicate { + f := func(sequence *BioSequence) bool { + return sequence.Count() <= count } return f @@ -81,7 +136,64 @@ func IsShorterOrEqualTo(length int) SequencePredicate { return f } -func ExrpessionPredicat(expression string) SequencePredicate { +func IsSequenceMatch(pattern string) SequencePredicate { + pat, err := regexp.Compile("(?i)" + pattern) + + if err != nil { + log.Fatalf("error in sequence regular pattern syntax : %v", err) + } + + f := func(sequence *BioSequence) bool { + return pat.Match(sequence.Sequence()) + } + + return f +} + +func IsDefinitionMatch(pattern string) SequencePredicate { + pat, err := regexp.Compile(pattern) + + if err != nil { + log.Fatalf("error in definition regular pattern syntax : %v", err) + } + + f := func(sequence *BioSequence) bool { + return pat.MatchString(sequence.Definition()) + } + + return f +} + +func IsIdMatch(pattern string) SequencePredicate { + pat, err := regexp.Compile(pattern) + + if err != nil { + log.Fatalf("error in identifier regular pattern syntax : %v", err) + } + + f := func(sequence *BioSequence) bool { + return pat.MatchString(sequence.Id()) + } + + return f +} + +func IsIdIn(ids ...string) SequencePredicate { + idset := make(map[string]bool) + + for _, v := range ids { + idset[v] = true + } + + f := func(sequence *BioSequence) bool { + _, ok := idset[sequence.Id()] + return ok + } + + return f +} + +func ExpressionPredicat(expression string) SequencePredicate { exp, err := gval.Full().NewEvaluable(expression) diff --git a/pkg/obiseq/worker.go b/pkg/obiseq/worker.go deleted file mode 100644 index cff335f..0000000 --- a/pkg/obiseq/worker.go +++ /dev/null @@ -1,2 +0,0 @@ -package obiseq - diff --git a/pkg/obitax/taxon.go b/pkg/obitax/taxon.go index 7f6af81..bf41ec2 100644 --- a/pkg/obitax/taxon.go +++ b/pkg/obitax/taxon.go @@ -2,6 +2,8 @@ package obitax import ( "regexp" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" ) type TaxNode struct { @@ -64,3 +66,23 @@ func (node *TaxNode) IsNameMatching(pattern *regexp.Regexp) bool { return false } + +func (node *TaxNode) HasRankDefined(rank string) bool { + + for node.rank != rank && node.parent != node.taxid { + node = node.pparent + } + + return node.rank == rank + +} + +func HasRankDefined(taxonomy Taxonomy, rank string) obiseq.SequencePredicate { + + f := func(sequence *obiseq.BioSequence) bool { + taxon, err := taxonomy.Taxon(sequence.Taxid()) + return err == nil && taxon.HasRankDefined(rank) + } + + return f +} diff --git a/pkg/obitools/obifind/options.go b/pkg/obitools/obifind/options.go index 6a114be..6976af8 100644 --- a/pkg/obitools/obifind/options.go +++ b/pkg/obitools/obifind/options.go @@ -38,9 +38,10 @@ func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bo } options.BoolVar(&__rank_list__, "rank-list", false, options.Alias("l"), - options.Description("List every taxonomic rank available iin the taxonomy.")) - options.IntSliceVar(&__taxonomical_restriction__, "subclade-of", 1, 1, - options.Alias("s"), + options.Description("List every taxonomic rank available in the taxonomy.")) + + options.IntSliceVar(&__taxonomical_restriction__, "restrict-to-taxon", 1, 1, + options.Alias("r"), options.Description("Restrict output to some subclades.")) } diff --git a/pkg/obitools/obigrep/grep.go b/pkg/obitools/obigrep/grep.go new file mode 100644 index 0000000..a406189 --- /dev/null +++ b/pkg/obitools/obigrep/grep.go @@ -0,0 +1,47 @@ +package obigrep + +import ( + "log" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" +) + +func IFilterSequence(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch { + var newIter obiiter.IBioSequenceBatch + + predicate := CLISequenceSelectionPredicate() + + if predicate != nil { + if CLISaveDiscardedSequences() { + var discarded obiiter.IBioSequenceBatch + + log.Printf("Discarded sequences saved in file: %s\n", CLIDiscardedFileName()) + newIter, discarded = iterator.DivideOn(predicate, + obioptions.CLIBatchSize()) + + go func() { + _, err := obiconvert.WriteBioSequencesBatch(discarded, + true, + CLIDiscardedFileName()) + + if err != nil { + log.Fatalf("%v", err) + } + }() + + } else { + newIter = iterator.FilterOn(predicate, + obioptions.CLIBatchSize(), + obioptions.CLIParallelWorkers(), + obioptions.CLIBufferSize(), + ) + } + } else { + newIter = iterator + } + + return newIter + +} diff --git a/pkg/obitools/obigrep/options.go b/pkg/obitools/obigrep/options.go new file mode 100644 index 0000000..bde3f2a --- /dev/null +++ b/pkg/obitools/obigrep/options.go @@ -0,0 +1,375 @@ +package obigrep + +import ( + "log" + "strings" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats/ncbitaxdump" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +var _BelongTaxa = make([]int, 0) +var _NotBelongTaxa = make([]int, 0) +var _RequiredRanks = make([]string, 0) + +var _MinimumLength = 1 +var _MaximumLength = int(2e9) + +var _MinimumCount = 1 +var _MaximumCount = int(2e9) + +var _SequencePatterns = make([]string, 0) +var _DefinitionPatterns = make([]string, 0) +var _IdPatterns = make([]string, 0) + +var _Predicats = make([]string, 0) + +var _IdList = "" + +var _Taxdump = "" +var _Taxonomy = (*obitax.Taxonomy)(nil) +var _RequiredAttributes = make([]string, 0) +var _AttributePatterns = make(map[string]string, 0) + +var _InvertMatch = false +var _SaveRejected = "" + +func TaxonomySelectionOptionSet(options *getoptions.GetOpt) { + + options.StringVar(&_Taxdump, "taxdump", _Taxdump, + options.Alias("t"), + options.Description("Points to the directory containing the NCBI Taxonomy database dump.")) + + options.IntSliceVar(&_BelongTaxa, "restrict-to-taxon", 1, 1, + options.Alias("r"), + options.ArgName("TAXID"), + options.Description("Require that the actual taxon of the sequence belongs the provided taxid.")) + + options.IntSliceVar(&_NotBelongTaxa, "ignore-taxon", 1, 1, + options.Alias("i"), + options.ArgName("TAXID"), + options.Description("Require that the actual taxon of the sequence doesn't belong the provided taxid.")) + + options.StringSliceVar(&_RequiredRanks, "require-rank", 1, 1, + options.ArgName("RANK_NAME"), + options.Description("Select sequences belonging a taxon with a rank ")) + +} + +func SequenceSelectionOptionSet(options *getoptions.GetOpt) { + TaxonomySelectionOptionSet(options) + options.StringVar(&_SaveRejected, "save-discarded", _SaveRejected, + options.ArgName("FILENAME"), + options.Description("Indicates in which file not selected sequences must be saved.")) + + options.StringVar(&_IdList, "id-list", _IdList, + options.ArgName("FILENAME"), + options.Description(" points to a text file containing the list of sequence record identifiers to be selected."+ + " The file format consists in a single identifier per line.")) + + options.BoolVar(&_InvertMatch, "inverse-match", _InvertMatch, + options.Alias("v"), + options.Description("Inverts the sequence record selection.")) + + options.IntVar(&_MinimumLength, "min-length", _MinimumLength, + options.ArgName("LENGTH"), + options.Alias("l"), + options.Description("Selects sequence records whose sequence length is equal or longer than lmin.")) + + options.IntVar(&_MaximumLength, "max-length", _MaximumLength, + options.ArgName("LENGTH"), + options.Alias("L"), + options.Description("Selects sequence records whose sequence length is equal or shorter than lmax.")) + + options.IntVar(&_MinimumCount, "min-count", _MinimumCount, + options.ArgName("COUNT"), + options.Alias("c"), + options.Description("Selects sequence records occuring at least min count time in the data set.")) + + options.IntVar(&_MaximumCount, "max-count", _MaximumCount, + options.ArgName("COUNT"), + options.Alias("C"), + options.Description("Selects sequence records occuring no more than max count time in the data set.")) + + options.StringSliceVar(&_Predicats, "predicate", 1, 1, + options.Alias("p"), + options.ArgName("EXPRESSION"), + options.Description("boolean expression to be evaluated for each sequence record. "+ + "The attribute keys defined for each sequence record can be used in the "+ + "expression as variable names. An extra variable named ‘sequence’ refers "+ + "to the sequence record itself. Several -p options can be used on the same "+ + "command line and in this last case, the selected sequence records will match "+ + "all constraints.")) + + options.StringSliceVar(&_SequencePatterns, "sequence", 1, 1, + options.Alias("s"), + options.ArgName("PATTERN"), + options.Description("Regular expression pattern to be tested against the sequence itself. The pattern is case insensitive.")) + + options.StringSliceVar(&_DefinitionPatterns, "definition", 1, 1, + options.Alias("D"), + options.ArgName("PATTERN"), + options.Description("Regular expression pattern to be tested against the sequence definition. The pattern is case insensitive.")) + + options.StringSliceVar(&_IdPatterns, "identifier", 1, 1, + options.Alias("I"), + options.ArgName("PATTERN"), + options.Description("Regular expression pattern to be tested against the sequence identifier. The pattern is case insensitive.")) + + options.StringSliceVar(&_RequiredAttributes, "has-attribute", 1, 1, + options.Alias("A"), + options.ArgName("KEY"), + options.Description("Selects sequence records having an attribute whose key = .")) + + options.StringMapVar(&_AttributePatterns, "attribute", 1, 1, + options.Alias("a"), + options.ArgName("KEY=VALUE"), + options.Description("Regular expression pattern matched against the attributes of the sequence record. "+ + "the value of this attribute is of the form : key:regular_pattern. The pattern is case sensitive. "+ + "Several -a options can be used on the same command line and in this last case, the selected "+ + "sequence records will match all constraints.")) + +} + +// OptionSet adds to the basic option set every options declared for +// the obipcr command +func OptionSet(options *getoptions.GetOpt) { + obiconvert.OptionSet(options) + SequenceSelectionOptionSet(options) +} + +func CLIMinSequenceLength() int { + return _MinimumLength +} + +func CLIMaxSequenceLength() int { + return _MaximumLength +} + +func CLISequenceSizePredicate() obiseq.SequencePredicate { + if _MinimumLength > 1 { + p := obiseq.IsLongerOrEqualTo(_MinimumLength) + + if _MaximumLength != int(2e9) { + p = p.And(obiseq.IsShorterOrEqualTo(_MaximumLength)) + } + + return p + } + + if _MaximumLength != int(2e9) { + return obiseq.IsShorterOrEqualTo(_MaximumLength) + } + + return nil +} + +func CLIMinSequenceCount() int { + return _MinimumCount +} + +func CLIMaxSequenceCount() int { + return _MaximumCount +} + +func CLISequenceCountPredicate() obiseq.SequencePredicate { + if _MinimumCount > 1 { + p := obiseq.IsMoreAbundantOrEqualTo(_MinimumCount) + + if _MaximumCount != int(2e9) { + p = p.And(obiseq.IsLessAbundantOrEqualTo(_MaximumCount)) + } + + return p + } + + if _MaximumLength != int(2e9) { + return obiseq.IsLessAbundantOrEqualTo(_MaximumCount) + } + + return nil +} + +func CLISelectedNCBITaxDump() string { + return _Taxdump +} + +func CLILoadSelectedTaxonomy() *obitax.Taxonomy { + if CLISelectedNCBITaxDump() != "" { + if _Taxonomy == nil { + var err error + _Taxonomy, err = ncbitaxdump.LoadNCBITaxDump(CLISelectedNCBITaxDump(), true) + if err != nil { + log.Fatalf("cannot load taxonomy %s : %v", + CLISelectedNCBITaxDump(), err) + return nil + } + } + return _Taxonomy + } + + log.Fatalln("no NCBI taxdump selected using option -t|--taxdump") + + return nil +} + +func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate { + + if len(_BelongTaxa) > 0 { + taxonomy := CLILoadSelectedTaxonomy() + p := obitax.IsSubCladeOf(*taxonomy, _BelongTaxa[0]) + + for _, taxid := range _BelongTaxa[1:] { + p.Or(obitax.IsSubCladeOf(*taxonomy, taxid)) + } + + return p + } + + return nil +} + +func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate { + + if len(_NotBelongTaxa) > 0 { + taxonomy := CLILoadSelectedTaxonomy() + p := obitax.IsSubCladeOf(*taxonomy, _NotBelongTaxa[0]) + + for _, taxid := range _NotBelongTaxa[1:] { + p.Or(obitax.IsSubCladeOf(*taxonomy, taxid)) + } + + return p.Not() + } + + return nil +} + +func CLIHasRankDefinedPredicate() obiseq.SequencePredicate { + + if len(_RequiredRanks) > 0 { + taxonomy := CLILoadSelectedTaxonomy() + p := obitax.HasRankDefined(*taxonomy, _RequiredRanks[0]) + + for _, rank := range _RequiredRanks[1:] { + p.And(obitax.HasRankDefined(*taxonomy, rank)) + } + + return p + } + + return nil +} + +func CLITaxonomyFilterPredicate() obiseq.SequencePredicate { + return CLIHasRankDefinedPredicate().And(CLIRestrictTaxonomyPredicate()).And(CLIAvoidTaxonomyPredicate()) +} + +func CLIPredicatesPredicate() obiseq.SequencePredicate { + + if len(_Predicats) > 0 { + p := obiseq.ExpressionPredicat(_Predicats[0]) + + for _, expression := range _Predicats[1:] { + p.And(obiseq.ExpressionPredicat(expression)) + } + + return p + } + + return nil +} + +func CLISequencePatternPredicate() obiseq.SequencePredicate { + + if len(_SequencePatterns) > 0 { + p := obiseq.IsSequenceMatch(_SequencePatterns[0]) + + for _, pattern := range _SequencePatterns[1:] { + p.And(obiseq.IsSequenceMatch(pattern)) + } + + return p + } + + return nil +} + +func CLIDefinitionPatternPredicate() obiseq.SequencePredicate { + + if len(_DefinitionPatterns) > 0 { + p := obiseq.IsDefinitionMatch(_DefinitionPatterns[0]) + + for _, pattern := range _DefinitionPatterns[1:] { + p.And(obiseq.IsDefinitionMatch(pattern)) + } + + return p + } + + return nil +} + +func CLIIdPatternPredicate() obiseq.SequencePredicate { + + if len(_IdPatterns) > 0 { + p := obiseq.IsIdMatch(_IdPatterns[0]) + + for _, pattern := range _IdPatterns[1:] { + p.And(obiseq.IsIdMatch(pattern)) + } + + return p + } + + return nil +} + +func CLIIdListPredicate() obiseq.SequencePredicate { + + if _IdList != "" { + ids, err := goutils.ReadLines(_IdList) + + if err != nil { + log.Fatalf("cannot read the id file %s : %v", _IdList, err) + } + + for i, v := range ids { + ids[i] = strings.TrimSpace(v) + } + + p := obiseq.IsIdIn(ids...) + + return p + } + return nil +} + +func CLISequenceSelectionPredicate() obiseq.SequencePredicate { + p := CLISequenceSizePredicate() + p = p.And(CLISequenceCountPredicate()) + p = p.And(CLITaxonomyFilterPredicate()) + p = p.And(CLIPredicatesPredicate()) + p = p.And(CLISequencePatternPredicate()) + p = p.And(CLIDefinitionPatternPredicate()) + p = p.And(CLIIdPatternPredicate()) + p = p.And(CLIIdListPredicate()) + + if _InvertMatch { + p = p.Not() + } + + return p +} + +func CLISaveDiscardedSequences() bool { + return _SaveRejected != "" +} + +func CLIDiscardedFileName() string { + return _SaveRejected +}