first version of obidemerge, obijoin and a new filter for obicleandb but to be finnished

Former-commit-id: 8a1ed26e5548c30db75644c294d478ec4d753f19
This commit is contained in:
Eric Coissac
2024-07-10 15:21:42 +02:00
parent bd855c4965
commit c7ed47e110
24 changed files with 2712 additions and 19 deletions

View File

@@ -0,0 +1,132 @@
package obijoin
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
type IndexedSequenceSlice struct {
Sequences obiseq.BioSequenceSlice
Indices []map[interface{}]*obiutils.Set[int]
}
func (s IndexedSequenceSlice) Len() int {
return len(s.Sequences)
}
func (s IndexedSequenceSlice) Get(keys ...interface{}) *obiseq.BioSequenceSlice {
var keeps obiutils.Set[int]
for i, v := range s.Indices {
if i == 0 {
keeps = *v[keys[0]]
} else {
keeps = keeps.Intersection(*v[keys[i]])
}
}
rep := obiseq.MakeBioSequenceSlice(len(keeps))
for i, v := range keeps.Members() {
rep[i] = s.Sequences[v]
}
return &rep
}
func BuildIndexedSequenceSlice(seqs obiseq.BioSequenceSlice, keys []string) IndexedSequenceSlice {
indices := make([]map[interface{}]*obiutils.Set[int], len(keys))
for i, k := range keys {
idx := make(map[interface{}]*obiutils.Set[int])
for j, seq := range seqs {
if value, ok := seq.GetAttribute(k); ok {
goods, ok := idx[value]
if !ok {
goods = obiutils.NewSet[int]()
idx[value] = goods
}
goods.Add(j)
}
}
indices[i] = idx
}
return IndexedSequenceSlice{seqs, indices}
}
func MakeJoinWorker(by []string, index IndexedSequenceSlice, updateId, updateSequence, updateQuality bool) obiseq.SeqWorker {
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
var ok bool
keys := make([]interface{}, len(by))
for i, v := range by {
keys[i], ok = sequence.GetAttribute(v)
if !ok {
return obiseq.BioSequenceSlice{sequence}, nil
}
}
join_with := index.Get(keys...)
rep := obiseq.MakeBioSequenceSlice(join_with.Len())
if join_with.Len() == 0 {
return obiseq.BioSequenceSlice{sequence}, nil
}
for i, v := range *join_with {
rep[i] = sequence.Copy()
annot := rep[i].Annotations()
new_annot := v.Annotations()
for k, v := range new_annot {
annot[k] = v
}
if updateId {
rep[i].SetId(v.Id())
}
if updateSequence && len(v.Sequence()) > 0 {
rep[i].SetSequence(v.Sequence())
}
if updateQuality && len(v.Qualities()) > 0 {
rep[i].SetQualities(v.Qualities())
}
}
return rep, nil
}
return obiseq.SeqWorker(f)
}
func CLIJoinSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
data_iter, err := obiformats.ReadSequencesFromFile(CLIJoinWith())
if err != nil {
log.Fatalf("Cannot read the data file to merge with: %s %v", CLIJoinWith(), err)
}
data := data_iter.Load()
keys := CLIBy()
index := BuildIndexedSequenceSlice(data, keys.Right)
worker := MakeJoinWorker(keys.Left, index, CLIUpdateId(), CLIUpdateSequence(), CLIUpdateQuality())
iterator = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
return iterator
}

View File

@@ -0,0 +1,90 @@
package obijoin
import (
"strings"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _by = []string{}
var _join = ""
var _UpdateID = false
var _UpdateSequence = false
var _UpdateQuality = false
type By struct {
Left []string
Right []string
}
func JoinOptionSet(options *getoptions.GetOpt) {
options.StringSliceVar(&_by, "by", 1, 1,
options.Alias("b"),
options.Description("to declare join keys."))
options.StringVar(&_join, "join-with", _join,
options.Alias("j"),
options.Description("file name of the file to join with."),
options.Required("You must provide a file name to join with."))
options.BoolVar(&_UpdateID, "update-id", _UpdateID,
options.Alias("i"),
options.Description("Update the sequence IDs in the joined file."))
options.BoolVar(&_UpdateSequence, "update-sequence", _UpdateSequence,
options.Alias("s"),
options.Description("Update the sequence in the joined file."))
options.BoolVar(&_UpdateQuality, "update-quality", _UpdateQuality,
options.Alias("q"),
options.Description("Update the quality in the joined file."))
}
// OptionSet adds to the basic option set every options declared for
// the obipcr command
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
JoinOptionSet(options)
}
func CLIBy() By {
if len(_by) == 0 {
return By{
Left: []string{"id"},
Right: []string{"id"},
}
}
left := make([]string, len(_by))
right := make([]string, len(_by))
for i, v := range _by {
vals := strings.Split(v, "=")
left[i] = vals[0]
right[i] = vals[0]
if len(vals) > 1 {
right[i] = vals[1]
}
}
return By{Left: left, Right: right}
}
func CLIJoinWith() string {
return _join
}
func CLIUpdateId() bool {
return _UpdateID
}
func CLIUpdateSequence() bool {
return _UpdateSequence
}
func CLIUpdateQuality() bool {
return _UpdateQuality
}