mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-10 01:30:27 +00:00
first version of obidemerge, obijoin and a new filter for obicleandb but to be finnished
Former-commit-id: 8a1ed26e5548c30db75644c294d478ec4d753f19
This commit is contained in:
132
pkg/obitools/obijoin/join.go
Normal file
132
pkg/obitools/obijoin/join.go
Normal file
@@ -0,0 +1,132 @@
|
||||
package obijoin
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type IndexedSequenceSlice struct {
|
||||
Sequences obiseq.BioSequenceSlice
|
||||
Indices []map[interface{}]*obiutils.Set[int]
|
||||
}
|
||||
|
||||
func (s IndexedSequenceSlice) Len() int {
|
||||
return len(s.Sequences)
|
||||
}
|
||||
|
||||
func (s IndexedSequenceSlice) Get(keys ...interface{}) *obiseq.BioSequenceSlice {
|
||||
var keeps obiutils.Set[int]
|
||||
|
||||
for i, v := range s.Indices {
|
||||
if i == 0 {
|
||||
keeps = *v[keys[0]]
|
||||
} else {
|
||||
keeps = keeps.Intersection(*v[keys[i]])
|
||||
}
|
||||
}
|
||||
|
||||
rep := obiseq.MakeBioSequenceSlice(len(keeps))
|
||||
for i, v := range keeps.Members() {
|
||||
rep[i] = s.Sequences[v]
|
||||
}
|
||||
|
||||
return &rep
|
||||
}
|
||||
|
||||
func BuildIndexedSequenceSlice(seqs obiseq.BioSequenceSlice, keys []string) IndexedSequenceSlice {
|
||||
indices := make([]map[interface{}]*obiutils.Set[int], len(keys))
|
||||
|
||||
for i, k := range keys {
|
||||
idx := make(map[interface{}]*obiutils.Set[int])
|
||||
|
||||
for j, seq := range seqs {
|
||||
|
||||
if value, ok := seq.GetAttribute(k); ok {
|
||||
goods, ok := idx[value]
|
||||
if !ok {
|
||||
goods = obiutils.NewSet[int]()
|
||||
idx[value] = goods
|
||||
}
|
||||
|
||||
goods.Add(j)
|
||||
}
|
||||
}
|
||||
|
||||
indices[i] = idx
|
||||
}
|
||||
|
||||
return IndexedSequenceSlice{seqs, indices}
|
||||
}
|
||||
|
||||
func MakeJoinWorker(by []string, index IndexedSequenceSlice, updateId, updateSequence, updateQuality bool) obiseq.SeqWorker {
|
||||
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
var ok bool
|
||||
|
||||
keys := make([]interface{}, len(by))
|
||||
|
||||
for i, v := range by {
|
||||
keys[i], ok = sequence.GetAttribute(v)
|
||||
if !ok {
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
}
|
||||
|
||||
join_with := index.Get(keys...)
|
||||
|
||||
rep := obiseq.MakeBioSequenceSlice(join_with.Len())
|
||||
|
||||
if join_with.Len() == 0 {
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
|
||||
for i, v := range *join_with {
|
||||
rep[i] = sequence.Copy()
|
||||
annot := rep[i].Annotations()
|
||||
new_annot := v.Annotations()
|
||||
|
||||
for k, v := range new_annot {
|
||||
annot[k] = v
|
||||
}
|
||||
|
||||
if updateId {
|
||||
rep[i].SetId(v.Id())
|
||||
}
|
||||
if updateSequence && len(v.Sequence()) > 0 {
|
||||
rep[i].SetSequence(v.Sequence())
|
||||
}
|
||||
if updateQuality && len(v.Qualities()) > 0 {
|
||||
rep[i].SetQualities(v.Qualities())
|
||||
}
|
||||
}
|
||||
|
||||
return rep, nil
|
||||
}
|
||||
|
||||
return obiseq.SeqWorker(f)
|
||||
}
|
||||
|
||||
func CLIJoinSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
|
||||
data_iter, err := obiformats.ReadSequencesFromFile(CLIJoinWith())
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot read the data file to merge with: %s %v", CLIJoinWith(), err)
|
||||
}
|
||||
|
||||
data := data_iter.Load()
|
||||
|
||||
keys := CLIBy()
|
||||
|
||||
index := BuildIndexedSequenceSlice(data, keys.Right)
|
||||
|
||||
worker := MakeJoinWorker(keys.Left, index, CLIUpdateId(), CLIUpdateSequence(), CLIUpdateQuality())
|
||||
|
||||
iterator = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
|
||||
|
||||
return iterator
|
||||
}
|
||||
90
pkg/obitools/obijoin/options.go
Normal file
90
pkg/obitools/obijoin/options.go
Normal file
@@ -0,0 +1,90 @@
|
||||
package obijoin
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _by = []string{}
|
||||
var _join = ""
|
||||
var _UpdateID = false
|
||||
var _UpdateSequence = false
|
||||
var _UpdateQuality = false
|
||||
|
||||
type By struct {
|
||||
Left []string
|
||||
Right []string
|
||||
}
|
||||
|
||||
func JoinOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
options.StringSliceVar(&_by, "by", 1, 1,
|
||||
options.Alias("b"),
|
||||
options.Description("to declare join keys."))
|
||||
|
||||
options.StringVar(&_join, "join-with", _join,
|
||||
options.Alias("j"),
|
||||
options.Description("file name of the file to join with."),
|
||||
options.Required("You must provide a file name to join with."))
|
||||
|
||||
options.BoolVar(&_UpdateID, "update-id", _UpdateID,
|
||||
options.Alias("i"),
|
||||
options.Description("Update the sequence IDs in the joined file."))
|
||||
|
||||
options.BoolVar(&_UpdateSequence, "update-sequence", _UpdateSequence,
|
||||
options.Alias("s"),
|
||||
options.Description("Update the sequence in the joined file."))
|
||||
|
||||
options.BoolVar(&_UpdateQuality, "update-quality", _UpdateQuality,
|
||||
options.Alias("q"),
|
||||
options.Description("Update the quality in the joined file."))
|
||||
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every options declared for
|
||||
// the obipcr command
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
JoinOptionSet(options)
|
||||
}
|
||||
|
||||
func CLIBy() By {
|
||||
if len(_by) == 0 {
|
||||
return By{
|
||||
Left: []string{"id"},
|
||||
Right: []string{"id"},
|
||||
}
|
||||
}
|
||||
|
||||
left := make([]string, len(_by))
|
||||
right := make([]string, len(_by))
|
||||
|
||||
for i, v := range _by {
|
||||
vals := strings.Split(v, "=")
|
||||
left[i] = vals[0]
|
||||
right[i] = vals[0]
|
||||
if len(vals) > 1 {
|
||||
right[i] = vals[1]
|
||||
}
|
||||
}
|
||||
|
||||
return By{Left: left, Right: right}
|
||||
}
|
||||
|
||||
func CLIJoinWith() string {
|
||||
return _join
|
||||
}
|
||||
|
||||
func CLIUpdateId() bool {
|
||||
return _UpdateID
|
||||
}
|
||||
|
||||
func CLIUpdateSequence() bool {
|
||||
return _UpdateSequence
|
||||
}
|
||||
|
||||
func CLIUpdateQuality() bool {
|
||||
return _UpdateQuality
|
||||
}
|
||||
Reference in New Issue
Block a user