mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-10 17:50:26 +00:00
first version of obidemerge, obijoin and a new filter for obicleandb but to be finnished
Former-commit-id: 8a1ed26e5548c30db75644c294d478ec4d753f19
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
package obicleandb
|
||||
|
||||
import (
|
||||
"math/rand"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
||||
@@ -18,6 +20,114 @@ func SequenceTrust(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
|
||||
func MakeSequenceFamilyGenusWorker(references obiseq.BioSequenceSlice) obiseq.SeqWorker {
|
||||
|
||||
genus := make(map[int]*obiseq.BioSequenceSlice)
|
||||
family := make(map[int]*obiseq.BioSequenceSlice)
|
||||
|
||||
for _, ref := range references {
|
||||
g, ok := ref.GetIntAttribute("genus_taxid")
|
||||
f, ok := ref.GetIntAttribute("family_taxid")
|
||||
|
||||
gs, ok := genus[g]
|
||||
if !ok {
|
||||
gs = obiseq.NewBioSequenceSlice(0)
|
||||
genus[g] = gs
|
||||
}
|
||||
|
||||
*gs = append(*gs, ref)
|
||||
|
||||
fs, ok := family[f]
|
||||
if !ok {
|
||||
fs = obiseq.NewBioSequenceSlice(0)
|
||||
family[f] = fs
|
||||
}
|
||||
|
||||
*fs = append(*fs, ref)
|
||||
}
|
||||
|
||||
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
g, _ := sequence.GetIntAttribute("genus_taxid")
|
||||
sequence.SetAttribute("obicleandb_level", "genus")
|
||||
|
||||
gs := genus[g]
|
||||
|
||||
indist := make([]float64, 0, gs.Len())
|
||||
for _, s := range *gs {
|
||||
if s != sequence {
|
||||
lca, lali := obialign.FastLCSScore(sequence, s, -1, nil)
|
||||
indist = append(indist, float64(lali-lca))
|
||||
}
|
||||
}
|
||||
nindist := len(indist)
|
||||
|
||||
pval := 0.0
|
||||
|
||||
f, _ := sequence.GetIntAttribute("family_taxid")
|
||||
fs := family[f]
|
||||
|
||||
if nindist < 5 {
|
||||
sequence.SetAttribute("obicleandb_level", "family")
|
||||
|
||||
for _, s := range *fs {
|
||||
gf, _ := s.GetIntAttribute("genus_taxid")
|
||||
if g != gf {
|
||||
lca, lali := obialign.FastLCSScore(sequence, s, -1, nil)
|
||||
indist = append(indist, float64(lali-lca))
|
||||
}
|
||||
}
|
||||
|
||||
nindist = len(indist)
|
||||
}
|
||||
|
||||
if nindist > 0 {
|
||||
|
||||
next := nindist
|
||||
if next <= 20 {
|
||||
next = 20
|
||||
}
|
||||
|
||||
outdist := make([]float64, 0, nindist)
|
||||
p := rand.Perm(references.Len())
|
||||
i := 0
|
||||
for _, ir := range p {
|
||||
s := references[ir]
|
||||
ff, _ := s.GetIntAttribute("family_taxid")
|
||||
|
||||
if ff != f {
|
||||
lca, lali := obialign.FastLCSScore(sequence, s, -1, nil)
|
||||
outdist = append(outdist, float64(lali-lca))
|
||||
i += 1
|
||||
if i >= next {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
res, err := obistats.MannWhitneyUTest(outdist, indist, obistats.LocationGreater)
|
||||
|
||||
if err == nil {
|
||||
pval = res.P
|
||||
}
|
||||
|
||||
level, _ := sequence.GetAttribute("obicleandb_level")
|
||||
log.Warnf("%s - level: %v", sequence.Id(), level)
|
||||
log.Warnf("%s - gdist: %v", sequence.Id(), indist)
|
||||
log.Warnf("%s - fdist: %v", sequence.Id(), outdist)
|
||||
log.Warnf("%s - pval: %f", sequence.Id(), pval)
|
||||
} else {
|
||||
sequence.SetAttribute("obicleandb_level", "none")
|
||||
}
|
||||
|
||||
sequence.SetAttribute("obicleandb_trusted", pval)
|
||||
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
|
||||
return f
|
||||
|
||||
}
|
||||
|
||||
func diagCoord(x, y, n int) int {
|
||||
if x > y {
|
||||
x, y = y, x
|
||||
@@ -160,19 +270,26 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
obioptions.CLIParallelWorkers(),
|
||||
)
|
||||
|
||||
genera_iterator, err := obichunk.ISequenceChunk(
|
||||
annotated,
|
||||
obiseq.AnnotationClassifier("genus_taxid", "NA"),
|
||||
)
|
||||
references := annotated.Load()
|
||||
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
mannwithney := MakeSequenceFamilyGenusWorker(references)
|
||||
|
||||
trusted := genera_iterator.MakeISliceWorker(
|
||||
SequenceTrustSlice,
|
||||
false,
|
||||
)
|
||||
partof := obiiter.IBatchOver(references,
|
||||
obioptions.CLIBatchSize()).Speed("Testing belonging to genus")
|
||||
|
||||
return trusted
|
||||
// genera_iterator, err := obichunk.ISequenceChunk(
|
||||
// annotated,
|
||||
// obiseq.AnnotationClassifier("genus_taxid", "NA"),
|
||||
// )
|
||||
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
|
||||
// trusted := genera_iterator.MakeISliceWorker(
|
||||
// SequenceTrustSlice,
|
||||
// false,
|
||||
// )
|
||||
|
||||
return partof.MakeIWorker(mannwithney, true)
|
||||
}
|
||||
|
||||
44
pkg/obitools/obidemerge/demerge.go
Normal file
44
pkg/obitools/obidemerge/demerge.go
Normal file
@@ -0,0 +1,44 @@
|
||||
package obidemerge
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
func MakeDemergeWorker(key string) obiseq.SeqWorker {
|
||||
desc := obiseq.MakeStatsOnDescription(key)
|
||||
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
|
||||
if sequence.HasStatsOn(key) {
|
||||
stats := sequence.StatsOn(desc, "NA")
|
||||
sequence.DeleteAttribute(obiseq.StatsOnSlotName(key))
|
||||
slice := obiseq.NewBioSequenceSlice(len(stats))
|
||||
i := 0
|
||||
|
||||
for k, v := range stats {
|
||||
(*slice)[i] = sequence.Copy()
|
||||
(*slice)[i].SetAttribute(key, k)
|
||||
(*slice)[i].SetCount(v)
|
||||
i++
|
||||
}
|
||||
|
||||
return *slice, nil
|
||||
}
|
||||
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
|
||||
return obiseq.SeqWorker(f)
|
||||
}
|
||||
|
||||
func CLIDemergeSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
|
||||
if CLIHasSlotToDemerge() {
|
||||
|
||||
worker := MakeDemergeWorker(CLIDemergeSlot())
|
||||
return iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers(), 0)
|
||||
}
|
||||
|
||||
return iterator
|
||||
}
|
||||
30
pkg/obitools/obidemerge/options.go
Normal file
30
pkg/obitools/obidemerge/options.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package obidemerge
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _Demerge = ""
|
||||
|
||||
func DemergeOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
options.StringVar(&_Demerge, "demerge", _Demerge,
|
||||
options.Alias("d"),
|
||||
options.Description("Indicates which slot has to be demerged."))
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every options declared for
|
||||
// the obipcr command
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
DemergeOptionSet(options)
|
||||
}
|
||||
|
||||
func CLIDemergeSlot() string {
|
||||
return _Demerge
|
||||
}
|
||||
|
||||
func CLIHasSlotToDemerge() bool {
|
||||
return _Demerge != ""
|
||||
}
|
||||
132
pkg/obitools/obijoin/join.go
Normal file
132
pkg/obitools/obijoin/join.go
Normal file
@@ -0,0 +1,132 @@
|
||||
package obijoin
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type IndexedSequenceSlice struct {
|
||||
Sequences obiseq.BioSequenceSlice
|
||||
Indices []map[interface{}]*obiutils.Set[int]
|
||||
}
|
||||
|
||||
func (s IndexedSequenceSlice) Len() int {
|
||||
return len(s.Sequences)
|
||||
}
|
||||
|
||||
func (s IndexedSequenceSlice) Get(keys ...interface{}) *obiseq.BioSequenceSlice {
|
||||
var keeps obiutils.Set[int]
|
||||
|
||||
for i, v := range s.Indices {
|
||||
if i == 0 {
|
||||
keeps = *v[keys[0]]
|
||||
} else {
|
||||
keeps = keeps.Intersection(*v[keys[i]])
|
||||
}
|
||||
}
|
||||
|
||||
rep := obiseq.MakeBioSequenceSlice(len(keeps))
|
||||
for i, v := range keeps.Members() {
|
||||
rep[i] = s.Sequences[v]
|
||||
}
|
||||
|
||||
return &rep
|
||||
}
|
||||
|
||||
func BuildIndexedSequenceSlice(seqs obiseq.BioSequenceSlice, keys []string) IndexedSequenceSlice {
|
||||
indices := make([]map[interface{}]*obiutils.Set[int], len(keys))
|
||||
|
||||
for i, k := range keys {
|
||||
idx := make(map[interface{}]*obiutils.Set[int])
|
||||
|
||||
for j, seq := range seqs {
|
||||
|
||||
if value, ok := seq.GetAttribute(k); ok {
|
||||
goods, ok := idx[value]
|
||||
if !ok {
|
||||
goods = obiutils.NewSet[int]()
|
||||
idx[value] = goods
|
||||
}
|
||||
|
||||
goods.Add(j)
|
||||
}
|
||||
}
|
||||
|
||||
indices[i] = idx
|
||||
}
|
||||
|
||||
return IndexedSequenceSlice{seqs, indices}
|
||||
}
|
||||
|
||||
func MakeJoinWorker(by []string, index IndexedSequenceSlice, updateId, updateSequence, updateQuality bool) obiseq.SeqWorker {
|
||||
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
var ok bool
|
||||
|
||||
keys := make([]interface{}, len(by))
|
||||
|
||||
for i, v := range by {
|
||||
keys[i], ok = sequence.GetAttribute(v)
|
||||
if !ok {
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
}
|
||||
|
||||
join_with := index.Get(keys...)
|
||||
|
||||
rep := obiseq.MakeBioSequenceSlice(join_with.Len())
|
||||
|
||||
if join_with.Len() == 0 {
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
|
||||
for i, v := range *join_with {
|
||||
rep[i] = sequence.Copy()
|
||||
annot := rep[i].Annotations()
|
||||
new_annot := v.Annotations()
|
||||
|
||||
for k, v := range new_annot {
|
||||
annot[k] = v
|
||||
}
|
||||
|
||||
if updateId {
|
||||
rep[i].SetId(v.Id())
|
||||
}
|
||||
if updateSequence && len(v.Sequence()) > 0 {
|
||||
rep[i].SetSequence(v.Sequence())
|
||||
}
|
||||
if updateQuality && len(v.Qualities()) > 0 {
|
||||
rep[i].SetQualities(v.Qualities())
|
||||
}
|
||||
}
|
||||
|
||||
return rep, nil
|
||||
}
|
||||
|
||||
return obiseq.SeqWorker(f)
|
||||
}
|
||||
|
||||
func CLIJoinSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
|
||||
data_iter, err := obiformats.ReadSequencesFromFile(CLIJoinWith())
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot read the data file to merge with: %s %v", CLIJoinWith(), err)
|
||||
}
|
||||
|
||||
data := data_iter.Load()
|
||||
|
||||
keys := CLIBy()
|
||||
|
||||
index := BuildIndexedSequenceSlice(data, keys.Right)
|
||||
|
||||
worker := MakeJoinWorker(keys.Left, index, CLIUpdateId(), CLIUpdateSequence(), CLIUpdateQuality())
|
||||
|
||||
iterator = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
|
||||
|
||||
return iterator
|
||||
}
|
||||
90
pkg/obitools/obijoin/options.go
Normal file
90
pkg/obitools/obijoin/options.go
Normal file
@@ -0,0 +1,90 @@
|
||||
package obijoin
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _by = []string{}
|
||||
var _join = ""
|
||||
var _UpdateID = false
|
||||
var _UpdateSequence = false
|
||||
var _UpdateQuality = false
|
||||
|
||||
type By struct {
|
||||
Left []string
|
||||
Right []string
|
||||
}
|
||||
|
||||
func JoinOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
options.StringSliceVar(&_by, "by", 1, 1,
|
||||
options.Alias("b"),
|
||||
options.Description("to declare join keys."))
|
||||
|
||||
options.StringVar(&_join, "join-with", _join,
|
||||
options.Alias("j"),
|
||||
options.Description("file name of the file to join with."),
|
||||
options.Required("You must provide a file name to join with."))
|
||||
|
||||
options.BoolVar(&_UpdateID, "update-id", _UpdateID,
|
||||
options.Alias("i"),
|
||||
options.Description("Update the sequence IDs in the joined file."))
|
||||
|
||||
options.BoolVar(&_UpdateSequence, "update-sequence", _UpdateSequence,
|
||||
options.Alias("s"),
|
||||
options.Description("Update the sequence in the joined file."))
|
||||
|
||||
options.BoolVar(&_UpdateQuality, "update-quality", _UpdateQuality,
|
||||
options.Alias("q"),
|
||||
options.Description("Update the quality in the joined file."))
|
||||
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every options declared for
|
||||
// the obipcr command
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
JoinOptionSet(options)
|
||||
}
|
||||
|
||||
func CLIBy() By {
|
||||
if len(_by) == 0 {
|
||||
return By{
|
||||
Left: []string{"id"},
|
||||
Right: []string{"id"},
|
||||
}
|
||||
}
|
||||
|
||||
left := make([]string, len(_by))
|
||||
right := make([]string, len(_by))
|
||||
|
||||
for i, v := range _by {
|
||||
vals := strings.Split(v, "=")
|
||||
left[i] = vals[0]
|
||||
right[i] = vals[0]
|
||||
if len(vals) > 1 {
|
||||
right[i] = vals[1]
|
||||
}
|
||||
}
|
||||
|
||||
return By{Left: left, Right: right}
|
||||
}
|
||||
|
||||
func CLIJoinWith() string {
|
||||
return _join
|
||||
}
|
||||
|
||||
func CLIUpdateId() bool {
|
||||
return _UpdateID
|
||||
}
|
||||
|
||||
func CLIUpdateSequence() bool {
|
||||
return _UpdateSequence
|
||||
}
|
||||
|
||||
func CLIUpdateQuality() bool {
|
||||
return _UpdateQuality
|
||||
}
|
||||
Reference in New Issue
Block a user