first version of obidemerge, obijoin and a new filter for obicleandb but to be finnished

Former-commit-id: 8a1ed26e5548c30db75644c294d478ec4d753f19
This commit is contained in:
Eric Coissac
2024-07-10 15:21:42 +02:00
parent bd855c4965
commit c7ed47e110
24 changed files with 2712 additions and 19 deletions

View File

@@ -1,6 +1,8 @@
package obicleandb
import (
"math/rand"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
@@ -18,6 +20,114 @@ func SequenceTrust(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error
return obiseq.BioSequenceSlice{sequence}, nil
}
func MakeSequenceFamilyGenusWorker(references obiseq.BioSequenceSlice) obiseq.SeqWorker {
genus := make(map[int]*obiseq.BioSequenceSlice)
family := make(map[int]*obiseq.BioSequenceSlice)
for _, ref := range references {
g, ok := ref.GetIntAttribute("genus_taxid")
f, ok := ref.GetIntAttribute("family_taxid")
gs, ok := genus[g]
if !ok {
gs = obiseq.NewBioSequenceSlice(0)
genus[g] = gs
}
*gs = append(*gs, ref)
fs, ok := family[f]
if !ok {
fs = obiseq.NewBioSequenceSlice(0)
family[f] = fs
}
*fs = append(*fs, ref)
}
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
g, _ := sequence.GetIntAttribute("genus_taxid")
sequence.SetAttribute("obicleandb_level", "genus")
gs := genus[g]
indist := make([]float64, 0, gs.Len())
for _, s := range *gs {
if s != sequence {
lca, lali := obialign.FastLCSScore(sequence, s, -1, nil)
indist = append(indist, float64(lali-lca))
}
}
nindist := len(indist)
pval := 0.0
f, _ := sequence.GetIntAttribute("family_taxid")
fs := family[f]
if nindist < 5 {
sequence.SetAttribute("obicleandb_level", "family")
for _, s := range *fs {
gf, _ := s.GetIntAttribute("genus_taxid")
if g != gf {
lca, lali := obialign.FastLCSScore(sequence, s, -1, nil)
indist = append(indist, float64(lali-lca))
}
}
nindist = len(indist)
}
if nindist > 0 {
next := nindist
if next <= 20 {
next = 20
}
outdist := make([]float64, 0, nindist)
p := rand.Perm(references.Len())
i := 0
for _, ir := range p {
s := references[ir]
ff, _ := s.GetIntAttribute("family_taxid")
if ff != f {
lca, lali := obialign.FastLCSScore(sequence, s, -1, nil)
outdist = append(outdist, float64(lali-lca))
i += 1
if i >= next {
break
}
}
}
res, err := obistats.MannWhitneyUTest(outdist, indist, obistats.LocationGreater)
if err == nil {
pval = res.P
}
level, _ := sequence.GetAttribute("obicleandb_level")
log.Warnf("%s - level: %v", sequence.Id(), level)
log.Warnf("%s - gdist: %v", sequence.Id(), indist)
log.Warnf("%s - fdist: %v", sequence.Id(), outdist)
log.Warnf("%s - pval: %f", sequence.Id(), pval)
} else {
sequence.SetAttribute("obicleandb_level", "none")
}
sequence.SetAttribute("obicleandb_trusted", pval)
return obiseq.BioSequenceSlice{sequence}, nil
}
return f
}
func diagCoord(x, y, n int) int {
if x > y {
x, y = y, x
@@ -160,19 +270,26 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
obioptions.CLIParallelWorkers(),
)
genera_iterator, err := obichunk.ISequenceChunk(
annotated,
obiseq.AnnotationClassifier("genus_taxid", "NA"),
)
references := annotated.Load()
if err != nil {
log.Fatal(err)
}
mannwithney := MakeSequenceFamilyGenusWorker(references)
trusted := genera_iterator.MakeISliceWorker(
SequenceTrustSlice,
false,
)
partof := obiiter.IBatchOver(references,
obioptions.CLIBatchSize()).Speed("Testing belonging to genus")
return trusted
// genera_iterator, err := obichunk.ISequenceChunk(
// annotated,
// obiseq.AnnotationClassifier("genus_taxid", "NA"),
// )
// if err != nil {
// log.Fatal(err)
// }
// trusted := genera_iterator.MakeISliceWorker(
// SequenceTrustSlice,
// false,
// )
return partof.MakeIWorker(mannwithney, true)
}

View File

@@ -0,0 +1,44 @@
package obidemerge
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
)
func MakeDemergeWorker(key string) obiseq.SeqWorker {
desc := obiseq.MakeStatsOnDescription(key)
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
if sequence.HasStatsOn(key) {
stats := sequence.StatsOn(desc, "NA")
sequence.DeleteAttribute(obiseq.StatsOnSlotName(key))
slice := obiseq.NewBioSequenceSlice(len(stats))
i := 0
for k, v := range stats {
(*slice)[i] = sequence.Copy()
(*slice)[i].SetAttribute(key, k)
(*slice)[i].SetCount(v)
i++
}
return *slice, nil
}
return obiseq.BioSequenceSlice{sequence}, nil
}
return obiseq.SeqWorker(f)
}
func CLIDemergeSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
if CLIHasSlotToDemerge() {
worker := MakeDemergeWorker(CLIDemergeSlot())
return iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers(), 0)
}
return iterator
}

View File

@@ -0,0 +1,30 @@
package obidemerge
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _Demerge = ""
func DemergeOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_Demerge, "demerge", _Demerge,
options.Alias("d"),
options.Description("Indicates which slot has to be demerged."))
}
// OptionSet adds to the basic option set every options declared for
// the obipcr command
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
DemergeOptionSet(options)
}
func CLIDemergeSlot() string {
return _Demerge
}
func CLIHasSlotToDemerge() bool {
return _Demerge != ""
}

View File

@@ -0,0 +1,132 @@
package obijoin
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
type IndexedSequenceSlice struct {
Sequences obiseq.BioSequenceSlice
Indices []map[interface{}]*obiutils.Set[int]
}
func (s IndexedSequenceSlice) Len() int {
return len(s.Sequences)
}
func (s IndexedSequenceSlice) Get(keys ...interface{}) *obiseq.BioSequenceSlice {
var keeps obiutils.Set[int]
for i, v := range s.Indices {
if i == 0 {
keeps = *v[keys[0]]
} else {
keeps = keeps.Intersection(*v[keys[i]])
}
}
rep := obiseq.MakeBioSequenceSlice(len(keeps))
for i, v := range keeps.Members() {
rep[i] = s.Sequences[v]
}
return &rep
}
func BuildIndexedSequenceSlice(seqs obiseq.BioSequenceSlice, keys []string) IndexedSequenceSlice {
indices := make([]map[interface{}]*obiutils.Set[int], len(keys))
for i, k := range keys {
idx := make(map[interface{}]*obiutils.Set[int])
for j, seq := range seqs {
if value, ok := seq.GetAttribute(k); ok {
goods, ok := idx[value]
if !ok {
goods = obiutils.NewSet[int]()
idx[value] = goods
}
goods.Add(j)
}
}
indices[i] = idx
}
return IndexedSequenceSlice{seqs, indices}
}
func MakeJoinWorker(by []string, index IndexedSequenceSlice, updateId, updateSequence, updateQuality bool) obiseq.SeqWorker {
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
var ok bool
keys := make([]interface{}, len(by))
for i, v := range by {
keys[i], ok = sequence.GetAttribute(v)
if !ok {
return obiseq.BioSequenceSlice{sequence}, nil
}
}
join_with := index.Get(keys...)
rep := obiseq.MakeBioSequenceSlice(join_with.Len())
if join_with.Len() == 0 {
return obiseq.BioSequenceSlice{sequence}, nil
}
for i, v := range *join_with {
rep[i] = sequence.Copy()
annot := rep[i].Annotations()
new_annot := v.Annotations()
for k, v := range new_annot {
annot[k] = v
}
if updateId {
rep[i].SetId(v.Id())
}
if updateSequence && len(v.Sequence()) > 0 {
rep[i].SetSequence(v.Sequence())
}
if updateQuality && len(v.Qualities()) > 0 {
rep[i].SetQualities(v.Qualities())
}
}
return rep, nil
}
return obiseq.SeqWorker(f)
}
func CLIJoinSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
data_iter, err := obiformats.ReadSequencesFromFile(CLIJoinWith())
if err != nil {
log.Fatalf("Cannot read the data file to merge with: %s %v", CLIJoinWith(), err)
}
data := data_iter.Load()
keys := CLIBy()
index := BuildIndexedSequenceSlice(data, keys.Right)
worker := MakeJoinWorker(keys.Left, index, CLIUpdateId(), CLIUpdateSequence(), CLIUpdateQuality())
iterator = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
return iterator
}

View File

@@ -0,0 +1,90 @@
package obijoin
import (
"strings"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _by = []string{}
var _join = ""
var _UpdateID = false
var _UpdateSequence = false
var _UpdateQuality = false
type By struct {
Left []string
Right []string
}
func JoinOptionSet(options *getoptions.GetOpt) {
options.StringSliceVar(&_by, "by", 1, 1,
options.Alias("b"),
options.Description("to declare join keys."))
options.StringVar(&_join, "join-with", _join,
options.Alias("j"),
options.Description("file name of the file to join with."),
options.Required("You must provide a file name to join with."))
options.BoolVar(&_UpdateID, "update-id", _UpdateID,
options.Alias("i"),
options.Description("Update the sequence IDs in the joined file."))
options.BoolVar(&_UpdateSequence, "update-sequence", _UpdateSequence,
options.Alias("s"),
options.Description("Update the sequence in the joined file."))
options.BoolVar(&_UpdateQuality, "update-quality", _UpdateQuality,
options.Alias("q"),
options.Description("Update the quality in the joined file."))
}
// OptionSet adds to the basic option set every options declared for
// the obipcr command
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
JoinOptionSet(options)
}
func CLIBy() By {
if len(_by) == 0 {
return By{
Left: []string{"id"},
Right: []string{"id"},
}
}
left := make([]string, len(_by))
right := make([]string, len(_by))
for i, v := range _by {
vals := strings.Split(v, "=")
left[i] = vals[0]
right[i] = vals[0]
if len(vals) > 1 {
right[i] = vals[1]
}
}
return By{Left: left, Right: right}
}
func CLIJoinWith() string {
return _join
}
func CLIUpdateId() bool {
return _UpdateID
}
func CLIUpdateSequence() bool {
return _UpdateSequence
}
func CLIUpdateQuality() bool {
return _UpdateQuality
}