From f3ddac0f5019a157e6daf5fb1f374d95ce77b54d Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 27 Oct 2022 11:18:44 +0200 Subject: [PATCH] Small adjustment --- cmd/obitools/obitag/main.go | 19 +++ pkg/goutils/goutils.go | 2 +- pkg/goutils/minmax.go | 8 + pkg/goutils/ranks.go | 27 +++- pkg/obiiter/speed.go | 4 +- pkg/obiseq/biosequence.go | 6 +- pkg/obitools/obicleandb/options.go | 27 ++++ pkg/obitools/obirefidx/obirefidx.go | 29 +--- pkg/obitools/obitag/obitag.go | 233 ++++++++++++++-------------- pkg/obitools/obitag/options.go | 18 ++- 10 files changed, 218 insertions(+), 155 deletions(-) create mode 100644 pkg/obitools/obicleandb/options.go diff --git a/cmd/obitools/obitag/main.go b/cmd/obitools/obitag/main.go index 713cdda..b8af4a1 100644 --- a/cmd/obitools/obitag/main.go +++ b/cmd/obitools/obitag/main.go @@ -1,7 +1,9 @@ package main import ( + "log" "os" + "runtime/pprof" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obitag" @@ -10,6 +12,23 @@ import ( ) func main() { + + // go tool pprof -http=":8000" ./build/obitag ./cpu.pprof + f, err := os.Create("cpu.pprof") + if err != nil { + log.Fatal(err) + } + pprof.StartCPUProfile(f) + defer pprof.StopCPUProfile() + + // go tool trace cpu.trace + // ftrace, err := os.Create("cpu.trace") + // if err != nil { + // log.Fatal(err) + // } + // trace.Start(ftrace) + // defer trace.Stop() + optionParser := obioptions.GenerateOptionParser(obitag.OptionSet) _, args, _ := optionParser(os.Args) diff --git a/pkg/goutils/goutils.go b/pkg/goutils/goutils.go index e42c5db..d819f42 100644 --- a/pkg/goutils/goutils.go +++ b/pkg/goutils/goutils.go @@ -18,7 +18,7 @@ import ( // return value and val is set to 0. func InterfaceToString(i interface{}) (val string, err error) { err = nil - val = fmt.Sprintf("%V", i) + val = fmt.Sprintf("%v", i) return } diff --git a/pkg/goutils/minmax.go b/pkg/goutils/minmax.go index e99cf1f..a467b6f 100644 --- a/pkg/goutils/minmax.go +++ b/pkg/goutils/minmax.go @@ -14,6 +14,14 @@ func MaxInt(x, y int) int { return x } +func MinMaxInt(x, y int) (int,int) { + if x < y { + return x,y + } + return y,x +} + + func MinUInt16(x, y uint16) uint16 { if x < y { return x diff --git a/pkg/goutils/ranks.go b/pkg/goutils/ranks.go index af94aac..a68a109 100644 --- a/pkg/goutils/ranks.go +++ b/pkg/goutils/ranks.go @@ -18,9 +18,10 @@ func IntOrder(data []int) []int { return nil } + r := make([]int, len(data)) rk := intRanker{ x: data, - r: make([]int, len(data)), + r: r, } for i := 0; i < len(data); i++ { @@ -28,6 +29,26 @@ func IntOrder(data []int) []int { } sort.Sort(rk) - - return rk.r + + return r +} + +func ReverseIntOrder(data []int) []int { + if len(data) == 0 { + return nil + } + + r := make([]int, len(data)) + rk := intRanker{ + x: data, + r: r, + } + + for i := 0; i < len(data); i++ { + rk.r[i] = i + } + + sort.Sort(sort.Reverse(rk)) + + return r } diff --git a/pkg/obiiter/speed.go b/pkg/obiiter/speed.go index 0b49fd2..a00a428 100644 --- a/pkg/obiiter/speed.go +++ b/pkg/obiiter/speed.go @@ -50,9 +50,9 @@ func (iterator IBioSequenceBatch) Speed(message ...string) IBioSequenceBatch { return newIter } -func SpeedPipe() Pipeable { +func SpeedPipe(message ...string) Pipeable { f := func(iterator IBioSequenceBatch) IBioSequenceBatch { - return iterator.Speed() + return iterator.Speed(message...) } return f diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index b383bdd..4d66d78 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -299,11 +299,11 @@ func (s *BioSequence) Taxid() int { return taxid } -func (s *BioSequence) EcotagRefIndex() map[int]string { +func (s *BioSequence) OBITagRefIndex() map[int]string { var val map[int]string - i, ok := s.GetAttribute("ecotag_ref_index") + i, ok := s.GetAttribute("obitag_ref_index") if !ok { return nil @@ -336,7 +336,7 @@ func (s *BioSequence) EcotagRefIndex() map[int]string { } default: - log.Panicln("value of attribute ecotag_ref_index cannot be casted to a map[int]string") + log.Panicln("value of attribute obitag_ref_index cannot be casted to a map[int]string") } return val diff --git a/pkg/obitools/obicleandb/options.go b/pkg/obitools/obicleandb/options.go new file mode 100644 index 0000000..6b7d35f --- /dev/null +++ b/pkg/obitools/obicleandb/options.go @@ -0,0 +1,27 @@ +package obicleandb + +import ( + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obigrep" + "github.com/DavidGamba/go-getoptions" +) + + +var _UpdateTaxids = false + +func ObicleanDBOptionSet(options *getoptions.GetOpt) { + + options.BoolVar(&_UpdateTaxids, "update-taxids", _UpdateTaxids, + options.Description("Indicates if decrecated Taxids must be corrected.")) +} + +func OptionSet(options *getoptions.GetOpt) { + obiconvert.InputOptionSet(options) + obiconvert.OutputOptionSet(options) + obigrep.TaxonomySelectionOptionSet(options) + ObicleanDBOptionSet(options) +} + +func CLIUpdateTaxids() bool { + return _UpdateTaxids +} \ No newline at end of file diff --git a/pkg/obitools/obirefidx/obirefidx.go b/pkg/obitools/obirefidx/obirefidx.go index 7ee8edc..bc555d3 100644 --- a/pkg/obitools/obirefidx/obirefidx.go +++ b/pkg/obitools/obirefidx/obirefidx.go @@ -18,7 +18,6 @@ import ( func IndexSequence(seqidx int, references obiseq.BioSequenceSlice, - refcounts []*obikmer.Table4mer, taxo *obitax.Taxonomy) map[int]string { sequence := references[seqidx] @@ -28,23 +27,7 @@ func IndexSequence(seqidx int, score := make([]int, len(references)) for i, ref := range references { - // maxe := goutils.MaxInt(sequence.Length(), ref.Length()) - // mine := 0 - // if refcounts != nil { - // mine, maxe = obikmer.Error4MerBounds(refcounts[seqidx], refcounts[i]) - // } - - lcs, alilength := obialign.FullLCSScore(sequence, ref, matrix) - - // if lcs < 0 { - // log.Print("Max error wrongly estimated", mine, maxe) - // log.Println(string(sequence.Sequence())) - // log.Fatalln(string(ref.Sequence())) - - // maxe := goutils.MaxInt(sequence.Length(), ref.Length()) - // lcs, alilength = obialign.LCSScore(sequence, ref, matrix) - // } score[i] = alilength - lcs } @@ -58,7 +41,7 @@ func IndexSequence(seqidx int, log.Panicln(err) } - ecotag_index := make(map[int]string) + obitag_index := make(map[int]string) for _, idx := range o { new_taxid, err := taxo.Taxon(references[idx].Taxid()) @@ -76,7 +59,7 @@ func IndexSequence(seqidx int, if current_taxid.Taxid() != new_taxid.Taxid() { if new_score > current_score { - ecotag_index[score[current_idx]] = fmt.Sprintf( + obitag_index[score[current_idx]] = fmt.Sprintf( "%d@%s@%s", current_taxid.Taxid(), current_taxid.ScientificName(), @@ -89,15 +72,15 @@ func IndexSequence(seqidx int, } } - ecotag_index[score[current_idx]] = fmt.Sprintf( + obitag_index[score[current_idx]] = fmt.Sprintf( "%d@%s@%s", current_taxid.Taxid(), current_taxid.ScientificName(), current_taxid.Rank()) - sequence.SetAttribute("ecotag_ref_index", ecotag_index) + sequence.SetAttribute("obitag_ref_index", obitag_index) - return ecotag_index + return obitag_index } func IndexReferenceDB(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch { @@ -142,7 +125,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBa for l := range limits { sl := obiseq.MakeBioSequenceSlice() for i := l[0]; i < l[1]; i++ { - IndexSequence(i, references, refcounts, taxo) + IndexSequence(i, references, taxo) sl = append(sl, references[i]) } indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl)) diff --git a/pkg/obitools/obitag/obitag.go b/pkg/obitools/obitag/obitag.go index f63ed28..1a65ce3 100644 --- a/pkg/obitools/obitag/obitag.go +++ b/pkg/obitools/obitag/obitag.go @@ -1,8 +1,8 @@ package obitag import ( - "fmt" "log" + "math" "strconv" "strings" @@ -14,118 +14,93 @@ import ( "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obirefidx" ) -func IndexSequence(seqidx int, +func FindClosests(sequence *obiseq.BioSequence, references obiseq.BioSequenceSlice, refcounts []*obikmer.Table4mer, - taxo *obitax.Taxonomy) map[int]string { + runExact bool) (obiseq.BioSequenceSlice, int, float64, string, []int) { - sequence := references[seqidx] matrix := obialign.NewLCSMatrix(nil, sequence.Length(), sequence.Length(), sequence.Length()) - score := make([]int, len(references)) - for i, ref := range references { - maxe := goutils.MaxInt(sequence.Length(), ref.Length()) - mine := 0 - if refcounts != nil { - mine, maxe = obikmer.Error4MerBounds(refcounts[seqidx], refcounts[i]) - } - lcs, alilength := obialign.LCSScore(sequence, ref, (maxe+1)*2, matrix) + seqwords := obikmer.Count4Mer(sequence, nil, nil) + cw := make([]int, len(refcounts)) - if lcs < 0 { - log.Print("Max error wrongly estimated", mine, maxe) - log.Println(string(sequence.Sequence())) - log.Fatalln(string(ref.Sequence())) + for i, ref := range refcounts { + cw[i] = obikmer.Common4Mer(seqwords, ref) + // if i < 50 { + // print(cw[i]) + // print(";") + // } + } + // print("\n") - maxe := goutils.MaxInt(sequence.Length(), ref.Length()) - lcs, alilength = obialign.LCSScore(sequence, ref, maxe, matrix) + o := goutils.ReverseIntOrder(cw) + + mcw := 100000 + for _, i := range o { + if cw[i] < mcw { + mcw = cw[i] + } + if cw[i] > mcw { + log.Panicln("wrong order") } - score[i] = alilength - lcs } - o := goutils.IntOrder(score) + bests := obiseq.MakeBioSequenceSlice() + bests = append(bests, references[o[0]]) + bestidxs := make([]int, 0) + bestidxs = append(bestidxs, o[0]) + bestId := 0.0 + bestmatch := references[o[0]].Id() - current_taxid, err := taxo.Taxon(references[o[0]].Taxid()) - current_score := score[o[0]] - current_idx := o[0] + maxe := 0 + n := 0 + nf := 0 - if err != nil { - log.Panicln(err) - } + for i, j := range o { + ref := references[j] - ecotag_index := make(map[int]string) + lmin, lmax := goutils.MinMaxInt(sequence.Length(), ref.Length()) + atMost := lmax - lmin + int(math.Ceil(float64(lmin-3-cw[j])/4.0)) - 2 - for _, idx := range o { - new_taxid, err := taxo.Taxon(references[idx].Taxid()) - if err != nil { - log.Panicln(err) + if i == 0 { + maxe = goutils.MaxInt(sequence.Length(), ref.Length()) } - new_taxid, err = current_taxid.LCA(new_taxid) - if err != nil { - log.Panicln(err) - } - - new_score := score[idx] - - if current_taxid.Taxid() != new_taxid.Taxid() { - - if new_score > current_score { - ecotag_index[score[current_idx]] = fmt.Sprintf( - "%d@%s@%s", - current_taxid.Taxid(), - current_taxid.ScientificName(), - current_taxid.Rank()) - current_score = new_score + // log.Println(sequence.Id(),cw[j], maxe) + if runExact || (atMost <= (maxe + 1)) { + lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix) + n++ + if lcs == -1 { + nf++ + // That aligment is worst than maxe, go to the next sequence + continue } - current_taxid = new_taxid - current_idx = idx - } - } + score := alilength - lcs + if score < maxe { + bests = bests[:0] + bestidxs = bestidxs[:0] + maxe = score + bestId = float64(lcs) / float64(alilength) + // log.Println(best.Id(), maxe, bestId) + } - ecotag_index[score[current_idx]] = fmt.Sprintf( - "%d@%s@%s", - current_taxid.Taxid(), - current_taxid.ScientificName(), - current_taxid.Rank()) + if score == maxe { + bests = append(bests, ref) + bestidxs = append(bestidxs, j) + id := float64(lcs) / float64(alilength) + if id > bestId { + bestId = id + bestmatch = ref.Id() + } + } - sequence.SetAttribute("ecotag_ref_index", ecotag_index) - - return ecotag_index -} - -func FindClosest(sequence *obiseq.BioSequence, - references obiseq.BioSequenceSlice) (*obiseq.BioSequence, int, float64, int) { - - matrix := obialign.NewLCSMatrix(nil, - sequence.Length(), - sequence.Length(), - sequence.Length()) - - maxe := goutils.MaxInt(sequence.Length(), references[0].Length()) - best := references[0] - bestidx := 0 - bestId := 0.0 - - for i, ref := range references { - lcs, alilength := obialign.LCSScore(sequence, ref, maxe, matrix) - if lcs == -1 { - // That aligment is worst than maxe, go to the next sequence - continue - } - - score := alilength - lcs - if score < maxe { - best = references[i] - bestidx = i - maxe = score - bestId = float64(lcs) / float64(alilength) - // log.Println(best.Id(), maxe, bestId) } if maxe == 0 { @@ -133,49 +108,71 @@ func FindClosest(sequence *obiseq.BioSequence, break } } - return best, maxe, bestId, bestidx + // log.Println("that's all falks", n, nf, maxe, bestId, bestidx) + return bests, maxe, bestId, bestmatch, bestidxs } func Identify(sequence *obiseq.BioSequence, references obiseq.BioSequenceSlice, refcounts []*obikmer.Table4mer, - taxo *obitax.Taxonomy) *obiseq.BioSequence { - best, differences, identity, seqidx := FindClosest(sequence, references) + taxo *obitax.Taxonomy, + runExact bool) *obiseq.BioSequence { + bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact) + + taxon := (*obitax.TaxNode)(nil) + + for i, best := range bests { + idx := best.OBITagRefIndex() + if idx == nil { + // log.Fatalln("Need of indexing") + idx = obirefidx.IndexSequence(seqidxs[i], references, taxo) + } + + d := differences + identification, ok := idx[d] + for !ok && d >= 0 { + identification, ok = idx[d] + d-- + } + + parts := strings.Split(identification, "@") + match_taxid, err := strconv.Atoi(parts[0]) + + if err != nil { + log.Panicln("Cannot extract taxid from :", identification) + } + + match_taxon, err := taxo.Taxon(match_taxid) + + if err != nil { + log.Panicln("Cannot find taxon corresponding to taxid :", match_taxid) + } + + if taxon != nil { + taxon, _ = taxon.LCA(match_taxon) + } else { + taxon = match_taxon + } - idx := best.EcotagRefIndex() - if idx == nil { - idx = IndexSequence(seqidx, references, refcounts, taxo) } - d := differences - identification, ok := idx[d] - for !ok && d >= 0 { - identification, ok = idx[d] - d-- - } - - parts := strings.Split(identification, "@") - taxid, err := strconv.Atoi(parts[0]) - - if err != nil { - log.Panicln("Cannot extract taxid from :", identification) - } - - sequence.SetTaxid(taxid) - sequence.SetAttribute("scientific_name", parts[1]) - sequence.SetAttribute("ecotag_rank", parts[2]) - sequence.SetAttribute("ecotag_id", identity) - sequence.SetAttribute("ecotag_difference", differences) - sequence.SetAttribute("ecotag_match", best.Id()) + sequence.SetTaxid(taxon.Taxid()) + sequence.SetAttribute("scientific_name", taxon.ScientificName()) + sequence.SetAttribute("obitag_rank", taxon.Rank()) + sequence.SetAttribute("obitag_bestid", identity) + sequence.SetAttribute("obitag_difference", differences) + sequence.SetAttribute("obitag_bestmatch", bestmatch) + sequence.SetAttribute("obitag_match_count", len(bests)) return sequence } func IdentifySeqWorker(references obiseq.BioSequenceSlice, refcounts []*obikmer.Table4mer, - taxo *obitax.Taxonomy) obiiter.SeqWorker { + taxo *obitax.Taxonomy, + runExact bool) obiiter.SeqWorker { return func(sequence *obiseq.BioSequence) *obiseq.BioSequence { - return Identify(sequence, references, refcounts, taxo) + return Identify(sequence, references, refcounts, taxo, runExact) } } @@ -198,7 +195,7 @@ func AssignTaxonomy(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatc log.Panicln(error) } - worker := IdentifySeqWorker(references, refcounts, taxo) + worker := IdentifySeqWorker(references, refcounts, taxo, CLIRunExact()) - return iterator.Rebatch(10).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000) + return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Speed("Annotated sequences").Rebatch(1000) } diff --git a/pkg/obitools/obitag/options.go b/pkg/obitools/obitag/options.go index 5f7e07d..68d9136 100644 --- a/pkg/obitools/obitag/options.go +++ b/pkg/obitools/obitag/options.go @@ -11,14 +11,18 @@ import ( ) var _RefDB = "" +var _RunExact = false func TagOptionSet(options *getoptions.GetOpt) { - options.StringVar(&_RefDB, "reference-db",_RefDB, + options.StringVar(&_RefDB, "reference-db", _RefDB, options.Alias("R"), options.Required(), options.ArgName("FILENAME"), options.Description("The name of the file containing the reference DB")) + options.BoolVar(&_RunExact, "exact", _RunExact, + options.Alias("E"), + options.Description("Unactivate the heuristic limatitating the sequence comparisons")) } @@ -26,7 +30,7 @@ func TagOptionSet(options *getoptions.GetOpt) { // the obiuniq command func OptionSet(options *getoptions.GetOpt) { obiconvert.OptionSet(options) - obifind.LoadTaxonomyOptionSet(options,true,false) + obifind.LoadTaxonomyOptionSet(options, true, false) TagOptionSet(options) } @@ -35,11 +39,15 @@ func CLIRefDBName() string { } func CLIRefDB() obiseq.BioSequenceSlice { - refdb,err := obiformats.ReadSequencesBatchFromFile(_RefDB) + refdb, err := obiformats.ReadSequencesBatchFromFile(_RefDB) if err != nil { - log.Panicf("Cannot open the reference library file : %s\n",_RefDB) + log.Panicf("Cannot open the reference library file : %s\n", _RefDB) } return refdb.Load() -} \ No newline at end of file +} + +func CLIRunExact() bool { + return _RunExact +}