Small adjustment

This commit is contained in:
2022-10-27 11:18:44 +02:00
parent 8aa323dad5
commit f3ddac0f50
10 changed files with 218 additions and 155 deletions

View File

@ -1,7 +1,9 @@
package main package main
import ( import (
"log"
"os" "os"
"runtime/pprof"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obitag" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obitag"
@ -10,6 +12,23 @@ import (
) )
func main() { func main() {
// go tool pprof -http=":8000" ./build/obitag ./cpu.pprof
f, err := os.Create("cpu.pprof")
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
// go tool trace cpu.trace
// ftrace, err := os.Create("cpu.trace")
// if err != nil {
// log.Fatal(err)
// }
// trace.Start(ftrace)
// defer trace.Stop()
optionParser := obioptions.GenerateOptionParser(obitag.OptionSet) optionParser := obioptions.GenerateOptionParser(obitag.OptionSet)
_, args, _ := optionParser(os.Args) _, args, _ := optionParser(os.Args)

View File

@ -18,7 +18,7 @@ import (
// return value and val is set to 0. // return value and val is set to 0.
func InterfaceToString(i interface{}) (val string, err error) { func InterfaceToString(i interface{}) (val string, err error) {
err = nil err = nil
val = fmt.Sprintf("%V", i) val = fmt.Sprintf("%v", i)
return return
} }

View File

@ -14,6 +14,14 @@ func MaxInt(x, y int) int {
return x return x
} }
func MinMaxInt(x, y int) (int,int) {
if x < y {
return x,y
}
return y,x
}
func MinUInt16(x, y uint16) uint16 { func MinUInt16(x, y uint16) uint16 {
if x < y { if x < y {
return x return x

View File

@ -18,9 +18,10 @@ func IntOrder(data []int) []int {
return nil return nil
} }
r := make([]int, len(data))
rk := intRanker{ rk := intRanker{
x: data, x: data,
r: make([]int, len(data)), r: r,
} }
for i := 0; i < len(data); i++ { for i := 0; i < len(data); i++ {
@ -29,5 +30,25 @@ func IntOrder(data []int) []int {
sort.Sort(rk) sort.Sort(rk)
return rk.r return r
}
func ReverseIntOrder(data []int) []int {
if len(data) == 0 {
return nil
}
r := make([]int, len(data))
rk := intRanker{
x: data,
r: r,
}
for i := 0; i < len(data); i++ {
rk.r[i] = i
}
sort.Sort(sort.Reverse(rk))
return r
} }

View File

@ -50,9 +50,9 @@ func (iterator IBioSequenceBatch) Speed(message ...string) IBioSequenceBatch {
return newIter return newIter
} }
func SpeedPipe() Pipeable { func SpeedPipe(message ...string) Pipeable {
f := func(iterator IBioSequenceBatch) IBioSequenceBatch { f := func(iterator IBioSequenceBatch) IBioSequenceBatch {
return iterator.Speed() return iterator.Speed(message...)
} }
return f return f

View File

@ -299,11 +299,11 @@ func (s *BioSequence) Taxid() int {
return taxid return taxid
} }
func (s *BioSequence) EcotagRefIndex() map[int]string { func (s *BioSequence) OBITagRefIndex() map[int]string {
var val map[int]string var val map[int]string
i, ok := s.GetAttribute("ecotag_ref_index") i, ok := s.GetAttribute("obitag_ref_index")
if !ok { if !ok {
return nil return nil
@ -336,7 +336,7 @@ func (s *BioSequence) EcotagRefIndex() map[int]string {
} }
default: default:
log.Panicln("value of attribute ecotag_ref_index cannot be casted to a map[int]string") log.Panicln("value of attribute obitag_ref_index cannot be casted to a map[int]string")
} }
return val return val

View File

@ -0,0 +1,27 @@
package obicleandb
import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obigrep"
"github.com/DavidGamba/go-getoptions"
)
var _UpdateTaxids = false
func ObicleanDBOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_UpdateTaxids, "update-taxids", _UpdateTaxids,
options.Description("Indicates if decrecated Taxids must be corrected."))
}
func OptionSet(options *getoptions.GetOpt) {
obiconvert.InputOptionSet(options)
obiconvert.OutputOptionSet(options)
obigrep.TaxonomySelectionOptionSet(options)
ObicleanDBOptionSet(options)
}
func CLIUpdateTaxids() bool {
return _UpdateTaxids
}

View File

@ -18,7 +18,6 @@ import (
func IndexSequence(seqidx int, func IndexSequence(seqidx int,
references obiseq.BioSequenceSlice, references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer,
taxo *obitax.Taxonomy) map[int]string { taxo *obitax.Taxonomy) map[int]string {
sequence := references[seqidx] sequence := references[seqidx]
@ -28,23 +27,7 @@ func IndexSequence(seqidx int,
score := make([]int, len(references)) score := make([]int, len(references))
for i, ref := range references { for i, ref := range references {
// maxe := goutils.MaxInt(sequence.Length(), ref.Length())
// mine := 0
// if refcounts != nil {
// mine, maxe = obikmer.Error4MerBounds(refcounts[seqidx], refcounts[i])
// }
lcs, alilength := obialign.FullLCSScore(sequence, ref, matrix) lcs, alilength := obialign.FullLCSScore(sequence, ref, matrix)
// if lcs < 0 {
// log.Print("Max error wrongly estimated", mine, maxe)
// log.Println(string(sequence.Sequence()))
// log.Fatalln(string(ref.Sequence()))
// maxe := goutils.MaxInt(sequence.Length(), ref.Length())
// lcs, alilength = obialign.LCSScore(sequence, ref, matrix)
// }
score[i] = alilength - lcs score[i] = alilength - lcs
} }
@ -58,7 +41,7 @@ func IndexSequence(seqidx int,
log.Panicln(err) log.Panicln(err)
} }
ecotag_index := make(map[int]string) obitag_index := make(map[int]string)
for _, idx := range o { for _, idx := range o {
new_taxid, err := taxo.Taxon(references[idx].Taxid()) new_taxid, err := taxo.Taxon(references[idx].Taxid())
@ -76,7 +59,7 @@ func IndexSequence(seqidx int,
if current_taxid.Taxid() != new_taxid.Taxid() { if current_taxid.Taxid() != new_taxid.Taxid() {
if new_score > current_score { if new_score > current_score {
ecotag_index[score[current_idx]] = fmt.Sprintf( obitag_index[score[current_idx]] = fmt.Sprintf(
"%d@%s@%s", "%d@%s@%s",
current_taxid.Taxid(), current_taxid.Taxid(),
current_taxid.ScientificName(), current_taxid.ScientificName(),
@ -89,15 +72,15 @@ func IndexSequence(seqidx int,
} }
} }
ecotag_index[score[current_idx]] = fmt.Sprintf( obitag_index[score[current_idx]] = fmt.Sprintf(
"%d@%s@%s", "%d@%s@%s",
current_taxid.Taxid(), current_taxid.Taxid(),
current_taxid.ScientificName(), current_taxid.ScientificName(),
current_taxid.Rank()) current_taxid.Rank())
sequence.SetAttribute("ecotag_ref_index", ecotag_index) sequence.SetAttribute("obitag_ref_index", obitag_index)
return ecotag_index return obitag_index
} }
func IndexReferenceDB(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch { func IndexReferenceDB(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
@ -142,7 +125,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBa
for l := range limits { for l := range limits {
sl := obiseq.MakeBioSequenceSlice() sl := obiseq.MakeBioSequenceSlice()
for i := l[0]; i < l[1]; i++ { for i := l[0]; i < l[1]; i++ {
IndexSequence(i, references, refcounts, taxo) IndexSequence(i, references, taxo)
sl = append(sl, references[i]) sl = append(sl, references[i])
} }
indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl)) indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl))

View File

@ -1,8 +1,8 @@
package obitag package obitag
import ( import (
"fmt"
"log" "log"
"math"
"strconv" "strconv"
"strings" "strings"
@ -14,118 +14,93 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obirefidx"
) )
func IndexSequence(seqidx int, func FindClosests(sequence *obiseq.BioSequence,
references obiseq.BioSequenceSlice, references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer, refcounts []*obikmer.Table4mer,
taxo *obitax.Taxonomy) map[int]string { runExact bool) (obiseq.BioSequenceSlice, int, float64, string, []int) {
sequence := references[seqidx]
matrix := obialign.NewLCSMatrix(nil, matrix := obialign.NewLCSMatrix(nil,
sequence.Length(), sequence.Length(),
sequence.Length(), sequence.Length(),
sequence.Length()) sequence.Length())
score := make([]int, len(references)) seqwords := obikmer.Count4Mer(sequence, nil, nil)
for i, ref := range references { cw := make([]int, len(refcounts))
maxe := goutils.MaxInt(sequence.Length(), ref.Length())
mine := 0
if refcounts != nil {
mine, maxe = obikmer.Error4MerBounds(refcounts[seqidx], refcounts[i])
}
lcs, alilength := obialign.LCSScore(sequence, ref, (maxe+1)*2, matrix)
if lcs < 0 { for i, ref := range refcounts {
log.Print("Max error wrongly estimated", mine, maxe) cw[i] = obikmer.Common4Mer(seqwords, ref)
log.Println(string(sequence.Sequence())) // if i < 50 {
log.Fatalln(string(ref.Sequence())) // print(cw[i])
// print(";")
// }
}
// print("\n")
maxe := goutils.MaxInt(sequence.Length(), ref.Length()) o := goutils.ReverseIntOrder(cw)
lcs, alilength = obialign.LCSScore(sequence, ref, maxe, matrix)
mcw := 100000
for _, i := range o {
if cw[i] < mcw {
mcw = cw[i]
}
if cw[i] > mcw {
log.Panicln("wrong order")
} }
score[i] = alilength - lcs
} }
o := goutils.IntOrder(score) bests := obiseq.MakeBioSequenceSlice()
bests = append(bests, references[o[0]])
bestidxs := make([]int, 0)
bestidxs = append(bestidxs, o[0])
bestId := 0.0
bestmatch := references[o[0]].Id()
current_taxid, err := taxo.Taxon(references[o[0]].Taxid()) maxe := 0
current_score := score[o[0]] n := 0
current_idx := o[0] nf := 0
if err != nil { for i, j := range o {
log.Panicln(err) ref := references[j]
}
ecotag_index := make(map[int]string) lmin, lmax := goutils.MinMaxInt(sequence.Length(), ref.Length())
atMost := lmax - lmin + int(math.Ceil(float64(lmin-3-cw[j])/4.0)) - 2
for _, idx := range o { if i == 0 {
new_taxid, err := taxo.Taxon(references[idx].Taxid()) maxe = goutils.MaxInt(sequence.Length(), ref.Length())
if err != nil {
log.Panicln(err)
} }
new_taxid, err = current_taxid.LCA(new_taxid) // log.Println(sequence.Id(),cw[j], maxe)
if err != nil { if runExact || (atMost <= (maxe + 1)) {
log.Panicln(err) lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix)
} n++
if lcs == -1 {
new_score := score[idx] nf++
// That aligment is worst than maxe, go to the next sequence
if current_taxid.Taxid() != new_taxid.Taxid() { continue
if new_score > current_score {
ecotag_index[score[current_idx]] = fmt.Sprintf(
"%d@%s@%s",
current_taxid.Taxid(),
current_taxid.ScientificName(),
current_taxid.Rank())
current_score = new_score
} }
current_taxid = new_taxid score := alilength - lcs
current_idx = idx if score < maxe {
} bests = bests[:0]
} bestidxs = bestidxs[:0]
maxe = score
bestId = float64(lcs) / float64(alilength)
// log.Println(best.Id(), maxe, bestId)
}
ecotag_index[score[current_idx]] = fmt.Sprintf( if score == maxe {
"%d@%s@%s", bests = append(bests, ref)
current_taxid.Taxid(), bestidxs = append(bestidxs, j)
current_taxid.ScientificName(), id := float64(lcs) / float64(alilength)
current_taxid.Rank()) if id > bestId {
bestId = id
bestmatch = ref.Id()
}
}
sequence.SetAttribute("ecotag_ref_index", ecotag_index)
return ecotag_index
}
func FindClosest(sequence *obiseq.BioSequence,
references obiseq.BioSequenceSlice) (*obiseq.BioSequence, int, float64, int) {
matrix := obialign.NewLCSMatrix(nil,
sequence.Length(),
sequence.Length(),
sequence.Length())
maxe := goutils.MaxInt(sequence.Length(), references[0].Length())
best := references[0]
bestidx := 0
bestId := 0.0
for i, ref := range references {
lcs, alilength := obialign.LCSScore(sequence, ref, maxe, matrix)
if lcs == -1 {
// That aligment is worst than maxe, go to the next sequence
continue
}
score := alilength - lcs
if score < maxe {
best = references[i]
bestidx = i
maxe = score
bestId = float64(lcs) / float64(alilength)
// log.Println(best.Id(), maxe, bestId)
} }
if maxe == 0 { if maxe == 0 {
@ -133,49 +108,71 @@ func FindClosest(sequence *obiseq.BioSequence,
break break
} }
} }
return best, maxe, bestId, bestidx // log.Println("that's all falks", n, nf, maxe, bestId, bestidx)
return bests, maxe, bestId, bestmatch, bestidxs
} }
func Identify(sequence *obiseq.BioSequence, func Identify(sequence *obiseq.BioSequence,
references obiseq.BioSequenceSlice, references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer, refcounts []*obikmer.Table4mer,
taxo *obitax.Taxonomy) *obiseq.BioSequence { taxo *obitax.Taxonomy,
best, differences, identity, seqidx := FindClosest(sequence, references) runExact bool) *obiseq.BioSequence {
bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact)
taxon := (*obitax.TaxNode)(nil)
for i, best := range bests {
idx := best.OBITagRefIndex()
if idx == nil {
// log.Fatalln("Need of indexing")
idx = obirefidx.IndexSequence(seqidxs[i], references, taxo)
}
d := differences
identification, ok := idx[d]
for !ok && d >= 0 {
identification, ok = idx[d]
d--
}
parts := strings.Split(identification, "@")
match_taxid, err := strconv.Atoi(parts[0])
if err != nil {
log.Panicln("Cannot extract taxid from :", identification)
}
match_taxon, err := taxo.Taxon(match_taxid)
if err != nil {
log.Panicln("Cannot find taxon corresponding to taxid :", match_taxid)
}
if taxon != nil {
taxon, _ = taxon.LCA(match_taxon)
} else {
taxon = match_taxon
}
idx := best.EcotagRefIndex()
if idx == nil {
idx = IndexSequence(seqidx, references, refcounts, taxo)
} }
d := differences sequence.SetTaxid(taxon.Taxid())
identification, ok := idx[d] sequence.SetAttribute("scientific_name", taxon.ScientificName())
for !ok && d >= 0 { sequence.SetAttribute("obitag_rank", taxon.Rank())
identification, ok = idx[d] sequence.SetAttribute("obitag_bestid", identity)
d-- sequence.SetAttribute("obitag_difference", differences)
} sequence.SetAttribute("obitag_bestmatch", bestmatch)
sequence.SetAttribute("obitag_match_count", len(bests))
parts := strings.Split(identification, "@")
taxid, err := strconv.Atoi(parts[0])
if err != nil {
log.Panicln("Cannot extract taxid from :", identification)
}
sequence.SetTaxid(taxid)
sequence.SetAttribute("scientific_name", parts[1])
sequence.SetAttribute("ecotag_rank", parts[2])
sequence.SetAttribute("ecotag_id", identity)
sequence.SetAttribute("ecotag_difference", differences)
sequence.SetAttribute("ecotag_match", best.Id())
return sequence return sequence
} }
func IdentifySeqWorker(references obiseq.BioSequenceSlice, func IdentifySeqWorker(references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer, refcounts []*obikmer.Table4mer,
taxo *obitax.Taxonomy) obiiter.SeqWorker { taxo *obitax.Taxonomy,
runExact bool) obiiter.SeqWorker {
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence { return func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
return Identify(sequence, references, refcounts, taxo) return Identify(sequence, references, refcounts, taxo, runExact)
} }
} }
@ -198,7 +195,7 @@ func AssignTaxonomy(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatc
log.Panicln(error) log.Panicln(error)
} }
worker := IdentifySeqWorker(references, refcounts, taxo) worker := IdentifySeqWorker(references, refcounts, taxo, CLIRunExact())
return iterator.Rebatch(10).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000) return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Speed("Annotated sequences").Rebatch(1000)
} }

View File

@ -11,14 +11,18 @@ import (
) )
var _RefDB = "" var _RefDB = ""
var _RunExact = false
func TagOptionSet(options *getoptions.GetOpt) { func TagOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_RefDB, "reference-db",_RefDB, options.StringVar(&_RefDB, "reference-db", _RefDB,
options.Alias("R"), options.Alias("R"),
options.Required(), options.Required(),
options.ArgName("FILENAME"), options.ArgName("FILENAME"),
options.Description("The name of the file containing the reference DB")) options.Description("The name of the file containing the reference DB"))
options.BoolVar(&_RunExact, "exact", _RunExact,
options.Alias("E"),
options.Description("Unactivate the heuristic limatitating the sequence comparisons"))
} }
@ -26,7 +30,7 @@ func TagOptionSet(options *getoptions.GetOpt) {
// the obiuniq command // the obiuniq command
func OptionSet(options *getoptions.GetOpt) { func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options) obiconvert.OptionSet(options)
obifind.LoadTaxonomyOptionSet(options,true,false) obifind.LoadTaxonomyOptionSet(options, true, false)
TagOptionSet(options) TagOptionSet(options)
} }
@ -35,11 +39,15 @@ func CLIRefDBName() string {
} }
func CLIRefDB() obiseq.BioSequenceSlice { func CLIRefDB() obiseq.BioSequenceSlice {
refdb,err := obiformats.ReadSequencesBatchFromFile(_RefDB) refdb, err := obiformats.ReadSequencesBatchFromFile(_RefDB)
if err != nil { if err != nil {
log.Panicf("Cannot open the reference library file : %s\n",_RefDB) log.Panicf("Cannot open the reference library file : %s\n", _RefDB)
} }
return refdb.Load() return refdb.Load()
} }
func CLIRunExact() bool {
return _RunExact
}