mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Small adjustment
This commit is contained in:
@ -1,7 +1,9 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"log"
|
||||||
"os"
|
"os"
|
||||||
|
"runtime/pprof"
|
||||||
|
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obitag"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obitag"
|
||||||
@ -10,6 +12,23 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
|
||||||
|
// go tool pprof -http=":8000" ./build/obitag ./cpu.pprof
|
||||||
|
f, err := os.Create("cpu.pprof")
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
pprof.StartCPUProfile(f)
|
||||||
|
defer pprof.StopCPUProfile()
|
||||||
|
|
||||||
|
// go tool trace cpu.trace
|
||||||
|
// ftrace, err := os.Create("cpu.trace")
|
||||||
|
// if err != nil {
|
||||||
|
// log.Fatal(err)
|
||||||
|
// }
|
||||||
|
// trace.Start(ftrace)
|
||||||
|
// defer trace.Stop()
|
||||||
|
|
||||||
optionParser := obioptions.GenerateOptionParser(obitag.OptionSet)
|
optionParser := obioptions.GenerateOptionParser(obitag.OptionSet)
|
||||||
|
|
||||||
_, args, _ := optionParser(os.Args)
|
_, args, _ := optionParser(os.Args)
|
||||||
|
@ -18,7 +18,7 @@ import (
|
|||||||
// return value and val is set to 0.
|
// return value and val is set to 0.
|
||||||
func InterfaceToString(i interface{}) (val string, err error) {
|
func InterfaceToString(i interface{}) (val string, err error) {
|
||||||
err = nil
|
err = nil
|
||||||
val = fmt.Sprintf("%V", i)
|
val = fmt.Sprintf("%v", i)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,6 +14,14 @@ func MaxInt(x, y int) int {
|
|||||||
return x
|
return x
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func MinMaxInt(x, y int) (int,int) {
|
||||||
|
if x < y {
|
||||||
|
return x,y
|
||||||
|
}
|
||||||
|
return y,x
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
func MinUInt16(x, y uint16) uint16 {
|
func MinUInt16(x, y uint16) uint16 {
|
||||||
if x < y {
|
if x < y {
|
||||||
return x
|
return x
|
||||||
|
@ -18,9 +18,10 @@ func IntOrder(data []int) []int {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
r := make([]int, len(data))
|
||||||
rk := intRanker{
|
rk := intRanker{
|
||||||
x: data,
|
x: data,
|
||||||
r: make([]int, len(data)),
|
r: r,
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := 0; i < len(data); i++ {
|
for i := 0; i < len(data); i++ {
|
||||||
@ -29,5 +30,25 @@ func IntOrder(data []int) []int {
|
|||||||
|
|
||||||
sort.Sort(rk)
|
sort.Sort(rk)
|
||||||
|
|
||||||
return rk.r
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
func ReverseIntOrder(data []int) []int {
|
||||||
|
if len(data) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
r := make([]int, len(data))
|
||||||
|
rk := intRanker{
|
||||||
|
x: data,
|
||||||
|
r: r,
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < len(data); i++ {
|
||||||
|
rk.r[i] = i
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.Sort(sort.Reverse(rk))
|
||||||
|
|
||||||
|
return r
|
||||||
}
|
}
|
||||||
|
@ -50,9 +50,9 @@ func (iterator IBioSequenceBatch) Speed(message ...string) IBioSequenceBatch {
|
|||||||
return newIter
|
return newIter
|
||||||
}
|
}
|
||||||
|
|
||||||
func SpeedPipe() Pipeable {
|
func SpeedPipe(message ...string) Pipeable {
|
||||||
f := func(iterator IBioSequenceBatch) IBioSequenceBatch {
|
f := func(iterator IBioSequenceBatch) IBioSequenceBatch {
|
||||||
return iterator.Speed()
|
return iterator.Speed(message...)
|
||||||
}
|
}
|
||||||
|
|
||||||
return f
|
return f
|
||||||
|
@ -299,11 +299,11 @@ func (s *BioSequence) Taxid() int {
|
|||||||
return taxid
|
return taxid
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *BioSequence) EcotagRefIndex() map[int]string {
|
func (s *BioSequence) OBITagRefIndex() map[int]string {
|
||||||
|
|
||||||
var val map[int]string
|
var val map[int]string
|
||||||
|
|
||||||
i, ok := s.GetAttribute("ecotag_ref_index")
|
i, ok := s.GetAttribute("obitag_ref_index")
|
||||||
|
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil
|
return nil
|
||||||
@ -336,7 +336,7 @@ func (s *BioSequence) EcotagRefIndex() map[int]string {
|
|||||||
|
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
log.Panicln("value of attribute ecotag_ref_index cannot be casted to a map[int]string")
|
log.Panicln("value of attribute obitag_ref_index cannot be casted to a map[int]string")
|
||||||
}
|
}
|
||||||
|
|
||||||
return val
|
return val
|
||||||
|
27
pkg/obitools/obicleandb/options.go
Normal file
27
pkg/obitools/obicleandb/options.go
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
package obicleandb
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obigrep"
|
||||||
|
"github.com/DavidGamba/go-getoptions"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
var _UpdateTaxids = false
|
||||||
|
|
||||||
|
func ObicleanDBOptionSet(options *getoptions.GetOpt) {
|
||||||
|
|
||||||
|
options.BoolVar(&_UpdateTaxids, "update-taxids", _UpdateTaxids,
|
||||||
|
options.Description("Indicates if decrecated Taxids must be corrected."))
|
||||||
|
}
|
||||||
|
|
||||||
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
|
obiconvert.InputOptionSet(options)
|
||||||
|
obiconvert.OutputOptionSet(options)
|
||||||
|
obigrep.TaxonomySelectionOptionSet(options)
|
||||||
|
ObicleanDBOptionSet(options)
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIUpdateTaxids() bool {
|
||||||
|
return _UpdateTaxids
|
||||||
|
}
|
@ -18,7 +18,6 @@ import (
|
|||||||
|
|
||||||
func IndexSequence(seqidx int,
|
func IndexSequence(seqidx int,
|
||||||
references obiseq.BioSequenceSlice,
|
references obiseq.BioSequenceSlice,
|
||||||
refcounts []*obikmer.Table4mer,
|
|
||||||
taxo *obitax.Taxonomy) map[int]string {
|
taxo *obitax.Taxonomy) map[int]string {
|
||||||
|
|
||||||
sequence := references[seqidx]
|
sequence := references[seqidx]
|
||||||
@ -28,23 +27,7 @@ func IndexSequence(seqidx int,
|
|||||||
|
|
||||||
score := make([]int, len(references))
|
score := make([]int, len(references))
|
||||||
for i, ref := range references {
|
for i, ref := range references {
|
||||||
// maxe := goutils.MaxInt(sequence.Length(), ref.Length())
|
|
||||||
// mine := 0
|
|
||||||
// if refcounts != nil {
|
|
||||||
// mine, maxe = obikmer.Error4MerBounds(refcounts[seqidx], refcounts[i])
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
|
||||||
lcs, alilength := obialign.FullLCSScore(sequence, ref, matrix)
|
lcs, alilength := obialign.FullLCSScore(sequence, ref, matrix)
|
||||||
|
|
||||||
// if lcs < 0 {
|
|
||||||
// log.Print("Max error wrongly estimated", mine, maxe)
|
|
||||||
// log.Println(string(sequence.Sequence()))
|
|
||||||
// log.Fatalln(string(ref.Sequence()))
|
|
||||||
|
|
||||||
// maxe := goutils.MaxInt(sequence.Length(), ref.Length())
|
|
||||||
// lcs, alilength = obialign.LCSScore(sequence, ref, matrix)
|
|
||||||
// }
|
|
||||||
score[i] = alilength - lcs
|
score[i] = alilength - lcs
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -58,7 +41,7 @@ func IndexSequence(seqidx int,
|
|||||||
log.Panicln(err)
|
log.Panicln(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
ecotag_index := make(map[int]string)
|
obitag_index := make(map[int]string)
|
||||||
|
|
||||||
for _, idx := range o {
|
for _, idx := range o {
|
||||||
new_taxid, err := taxo.Taxon(references[idx].Taxid())
|
new_taxid, err := taxo.Taxon(references[idx].Taxid())
|
||||||
@ -76,7 +59,7 @@ func IndexSequence(seqidx int,
|
|||||||
if current_taxid.Taxid() != new_taxid.Taxid() {
|
if current_taxid.Taxid() != new_taxid.Taxid() {
|
||||||
|
|
||||||
if new_score > current_score {
|
if new_score > current_score {
|
||||||
ecotag_index[score[current_idx]] = fmt.Sprintf(
|
obitag_index[score[current_idx]] = fmt.Sprintf(
|
||||||
"%d@%s@%s",
|
"%d@%s@%s",
|
||||||
current_taxid.Taxid(),
|
current_taxid.Taxid(),
|
||||||
current_taxid.ScientificName(),
|
current_taxid.ScientificName(),
|
||||||
@ -89,15 +72,15 @@ func IndexSequence(seqidx int,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ecotag_index[score[current_idx]] = fmt.Sprintf(
|
obitag_index[score[current_idx]] = fmt.Sprintf(
|
||||||
"%d@%s@%s",
|
"%d@%s@%s",
|
||||||
current_taxid.Taxid(),
|
current_taxid.Taxid(),
|
||||||
current_taxid.ScientificName(),
|
current_taxid.ScientificName(),
|
||||||
current_taxid.Rank())
|
current_taxid.Rank())
|
||||||
|
|
||||||
sequence.SetAttribute("ecotag_ref_index", ecotag_index)
|
sequence.SetAttribute("obitag_ref_index", obitag_index)
|
||||||
|
|
||||||
return ecotag_index
|
return obitag_index
|
||||||
}
|
}
|
||||||
|
|
||||||
func IndexReferenceDB(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
|
func IndexReferenceDB(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
|
||||||
@ -142,7 +125,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBa
|
|||||||
for l := range limits {
|
for l := range limits {
|
||||||
sl := obiseq.MakeBioSequenceSlice()
|
sl := obiseq.MakeBioSequenceSlice()
|
||||||
for i := l[0]; i < l[1]; i++ {
|
for i := l[0]; i < l[1]; i++ {
|
||||||
IndexSequence(i, references, refcounts, taxo)
|
IndexSequence(i, references, taxo)
|
||||||
sl = append(sl, references[i])
|
sl = append(sl, references[i])
|
||||||
}
|
}
|
||||||
indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl))
|
indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl))
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
package obitag
|
package obitag
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
|
||||||
"log"
|
"log"
|
||||||
|
"math"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@ -14,118 +14,93 @@ import (
|
|||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obirefidx"
|
||||||
)
|
)
|
||||||
|
|
||||||
func IndexSequence(seqidx int,
|
func FindClosests(sequence *obiseq.BioSequence,
|
||||||
references obiseq.BioSequenceSlice,
|
references obiseq.BioSequenceSlice,
|
||||||
refcounts []*obikmer.Table4mer,
|
refcounts []*obikmer.Table4mer,
|
||||||
taxo *obitax.Taxonomy) map[int]string {
|
runExact bool) (obiseq.BioSequenceSlice, int, float64, string, []int) {
|
||||||
|
|
||||||
sequence := references[seqidx]
|
|
||||||
matrix := obialign.NewLCSMatrix(nil,
|
matrix := obialign.NewLCSMatrix(nil,
|
||||||
sequence.Length(),
|
sequence.Length(),
|
||||||
sequence.Length(),
|
sequence.Length(),
|
||||||
sequence.Length())
|
sequence.Length())
|
||||||
|
|
||||||
score := make([]int, len(references))
|
seqwords := obikmer.Count4Mer(sequence, nil, nil)
|
||||||
for i, ref := range references {
|
cw := make([]int, len(refcounts))
|
||||||
maxe := goutils.MaxInt(sequence.Length(), ref.Length())
|
|
||||||
mine := 0
|
|
||||||
if refcounts != nil {
|
|
||||||
mine, maxe = obikmer.Error4MerBounds(refcounts[seqidx], refcounts[i])
|
|
||||||
}
|
|
||||||
lcs, alilength := obialign.LCSScore(sequence, ref, (maxe+1)*2, matrix)
|
|
||||||
|
|
||||||
if lcs < 0 {
|
for i, ref := range refcounts {
|
||||||
log.Print("Max error wrongly estimated", mine, maxe)
|
cw[i] = obikmer.Common4Mer(seqwords, ref)
|
||||||
log.Println(string(sequence.Sequence()))
|
// if i < 50 {
|
||||||
log.Fatalln(string(ref.Sequence()))
|
// print(cw[i])
|
||||||
|
// print(";")
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
// print("\n")
|
||||||
|
|
||||||
maxe := goutils.MaxInt(sequence.Length(), ref.Length())
|
o := goutils.ReverseIntOrder(cw)
|
||||||
lcs, alilength = obialign.LCSScore(sequence, ref, maxe, matrix)
|
|
||||||
|
mcw := 100000
|
||||||
|
for _, i := range o {
|
||||||
|
if cw[i] < mcw {
|
||||||
|
mcw = cw[i]
|
||||||
|
}
|
||||||
|
if cw[i] > mcw {
|
||||||
|
log.Panicln("wrong order")
|
||||||
}
|
}
|
||||||
score[i] = alilength - lcs
|
|
||||||
}
|
}
|
||||||
|
|
||||||
o := goutils.IntOrder(score)
|
bests := obiseq.MakeBioSequenceSlice()
|
||||||
|
bests = append(bests, references[o[0]])
|
||||||
|
bestidxs := make([]int, 0)
|
||||||
|
bestidxs = append(bestidxs, o[0])
|
||||||
|
bestId := 0.0
|
||||||
|
bestmatch := references[o[0]].Id()
|
||||||
|
|
||||||
current_taxid, err := taxo.Taxon(references[o[0]].Taxid())
|
maxe := 0
|
||||||
current_score := score[o[0]]
|
n := 0
|
||||||
current_idx := o[0]
|
nf := 0
|
||||||
|
|
||||||
if err != nil {
|
for i, j := range o {
|
||||||
log.Panicln(err)
|
ref := references[j]
|
||||||
}
|
|
||||||
|
|
||||||
ecotag_index := make(map[int]string)
|
lmin, lmax := goutils.MinMaxInt(sequence.Length(), ref.Length())
|
||||||
|
atMost := lmax - lmin + int(math.Ceil(float64(lmin-3-cw[j])/4.0)) - 2
|
||||||
|
|
||||||
for _, idx := range o {
|
if i == 0 {
|
||||||
new_taxid, err := taxo.Taxon(references[idx].Taxid())
|
maxe = goutils.MaxInt(sequence.Length(), ref.Length())
|
||||||
if err != nil {
|
|
||||||
log.Panicln(err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
new_taxid, err = current_taxid.LCA(new_taxid)
|
// log.Println(sequence.Id(),cw[j], maxe)
|
||||||
if err != nil {
|
if runExact || (atMost <= (maxe + 1)) {
|
||||||
log.Panicln(err)
|
lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix)
|
||||||
}
|
n++
|
||||||
|
if lcs == -1 {
|
||||||
new_score := score[idx]
|
nf++
|
||||||
|
// That aligment is worst than maxe, go to the next sequence
|
||||||
if current_taxid.Taxid() != new_taxid.Taxid() {
|
continue
|
||||||
|
|
||||||
if new_score > current_score {
|
|
||||||
ecotag_index[score[current_idx]] = fmt.Sprintf(
|
|
||||||
"%d@%s@%s",
|
|
||||||
current_taxid.Taxid(),
|
|
||||||
current_taxid.ScientificName(),
|
|
||||||
current_taxid.Rank())
|
|
||||||
current_score = new_score
|
|
||||||
}
|
}
|
||||||
|
|
||||||
current_taxid = new_taxid
|
score := alilength - lcs
|
||||||
current_idx = idx
|
if score < maxe {
|
||||||
}
|
bests = bests[:0]
|
||||||
}
|
bestidxs = bestidxs[:0]
|
||||||
|
maxe = score
|
||||||
|
bestId = float64(lcs) / float64(alilength)
|
||||||
|
// log.Println(best.Id(), maxe, bestId)
|
||||||
|
}
|
||||||
|
|
||||||
ecotag_index[score[current_idx]] = fmt.Sprintf(
|
if score == maxe {
|
||||||
"%d@%s@%s",
|
bests = append(bests, ref)
|
||||||
current_taxid.Taxid(),
|
bestidxs = append(bestidxs, j)
|
||||||
current_taxid.ScientificName(),
|
id := float64(lcs) / float64(alilength)
|
||||||
current_taxid.Rank())
|
if id > bestId {
|
||||||
|
bestId = id
|
||||||
|
bestmatch = ref.Id()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
sequence.SetAttribute("ecotag_ref_index", ecotag_index)
|
|
||||||
|
|
||||||
return ecotag_index
|
|
||||||
}
|
|
||||||
|
|
||||||
func FindClosest(sequence *obiseq.BioSequence,
|
|
||||||
references obiseq.BioSequenceSlice) (*obiseq.BioSequence, int, float64, int) {
|
|
||||||
|
|
||||||
matrix := obialign.NewLCSMatrix(nil,
|
|
||||||
sequence.Length(),
|
|
||||||
sequence.Length(),
|
|
||||||
sequence.Length())
|
|
||||||
|
|
||||||
maxe := goutils.MaxInt(sequence.Length(), references[0].Length())
|
|
||||||
best := references[0]
|
|
||||||
bestidx := 0
|
|
||||||
bestId := 0.0
|
|
||||||
|
|
||||||
for i, ref := range references {
|
|
||||||
lcs, alilength := obialign.LCSScore(sequence, ref, maxe, matrix)
|
|
||||||
if lcs == -1 {
|
|
||||||
// That aligment is worst than maxe, go to the next sequence
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
score := alilength - lcs
|
|
||||||
if score < maxe {
|
|
||||||
best = references[i]
|
|
||||||
bestidx = i
|
|
||||||
maxe = score
|
|
||||||
bestId = float64(lcs) / float64(alilength)
|
|
||||||
// log.Println(best.Id(), maxe, bestId)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if maxe == 0 {
|
if maxe == 0 {
|
||||||
@ -133,49 +108,71 @@ func FindClosest(sequence *obiseq.BioSequence,
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return best, maxe, bestId, bestidx
|
// log.Println("that's all falks", n, nf, maxe, bestId, bestidx)
|
||||||
|
return bests, maxe, bestId, bestmatch, bestidxs
|
||||||
}
|
}
|
||||||
|
|
||||||
func Identify(sequence *obiseq.BioSequence,
|
func Identify(sequence *obiseq.BioSequence,
|
||||||
references obiseq.BioSequenceSlice,
|
references obiseq.BioSequenceSlice,
|
||||||
refcounts []*obikmer.Table4mer,
|
refcounts []*obikmer.Table4mer,
|
||||||
taxo *obitax.Taxonomy) *obiseq.BioSequence {
|
taxo *obitax.Taxonomy,
|
||||||
best, differences, identity, seqidx := FindClosest(sequence, references)
|
runExact bool) *obiseq.BioSequence {
|
||||||
|
bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact)
|
||||||
|
|
||||||
|
taxon := (*obitax.TaxNode)(nil)
|
||||||
|
|
||||||
|
for i, best := range bests {
|
||||||
|
idx := best.OBITagRefIndex()
|
||||||
|
if idx == nil {
|
||||||
|
// log.Fatalln("Need of indexing")
|
||||||
|
idx = obirefidx.IndexSequence(seqidxs[i], references, taxo)
|
||||||
|
}
|
||||||
|
|
||||||
|
d := differences
|
||||||
|
identification, ok := idx[d]
|
||||||
|
for !ok && d >= 0 {
|
||||||
|
identification, ok = idx[d]
|
||||||
|
d--
|
||||||
|
}
|
||||||
|
|
||||||
|
parts := strings.Split(identification, "@")
|
||||||
|
match_taxid, err := strconv.Atoi(parts[0])
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Panicln("Cannot extract taxid from :", identification)
|
||||||
|
}
|
||||||
|
|
||||||
|
match_taxon, err := taxo.Taxon(match_taxid)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Panicln("Cannot find taxon corresponding to taxid :", match_taxid)
|
||||||
|
}
|
||||||
|
|
||||||
|
if taxon != nil {
|
||||||
|
taxon, _ = taxon.LCA(match_taxon)
|
||||||
|
} else {
|
||||||
|
taxon = match_taxon
|
||||||
|
}
|
||||||
|
|
||||||
idx := best.EcotagRefIndex()
|
|
||||||
if idx == nil {
|
|
||||||
idx = IndexSequence(seqidx, references, refcounts, taxo)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
d := differences
|
sequence.SetTaxid(taxon.Taxid())
|
||||||
identification, ok := idx[d]
|
sequence.SetAttribute("scientific_name", taxon.ScientificName())
|
||||||
for !ok && d >= 0 {
|
sequence.SetAttribute("obitag_rank", taxon.Rank())
|
||||||
identification, ok = idx[d]
|
sequence.SetAttribute("obitag_bestid", identity)
|
||||||
d--
|
sequence.SetAttribute("obitag_difference", differences)
|
||||||
}
|
sequence.SetAttribute("obitag_bestmatch", bestmatch)
|
||||||
|
sequence.SetAttribute("obitag_match_count", len(bests))
|
||||||
parts := strings.Split(identification, "@")
|
|
||||||
taxid, err := strconv.Atoi(parts[0])
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
log.Panicln("Cannot extract taxid from :", identification)
|
|
||||||
}
|
|
||||||
|
|
||||||
sequence.SetTaxid(taxid)
|
|
||||||
sequence.SetAttribute("scientific_name", parts[1])
|
|
||||||
sequence.SetAttribute("ecotag_rank", parts[2])
|
|
||||||
sequence.SetAttribute("ecotag_id", identity)
|
|
||||||
sequence.SetAttribute("ecotag_difference", differences)
|
|
||||||
sequence.SetAttribute("ecotag_match", best.Id())
|
|
||||||
|
|
||||||
return sequence
|
return sequence
|
||||||
}
|
}
|
||||||
|
|
||||||
func IdentifySeqWorker(references obiseq.BioSequenceSlice,
|
func IdentifySeqWorker(references obiseq.BioSequenceSlice,
|
||||||
refcounts []*obikmer.Table4mer,
|
refcounts []*obikmer.Table4mer,
|
||||||
taxo *obitax.Taxonomy) obiiter.SeqWorker {
|
taxo *obitax.Taxonomy,
|
||||||
|
runExact bool) obiiter.SeqWorker {
|
||||||
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
|
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
|
||||||
return Identify(sequence, references, refcounts, taxo)
|
return Identify(sequence, references, refcounts, taxo, runExact)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -198,7 +195,7 @@ func AssignTaxonomy(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatc
|
|||||||
log.Panicln(error)
|
log.Panicln(error)
|
||||||
}
|
}
|
||||||
|
|
||||||
worker := IdentifySeqWorker(references, refcounts, taxo)
|
worker := IdentifySeqWorker(references, refcounts, taxo, CLIRunExact())
|
||||||
|
|
||||||
return iterator.Rebatch(10).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000)
|
return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Speed("Annotated sequences").Rebatch(1000)
|
||||||
}
|
}
|
||||||
|
@ -11,14 +11,18 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var _RefDB = ""
|
var _RefDB = ""
|
||||||
|
var _RunExact = false
|
||||||
|
|
||||||
func TagOptionSet(options *getoptions.GetOpt) {
|
func TagOptionSet(options *getoptions.GetOpt) {
|
||||||
options.StringVar(&_RefDB, "reference-db",_RefDB,
|
options.StringVar(&_RefDB, "reference-db", _RefDB,
|
||||||
options.Alias("R"),
|
options.Alias("R"),
|
||||||
options.Required(),
|
options.Required(),
|
||||||
options.ArgName("FILENAME"),
|
options.ArgName("FILENAME"),
|
||||||
options.Description("The name of the file containing the reference DB"))
|
options.Description("The name of the file containing the reference DB"))
|
||||||
|
|
||||||
|
options.BoolVar(&_RunExact, "exact", _RunExact,
|
||||||
|
options.Alias("E"),
|
||||||
|
options.Description("Unactivate the heuristic limatitating the sequence comparisons"))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -26,7 +30,7 @@ func TagOptionSet(options *getoptions.GetOpt) {
|
|||||||
// the obiuniq command
|
// the obiuniq command
|
||||||
func OptionSet(options *getoptions.GetOpt) {
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
obiconvert.OptionSet(options)
|
obiconvert.OptionSet(options)
|
||||||
obifind.LoadTaxonomyOptionSet(options,true,false)
|
obifind.LoadTaxonomyOptionSet(options, true, false)
|
||||||
TagOptionSet(options)
|
TagOptionSet(options)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -35,11 +39,15 @@ func CLIRefDBName() string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func CLIRefDB() obiseq.BioSequenceSlice {
|
func CLIRefDB() obiseq.BioSequenceSlice {
|
||||||
refdb,err := obiformats.ReadSequencesBatchFromFile(_RefDB)
|
refdb, err := obiformats.ReadSequencesBatchFromFile(_RefDB)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Panicf("Cannot open the reference library file : %s\n",_RefDB)
|
log.Panicf("Cannot open the reference library file : %s\n", _RefDB)
|
||||||
}
|
}
|
||||||
|
|
||||||
return refdb.Load()
|
return refdb.Load()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func CLIRunExact() bool {
|
||||||
|
return _RunExact
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user