Correction a a bug and transfert of the new matching rules from obirefidx to obitag

Former-commit-id: a28540f72a504ad4e7e8a8b6f7609116913445b4
This commit is contained in:
2023-05-05 07:37:19 +02:00
parent 88a9e38952
commit 3778ae9266
4 changed files with 78 additions and 80 deletions

View File

@ -48,34 +48,46 @@ func IndexSequence(seqidx int,
obiutils.Reverse(*pseq, true)
// score := make([]int, len(references))
mindiff := make([]int, len(*pseq))
nseq := make([]int, len(*pseq))
/* nseq := make([]int, len(*pseq))
nali := make([]int, len(*pseq))
nok := make([]int, len(*pseq))
lseq := sequence.Len()
nfast := make([]int, len(*pseq))
nfastok := make([]int, len(*pseq))
*/ lseq := sequence.Len()
mini := -1
for i, ancestor := range *pseq {
for _, order := range ow {
if lca[order] == ancestor {
nseq[i]++
// nseq[i]++
wordmin := 0
if mini != -1 {
wordmin = obiutils.MaxInt(lseq-3-mini*4, 0)
}
lcs, alilength := -1, -1
errs := int(1e9)
if mini != -1 && mini <= 1 {
// nfast[i]++
d, _, _, _ := obialign.D1Or0(sequence, references[order])
if d >= 0 {
errs = d
// nfastok[i]++
}
} else {
if cw[order] >= wordmin {
nali[i]++
// nali[i]++
lcs, alilength = obialign.FastLCSScore(sequence, references[order], mini, &matrix)
if lcs >= 0 {
nok[i]++
errs := alilength - lcs
// nok[i]++
errs = alilength - lcs
}
}
}
if mini == -1 || errs < mini {
mini = errs
}
}
}
}
}
mindiff[i] = mini
}
@ -94,11 +106,13 @@ func IndexSequence(seqidx int,
}
}
// log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), obitag_index)
// log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nseq)
// log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nali)
// log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nok)
return obitag_index
/* log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), obitag_index)
log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nseq)
log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nfast)
log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nfastok)
log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nali)
log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nok)
*/ return obitag_index
}
func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {

View File

@ -1,7 +1,6 @@
package obitag
import (
"math"
"strconv"
"strings"
@ -32,79 +31,65 @@ func FindClosests(sequence *obiseq.BioSequence,
cw[i] = obikmer.Common4Mer(seqwords, ref)
}
o := obiutils.ReverseIntOrder(cw)
// mcw := 100000
// for _, i := range o {
// if cw[i] < mcw {
// mcw = cw[i]
// }
// if cw[i] > mcw {
// log.Panicln("wrong order")
// }
// }
o := obiutils.Reverse(obiutils.IntOrder(cw), true)
bests := obiseq.MakeBioSequenceSlice()
bests = append(bests, references[o[0]])
// bests = append(bests, references[o[0]])
bestidxs := make([]int, 0)
bestidxs = append(bestidxs, o[0])
// bestidxs = append(bestidxs, o[0])
bestId := 0.0
bestmatch := references[o[0]].Id()
maxe := 0
n := 0
nf := 0
maxe := -1
wordmin := 0
for i, j := range o {
ref := references[j]
for _, order := range o {
ref := references[order]
lmin, lmax := obiutils.MinMaxInt(sequence.Len(), ref.Len())
atMost := lmax - lmin + int(math.Ceil(float64(lmin-3-cw[j])/4.0)) - 2
if i == 0 {
maxe = obiutils.MaxInt(sequence.Len(), ref.Len())
if maxe != -1 {
wordmin = obiutils.MaxInt(sequence.Len(), ref.Len()) - 4*maxe
}
// log.Println(sequence.Id(),cw[j], maxe)
if runExact || (atMost <= (maxe + 1)) {
// if true {
lcs, alilength := obialign.FastLCSScore(sequence, ref, maxe+1, &matrix)
// fmt.Println(j, cw[j], lcs, alilength, alilength-lcs)
// lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix)
n++
if lcs == -1 {
nf++
// That aligment is worst than maxe, go to the next sequence
continue
lcs, alilength := -1, -1
score := int(1e9)
if maxe == 0 || maxe == 1 {
d, _, _, _ := obialign.D1Or0(sequence, references[order])
if d >= 0 {
score = d
alilength = obiutils.MaxInt(sequence.Len(), ref.Len())
lcs = alilength - score
}
} else {
if cw[order] >= wordmin {
lcs, alilength = obialign.FastLCSScore(sequence, references[order], maxe, &matrix)
if lcs >= 0 {
score = alilength - lcs
}
}
}
score := alilength - lcs
if score < maxe {
if maxe == -1 || score < maxe {
bests = bests[:0]
bestidxs = bestidxs[:0]
maxe = score
bestId = float64(lcs) / float64(alilength)
// log.Println(best.Id(), maxe, bestId)
// log.Println(ref.Id(), maxe, bestId,bestidxs)
}
if score == maxe {
bests = append(bests, ref)
bestidxs = append(bestidxs, j)
bestidxs = append(bestidxs, order)
id := float64(lcs) / float64(alilength)
if id > bestId {
bestId = id
bestmatch = ref.Id()
}
// log.Println(ref.Id(), maxe, bestId,bestidxs)
}
}
if maxe == 0 {
// We have found identity no need to continue to search
break
}
}
// log.Println("that's all falks", n, nf, maxe, bestId, bestidx)
//log.Println("that's all falks", maxe, bestId, bestidxs)
return bests, maxe, bestId, bestmatch, bestidxs
}
@ -170,7 +155,7 @@ func IdentifySeqWorker(references obiseq.BioSequenceSlice,
taxo *obitax.Taxonomy,
runExact bool) obiseq.SeqWorker {
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
return Identify(sequence, references, refcounts, taxa,taxo, runExact)
return Identify(sequence, references, refcounts, taxa, taxo, runExact)
}
}
@ -194,10 +179,9 @@ func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence {
for i, seq := range references {
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
taxa[i],_= taxo.Taxon(seq.Taxid())
taxa[i], _ = taxo.Taxon(seq.Taxid())
}
worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact())
return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000)

View File

@ -20,9 +20,9 @@ func TagOptionSet(options *getoptions.GetOpt) {
options.ArgName("FILENAME"),
options.Description("The name of the file containing the reference DB"))
options.BoolVar(&_RunExact, "exact", _RunExact,
options.Alias("E"),
options.Description("Unactivate the heuristic limatitating the sequence comparisons"))
// options.BoolVar(&_RunExact, "exact", _RunExact,
// options.Alias("E"),
// options.Description("Unactivate the heuristic limatitating the sequence comparisons"))
}

View File

@ -80,7 +80,7 @@ func Order[T sort.Interface](data T) []int {
rk.r[i] = i
}
sort.Sort(rk)
sort.Stable(rk)
return r
}