Correction a a bug and transfert of the new matching rules from obirefidx to obitag

Former-commit-id: a28540f72a504ad4e7e8a8b6f7609116913445b4
This commit is contained in:
2023-05-05 07:37:19 +02:00
parent 88a9e38952
commit 3778ae9266
4 changed files with 78 additions and 80 deletions

View File

@ -48,32 +48,44 @@ func IndexSequence(seqidx int,
obiutils.Reverse(*pseq, true) obiutils.Reverse(*pseq, true)
// score := make([]int, len(references)) // score := make([]int, len(references))
mindiff := make([]int, len(*pseq)) mindiff := make([]int, len(*pseq))
nseq := make([]int, len(*pseq)) /* nseq := make([]int, len(*pseq))
nali := make([]int, len(*pseq)) nali := make([]int, len(*pseq))
nok := make([]int, len(*pseq)) nok := make([]int, len(*pseq))
lseq := sequence.Len() nfast := make([]int, len(*pseq))
nfastok := make([]int, len(*pseq))
*/ lseq := sequence.Len()
mini := -1 mini := -1
for i, ancestor := range *pseq { for i, ancestor := range *pseq {
for _, order := range ow { for _, order := range ow {
if lca[order] == ancestor { if lca[order] == ancestor {
nseq[i]++ // nseq[i]++
wordmin := 0 wordmin := 0
if mini != -1 { if mini != -1 {
wordmin = obiutils.MaxInt(lseq-3-mini*4, 0) wordmin = obiutils.MaxInt(lseq-3-mini*4, 0)
} }
lcs, alilength := -1, -1 lcs, alilength := -1, -1
if cw[order] >= wordmin { errs := int(1e9)
nali[i]++ if mini != -1 && mini <= 1 {
lcs, alilength = obialign.FastLCSScore(sequence, references[order], mini, &matrix) // nfast[i]++
if lcs >= 0 { d, _, _, _ := obialign.D1Or0(sequence, references[order])
nok[i]++ if d >= 0 {
errs := alilength - lcs errs = d
if mini == -1 || errs < mini { // nfastok[i]++
mini = errs }
} else {
if cw[order] >= wordmin {
// nali[i]++
lcs, alilength = obialign.FastLCSScore(sequence, references[order], mini, &matrix)
if lcs >= 0 {
// nok[i]++
errs = alilength - lcs
} }
} }
} }
if mini == -1 || errs < mini {
mini = errs
}
} }
} }
mindiff[i] = mini mindiff[i] = mini
@ -94,11 +106,13 @@ func IndexSequence(seqidx int,
} }
} }
// log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), obitag_index) /* log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), obitag_index)
// log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nseq) log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nseq)
// log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nali) log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nfast)
// log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nok) log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nfastok)
return obitag_index log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nali)
log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nok)
*/ return obitag_index
} }
func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {

View File

@ -1,7 +1,6 @@
package obitag package obitag
import ( import (
"math"
"strconv" "strconv"
"strings" "strings"
@ -32,79 +31,65 @@ func FindClosests(sequence *obiseq.BioSequence,
cw[i] = obikmer.Common4Mer(seqwords, ref) cw[i] = obikmer.Common4Mer(seqwords, ref)
} }
o := obiutils.ReverseIntOrder(cw) o := obiutils.Reverse(obiutils.IntOrder(cw), true)
// mcw := 100000
// for _, i := range o {
// if cw[i] < mcw {
// mcw = cw[i]
// }
// if cw[i] > mcw {
// log.Panicln("wrong order")
// }
// }
bests := obiseq.MakeBioSequenceSlice() bests := obiseq.MakeBioSequenceSlice()
bests = append(bests, references[o[0]]) // bests = append(bests, references[o[0]])
bestidxs := make([]int, 0) bestidxs := make([]int, 0)
bestidxs = append(bestidxs, o[0]) // bestidxs = append(bestidxs, o[0])
bestId := 0.0 bestId := 0.0
bestmatch := references[o[0]].Id() bestmatch := references[o[0]].Id()
maxe := 0 maxe := -1
n := 0 wordmin := 0
nf := 0
for i, j := range o { for _, order := range o {
ref := references[j] ref := references[order]
lmin, lmax := obiutils.MinMaxInt(sequence.Len(), ref.Len()) if maxe != -1 {
atMost := lmax - lmin + int(math.Ceil(float64(lmin-3-cw[j])/4.0)) - 2 wordmin = obiutils.MaxInt(sequence.Len(), ref.Len()) - 4*maxe
if i == 0 {
maxe = obiutils.MaxInt(sequence.Len(), ref.Len())
} }
// log.Println(sequence.Id(),cw[j], maxe) lcs, alilength := -1, -1
if runExact || (atMost <= (maxe + 1)) { score := int(1e9)
// if true { if maxe == 0 || maxe == 1 {
lcs, alilength := obialign.FastLCSScore(sequence, ref, maxe+1, &matrix) d, _, _, _ := obialign.D1Or0(sequence, references[order])
// fmt.Println(j, cw[j], lcs, alilength, alilength-lcs) if d >= 0 {
// lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix) score = d
n++ alilength = obiutils.MaxInt(sequence.Len(), ref.Len())
if lcs == -1 { lcs = alilength - score
nf++
// That aligment is worst than maxe, go to the next sequence
continue
} }
} else {
score := alilength - lcs if cw[order] >= wordmin {
if score < maxe { lcs, alilength = obialign.FastLCSScore(sequence, references[order], maxe, &matrix)
bests = bests[:0] if lcs >= 0 {
bestidxs = bestidxs[:0] score = alilength - lcs
maxe = score
bestId = float64(lcs) / float64(alilength)
// log.Println(best.Id(), maxe, bestId)
}
if score == maxe {
bests = append(bests, ref)
bestidxs = append(bestidxs, j)
id := float64(lcs) / float64(alilength)
if id > bestId {
bestId = id
bestmatch = ref.Id()
} }
} }
} }
if maxe == 0 { if maxe == -1 || score < maxe {
// We have found identity no need to continue to search bests = bests[:0]
break bestidxs = bestidxs[:0]
maxe = score
bestId = float64(lcs) / float64(alilength)
// log.Println(ref.Id(), maxe, bestId,bestidxs)
} }
if score == maxe {
bests = append(bests, ref)
bestidxs = append(bestidxs, order)
id := float64(lcs) / float64(alilength)
if id > bestId {
bestId = id
bestmatch = ref.Id()
}
// log.Println(ref.Id(), maxe, bestId,bestidxs)
}
} }
// log.Println("that's all falks", n, nf, maxe, bestId, bestidx)
//log.Println("that's all falks", maxe, bestId, bestidxs)
return bests, maxe, bestId, bestmatch, bestidxs return bests, maxe, bestId, bestmatch, bestidxs
} }
@ -170,7 +155,7 @@ func IdentifySeqWorker(references obiseq.BioSequenceSlice,
taxo *obitax.Taxonomy, taxo *obitax.Taxonomy,
runExact bool) obiseq.SeqWorker { runExact bool) obiseq.SeqWorker {
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence { return func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
return Identify(sequence, references, refcounts, taxa,taxo, runExact) return Identify(sequence, references, refcounts, taxa, taxo, runExact)
} }
} }
@ -194,10 +179,9 @@ func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence {
for i, seq := range references { for i, seq := range references {
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil) refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
taxa[i],_= taxo.Taxon(seq.Taxid()) taxa[i], _ = taxo.Taxon(seq.Taxid())
} }
worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact()) worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact())
return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000) return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000)

View File

@ -20,9 +20,9 @@ func TagOptionSet(options *getoptions.GetOpt) {
options.ArgName("FILENAME"), options.ArgName("FILENAME"),
options.Description("The name of the file containing the reference DB")) options.Description("The name of the file containing the reference DB"))
options.BoolVar(&_RunExact, "exact", _RunExact, // options.BoolVar(&_RunExact, "exact", _RunExact,
options.Alias("E"), // options.Alias("E"),
options.Description("Unactivate the heuristic limatitating the sequence comparisons")) // options.Description("Unactivate the heuristic limatitating the sequence comparisons"))
} }

View File

@ -80,7 +80,7 @@ func Order[T sort.Interface](data T) []int {
rk.r[i] = i rk.r[i] = i
} }
sort.Sort(rk) sort.Stable(rk)
return r return r
} }