mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Correction a a bug and transfert of the new matching rules from obirefidx to obitag
Former-commit-id: a28540f72a504ad4e7e8a8b6f7609116913445b4
This commit is contained in:
@ -48,32 +48,44 @@ func IndexSequence(seqidx int,
|
|||||||
obiutils.Reverse(*pseq, true)
|
obiutils.Reverse(*pseq, true)
|
||||||
// score := make([]int, len(references))
|
// score := make([]int, len(references))
|
||||||
mindiff := make([]int, len(*pseq))
|
mindiff := make([]int, len(*pseq))
|
||||||
nseq := make([]int, len(*pseq))
|
/* nseq := make([]int, len(*pseq))
|
||||||
nali := make([]int, len(*pseq))
|
nali := make([]int, len(*pseq))
|
||||||
nok := make([]int, len(*pseq))
|
nok := make([]int, len(*pseq))
|
||||||
lseq := sequence.Len()
|
nfast := make([]int, len(*pseq))
|
||||||
|
nfastok := make([]int, len(*pseq))
|
||||||
|
*/ lseq := sequence.Len()
|
||||||
|
|
||||||
mini := -1
|
mini := -1
|
||||||
for i, ancestor := range *pseq {
|
for i, ancestor := range *pseq {
|
||||||
for _, order := range ow {
|
for _, order := range ow {
|
||||||
if lca[order] == ancestor {
|
if lca[order] == ancestor {
|
||||||
nseq[i]++
|
// nseq[i]++
|
||||||
wordmin := 0
|
wordmin := 0
|
||||||
if mini != -1 {
|
if mini != -1 {
|
||||||
wordmin = obiutils.MaxInt(lseq-3-mini*4, 0)
|
wordmin = obiutils.MaxInt(lseq-3-mini*4, 0)
|
||||||
}
|
}
|
||||||
lcs, alilength := -1, -1
|
lcs, alilength := -1, -1
|
||||||
if cw[order] >= wordmin {
|
errs := int(1e9)
|
||||||
nali[i]++
|
if mini != -1 && mini <= 1 {
|
||||||
lcs, alilength = obialign.FastLCSScore(sequence, references[order], mini, &matrix)
|
// nfast[i]++
|
||||||
if lcs >= 0 {
|
d, _, _, _ := obialign.D1Or0(sequence, references[order])
|
||||||
nok[i]++
|
if d >= 0 {
|
||||||
errs := alilength - lcs
|
errs = d
|
||||||
if mini == -1 || errs < mini {
|
// nfastok[i]++
|
||||||
mini = errs
|
}
|
||||||
|
} else {
|
||||||
|
if cw[order] >= wordmin {
|
||||||
|
// nali[i]++
|
||||||
|
lcs, alilength = obialign.FastLCSScore(sequence, references[order], mini, &matrix)
|
||||||
|
if lcs >= 0 {
|
||||||
|
// nok[i]++
|
||||||
|
errs = alilength - lcs
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if mini == -1 || errs < mini {
|
||||||
|
mini = errs
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mindiff[i] = mini
|
mindiff[i] = mini
|
||||||
@ -94,11 +106,13 @@ func IndexSequence(seqidx int,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), obitag_index)
|
/* log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), obitag_index)
|
||||||
// log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nseq)
|
log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nseq)
|
||||||
// log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nali)
|
log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nfast)
|
||||||
// log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nok)
|
log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nfastok)
|
||||||
return obitag_index
|
log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nali)
|
||||||
|
log.Println(sequence.Id(), tseq.Taxid(), tseq.ScientificName(), tseq.Rank(), nok)
|
||||||
|
*/ return obitag_index
|
||||||
}
|
}
|
||||||
|
|
||||||
func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
package obitag
|
package obitag
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"math"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@ -32,79 +31,65 @@ func FindClosests(sequence *obiseq.BioSequence,
|
|||||||
cw[i] = obikmer.Common4Mer(seqwords, ref)
|
cw[i] = obikmer.Common4Mer(seqwords, ref)
|
||||||
}
|
}
|
||||||
|
|
||||||
o := obiutils.ReverseIntOrder(cw)
|
o := obiutils.Reverse(obiutils.IntOrder(cw), true)
|
||||||
|
|
||||||
// mcw := 100000
|
|
||||||
// for _, i := range o {
|
|
||||||
// if cw[i] < mcw {
|
|
||||||
// mcw = cw[i]
|
|
||||||
// }
|
|
||||||
// if cw[i] > mcw {
|
|
||||||
// log.Panicln("wrong order")
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
bests := obiseq.MakeBioSequenceSlice()
|
bests := obiseq.MakeBioSequenceSlice()
|
||||||
bests = append(bests, references[o[0]])
|
// bests = append(bests, references[o[0]])
|
||||||
bestidxs := make([]int, 0)
|
bestidxs := make([]int, 0)
|
||||||
bestidxs = append(bestidxs, o[0])
|
// bestidxs = append(bestidxs, o[0])
|
||||||
bestId := 0.0
|
bestId := 0.0
|
||||||
bestmatch := references[o[0]].Id()
|
bestmatch := references[o[0]].Id()
|
||||||
|
|
||||||
maxe := 0
|
maxe := -1
|
||||||
n := 0
|
wordmin := 0
|
||||||
nf := 0
|
|
||||||
|
|
||||||
for i, j := range o {
|
for _, order := range o {
|
||||||
ref := references[j]
|
ref := references[order]
|
||||||
|
|
||||||
lmin, lmax := obiutils.MinMaxInt(sequence.Len(), ref.Len())
|
if maxe != -1 {
|
||||||
atMost := lmax - lmin + int(math.Ceil(float64(lmin-3-cw[j])/4.0)) - 2
|
wordmin = obiutils.MaxInt(sequence.Len(), ref.Len()) - 4*maxe
|
||||||
|
|
||||||
if i == 0 {
|
|
||||||
maxe = obiutils.MaxInt(sequence.Len(), ref.Len())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// log.Println(sequence.Id(),cw[j], maxe)
|
lcs, alilength := -1, -1
|
||||||
if runExact || (atMost <= (maxe + 1)) {
|
score := int(1e9)
|
||||||
// if true {
|
if maxe == 0 || maxe == 1 {
|
||||||
lcs, alilength := obialign.FastLCSScore(sequence, ref, maxe+1, &matrix)
|
d, _, _, _ := obialign.D1Or0(sequence, references[order])
|
||||||
// fmt.Println(j, cw[j], lcs, alilength, alilength-lcs)
|
if d >= 0 {
|
||||||
// lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix)
|
score = d
|
||||||
n++
|
alilength = obiutils.MaxInt(sequence.Len(), ref.Len())
|
||||||
if lcs == -1 {
|
lcs = alilength - score
|
||||||
nf++
|
|
||||||
// That aligment is worst than maxe, go to the next sequence
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
score := alilength - lcs
|
if cw[order] >= wordmin {
|
||||||
if score < maxe {
|
lcs, alilength = obialign.FastLCSScore(sequence, references[order], maxe, &matrix)
|
||||||
bests = bests[:0]
|
if lcs >= 0 {
|
||||||
bestidxs = bestidxs[:0]
|
score = alilength - lcs
|
||||||
maxe = score
|
|
||||||
bestId = float64(lcs) / float64(alilength)
|
|
||||||
// log.Println(best.Id(), maxe, bestId)
|
|
||||||
}
|
|
||||||
|
|
||||||
if score == maxe {
|
|
||||||
bests = append(bests, ref)
|
|
||||||
bestidxs = append(bestidxs, j)
|
|
||||||
id := float64(lcs) / float64(alilength)
|
|
||||||
if id > bestId {
|
|
||||||
bestId = id
|
|
||||||
bestmatch = ref.Id()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if maxe == 0 {
|
if maxe == -1 || score < maxe {
|
||||||
// We have found identity no need to continue to search
|
bests = bests[:0]
|
||||||
break
|
bestidxs = bestidxs[:0]
|
||||||
|
maxe = score
|
||||||
|
bestId = float64(lcs) / float64(alilength)
|
||||||
|
// log.Println(ref.Id(), maxe, bestId,bestidxs)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if score == maxe {
|
||||||
|
bests = append(bests, ref)
|
||||||
|
bestidxs = append(bestidxs, order)
|
||||||
|
id := float64(lcs) / float64(alilength)
|
||||||
|
if id > bestId {
|
||||||
|
bestId = id
|
||||||
|
bestmatch = ref.Id()
|
||||||
|
}
|
||||||
|
// log.Println(ref.Id(), maxe, bestId,bestidxs)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
// log.Println("that's all falks", n, nf, maxe, bestId, bestidx)
|
|
||||||
|
//log.Println("that's all falks", maxe, bestId, bestidxs)
|
||||||
return bests, maxe, bestId, bestmatch, bestidxs
|
return bests, maxe, bestId, bestmatch, bestidxs
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -170,7 +155,7 @@ func IdentifySeqWorker(references obiseq.BioSequenceSlice,
|
|||||||
taxo *obitax.Taxonomy,
|
taxo *obitax.Taxonomy,
|
||||||
runExact bool) obiseq.SeqWorker {
|
runExact bool) obiseq.SeqWorker {
|
||||||
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
|
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
|
||||||
return Identify(sequence, references, refcounts, taxa,taxo, runExact)
|
return Identify(sequence, references, refcounts, taxa, taxo, runExact)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -194,10 +179,9 @@ func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
|
|
||||||
for i, seq := range references {
|
for i, seq := range references {
|
||||||
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
|
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
|
||||||
taxa[i],_= taxo.Taxon(seq.Taxid())
|
taxa[i], _ = taxo.Taxon(seq.Taxid())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact())
|
worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact())
|
||||||
|
|
||||||
return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000)
|
return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000)
|
||||||
|
@ -20,9 +20,9 @@ func TagOptionSet(options *getoptions.GetOpt) {
|
|||||||
options.ArgName("FILENAME"),
|
options.ArgName("FILENAME"),
|
||||||
options.Description("The name of the file containing the reference DB"))
|
options.Description("The name of the file containing the reference DB"))
|
||||||
|
|
||||||
options.BoolVar(&_RunExact, "exact", _RunExact,
|
// options.BoolVar(&_RunExact, "exact", _RunExact,
|
||||||
options.Alias("E"),
|
// options.Alias("E"),
|
||||||
options.Description("Unactivate the heuristic limatitating the sequence comparisons"))
|
// options.Description("Unactivate the heuristic limatitating the sequence comparisons"))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -80,7 +80,7 @@ func Order[T sort.Interface](data T) []int {
|
|||||||
rk.r[i] = i
|
rk.r[i] = i
|
||||||
}
|
}
|
||||||
|
|
||||||
sort.Sort(rk)
|
sort.Stable(rk)
|
||||||
|
|
||||||
return r
|
return r
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user