New refdb indexing algorithm

Former-commit-id: 7ae480cec5d277a08620a2e666b5c9b69bb6cd73
This commit is contained in:
2023-05-03 15:32:31 +02:00
parent e8bd130ffc
commit 870cb1aabb
2 changed files with 123 additions and 24 deletions

View File

@ -2,9 +2,10 @@ package obirefidx
import ( import (
"fmt" "fmt"
"log"
"os" "os"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer"
@ -18,6 +19,8 @@ import (
func IndexSequence(seqidx int, func IndexSequence(seqidx int,
references obiseq.BioSequenceSlice, references obiseq.BioSequenceSlice,
kmers *[]*obikmer.Table4mer,
taxa *obitax.TaxonSet,
taxo *obitax.Taxonomy) map[int]string { taxo *obitax.Taxonomy) map[int]string {
sequence := references[seqidx] sequence := references[seqidx]
@ -27,8 +30,62 @@ func IndexSequence(seqidx int,
var matrix []uint64 var matrix []uint64
score := make([]int, len(references)) lca := make(obitax.TaxonSet, len(references))
// t := 0 tref := (*taxa)[seqidx]
for i, taxon := range (*taxa) {
lca[i],_ = tref.LCA(taxon)
}
cw := make([]int, len(references))
sw := (*kmers)[seqidx]
for i, ref := range *kmers {
cw[i] = obikmer.Common4Mer(sw,ref)
}
ow := obiutils.Reverse(obiutils.IntOrder(cw),true)
pref,_ := tref.Path()
obiutils.Reverse(*pref,true)
// score := make([]int, len(references))
mindiff := make([]int, len(*pref))
for i,ancestor := range *pref {
mini := -1
for _,order := range ow {
if lca[order] == ancestor {
lcs, alilength := obialign.FastLCSScore(sequence, references[order], mini, &matrix)
if lcs >= 0 {
errs := alilength - lcs
if mini== -1 || errs < mini {
mini = errs
}
}
}
}
if mini != -1 {
mindiff[i] = mini
} else {
mindiff[i] = 1e6
}
}
obitag_index := make(map[int]string, len(*pref))
old := sequence.Len()
for i,d := range mindiff {
if d < old {
current_taxid :=(*pref)[i]
obitag_index[d] = fmt.Sprintf(
"%d@%s@%s",
current_taxid.Taxid(),
current_taxid.ScientificName(),
current_taxid.Rank())
old = d
}
}
/* // t := 0
// r := 0 // r := 0
// w := 0 // w := 0
for i, ref := range references { for i, ref := range references {
@ -84,13 +141,51 @@ func IndexSequence(seqidx int,
current_taxid.Taxid(), current_taxid.Taxid(),
current_taxid.ScientificName(), current_taxid.ScientificName(),
current_taxid.Rank()) current_taxid.Rank())
*/
//log.Println(obitag_index)
return obitag_index return obitag_index
} }
func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
log.Infoln("Loading database...")
references := iterator.Load() references := iterator.Load()
log.Infof("Done. Database contains %d sequences", len(references))
taxo, error := obifind.CLILoadSelectedTaxonomy()
if error != nil {
log.Panicln(error)
}
log.Infoln("Indexing sequence taxids...")
taxa := make(
obitax.TaxonSet,
len(references))
j := 0
for i, seq := range references {
taxon, err := taxo.Taxon(seq.Taxid())
if err == nil {
taxa[j] = taxon
references[j] = references[i]
j++
}
}
if j < len(references) {
if len(references)-j == 1 {
log.Infoln("1 sequence has no valid taxid and has been discarded")
} else {
log.Infof("%d sequences have no valid taxid and has been discarded", len(references)-j)
}
references = references[0:j]
} else {
log.Infoln("Done.")
}
log.Infoln("Indexing database kmers...")
refcounts := make( refcounts := make(
[]*obikmer.Table4mer, []*obikmer.Table4mer,
len(references)) len(references))
@ -101,10 +196,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil) refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
} }
taxo, error := obifind.CLILoadSelectedTaxonomy() log.Info("done")
if error != nil {
log.Panicln(error)
}
pbopt := make([]progressbar.Option, 0, 5) pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt, pbopt = append(pbopt,
@ -130,13 +222,13 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
for l := range limits { for l := range limits {
sl := obiseq.MakeBioSequenceSlice() sl := obiseq.MakeBioSequenceSlice()
for i := l[0]; i < l[1]; i++ { for i := l[0]; i < l[1]; i++ {
idx := IndexSequence(i, references, taxo) idx := IndexSequence(i, references, &refcounts, &taxa, taxo)
iref := references[i].Copy() iref := references[i].Copy()
iref.SetAttribute("obitag_ref_index", idx) iref.SetAttribute("obitag_ref_index", idx)
sl = append(sl, iref) sl = append(sl, iref)
bar.Add(1)
} }
indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl)) indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl))
bar.Add(len(sl))
} }
indexed.Done() indexed.Done()

View File

@ -111,6 +111,7 @@ func FindClosests(sequence *obiseq.BioSequence,
func Identify(sequence *obiseq.BioSequence, func Identify(sequence *obiseq.BioSequence,
references obiseq.BioSequenceSlice, references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer, refcounts []*obikmer.Table4mer,
taxa obitax.TaxonSet,
taxo *obitax.Taxonomy, taxo *obitax.Taxonomy,
runExact bool) *obiseq.BioSequence { runExact bool) *obiseq.BioSequence {
bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact) bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact)
@ -121,7 +122,7 @@ func Identify(sequence *obiseq.BioSequence,
idx := best.OBITagRefIndex() idx := best.OBITagRefIndex()
if idx == nil { if idx == nil {
// log.Fatalln("Need of indexing") // log.Fatalln("Need of indexing")
idx = obirefidx.IndexSequence(seqidxs[i], references, taxo) idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, &taxa, taxo)
} }
d := differences d := differences
@ -165,33 +166,39 @@ func Identify(sequence *obiseq.BioSequence,
func IdentifySeqWorker(references obiseq.BioSequenceSlice, func IdentifySeqWorker(references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer, refcounts []*obikmer.Table4mer,
taxa obitax.TaxonSet,
taxo *obitax.Taxonomy, taxo *obitax.Taxonomy,
runExact bool) obiseq.SeqWorker { runExact bool) obiseq.SeqWorker {
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence { return func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
return Identify(sequence, references, refcounts, taxo, runExact) return Identify(sequence, references, refcounts, taxa,taxo, runExact)
} }
} }
func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence { func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence {
references := CLIRefDB()
refcounts := make(
[]*obikmer.Table4mer,
len(references))
buffer := make([]byte, 0, 1000)
for i, seq := range references {
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
}
taxo, error := obifind.CLILoadSelectedTaxonomy() taxo, error := obifind.CLILoadSelectedTaxonomy()
if error != nil { if error != nil {
log.Panicln(error) log.Panicln(error)
} }
worker := IdentifySeqWorker(references, refcounts, taxo, CLIRunExact()) references := CLIRefDB()
refcounts := make(
[]*obikmer.Table4mer,
len(references))
taxa := make(obitax.TaxonSet,
len(references))
buffer := make([]byte, 0, 1000)
for i, seq := range references {
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
taxa[i],_= taxo.Taxon(seq.Taxid())
}
worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact())
return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000) return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000)
} }