New refdb indexing algorithm

Former-commit-id: 7ae480cec5d277a08620a2e666b5c9b69bb6cd73
This commit is contained in:
2023-05-03 15:32:31 +02:00
parent e8bd130ffc
commit 870cb1aabb
2 changed files with 123 additions and 24 deletions

View File

@ -2,9 +2,10 @@ package obirefidx
import (
"fmt"
"log"
"os"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer"
@ -18,6 +19,8 @@ import (
func IndexSequence(seqidx int,
references obiseq.BioSequenceSlice,
kmers *[]*obikmer.Table4mer,
taxa *obitax.TaxonSet,
taxo *obitax.Taxonomy) map[int]string {
sequence := references[seqidx]
@ -27,8 +30,62 @@ func IndexSequence(seqidx int,
var matrix []uint64
score := make([]int, len(references))
// t := 0
lca := make(obitax.TaxonSet, len(references))
tref := (*taxa)[seqidx]
for i, taxon := range (*taxa) {
lca[i],_ = tref.LCA(taxon)
}
cw := make([]int, len(references))
sw := (*kmers)[seqidx]
for i, ref := range *kmers {
cw[i] = obikmer.Common4Mer(sw,ref)
}
ow := obiutils.Reverse(obiutils.IntOrder(cw),true)
pref,_ := tref.Path()
obiutils.Reverse(*pref,true)
// score := make([]int, len(references))
mindiff := make([]int, len(*pref))
for i,ancestor := range *pref {
mini := -1
for _,order := range ow {
if lca[order] == ancestor {
lcs, alilength := obialign.FastLCSScore(sequence, references[order], mini, &matrix)
if lcs >= 0 {
errs := alilength - lcs
if mini== -1 || errs < mini {
mini = errs
}
}
}
}
if mini != -1 {
mindiff[i] = mini
} else {
mindiff[i] = 1e6
}
}
obitag_index := make(map[int]string, len(*pref))
old := sequence.Len()
for i,d := range mindiff {
if d < old {
current_taxid :=(*pref)[i]
obitag_index[d] = fmt.Sprintf(
"%d@%s@%s",
current_taxid.Taxid(),
current_taxid.ScientificName(),
current_taxid.Rank())
old = d
}
}
/* // t := 0
// r := 0
// w := 0
for i, ref := range references {
@ -84,13 +141,51 @@ func IndexSequence(seqidx int,
current_taxid.Taxid(),
current_taxid.ScientificName(),
current_taxid.Rank())
*/
//log.Println(obitag_index)
return obitag_index
}
func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
log.Infoln("Loading database...")
references := iterator.Load()
log.Infof("Done. Database contains %d sequences", len(references))
taxo, error := obifind.CLILoadSelectedTaxonomy()
if error != nil {
log.Panicln(error)
}
log.Infoln("Indexing sequence taxids...")
taxa := make(
obitax.TaxonSet,
len(references))
j := 0
for i, seq := range references {
taxon, err := taxo.Taxon(seq.Taxid())
if err == nil {
taxa[j] = taxon
references[j] = references[i]
j++
}
}
if j < len(references) {
if len(references)-j == 1 {
log.Infoln("1 sequence has no valid taxid and has been discarded")
} else {
log.Infof("%d sequences have no valid taxid and has been discarded", len(references)-j)
}
references = references[0:j]
} else {
log.Infoln("Done.")
}
log.Infoln("Indexing database kmers...")
refcounts := make(
[]*obikmer.Table4mer,
len(references))
@ -101,10 +196,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
}
taxo, error := obifind.CLILoadSelectedTaxonomy()
if error != nil {
log.Panicln(error)
}
log.Info("done")
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
@ -130,13 +222,13 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
for l := range limits {
sl := obiseq.MakeBioSequenceSlice()
for i := l[0]; i < l[1]; i++ {
idx := IndexSequence(i, references, taxo)
idx := IndexSequence(i, references, &refcounts, &taxa, taxo)
iref := references[i].Copy()
iref.SetAttribute("obitag_ref_index", idx)
sl = append(sl, iref)
bar.Add(1)
}
indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl))
bar.Add(len(sl))
}
indexed.Done()

View File

@ -111,6 +111,7 @@ func FindClosests(sequence *obiseq.BioSequence,
func Identify(sequence *obiseq.BioSequence,
references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer,
taxa obitax.TaxonSet,
taxo *obitax.Taxonomy,
runExact bool) *obiseq.BioSequence {
bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact)
@ -121,7 +122,7 @@ func Identify(sequence *obiseq.BioSequence,
idx := best.OBITagRefIndex()
if idx == nil {
// log.Fatalln("Need of indexing")
idx = obirefidx.IndexSequence(seqidxs[i], references, taxo)
idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, &taxa, taxo)
}
d := differences
@ -165,33 +166,39 @@ func Identify(sequence *obiseq.BioSequence,
func IdentifySeqWorker(references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer,
taxa obitax.TaxonSet,
taxo *obitax.Taxonomy,
runExact bool) obiseq.SeqWorker {
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
return Identify(sequence, references, refcounts, taxo, runExact)
return Identify(sequence, references, refcounts, taxa,taxo, runExact)
}
}
func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence {
references := CLIRefDB()
refcounts := make(
[]*obikmer.Table4mer,
len(references))
buffer := make([]byte, 0, 1000)
for i, seq := range references {
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
}
taxo, error := obifind.CLILoadSelectedTaxonomy()
if error != nil {
log.Panicln(error)
}
worker := IdentifySeqWorker(references, refcounts, taxo, CLIRunExact())
references := CLIRefDB()
refcounts := make(
[]*obikmer.Table4mer,
len(references))
taxa := make(obitax.TaxonSet,
len(references))
buffer := make([]byte, 0, 1000)
for i, seq := range references {
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
taxa[i],_= taxo.Taxon(seq.Taxid())
}
worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact())
return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000)
}