From 870cb1aabb36cd78312a9c183bf0cbe046dec865 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 3 May 2023 15:32:31 +0200 Subject: [PATCH] New refdb indexing algorithm Former-commit-id: 7ae480cec5d277a08620a2e666b5c9b69bb6cd73 --- pkg/obitools/obirefidx/obirefidx.go | 112 +++++++++++++++++++++++++--- pkg/obitools/obitag/obitag.go | 35 +++++---- 2 files changed, 123 insertions(+), 24 deletions(-) diff --git a/pkg/obitools/obirefidx/obirefidx.go b/pkg/obitools/obirefidx/obirefidx.go index e1e8aa7..6b67e4f 100644 --- a/pkg/obitools/obirefidx/obirefidx.go +++ b/pkg/obitools/obirefidx/obirefidx.go @@ -2,9 +2,10 @@ package obirefidx import ( "fmt" - "log" "os" + log "github.com/sirupsen/logrus" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer" @@ -18,6 +19,8 @@ import ( func IndexSequence(seqidx int, references obiseq.BioSequenceSlice, + kmers *[]*obikmer.Table4mer, + taxa *obitax.TaxonSet, taxo *obitax.Taxonomy) map[int]string { sequence := references[seqidx] @@ -27,8 +30,62 @@ func IndexSequence(seqidx int, var matrix []uint64 - score := make([]int, len(references)) - // t := 0 + lca := make(obitax.TaxonSet, len(references)) + tref := (*taxa)[seqidx] + + for i, taxon := range (*taxa) { + lca[i],_ = tref.LCA(taxon) + } + + cw := make([]int, len(references)) + sw := (*kmers)[seqidx] + for i, ref := range *kmers { + cw[i] = obikmer.Common4Mer(sw,ref) + } + + ow := obiutils.Reverse(obiutils.IntOrder(cw),true) + pref,_ := tref.Path() + obiutils.Reverse(*pref,true) + // score := make([]int, len(references)) + mindiff := make([]int, len(*pref)) + + + for i,ancestor := range *pref { + mini := -1 + for _,order := range ow { + if lca[order] == ancestor { + lcs, alilength := obialign.FastLCSScore(sequence, references[order], mini, &matrix) + if lcs >= 0 { + errs := alilength - lcs + if mini== -1 || errs < mini { + mini = errs + } + } + } + } + if mini != -1 { + mindiff[i] = mini + } else { + mindiff[i] = 1e6 + } + } + + obitag_index := make(map[int]string, len(*pref)) + + old := sequence.Len() + for i,d := range mindiff { + if d < old { + current_taxid :=(*pref)[i] + obitag_index[d] = fmt.Sprintf( + "%d@%s@%s", + current_taxid.Taxid(), + current_taxid.ScientificName(), + current_taxid.Rank()) + old = d + } + } + + /* // t := 0 // r := 0 // w := 0 for i, ref := range references { @@ -84,13 +141,51 @@ func IndexSequence(seqidx int, current_taxid.Taxid(), current_taxid.ScientificName(), current_taxid.Rank()) - + */ + //log.Println(obitag_index) return obitag_index } func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { + log.Infoln("Loading database...") references := iterator.Load() + log.Infof("Done. Database contains %d sequences", len(references)) + + taxo, error := obifind.CLILoadSelectedTaxonomy() + if error != nil { + log.Panicln(error) + } + + log.Infoln("Indexing sequence taxids...") + + taxa := make( + obitax.TaxonSet, + len(references)) + + j := 0 + for i, seq := range references { + taxon, err := taxo.Taxon(seq.Taxid()) + if err == nil { + taxa[j] = taxon + references[j] = references[i] + j++ + } + } + + if j < len(references) { + if len(references)-j == 1 { + log.Infoln("1 sequence has no valid taxid and has been discarded") + } else { + log.Infof("%d sequences have no valid taxid and has been discarded", len(references)-j) + } + + references = references[0:j] + } else { + log.Infoln("Done.") + } + + log.Infoln("Indexing database kmers...") refcounts := make( []*obikmer.Table4mer, len(references)) @@ -101,10 +196,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil) } - taxo, error := obifind.CLILoadSelectedTaxonomy() - if error != nil { - log.Panicln(error) - } + log.Info("done") pbopt := make([]progressbar.Option, 0, 5) pbopt = append(pbopt, @@ -130,13 +222,13 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { for l := range limits { sl := obiseq.MakeBioSequenceSlice() for i := l[0]; i < l[1]; i++ { - idx := IndexSequence(i, references, taxo) + idx := IndexSequence(i, references, &refcounts, &taxa, taxo) iref := references[i].Copy() iref.SetAttribute("obitag_ref_index", idx) sl = append(sl, iref) + bar.Add(1) } indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl)) - bar.Add(len(sl)) } indexed.Done() diff --git a/pkg/obitools/obitag/obitag.go b/pkg/obitools/obitag/obitag.go index 460bcdb..54966f0 100644 --- a/pkg/obitools/obitag/obitag.go +++ b/pkg/obitools/obitag/obitag.go @@ -111,6 +111,7 @@ func FindClosests(sequence *obiseq.BioSequence, func Identify(sequence *obiseq.BioSequence, references obiseq.BioSequenceSlice, refcounts []*obikmer.Table4mer, + taxa obitax.TaxonSet, taxo *obitax.Taxonomy, runExact bool) *obiseq.BioSequence { bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact) @@ -121,7 +122,7 @@ func Identify(sequence *obiseq.BioSequence, idx := best.OBITagRefIndex() if idx == nil { // log.Fatalln("Need of indexing") - idx = obirefidx.IndexSequence(seqidxs[i], references, taxo) + idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, &taxa, taxo) } d := differences @@ -165,33 +166,39 @@ func Identify(sequence *obiseq.BioSequence, func IdentifySeqWorker(references obiseq.BioSequenceSlice, refcounts []*obikmer.Table4mer, + taxa obitax.TaxonSet, taxo *obitax.Taxonomy, runExact bool) obiseq.SeqWorker { return func(sequence *obiseq.BioSequence) *obiseq.BioSequence { - return Identify(sequence, references, refcounts, taxo, runExact) + return Identify(sequence, references, refcounts, taxa,taxo, runExact) } } func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence { - references := CLIRefDB() - refcounts := make( - []*obikmer.Table4mer, - len(references)) - - buffer := make([]byte, 0, 1000) - - for i, seq := range references { - refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil) - } - taxo, error := obifind.CLILoadSelectedTaxonomy() if error != nil { log.Panicln(error) } - worker := IdentifySeqWorker(references, refcounts, taxo, CLIRunExact()) + references := CLIRefDB() + refcounts := make( + []*obikmer.Table4mer, + len(references)) + + taxa := make(obitax.TaxonSet, + len(references)) + + buffer := make([]byte, 0, 1000) + + for i, seq := range references { + refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil) + taxa[i],_= taxo.Taxon(seq.Taxid()) + } + + + worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact()) return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000) }