mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
New refdb indexing algorithm
Former-commit-id: 7ae480cec5d277a08620a2e666b5c9b69bb6cd73
This commit is contained in:
@ -2,9 +2,10 @@ package obirefidx
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer"
|
||||
@ -18,6 +19,8 @@ import (
|
||||
|
||||
func IndexSequence(seqidx int,
|
||||
references obiseq.BioSequenceSlice,
|
||||
kmers *[]*obikmer.Table4mer,
|
||||
taxa *obitax.TaxonSet,
|
||||
taxo *obitax.Taxonomy) map[int]string {
|
||||
|
||||
sequence := references[seqidx]
|
||||
@ -27,8 +30,62 @@ func IndexSequence(seqidx int,
|
||||
|
||||
var matrix []uint64
|
||||
|
||||
score := make([]int, len(references))
|
||||
// t := 0
|
||||
lca := make(obitax.TaxonSet, len(references))
|
||||
tref := (*taxa)[seqidx]
|
||||
|
||||
for i, taxon := range (*taxa) {
|
||||
lca[i],_ = tref.LCA(taxon)
|
||||
}
|
||||
|
||||
cw := make([]int, len(references))
|
||||
sw := (*kmers)[seqidx]
|
||||
for i, ref := range *kmers {
|
||||
cw[i] = obikmer.Common4Mer(sw,ref)
|
||||
}
|
||||
|
||||
ow := obiutils.Reverse(obiutils.IntOrder(cw),true)
|
||||
pref,_ := tref.Path()
|
||||
obiutils.Reverse(*pref,true)
|
||||
// score := make([]int, len(references))
|
||||
mindiff := make([]int, len(*pref))
|
||||
|
||||
|
||||
for i,ancestor := range *pref {
|
||||
mini := -1
|
||||
for _,order := range ow {
|
||||
if lca[order] == ancestor {
|
||||
lcs, alilength := obialign.FastLCSScore(sequence, references[order], mini, &matrix)
|
||||
if lcs >= 0 {
|
||||
errs := alilength - lcs
|
||||
if mini== -1 || errs < mini {
|
||||
mini = errs
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if mini != -1 {
|
||||
mindiff[i] = mini
|
||||
} else {
|
||||
mindiff[i] = 1e6
|
||||
}
|
||||
}
|
||||
|
||||
obitag_index := make(map[int]string, len(*pref))
|
||||
|
||||
old := sequence.Len()
|
||||
for i,d := range mindiff {
|
||||
if d < old {
|
||||
current_taxid :=(*pref)[i]
|
||||
obitag_index[d] = fmt.Sprintf(
|
||||
"%d@%s@%s",
|
||||
current_taxid.Taxid(),
|
||||
current_taxid.ScientificName(),
|
||||
current_taxid.Rank())
|
||||
old = d
|
||||
}
|
||||
}
|
||||
|
||||
/* // t := 0
|
||||
// r := 0
|
||||
// w := 0
|
||||
for i, ref := range references {
|
||||
@ -84,13 +141,51 @@ func IndexSequence(seqidx int,
|
||||
current_taxid.Taxid(),
|
||||
current_taxid.ScientificName(),
|
||||
current_taxid.Rank())
|
||||
|
||||
*/
|
||||
//log.Println(obitag_index)
|
||||
return obitag_index
|
||||
}
|
||||
|
||||
func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
|
||||
log.Infoln("Loading database...")
|
||||
references := iterator.Load()
|
||||
log.Infof("Done. Database contains %d sequences", len(references))
|
||||
|
||||
taxo, error := obifind.CLILoadSelectedTaxonomy()
|
||||
if error != nil {
|
||||
log.Panicln(error)
|
||||
}
|
||||
|
||||
log.Infoln("Indexing sequence taxids...")
|
||||
|
||||
taxa := make(
|
||||
obitax.TaxonSet,
|
||||
len(references))
|
||||
|
||||
j := 0
|
||||
for i, seq := range references {
|
||||
taxon, err := taxo.Taxon(seq.Taxid())
|
||||
if err == nil {
|
||||
taxa[j] = taxon
|
||||
references[j] = references[i]
|
||||
j++
|
||||
}
|
||||
}
|
||||
|
||||
if j < len(references) {
|
||||
if len(references)-j == 1 {
|
||||
log.Infoln("1 sequence has no valid taxid and has been discarded")
|
||||
} else {
|
||||
log.Infof("%d sequences have no valid taxid and has been discarded", len(references)-j)
|
||||
}
|
||||
|
||||
references = references[0:j]
|
||||
} else {
|
||||
log.Infoln("Done.")
|
||||
}
|
||||
|
||||
log.Infoln("Indexing database kmers...")
|
||||
refcounts := make(
|
||||
[]*obikmer.Table4mer,
|
||||
len(references))
|
||||
@ -101,10 +196,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
|
||||
}
|
||||
|
||||
taxo, error := obifind.CLILoadSelectedTaxonomy()
|
||||
if error != nil {
|
||||
log.Panicln(error)
|
||||
}
|
||||
log.Info("done")
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
@ -130,13 +222,13 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
for l := range limits {
|
||||
sl := obiseq.MakeBioSequenceSlice()
|
||||
for i := l[0]; i < l[1]; i++ {
|
||||
idx := IndexSequence(i, references, taxo)
|
||||
idx := IndexSequence(i, references, &refcounts, &taxa, taxo)
|
||||
iref := references[i].Copy()
|
||||
iref.SetAttribute("obitag_ref_index", idx)
|
||||
sl = append(sl, iref)
|
||||
bar.Add(1)
|
||||
}
|
||||
indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl))
|
||||
bar.Add(len(sl))
|
||||
}
|
||||
|
||||
indexed.Done()
|
||||
|
@ -111,6 +111,7 @@ func FindClosests(sequence *obiseq.BioSequence,
|
||||
func Identify(sequence *obiseq.BioSequence,
|
||||
references obiseq.BioSequenceSlice,
|
||||
refcounts []*obikmer.Table4mer,
|
||||
taxa obitax.TaxonSet,
|
||||
taxo *obitax.Taxonomy,
|
||||
runExact bool) *obiseq.BioSequence {
|
||||
bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact)
|
||||
@ -121,7 +122,7 @@ func Identify(sequence *obiseq.BioSequence,
|
||||
idx := best.OBITagRefIndex()
|
||||
if idx == nil {
|
||||
// log.Fatalln("Need of indexing")
|
||||
idx = obirefidx.IndexSequence(seqidxs[i], references, taxo)
|
||||
idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, &taxa, taxo)
|
||||
}
|
||||
|
||||
d := differences
|
||||
@ -165,33 +166,39 @@ func Identify(sequence *obiseq.BioSequence,
|
||||
|
||||
func IdentifySeqWorker(references obiseq.BioSequenceSlice,
|
||||
refcounts []*obikmer.Table4mer,
|
||||
taxa obitax.TaxonSet,
|
||||
taxo *obitax.Taxonomy,
|
||||
runExact bool) obiseq.SeqWorker {
|
||||
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
|
||||
return Identify(sequence, references, refcounts, taxo, runExact)
|
||||
return Identify(sequence, references, refcounts, taxa,taxo, runExact)
|
||||
}
|
||||
}
|
||||
|
||||
func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
|
||||
references := CLIRefDB()
|
||||
refcounts := make(
|
||||
[]*obikmer.Table4mer,
|
||||
len(references))
|
||||
|
||||
buffer := make([]byte, 0, 1000)
|
||||
|
||||
for i, seq := range references {
|
||||
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
|
||||
}
|
||||
|
||||
taxo, error := obifind.CLILoadSelectedTaxonomy()
|
||||
|
||||
if error != nil {
|
||||
log.Panicln(error)
|
||||
}
|
||||
|
||||
worker := IdentifySeqWorker(references, refcounts, taxo, CLIRunExact())
|
||||
references := CLIRefDB()
|
||||
refcounts := make(
|
||||
[]*obikmer.Table4mer,
|
||||
len(references))
|
||||
|
||||
taxa := make(obitax.TaxonSet,
|
||||
len(references))
|
||||
|
||||
buffer := make([]byte, 0, 1000)
|
||||
|
||||
for i, seq := range references {
|
||||
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
|
||||
taxa[i],_= taxo.Taxon(seq.Taxid())
|
||||
}
|
||||
|
||||
|
||||
worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact())
|
||||
|
||||
return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000)
|
||||
}
|
||||
|
Reference in New Issue
Block a user