2022-10-26 13:16:56 +02:00
|
|
|
package obirefidx
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"log"
|
|
|
|
"os"
|
|
|
|
|
|
|
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
|
|
|
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
|
|
|
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
|
|
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer"
|
|
|
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
|
|
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
|
|
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
|
|
|
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
|
|
|
|
"github.com/schollz/progressbar/v3"
|
|
|
|
)
|
|
|
|
|
|
|
|
func IndexSequence(seqidx int,
|
|
|
|
references obiseq.BioSequenceSlice,
|
|
|
|
taxo *obitax.Taxonomy) map[int]string {
|
|
|
|
|
|
|
|
sequence := references[seqidx]
|
2022-11-16 09:27:04 +01:00
|
|
|
// matrix := obialign.NewFullLCSMatrix(nil,
|
|
|
|
// sequence.Length(),
|
|
|
|
// sequence.Length())
|
|
|
|
|
|
|
|
var matrix []uint64
|
2022-10-26 13:16:56 +02:00
|
|
|
|
|
|
|
score := make([]int, len(references))
|
2022-11-16 09:57:36 +01:00
|
|
|
// t := 0
|
|
|
|
// r := 0
|
|
|
|
// w := 0
|
2022-10-26 13:16:56 +02:00
|
|
|
for i, ref := range references {
|
2023-01-22 22:04:17 +01:00
|
|
|
lcs, alilength := obialign.FastLCSScore(sequence, ref, -1, &matrix)
|
2022-10-26 13:16:56 +02:00
|
|
|
score[i] = alilength - lcs
|
|
|
|
}
|
|
|
|
|
2022-11-16 09:57:36 +01:00
|
|
|
// log.Println("Redone : ",r,"/",t,"(",w,")")
|
|
|
|
|
2022-10-26 13:16:56 +02:00
|
|
|
o := goutils.IntOrder(score)
|
|
|
|
|
|
|
|
current_taxid, err := taxo.Taxon(references[o[0]].Taxid())
|
|
|
|
current_score := score[o[0]]
|
|
|
|
current_idx := o[0]
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
log.Panicln(err)
|
|
|
|
}
|
|
|
|
|
2022-10-27 11:18:44 +02:00
|
|
|
obitag_index := make(map[int]string)
|
2022-10-26 13:16:56 +02:00
|
|
|
|
|
|
|
for _, idx := range o {
|
|
|
|
new_taxid, err := taxo.Taxon(references[idx].Taxid())
|
|
|
|
if err != nil {
|
|
|
|
log.Panicln(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
new_taxid, err = current_taxid.LCA(new_taxid)
|
|
|
|
if err != nil {
|
|
|
|
log.Panicln(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
new_score := score[idx]
|
|
|
|
|
|
|
|
if current_taxid.Taxid() != new_taxid.Taxid() {
|
|
|
|
|
|
|
|
if new_score > current_score {
|
2022-10-27 11:18:44 +02:00
|
|
|
obitag_index[score[current_idx]] = fmt.Sprintf(
|
2022-10-26 13:16:56 +02:00
|
|
|
"%d@%s@%s",
|
|
|
|
current_taxid.Taxid(),
|
|
|
|
current_taxid.ScientificName(),
|
|
|
|
current_taxid.Rank())
|
|
|
|
current_score = new_score
|
|
|
|
}
|
|
|
|
|
|
|
|
current_taxid = new_taxid
|
|
|
|
current_idx = idx
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-27 11:18:44 +02:00
|
|
|
obitag_index[score[current_idx]] = fmt.Sprintf(
|
2022-10-26 13:16:56 +02:00
|
|
|
"%d@%s@%s",
|
|
|
|
current_taxid.Taxid(),
|
|
|
|
current_taxid.ScientificName(),
|
|
|
|
current_taxid.Rank())
|
|
|
|
|
2022-10-27 11:18:44 +02:00
|
|
|
return obitag_index
|
2022-10-26 13:16:56 +02:00
|
|
|
}
|
|
|
|
|
2023-01-22 22:04:17 +01:00
|
|
|
func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
2022-10-26 13:16:56 +02:00
|
|
|
|
|
|
|
references := iterator.Load()
|
|
|
|
refcounts := make(
|
|
|
|
[]*obikmer.Table4mer,
|
|
|
|
len(references))
|
|
|
|
|
|
|
|
buffer := make([]byte, 0, 1000)
|
|
|
|
|
|
|
|
for i, seq := range references {
|
|
|
|
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
|
|
|
|
}
|
|
|
|
|
|
|
|
taxo, error := obifind.CLILoadSelectedTaxonomy()
|
|
|
|
if error != nil {
|
|
|
|
log.Panicln(error)
|
|
|
|
}
|
|
|
|
|
|
|
|
pbopt := make([]progressbar.Option, 0, 5)
|
|
|
|
pbopt = append(pbopt,
|
|
|
|
progressbar.OptionSetWriter(os.Stderr),
|
|
|
|
progressbar.OptionSetWidth(15),
|
|
|
|
progressbar.OptionShowCount(),
|
|
|
|
progressbar.OptionShowIts(),
|
|
|
|
progressbar.OptionSetDescription("[Sequence Processing]"),
|
|
|
|
)
|
|
|
|
|
|
|
|
bar := progressbar.NewOptions(len(references), pbopt...)
|
|
|
|
|
|
|
|
limits := make(chan [2]int)
|
2023-01-22 22:04:17 +01:00
|
|
|
indexed := obiiter.MakeIBioSequence()
|
2022-10-26 13:16:56 +02:00
|
|
|
go func() {
|
|
|
|
for i := 0; i < len(references); i += 10 {
|
|
|
|
limits <- [2]int{i, goutils.MinInt(i+10, len(references))}
|
|
|
|
}
|
|
|
|
close(limits)
|
|
|
|
}()
|
|
|
|
|
|
|
|
f := func() {
|
|
|
|
for l := range limits {
|
|
|
|
sl := obiseq.MakeBioSequenceSlice()
|
|
|
|
for i := l[0]; i < l[1]; i++ {
|
2022-11-16 09:27:04 +01:00
|
|
|
idx := IndexSequence(i, references, taxo)
|
|
|
|
iref := references[i].Copy()
|
|
|
|
iref.SetAttribute("obitag_ref_index", idx)
|
|
|
|
sl = append(sl, iref)
|
2022-10-26 13:16:56 +02:00
|
|
|
}
|
|
|
|
indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl))
|
|
|
|
bar.Add(len(sl))
|
|
|
|
}
|
|
|
|
|
|
|
|
indexed.Done()
|
|
|
|
}
|
|
|
|
|
|
|
|
nworkers := obioptions.CLIParallelWorkers()
|
|
|
|
indexed.Add(nworkers)
|
|
|
|
|
|
|
|
go func() {
|
|
|
|
indexed.WaitAndClose()
|
|
|
|
}()
|
|
|
|
|
|
|
|
for w := 0; w < nworkers; w++ {
|
|
|
|
go f()
|
|
|
|
}
|
|
|
|
|
|
|
|
return indexed.Rebatch(1000)
|
|
|
|
}
|