several correction in obitag and adds the --save-db option

Former-commit-id: b3a15b1a1b3971388a8749faaba519028f6e976c
This commit is contained in:
2023-06-07 17:45:11 +02:00
parent 8895381c92
commit 7f08f58336
3 changed files with 148 additions and 51 deletions

View File

@ -200,7 +200,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
for i := l[0]; i < l[1]; i++ { for i := l[0]; i < l[1]; i++ {
idx := IndexSequence(i, references, &refcounts, &taxa, taxo) idx := IndexSequence(i, references, &refcounts, &taxa, taxo)
iref := references[i].Copy() iref := references[i].Copy()
iref.SetAttribute("obitag_ref_index", idx) iref.SetOBITagRefIndex(idx)
sl = append(sl, iref) sl = append(sl, iref)
bar.Add(1) bar.Add(1)
} }

View File

@ -12,7 +12,6 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obirefidx" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obirefidx"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
) )
@ -91,7 +90,7 @@ func FindClosests(sequence *obiseq.BioSequence,
} }
//log.Println("that's all falks", maxe, bestId, bestidxs) log.Debugln("Closest Match", sequence.Id(), maxe, bestId, bestidxs, len(bests))
return bests, maxe, bestId, bestmatch, bestidxs return bests, maxe, bestId, bestmatch, bestidxs
} }
@ -102,24 +101,54 @@ func Identify(sequence *obiseq.BioSequence,
taxo *obitax.Taxonomy, taxo *obitax.Taxonomy,
runExact bool) *obiseq.BioSequence { runExact bool) *obiseq.BioSequence {
bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact) bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact)
taxon := (*obitax.TaxNode)(nil) taxon := (*obitax.TaxNode)(nil)
if identity >= 0.5 && differences > 0 {
newidx := 0
for i, best := range bests { for i, best := range bests {
idx := best.OBITagRefIndex() idx := best.OBITagRefIndex()
if idx == nil { if idx == nil {
// log.Fatalln("Need of indexing") // log.Debugln("Need of indexing")
newidx++
idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, &taxa, taxo) idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, &taxa, taxo)
references[seqidxs[i]].SetOBITagRefIndex(idx)
log.Debugln(references[seqidxs[i]].Id(), idx)
} }
d := differences d := differences
identification, ok := idx[d] identification, ok := idx[d]
found := false
var parts []string
/*
Here is an horrible hack for xprize challence.
With Euka01 the part[0] was equal to "" for at
least a sequence consensus. Which is not normal.
TO BE CHECKED AND CORRECTED
The problem seems related to idx that doesn't have
a 0 distance
*/
for !found && d >= 0 {
for !ok && d >= 0 { for !ok && d >= 0 {
identification, ok = idx[d] identification, ok = idx[d]
d-- d--
} }
parts := strings.Split(identification, "@") parts = strings.Split(identification, "@")
found = parts[0] != ""
if !found {
log.Debugln("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d)
for !ok && d <= 1000 {
identification, ok = idx[d]
d++
}
}
}
match_taxid, err := strconv.Atoi(parts[0]) match_taxid, err := strconv.Atoi(parts[0])
if err != nil { if err != nil {
@ -139,6 +168,7 @@ func Identify(sequence *obiseq.BioSequence,
} }
} }
log.Debugln(sequence.Id(), "Best matches:", len(bests), "New index:", newidx)
sequence.SetTaxid(taxon.Taxid()) sequence.SetTaxid(taxon.Taxid())
sequence.SetAttribute("scientific_name", taxon.ScientificName()) sequence.SetAttribute("scientific_name", taxon.ScientificName())
@ -148,6 +178,17 @@ func Identify(sequence *obiseq.BioSequence,
sequence.SetAttribute("obitag_bestmatch", bestmatch) sequence.SetAttribute("obitag_bestmatch", bestmatch)
sequence.SetAttribute("obitag_match_count", len(bests)) sequence.SetAttribute("obitag_match_count", len(bests))
} else {
taxon, _ = taxo.Taxon(1)
sequence.SetTaxid(1)
sequence.SetAttribute("scientific_name", taxon.ScientificName())
sequence.SetAttribute("obitag_rank", taxon.Rank())
sequence.SetAttribute("obitag_bestid", identity)
sequence.SetAttribute("obitag_difference", differences)
sequence.SetAttribute("obitag_bestmatch", bestmatch)
sequence.SetAttribute("obitag_match_count", len(bests))
}
return sequence return sequence
} }
@ -161,15 +202,11 @@ func IdentifySeqWorker(references obiseq.BioSequenceSlice,
} }
} }
func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence { func CLIAssignTaxonomy(iterator obiiter.IBioSequence,
references obiseq.BioSequenceSlice,
taxo *obitax.Taxonomy,
) obiiter.IBioSequence {
taxo, error := obifind.CLILoadSelectedTaxonomy()
if error != nil {
log.Panicln(error)
}
references := CLIRefDB()
refcounts := make( refcounts := make(
[]*obikmer.Table4mer, []*obikmer.Table4mer,
len(references)) len(references))
@ -186,5 +223,5 @@ func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence {
worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact()) worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact())
return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000) return iterator.MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0)
} }

View File

@ -4,6 +4,8 @@ import (
"log" "log"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
@ -11,6 +13,7 @@ import (
) )
var _RefDB = "" var _RefDB = ""
var _SaveRefDB = ""
var _RunExact = false var _RunExact = false
func TagOptionSet(options *getoptions.GetOpt) { func TagOptionSet(options *getoptions.GetOpt) {
@ -20,6 +23,10 @@ func TagOptionSet(options *getoptions.GetOpt) {
options.ArgName("FILENAME"), options.ArgName("FILENAME"),
options.Description("The name of the file containing the reference DB")) options.Description("The name of the file containing the reference DB"))
options.StringVar(&_SaveRefDB, "save-db", _SaveRefDB,
options.ArgName("FILENAME"),
options.Description("The name of a file where to save the reference DB with its indices"))
// options.BoolVar(&_RunExact, "exact", _RunExact, // options.BoolVar(&_RunExact, "exact", _RunExact,
// options.Alias("E"), // options.Alias("E"),
// options.Description("Unactivate the heuristic limatitating the sequence comparisons")) // options.Description("Unactivate the heuristic limatitating the sequence comparisons"))
@ -48,6 +55,59 @@ func CLIRefDB() obiseq.BioSequenceSlice {
return refdb.Load() return refdb.Load()
} }
func CLIShouldISaveRefDB() bool {
return _SaveRefDB != ""
}
func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) {
if CLIShouldISaveRefDB() {
idb := obiiter.IBatchOver(db, 1000)
var newIter obiiter.IBioSequence
opts := make([]obiformats.WithOption, 0, 10)
switch obiconvert.CLIOutputFastHeaderFormat() {
case "json":
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
case "obi":
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqOBIHeader))
default:
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
}
nworkers := obioptions.CLIParallelWorkers() / 4
if nworkers < 2 {
nworkers = 2
}
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
opts = append(opts, obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift()))
opts = append(opts, obiformats.OptionsCompressed(obiconvert.CLICompressed()))
var err error
switch obiconvert.CLIOutputFormat() {
case "fastq":
newIter, err = obiformats.WriteFastqToFile(idb, _SaveRefDB, opts...)
case "fasta":
newIter, err = obiformats.WriteFastaToFile(idb, _SaveRefDB, opts...)
default:
newIter, err = obiformats.WriteSequencesToFile(idb, _SaveRefDB, opts...)
}
if err != nil {
log.Fatalf("Write file error: %v", err)
}
newIter.Recycle()
obiiter.WaitForLastPipe()
}
}
func CLIRunExact() bool { func CLIRunExact() bool {
return _RunExact return _RunExact
} }