From 7f08f5833639212b614b04a73bc3e7d8dbac8518 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 7 Jun 2023 17:45:11 +0200 Subject: [PATCH] several correction in obitag and adds the --save-db option Former-commit-id: b3a15b1a1b3971388a8749faaba519028f6e976c --- pkg/obitools/obirefidx/obirefidx.go | 2 +- pkg/obitools/obitag/obitag.go | 137 ++++++++++++++++++---------- pkg/obitools/obitag/options.go | 60 ++++++++++++ 3 files changed, 148 insertions(+), 51 deletions(-) diff --git a/pkg/obitools/obirefidx/obirefidx.go b/pkg/obitools/obirefidx/obirefidx.go index d3dfc05..757bbd5 100644 --- a/pkg/obitools/obirefidx/obirefidx.go +++ b/pkg/obitools/obirefidx/obirefidx.go @@ -200,7 +200,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { for i := l[0]; i < l[1]; i++ { idx := IndexSequence(i, references, &refcounts, &taxa, taxo) iref := references[i].Copy() - iref.SetAttribute("obitag_ref_index", idx) + iref.SetOBITagRefIndex(idx) sl = append(sl, iref) bar.Add(1) } diff --git a/pkg/obitools/obitag/obitag.go b/pkg/obitools/obitag/obitag.go index 8af304c..dae4e55 100644 --- a/pkg/obitools/obitag/obitag.go +++ b/pkg/obitools/obitag/obitag.go @@ -12,7 +12,6 @@ import ( "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax" - "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obirefidx" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" ) @@ -91,7 +90,7 @@ func FindClosests(sequence *obiseq.BioSequence, } - //log.Println("that's all falks", maxe, bestId, bestidxs) + log.Debugln("Closest Match", sequence.Id(), maxe, bestId, bestidxs, len(bests)) return bests, maxe, bestId, bestmatch, bestidxs } @@ -102,52 +101,94 @@ func Identify(sequence *obiseq.BioSequence, taxo *obitax.Taxonomy, runExact bool) *obiseq.BioSequence { bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact) - taxon := (*obitax.TaxNode)(nil) - for i, best := range bests { - idx := best.OBITagRefIndex() - if idx == nil { - // log.Fatalln("Need of indexing") - idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, &taxa, taxo) + if identity >= 0.5 && differences > 0 { + newidx := 0 + for i, best := range bests { + idx := best.OBITagRefIndex() + if idx == nil { + // log.Debugln("Need of indexing") + newidx++ + idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, &taxa, taxo) + references[seqidxs[i]].SetOBITagRefIndex(idx) + log.Debugln(references[seqidxs[i]].Id(), idx) + } + + d := differences + identification, ok := idx[d] + found := false + var parts []string + + /* + Here is an horrible hack for xprize challence. + With Euka01 the part[0] was equal to "" for at + least a sequence consensus. Which is not normal. + + TO BE CHECKED AND CORRECTED + + The problem seems related to idx that doesn't have + a 0 distance + */ + for !found && d >= 0 { + for !ok && d >= 0 { + identification, ok = idx[d] + d-- + } + + parts = strings.Split(identification, "@") + + found = parts[0] != "" + if !found { + log.Debugln("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d) + for !ok && d <= 1000 { + identification, ok = idx[d] + d++ + } + + } + } + + match_taxid, err := strconv.Atoi(parts[0]) + + if err != nil { + log.Panicln("Cannot extract taxid from :", identification) + } + + match_taxon, err := taxo.Taxon(match_taxid) + + if err != nil { + log.Panicln("Cannot find taxon corresponding to taxid :", match_taxid) + } + + if taxon != nil { + taxon, _ = taxon.LCA(match_taxon) + } else { + taxon = match_taxon + } + } + log.Debugln(sequence.Id(), "Best matches:", len(bests), "New index:", newidx) - d := differences - identification, ok := idx[d] - for !ok && d >= 0 { - identification, ok = idx[d] - d-- - } - - parts := strings.Split(identification, "@") - match_taxid, err := strconv.Atoi(parts[0]) - - if err != nil { - log.Panicln("Cannot extract taxid from :", identification) - } - - match_taxon, err := taxo.Taxon(match_taxid) - - if err != nil { - log.Panicln("Cannot find taxon corresponding to taxid :", match_taxid) - } - - if taxon != nil { - taxon, _ = taxon.LCA(match_taxon) - } else { - taxon = match_taxon - } + sequence.SetTaxid(taxon.Taxid()) + sequence.SetAttribute("scientific_name", taxon.ScientificName()) + sequence.SetAttribute("obitag_rank", taxon.Rank()) + sequence.SetAttribute("obitag_bestid", identity) + sequence.SetAttribute("obitag_difference", differences) + sequence.SetAttribute("obitag_bestmatch", bestmatch) + sequence.SetAttribute("obitag_match_count", len(bests)) + } else { + taxon, _ = taxo.Taxon(1) + sequence.SetTaxid(1) + sequence.SetAttribute("scientific_name", taxon.ScientificName()) + sequence.SetAttribute("obitag_rank", taxon.Rank()) + sequence.SetAttribute("obitag_bestid", identity) + sequence.SetAttribute("obitag_difference", differences) + sequence.SetAttribute("obitag_bestmatch", bestmatch) + sequence.SetAttribute("obitag_match_count", len(bests)) } - sequence.SetTaxid(taxon.Taxid()) - sequence.SetAttribute("scientific_name", taxon.ScientificName()) - sequence.SetAttribute("obitag_rank", taxon.Rank()) - sequence.SetAttribute("obitag_bestid", identity) - sequence.SetAttribute("obitag_difference", differences) - sequence.SetAttribute("obitag_bestmatch", bestmatch) - sequence.SetAttribute("obitag_match_count", len(bests)) - return sequence } @@ -161,15 +202,11 @@ func IdentifySeqWorker(references obiseq.BioSequenceSlice, } } -func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence { +func CLIAssignTaxonomy(iterator obiiter.IBioSequence, + references obiseq.BioSequenceSlice, + taxo *obitax.Taxonomy, +) obiiter.IBioSequence { - taxo, error := obifind.CLILoadSelectedTaxonomy() - - if error != nil { - log.Panicln(error) - } - - references := CLIRefDB() refcounts := make( []*obikmer.Table4mer, len(references)) @@ -186,5 +223,5 @@ func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence { worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact()) - return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000) + return iterator.MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0) } diff --git a/pkg/obitools/obitag/options.go b/pkg/obitools/obitag/options.go index f95803b..15116b7 100644 --- a/pkg/obitools/obitag/options.go +++ b/pkg/obitools/obitag/options.go @@ -4,6 +4,8 @@ import ( "log" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind" @@ -11,6 +13,7 @@ import ( ) var _RefDB = "" +var _SaveRefDB = "" var _RunExact = false func TagOptionSet(options *getoptions.GetOpt) { @@ -20,6 +23,10 @@ func TagOptionSet(options *getoptions.GetOpt) { options.ArgName("FILENAME"), options.Description("The name of the file containing the reference DB")) + options.StringVar(&_SaveRefDB, "save-db", _SaveRefDB, + options.ArgName("FILENAME"), + options.Description("The name of a file where to save the reference DB with its indices")) + // options.BoolVar(&_RunExact, "exact", _RunExact, // options.Alias("E"), // options.Description("Unactivate the heuristic limatitating the sequence comparisons")) @@ -48,6 +55,59 @@ func CLIRefDB() obiseq.BioSequenceSlice { return refdb.Load() } +func CLIShouldISaveRefDB() bool { + return _SaveRefDB != "" +} + +func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) { + if CLIShouldISaveRefDB() { + idb := obiiter.IBatchOver(db, 1000) + + var newIter obiiter.IBioSequence + + opts := make([]obiformats.WithOption, 0, 10) + + switch obiconvert.CLIOutputFastHeaderFormat() { + case "json": + opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader)) + case "obi": + opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqOBIHeader)) + default: + opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader)) + } + + nworkers := obioptions.CLIParallelWorkers() / 4 + if nworkers < 2 { + nworkers = 2 + } + + opts = append(opts, obiformats.OptionsParallelWorkers(nworkers)) + opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize())) + + opts = append(opts, obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift())) + + opts = append(opts, obiformats.OptionsCompressed(obiconvert.CLICompressed())) + + var err error + + switch obiconvert.CLIOutputFormat() { + case "fastq": + newIter, err = obiformats.WriteFastqToFile(idb, _SaveRefDB, opts...) + case "fasta": + newIter, err = obiformats.WriteFastaToFile(idb, _SaveRefDB, opts...) + default: + newIter, err = obiformats.WriteSequencesToFile(idb, _SaveRefDB, opts...) + } + + if err != nil { + log.Fatalf("Write file error: %v", err) + } + + newIter.Recycle() + obiiter.WaitForLastPipe() + } +} + func CLIRunExact() bool { return _RunExact }