mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Xprize update
Former-commit-id: d38919a897961e4d40da3b844057c3fb94fdb6d7
This commit is contained in:
@ -2,7 +2,6 @@ package obichunk
|
||||
|
||||
import (
|
||||
"io/fs"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
@ -14,7 +13,7 @@ import (
|
||||
)
|
||||
|
||||
func tempDir() (string, error) {
|
||||
dir, err := ioutil.TempDir(os.TempDir(), "obiseq_chunks_")
|
||||
dir, err := os.MkdirTemp(os.TempDir(), "obiseq_chunks_")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
@ -249,6 +249,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
log.Fatalln("Invalid value for @spacer parameter")
|
||||
}
|
||||
|
||||
log.Infof("Set global spacer to %d bp", spacer)
|
||||
library.SetTagSpacer(spacer)
|
||||
case 2:
|
||||
primer := values[0]
|
||||
@ -258,6 +259,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
log.Fatalln("Invalid value for @spacer parameter")
|
||||
}
|
||||
|
||||
log.Infof("Set spacer for primer %s to %d bp", primer, spacer)
|
||||
library.SetTagSpacerFor(primer, spacer)
|
||||
default:
|
||||
log.Fatalln("Invalid value for @spacer parameter")
|
||||
@ -274,6 +276,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
log.Fatalln("Invalid value for @forward_spacer parameter")
|
||||
}
|
||||
|
||||
log.Infof("Set spacer for forward primer to %d bp", spacer)
|
||||
library.SetForwardTagSpacer(spacer)
|
||||
default:
|
||||
log.Fatalln("Invalid value for @forward_spacer parameter")
|
||||
@ -290,6 +293,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
log.Fatalln("Invalid value for @reverse_spacer parameter")
|
||||
}
|
||||
|
||||
log.Infof("Set spacer for reverse primer to %d bp", spacer)
|
||||
library.SetReverseTagSpacer(spacer)
|
||||
default:
|
||||
log.Fatalln("Invalid value for @reverse_spacer parameter")
|
||||
@ -301,9 +305,13 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
log.Fatalln("Missing value for @tag_delimiter parameter")
|
||||
case 1:
|
||||
value := []byte(values[0])[0]
|
||||
|
||||
log.Infof("Set global tag delimiter to %c", value)
|
||||
library.SetTagDelimiter(value)
|
||||
case 2:
|
||||
value := []byte(values[1])[0]
|
||||
|
||||
log.Infof("Set tag delimiter for primer %s to %c", values[0], value)
|
||||
library.SetTagDelimiterFor(values[0], value)
|
||||
default:
|
||||
log.Fatalln("Invalid value for @tag_delimiter parameter")
|
||||
@ -315,6 +323,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
log.Fatalln("Missing value for @forward_tag_delimiter parameter")
|
||||
case 1:
|
||||
value := []byte(values[0])[0]
|
||||
|
||||
log.Infof("Set tag delimiter for forward primer to %c", value)
|
||||
library.SetForwardTagDelimiter(value)
|
||||
default:
|
||||
log.Fatalln("Invalid value for @forward_tag_delimiter parameter")
|
||||
@ -326,6 +336,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
log.Fatalln("Missing value for @reverse_tag_delimiter parameter")
|
||||
case 1:
|
||||
value := []byte(values[0])[0]
|
||||
|
||||
log.Infof("Set tag delimiter for reverse primer to %c", value)
|
||||
library.SetReverseTagDelimiter(value)
|
||||
default:
|
||||
log.Fatalln("Invalid value for @reverse_tag_delimiter parameter")
|
||||
@ -339,6 +351,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
if err := library.SetMatching(values[0]); err != nil {
|
||||
log.Fatalf("Invalid value %s for @matching parameter", values[0])
|
||||
}
|
||||
log.Infof("Set tag matching mode to %s", values[0])
|
||||
default:
|
||||
log.Fatalln("Invalid value for @matching parameter")
|
||||
}
|
||||
@ -354,6 +367,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
log.Fatalf("Invalid value %s for @primer_error parameter", values[0])
|
||||
}
|
||||
|
||||
log.Infof("Set global allowed primer mismatches to %d", dist)
|
||||
library.SetAllowedMismatches(dist)
|
||||
case 2:
|
||||
primer := values[0]
|
||||
@ -363,6 +377,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
log.Fatalf("Invalid value %s for @primer_error parameter", values[1])
|
||||
}
|
||||
|
||||
log.Infof("Set allowed primer mismatches for primer %s to %d", primer, dist)
|
||||
library.SetAllowedMismatchesFor(primer, dist)
|
||||
default:
|
||||
log.Fatalln("Invalid value for @primer_error parameter")
|
||||
@ -379,6 +394,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
log.Fatalf("Invalid value %s for @forward_primer_error parameter", values[0])
|
||||
}
|
||||
|
||||
log.Infof("Set allowed mismatches for forward primer to %d", dist)
|
||||
library.SetForwardAllowedMismatches(dist)
|
||||
default:
|
||||
log.Fatalln("Invalid value for @forward_primer_error parameter")
|
||||
@ -395,6 +411,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
log.Fatalf("Invalid value %s for @reverse_primer_error parameter", values[0])
|
||||
}
|
||||
|
||||
log.Infof("Set allowed mismatches for reverse primer to %d", dist)
|
||||
library.SetReverseAllowedMismatches(dist)
|
||||
default:
|
||||
log.Fatalln("Invalid value for @reverse_primer_error parameter")
|
||||
@ -410,6 +427,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
if err != nil {
|
||||
log.Fatalf("Invalid value %s for @tag_indels parameter", values[0])
|
||||
}
|
||||
|
||||
log.Infof("Set global maximum tag indels to %d", indels)
|
||||
library.SetTagIndels(indels)
|
||||
case 2:
|
||||
indels, err := strconv.Atoi(values[1])
|
||||
@ -418,6 +437,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
log.Fatalf("Invalid value %s for @tag_indels parameter", values[1])
|
||||
}
|
||||
|
||||
log.Infof("Set maximum tag indels for primer %s to %d", values[0], indels)
|
||||
library.SetTagIndelsFor(values[0], indels)
|
||||
}
|
||||
},
|
||||
@ -432,6 +452,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
if err != nil {
|
||||
log.Fatalf("Invalid value %s for @forward_tag_indels parameter", values[0])
|
||||
}
|
||||
|
||||
log.Infof("Set maximum tag indels for forward primer to %d", indels)
|
||||
library.SetForwardTagIndels(indels)
|
||||
default:
|
||||
log.Fatalln("Invalid value for @forward_tag_indels parameter")
|
||||
@ -447,6 +469,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
if err != nil {
|
||||
log.Fatalf("Invalid value %s for @reverse_tag_indels parameter", values[0])
|
||||
}
|
||||
|
||||
log.Infof("Set maximum tag indels for reverse primer to %d", indels)
|
||||
library.SetReverseTagIndels(indels)
|
||||
default:
|
||||
log.Fatalln("Invalid value for @reverse_tag_indels parameter")
|
||||
@ -457,8 +481,22 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
case 0:
|
||||
log.Fatalln("Missing value for @indels parameter")
|
||||
case 1:
|
||||
|
||||
if values[0] == "true" {
|
||||
log.Info("Allows indels for primer matching")
|
||||
} else {
|
||||
log.Info("Disallows indels for primer matching")
|
||||
}
|
||||
|
||||
library.SetAllowsIndels(values[0] == "true")
|
||||
case 2:
|
||||
|
||||
if values[1] == "true" {
|
||||
log.Infof("Allows indels for primer matching %s", values[0])
|
||||
} else {
|
||||
log.Infof("Disallows indels for primer matching %s", values[0])
|
||||
}
|
||||
|
||||
library.SetAllowsIndelsFor(values[0], values[1] == "true")
|
||||
default:
|
||||
log.Fatalln("Invalid value for @indels parameter")
|
||||
@ -470,6 +508,12 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
case 0:
|
||||
log.Fatalln("Missing value for @forward_indels parameter")
|
||||
case 1:
|
||||
if values[0] == "true" {
|
||||
log.Info("Allows indels for forward primer matching")
|
||||
} else {
|
||||
log.Info("Disallows indels for forward primer matching")
|
||||
}
|
||||
|
||||
library.SetForwardAllowsIndels(values[0] == "true")
|
||||
default:
|
||||
log.Fatalln("Invalid value for @forward_indels parameter")
|
||||
@ -480,6 +524,11 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
|
||||
case 0:
|
||||
log.Fatalln("Missing value for @reverse_indels parameter")
|
||||
case 1:
|
||||
if values[0] == "true" {
|
||||
log.Info("Allows indels for reverse primer matching")
|
||||
} else {
|
||||
log.Info("Disallows indels for reverse primer matching")
|
||||
}
|
||||
library.SetReverseAllowsIndels(values[0] == "true")
|
||||
default:
|
||||
log.Fatalln("Invalid value for @reverse_indels parameter")
|
||||
|
@ -607,7 +607,7 @@ func (g *DeBruijnGraph) HaviestPath() []uint64 {
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("Heaviest node: %d [%v]", heaviestNode, heaviestWeight)
|
||||
log.Debugf("Heaviest node: %d [%v]", heaviestNode, heaviestWeight)
|
||||
// Reconstruct the path from the start node to the heaviest node found
|
||||
heaviestPath := make([]uint64, 0)
|
||||
currentNode := heaviestNode
|
||||
|
@ -7,7 +7,7 @@ import (
|
||||
// TODO: The version number is extracted from git. This induces that the version
|
||||
// corresponds to the last commit, and not the one when the file will be
|
||||
// commited
|
||||
var _Commit = "f265a39"
|
||||
var _Commit = "4d86483"
|
||||
var _Version = "Release 4.2.0"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
|
@ -415,11 +415,16 @@ func (s *BioSequence) SetTaxid(taxid int) {
|
||||
s.SetAttribute("taxid", taxid)
|
||||
}
|
||||
|
||||
func (s *BioSequence) OBITagRefIndex() map[int]string {
|
||||
func (s *BioSequence) OBITagRefIndex(slot ...string) map[int]string {
|
||||
key := "obitag_ref_index"
|
||||
|
||||
if len(slot) > 0 {
|
||||
key = slot[0]
|
||||
}
|
||||
|
||||
var val map[int]string
|
||||
|
||||
i, ok := s.GetAttribute("obitag_ref_index")
|
||||
i, ok := s.GetAttribute(key)
|
||||
|
||||
if !ok {
|
||||
return nil
|
||||
|
@ -202,4 +202,8 @@ var OBILang = gval.NewLanguage(
|
||||
scomp[string(k)] = float64(v)
|
||||
}
|
||||
return scomp, nil
|
||||
}))
|
||||
}),
|
||||
gval.Function("replace", func(args ...interface{}) (interface{}, error) {
|
||||
return strings.ReplaceAll(args[0].(string), args[1].(string), args[2].(string)), nil
|
||||
}),
|
||||
)
|
||||
|
@ -66,3 +66,27 @@ func (taxonomy *Taxonomy) SetPath(sequence *obiseq.BioSequence) string {
|
||||
|
||||
return tpath
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) SetScientificName(sequence *obiseq.BioSequence) string {
|
||||
taxid, err := taxonomy.Taxon(sequence.Taxid())
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Taxid %d not defined in the current taxonomy", sequence.Taxid())
|
||||
}
|
||||
|
||||
sequence.SetAttribute("scienctific_name", taxid.ScientificName())
|
||||
|
||||
return taxid.ScientificName()
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) SetTaxonomicRank(sequence *obiseq.BioSequence) string {
|
||||
taxid, err := taxonomy.Taxon(sequence.Taxid())
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Taxid %d not defined in the current taxonomy", sequence.Taxid())
|
||||
}
|
||||
|
||||
sequence.SetAttribute("taxonomic_rank", taxid.Rank())
|
||||
|
||||
return taxid.Rank()
|
||||
}
|
||||
|
@ -57,6 +57,23 @@ func (taxonomy *Taxonomy) IsSubCladeOf(taxid int) obiseq.SequencePredicate {
|
||||
return f
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) IsSubCladeOfSlot(key string) obiseq.SequencePredicate {
|
||||
|
||||
f := func(sequence *obiseq.BioSequence) bool {
|
||||
val, ok := sequence.GetStringAttribute(key)
|
||||
|
||||
if ok {
|
||||
parent, err1 := taxonomy.Taxon(val)
|
||||
taxon, err2 := taxonomy.Taxon(sequence.Taxid())
|
||||
return err1 == nil && err2 == nil && taxon.IsSubCladeOf(parent)
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) HasRequiredRank(rank string) obiseq.SequencePredicate {
|
||||
|
||||
if !obiutils.Contains(taxonomy.RankList(), rank) {
|
||||
|
@ -2,6 +2,8 @@ package obitax
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type TaxName struct {
|
||||
@ -54,11 +56,43 @@ func (taxonomy *Taxonomy) AddNewTaxa(taxid, parent int, rank string, replace boo
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) Taxon(taxid int) (*TaxNode, error) {
|
||||
t, ok := (*taxonomy.nodes)[taxid]
|
||||
// func (taxonomy *Taxonomy) Taxon(taxid int) (*TaxNode, error) {
|
||||
// t, ok := (*taxonomy.nodes)[taxid]
|
||||
|
||||
// if !ok {
|
||||
// a, aok := taxonomy.alias[taxid]
|
||||
// if !aok {
|
||||
// return nil, fmt.Errorf("Taxid %d is not part of the taxonomy", taxid)
|
||||
// }
|
||||
// t = a
|
||||
// }
|
||||
// return t, nil
|
||||
// }
|
||||
|
||||
func (taxonomy *Taxonomy) Taxon(taxid interface{}) (*TaxNode, error) {
|
||||
var itaxid int
|
||||
var err error
|
||||
|
||||
switch v := taxid.(type) {
|
||||
case int:
|
||||
itaxid = v
|
||||
case string:
|
||||
itaxid, err = strconv.Atoi(v)
|
||||
|
||||
if err != nil {
|
||||
re := regexp.MustCompile(`TX:(\d+)`)
|
||||
parts := re.FindStringSubmatch(v)
|
||||
if len(parts) != 2 {
|
||||
return nil, fmt.Errorf("I cannot parse taxid from %s", v)
|
||||
}
|
||||
itaxid, _ = strconv.Atoi(parts[1])
|
||||
}
|
||||
}
|
||||
|
||||
t, ok := (*taxonomy.nodes)[itaxid]
|
||||
|
||||
if !ok {
|
||||
a, aok := taxonomy.alias[taxid]
|
||||
a, aok := taxonomy.alias[itaxid]
|
||||
if !aok {
|
||||
return nil, fmt.Errorf("Taxid %d is not part of the taxonomy", taxid)
|
||||
}
|
||||
@ -66,7 +100,6 @@ func (taxonomy *Taxonomy) Taxon(taxid int) (*TaxNode, error) {
|
||||
}
|
||||
return t, nil
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) AddNewName(taxid int, name, nameclass *string) error {
|
||||
node, node_err := taxonomy.Taxon(taxid)
|
||||
if node_err != nil {
|
||||
|
@ -215,6 +215,24 @@ func AddTaxonAtRankWorker(taxonomy *obitax.Taxonomy, ranks ...string) obiseq.Seq
|
||||
return f
|
||||
}
|
||||
|
||||
func AddTaxonRankWorker(taxonomy *obitax.Taxonomy) obiseq.SeqWorker {
|
||||
f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
taxonomy.SetTaxonomicRank(s)
|
||||
return obiseq.BioSequenceSlice{s}, nil
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func AddScientificNameWorker(taxonomy *obitax.Taxonomy) obiseq.SeqWorker {
|
||||
f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
taxonomy.SetScientificName(s)
|
||||
return obiseq.BioSequenceSlice{s}, nil
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func AddSeqLengthWorker() obiseq.SeqWorker {
|
||||
f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
s.SetAttribute("seq_length", s.Len())
|
||||
@ -266,6 +284,18 @@ func CLIAnnotationWorker() obiseq.SeqWorker {
|
||||
annotator = annotator.ChainWorkers(w)
|
||||
}
|
||||
|
||||
if CLISetTaxonomicRank() {
|
||||
taxo := obigrep.CLILoadSelectedTaxonomy()
|
||||
w := AddTaxonRankWorker(taxo)
|
||||
annotator = annotator.ChainWorkers(w)
|
||||
}
|
||||
|
||||
if CLISetScientificName() {
|
||||
taxo := obigrep.CLILoadSelectedTaxonomy()
|
||||
w := AddScientificNameWorker(taxo)
|
||||
annotator = annotator.ChainWorkers(w)
|
||||
}
|
||||
|
||||
if CLIHasAddLCA() {
|
||||
taxo := obigrep.CLILoadSelectedTaxonomy()
|
||||
w := obitax.AddLCAWorker(taxo, CLILCASlotName(), CLILCAThreshold())
|
||||
|
@ -32,6 +32,8 @@ var _lcaError = 0.0
|
||||
var _setId = ""
|
||||
var _cut = ""
|
||||
var _taxonomicPath = false
|
||||
var _withRank = false
|
||||
var _withScientificName = false
|
||||
|
||||
func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
|
||||
// options.BoolVar(&_addRank, "seq-rank", _addRank,
|
||||
@ -117,6 +119,12 @@ func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&_taxonomicPath, "taxonomic-path", _taxonomicPath,
|
||||
options.Description("Annotate the sequence with its taxonomic path"))
|
||||
|
||||
options.BoolVar(&_withRank, "taxonomic-rank", _withRank,
|
||||
options.Description("Annotate the sequence with its taxonomic rank"))
|
||||
|
||||
options.BoolVar(&_withScientificName, "scientific-name", _withScientificName,
|
||||
options.Description("Annotate the sequence with its scientific name"))
|
||||
|
||||
// options.StringVar(&_tagList, "tag-list", _tagList,
|
||||
// options.ArgName("FILENAME"),
|
||||
// options.Description("<FILENAME> points to a file containing attribute names"+
|
||||
@ -307,3 +315,11 @@ func CLIPatternInDels() bool {
|
||||
func CLISetTaxonomicPath() bool {
|
||||
return _taxonomicPath
|
||||
}
|
||||
|
||||
func CLISetTaxonomicRank() bool {
|
||||
return _withRank
|
||||
}
|
||||
|
||||
func CLISetScientificName() bool {
|
||||
return _withScientificName
|
||||
}
|
||||
|
@ -52,7 +52,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
||||
|
||||
}
|
||||
|
||||
log.Printf("Number of reads : %d\n", len(seqs))
|
||||
log.Debugf("Number of reads : %d\n", len(seqs))
|
||||
|
||||
if kmer_size < 0 {
|
||||
longest := make([]int, len(seqs))
|
||||
@ -64,7 +64,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
||||
}
|
||||
|
||||
kmer_size = slices.Max(longest) + 1
|
||||
log.Printf("estimated kmer size : %d", kmer_size)
|
||||
log.Debugf("estimated kmer size : %d", kmer_size)
|
||||
}
|
||||
|
||||
var graph *obikmer.DeBruijnGraph
|
||||
@ -80,7 +80,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
||||
}
|
||||
|
||||
kmer_size++
|
||||
log.Infof("Cycle detected, increasing kmer size to %d\n", kmer_size)
|
||||
log.Debugf("Cycle detected, increasing kmer size to %d\n", kmer_size)
|
||||
}
|
||||
|
||||
if save_graph {
|
||||
@ -96,7 +96,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("Graph size : %d\n", graph.Len())
|
||||
log.Debugf("Graph size : %d\n", graph.Len())
|
||||
total_kmer := graph.Len()
|
||||
|
||||
seq, err := graph.LongestConsensus(consensus_id)
|
||||
@ -260,6 +260,21 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
|
||||
sample_key string, kmer_size int) obiseq.BioSequenceSlice {
|
||||
denoised := obiseq.MakeBioSequenceSlice(len(*graph.Vertices))
|
||||
|
||||
bar := (*progressbar.ProgressBar)(nil)
|
||||
if obiconvert.CLIProgressBar() {
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetPredictTime(true),
|
||||
progressbar.OptionSetDescription(fmt.Sprintf("[Build consensus] on %s", graph.Name)),
|
||||
)
|
||||
|
||||
bar = progressbar.NewOptions(len(*graph.Vertices), pbopt...)
|
||||
}
|
||||
|
||||
for i, v := range *graph.Vertices {
|
||||
var err error
|
||||
var clean *obiseq.BioSequence
|
||||
@ -283,6 +298,7 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
|
||||
clean.SetAttribute("obiconsensus_consensus", true)
|
||||
}
|
||||
pack.Recycle(false)
|
||||
|
||||
} else {
|
||||
clean = obiseq.NewBioSequence(v.Id(), v.Sequence(), v.Definition())
|
||||
clean.SetAttribute("obiconsensus_consensus", false)
|
||||
@ -295,7 +311,20 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
|
||||
clean.SetAttribute("obiconsensus_weight", int(1))
|
||||
}
|
||||
|
||||
annotations := v.Annotations()
|
||||
|
||||
for k, v := range annotations {
|
||||
if !clean.HasAttribute(k) {
|
||||
clean.SetAttribute(k, v)
|
||||
}
|
||||
}
|
||||
|
||||
denoised[i] = clean
|
||||
|
||||
if bar != nil {
|
||||
bar.Add(1)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return denoised
|
||||
|
@ -1,6 +1,7 @@
|
||||
package obigrep
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
@ -13,7 +14,7 @@ import (
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _BelongTaxa = make([]int, 0)
|
||||
var _BelongTaxa = make([]string, 0)
|
||||
var _NotBelongTaxa = make([]int, 0)
|
||||
var _RequiredRanks = make([]string, 0)
|
||||
|
||||
@ -48,7 +49,7 @@ func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
|
||||
options.Alias("t"),
|
||||
options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
|
||||
|
||||
options.IntSliceVar(&_BelongTaxa, "restrict-to-taxon", 1, 1,
|
||||
options.StringSliceVar(&_BelongTaxa, "restrict-to-taxon", 1, 1,
|
||||
options.Alias("r"),
|
||||
options.ArgName("TAXID"),
|
||||
options.Description("Require that the actual taxon of the sequence belongs the provided taxid."))
|
||||
@ -231,13 +232,27 @@ func CLILoadSelectedTaxonomy() *obitax.Taxonomy {
|
||||
}
|
||||
|
||||
func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
|
||||
var p obiseq.SequencePredicate
|
||||
var p2 obiseq.SequencePredicate
|
||||
|
||||
if len(_BelongTaxa) > 0 {
|
||||
taxonomy := CLILoadSelectedTaxonomy()
|
||||
p := taxonomy.IsSubCladeOf(_BelongTaxa[0])
|
||||
|
||||
for _, taxid := range _BelongTaxa[1:] {
|
||||
p = p.Or(taxonomy.IsSubCladeOf(taxid))
|
||||
taxid, err := strconv.Atoi(_BelongTaxa[0])
|
||||
if err != nil {
|
||||
p = taxonomy.IsSubCladeOfSlot(_BelongTaxa[0])
|
||||
} else {
|
||||
p = taxonomy.IsSubCladeOf(taxid)
|
||||
}
|
||||
for _, staxid := range _BelongTaxa[1:] {
|
||||
taxid, err := strconv.Atoi(staxid)
|
||||
if err != nil {
|
||||
p2 = taxonomy.IsSubCladeOfSlot(staxid)
|
||||
} else {
|
||||
p2 = taxonomy.IsSubCladeOf(taxid)
|
||||
}
|
||||
|
||||
p = p.Or(p2)
|
||||
}
|
||||
|
||||
return p
|
||||
|
@ -12,21 +12,32 @@ import (
|
||||
|
||||
type IndexedSequenceSlice struct {
|
||||
Sequences obiseq.BioSequenceSlice
|
||||
Indices []map[interface{}]*obiutils.Set[int]
|
||||
Indices []map[string]*obiutils.Set[int]
|
||||
}
|
||||
|
||||
func (s IndexedSequenceSlice) Len() int {
|
||||
return len(s.Sequences)
|
||||
}
|
||||
|
||||
func (s IndexedSequenceSlice) Get(keys ...interface{}) *obiseq.BioSequenceSlice {
|
||||
func (s IndexedSequenceSlice) Get(keys ...string) *obiseq.BioSequenceSlice {
|
||||
var keeps obiutils.Set[int]
|
||||
|
||||
for i, v := range s.Indices {
|
||||
|
||||
if i == 0 {
|
||||
keeps = *v[keys[0]]
|
||||
p, ok := v[keys[0]]
|
||||
if !ok {
|
||||
keeps = obiutils.MakeSet[int]()
|
||||
break
|
||||
}
|
||||
keeps = *p
|
||||
} else {
|
||||
keeps = keeps.Intersection(*v[keys[i]])
|
||||
p, ok := v[keys[i]]
|
||||
if !ok {
|
||||
keeps = obiutils.MakeSet[int]()
|
||||
break
|
||||
}
|
||||
keeps = keeps.Intersection(*p)
|
||||
}
|
||||
}
|
||||
|
||||
@ -39,14 +50,14 @@ func (s IndexedSequenceSlice) Get(keys ...interface{}) *obiseq.BioSequenceSlice
|
||||
}
|
||||
|
||||
func BuildIndexedSequenceSlice(seqs obiseq.BioSequenceSlice, keys []string) IndexedSequenceSlice {
|
||||
indices := make([]map[interface{}]*obiutils.Set[int], len(keys))
|
||||
indices := make([]map[string]*obiutils.Set[int], len(keys))
|
||||
|
||||
for i, k := range keys {
|
||||
idx := make(map[interface{}]*obiutils.Set[int])
|
||||
idx := make(map[string]*obiutils.Set[int])
|
||||
|
||||
for j, seq := range seqs {
|
||||
|
||||
if value, ok := seq.GetAttribute(k); ok {
|
||||
if value, ok := seq.GetStringAttribute(k); ok {
|
||||
goods, ok := idx[value]
|
||||
if !ok {
|
||||
goods = obiutils.NewSet[int]()
|
||||
@ -67,10 +78,10 @@ func MakeJoinWorker(by []string, index IndexedSequenceSlice, updateId, updateSeq
|
||||
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
var ok bool
|
||||
|
||||
keys := make([]interface{}, len(by))
|
||||
keys := make([]string, len(by))
|
||||
|
||||
for i, v := range by {
|
||||
keys[i], ok = sequence.GetAttribute(v)
|
||||
keys[i], ok = sequence.GetStringAttribute(v)
|
||||
if !ok {
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
|
460
pkg/obitools/obitag2/obitag.go
Normal file
460
pkg/obitools/obitag2/obitag.go
Normal file
@ -0,0 +1,460 @@
|
||||
package obitag2
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
"golang.org/x/exp/maps"
|
||||
"golang.org/x/exp/slices"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
type Obitag2Match struct {
|
||||
Taxon *obitax.TaxNode
|
||||
Id string
|
||||
Weight int
|
||||
}
|
||||
type Obitag2RefDB struct {
|
||||
Taxonomy *obitax.Taxonomy
|
||||
|
||||
Full *obiseq.BioSequenceSlice
|
||||
FullCounts *[]*obikmer.Table4mer
|
||||
FullTaxa *obitax.TaxonSet
|
||||
|
||||
Clusters *obiseq.BioSequenceSlice
|
||||
ClusterCounts *[]*obikmer.Table4mer
|
||||
ClusterTaxa *obitax.TaxonSet
|
||||
|
||||
Families *map[int]*obiseq.BioSequenceSlice
|
||||
FamilyCounts *map[int]*[]*obikmer.Table4mer
|
||||
FamilyTaxa *map[int]obitax.TaxonSet
|
||||
|
||||
ExactTaxid *map[string]*Obitag2Match
|
||||
}
|
||||
|
||||
// MatchDistanceIndex returns the taxid, rank, and scientificName based on the given distance and distanceIdx.
|
||||
//
|
||||
// Parameters:
|
||||
// - distance: The distance to match against the keys in distanceIdx.
|
||||
// - distanceIdx: A map containing distances as keys and corresponding values in the format "taxid@rank@scientificName".
|
||||
//
|
||||
// Returns:
|
||||
// - taxid: The taxid associated with the matched distance.
|
||||
// - rank: The rank associated with the matched distance.
|
||||
// - scientificName: The scientific name associated with the matched distance.
|
||||
func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, string) {
|
||||
keys := maps.Keys(distanceIdx)
|
||||
slices.Sort(keys)
|
||||
|
||||
i := sort.Search(len(keys), func(i int) bool {
|
||||
return distance <= keys[i]
|
||||
})
|
||||
|
||||
var taxid int
|
||||
var rank string
|
||||
var scientificName string
|
||||
|
||||
if i == len(keys) || distance > keys[len(keys)-1] {
|
||||
taxid = 1
|
||||
rank = "no rank"
|
||||
scientificName = "root"
|
||||
} else {
|
||||
parts := strings.Split(distanceIdx[keys[i]], "@")
|
||||
taxid, _ = strconv.Atoi(parts[0])
|
||||
rank = parts[1]
|
||||
scientificName = parts[2]
|
||||
}
|
||||
|
||||
// log.Info("taxid:", taxid, " rank:", rank, " scientificName:", scientificName)
|
||||
|
||||
return taxid, rank, scientificName
|
||||
}
|
||||
|
||||
// FindClosests finds the closest bio sequence from a given sequence and a slice of reference sequences.
|
||||
//
|
||||
// Parameters:
|
||||
// - sequence: the bio sequence to find the closest matches for.
|
||||
// - references: a slice of reference sequences to compare against.
|
||||
// - refcounts: a slice of reference sequence counts.
|
||||
// - runExact: a boolean flag indicating whether to run an exact match.
|
||||
//
|
||||
// Returns:
|
||||
// - bests: a slice of the closest bio sequences.
|
||||
// - maxe: the maximum score.
|
||||
// - bestId: the best ID.
|
||||
// - bestmatch: the best match.
|
||||
// - bestidxs: a slice of the best indexes.
|
||||
func FindClosests(sequence *obiseq.BioSequence,
|
||||
references obiseq.BioSequenceSlice,
|
||||
refcounts []*obikmer.Table4mer,
|
||||
runExact bool) (obiseq.BioSequenceSlice, int, float64, string, []int) {
|
||||
|
||||
var matrix []uint64
|
||||
|
||||
seqwords := obikmer.Count4Mer(sequence, nil, nil)
|
||||
cw := make([]int, len(refcounts))
|
||||
|
||||
for i, ref := range refcounts {
|
||||
cw[i] = obikmer.Common4Mer(seqwords, ref)
|
||||
}
|
||||
|
||||
o := obiutils.Reverse(obiutils.IntOrder(cw), true)
|
||||
|
||||
bests := obiseq.MakeBioSequenceSlice()
|
||||
// bests = append(bests, references[o[0]])
|
||||
bestidxs := make([]int, 0)
|
||||
// bestidxs = append(bestidxs, o[0])
|
||||
bestId := 0.0
|
||||
bestmatch := references[o[0]].Id()
|
||||
|
||||
maxe := -1
|
||||
wordmin := 0
|
||||
|
||||
// sss := make([]int, 0, 10000)
|
||||
// log.Warnf("%v\n%v\n%d", seqwords, refcounts[o[0]], obikmer.Common4Mer(seqwords, refcounts[o[0]]))
|
||||
|
||||
for i, order := range o {
|
||||
ref := references[order]
|
||||
score := int(1e9)
|
||||
|
||||
if cw[order] < wordmin || i > 1000 {
|
||||
break
|
||||
}
|
||||
lcs, alilength := -1, -1
|
||||
switch maxe {
|
||||
case 0:
|
||||
if obiutils.UnsafeStringFreomBytes(sequence.Sequence()) == obiutils.UnsafeStringFreomBytes(references[order].Sequence()) {
|
||||
score = 0
|
||||
alilength = sequence.Len()
|
||||
lcs = alilength
|
||||
}
|
||||
case 1:
|
||||
d, _, _, _ := obialign.D1Or0(sequence, references[order])
|
||||
if d >= 0 {
|
||||
score = d
|
||||
alilength = max(sequence.Len(), ref.Len())
|
||||
lcs = alilength - score
|
||||
}
|
||||
default:
|
||||
lcs, alilength = obialign.FastLCSScore(sequence, references[order], maxe, &matrix)
|
||||
if lcs >= 0 {
|
||||
score = alilength - lcs
|
||||
}
|
||||
}
|
||||
|
||||
// log.Warnf("LCS : %d ALILENGTH : %d score : %d cw : %d", lcs, alilength, score, cw[order])
|
||||
if lcs >= 0 {
|
||||
|
||||
// sss = append(sss, cw[order], alilength-lcs)
|
||||
|
||||
//
|
||||
// We have found a better candidate than never
|
||||
//
|
||||
if maxe == -1 || score < maxe {
|
||||
bests = bests[:0] // Empty the best lists
|
||||
bestidxs = bestidxs[:0] //
|
||||
|
||||
maxe = score // Memorize the best scores
|
||||
wordmin = max(0, max(sequence.Len(), ref.Len())-3-4*maxe)
|
||||
bestId = float64(lcs) / float64(alilength)
|
||||
bestmatch = ref.Id()
|
||||
// log.Warnln(sequence.Id(), ref.Id(), cw[order], maxe, bestId, wordmin)
|
||||
}
|
||||
|
||||
if score == maxe {
|
||||
bests = append(bests, ref)
|
||||
bestidxs = append(bestidxs, order)
|
||||
id := float64(lcs) / float64(alilength)
|
||||
if id > bestId {
|
||||
bestId = id
|
||||
bestmatch = ref.Id()
|
||||
}
|
||||
// log.Println(ref.Id(), maxe, bestId,bestidxs)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
// sequence.SetAttribute("maxe", sss)
|
||||
log.Debugln("Closest Match", sequence.Id(), maxe, bestId, references[bestidxs[0]].Id(), bestidxs, len(bests))
|
||||
return bests, maxe, bestId, bestmatch, bestidxs
|
||||
}
|
||||
|
||||
func (db *Obitag2RefDB) BestConsensus(bests obiseq.BioSequenceSlice, differences int, slot string) *obitax.TaxNode {
|
||||
taxon := (*obitax.TaxNode)(nil)
|
||||
|
||||
for _, best := range bests {
|
||||
idx := best.OBITagRefIndex(slot)
|
||||
if idx == nil {
|
||||
log.Fatalf("Reference database must be indexed first")
|
||||
}
|
||||
|
||||
d := differences
|
||||
identification, ok := idx[d]
|
||||
found := false
|
||||
var parts []string
|
||||
|
||||
/*
|
||||
Here is an horrible hack for xprize challence.
|
||||
With Euka01 the part[0] was equal to "" for at
|
||||
least a sequence consensus. Which is not normal.
|
||||
|
||||
TO BE CHECKED AND CORRECTED
|
||||
|
||||
The problem seems related to idx that doesn't have
|
||||
a 0 distance
|
||||
*/
|
||||
for !found && d >= 0 {
|
||||
for !ok && d >= 0 {
|
||||
identification, ok = idx[d]
|
||||
d--
|
||||
}
|
||||
|
||||
parts = strings.Split(identification, "@")
|
||||
|
||||
found = parts[0] != ""
|
||||
if !found {
|
||||
log.Debugln("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d)
|
||||
for !ok && d <= 1000 {
|
||||
identification, ok = idx[d]
|
||||
d++
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
match_taxid, err := strconv.Atoi(parts[0])
|
||||
|
||||
if err != nil {
|
||||
log.Panicln("Cannot extract taxid from :", identification)
|
||||
}
|
||||
|
||||
match_taxon, err := db.Taxonomy.Taxon(match_taxid)
|
||||
|
||||
if err != nil {
|
||||
log.Panicln("Cannot find taxon corresponding to taxid :", match_taxid)
|
||||
}
|
||||
|
||||
if taxon != nil {
|
||||
taxon, _ = taxon.LCA(match_taxon)
|
||||
} else {
|
||||
taxon = match_taxon
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return taxon
|
||||
}
|
||||
|
||||
// Identify makes the taxonomic identification of a BioSequence.
|
||||
//
|
||||
// Parameters:
|
||||
// - sequence: A pointer to a BioSequence to identify.
|
||||
// - references: A BioSequenceSlice.
|
||||
// - refcounts: A slice of pointers to Table4mer.
|
||||
// - taxa: A TaxonSet.
|
||||
// - taxo: A pointer to a Taxonomy.
|
||||
// - runExact: A boolean value indicating whether to run exact matching.
|
||||
//
|
||||
// Returns:
|
||||
// - A pointer to a BioSequence.
|
||||
func Identify(sequence *obiseq.BioSequence,
|
||||
db *Obitag2RefDB,
|
||||
runExact bool) *obiseq.BioSequence {
|
||||
|
||||
identity := 1.0
|
||||
differences := 0
|
||||
method := "exact match"
|
||||
weight := 0
|
||||
|
||||
var bestmatch string
|
||||
var taxon *obitax.TaxNode
|
||||
|
||||
exacttaxon, ok := (*db.ExactTaxid)[obiutils.UnsafeStringFreomBytes(sequence.Sequence())]
|
||||
if ok {
|
||||
taxon = exacttaxon.Taxon
|
||||
bestmatch = exacttaxon.Id
|
||||
weight = exacttaxon.Weight
|
||||
} else {
|
||||
var bests obiseq.BioSequenceSlice
|
||||
bests, differences, identity, bestmatch, _ = FindClosests(sequence, *db.Clusters, *db.ClusterCounts, runExact)
|
||||
weight = bests.Len() /* Should be the some of the count */
|
||||
ftaxon := (*obitax.TaxNode)(nil)
|
||||
|
||||
if identity >= 0.5 && differences >= 0 {
|
||||
ftaxon = db.BestConsensus(bests, differences, "obitag_ref_index")
|
||||
|
||||
fam := ftaxon.TaxonAtRank("family")
|
||||
|
||||
if fam != nil {
|
||||
ftaxid := fam.Taxid()
|
||||
bests, differences, identity, bestmatch, _ = FindClosests(
|
||||
sequence,
|
||||
*(*db.Families)[ftaxid],
|
||||
*(*db.FamilyCounts)[ftaxid],
|
||||
runExact,
|
||||
)
|
||||
sequence.SetAttribute("obitag_proposed_family", fam.ScientificName())
|
||||
ftaxon = db.BestConsensus(bests, differences, "reffamidx_in")
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
// Cannot match any taxon
|
||||
ftaxon, _ = db.Taxonomy.Taxon(1)
|
||||
sequence.SetTaxid(1)
|
||||
}
|
||||
|
||||
taxon = ftaxon
|
||||
method = "lcsfamlily"
|
||||
}
|
||||
|
||||
sequence.SetTaxid(taxon.Taxid())
|
||||
|
||||
sequence.SetAttribute("scientific_name", taxon.ScientificName())
|
||||
sequence.SetAttribute("obitag_rank", taxon.Rank())
|
||||
sequence.SetAttribute("obitag_bestid", identity)
|
||||
sequence.SetAttribute("obitag_bestmatch", bestmatch)
|
||||
sequence.SetAttribute("obitag_match_count", weight)
|
||||
sequence.SetAttribute("obitag_similarity_method", method)
|
||||
|
||||
return sequence
|
||||
}
|
||||
|
||||
func IdentifySeqWorker(db *Obitag2RefDB, runExact bool) obiseq.SeqWorker {
|
||||
return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
return obiseq.BioSequenceSlice{Identify(sequence, db, runExact)}, nil
|
||||
}
|
||||
}
|
||||
|
||||
func CLIAssignTaxonomy(iterator obiiter.IBioSequence,
|
||||
references obiseq.BioSequenceSlice,
|
||||
taxo *obitax.Taxonomy,
|
||||
) obiiter.IBioSequence {
|
||||
|
||||
refcounts := make(
|
||||
[]*obikmer.Table4mer,
|
||||
len(references))
|
||||
|
||||
taxa := make(obitax.TaxonSet,
|
||||
len(references))
|
||||
|
||||
familydb := make(map[int]*obiseq.BioSequenceSlice)
|
||||
familycounts := make(map[int]*[]*obikmer.Table4mer)
|
||||
familytaxa := make(map[int]obitax.TaxonSet)
|
||||
clusterdb := obiseq.MakeBioSequenceSlice()
|
||||
clustercounts := make([]*obikmer.Table4mer, 0)
|
||||
clustertaxa := make(obitax.TaxonSet, 0)
|
||||
exactmatch := make(map[string]*obiseq.BioSequenceSlice)
|
||||
|
||||
buffer := make([]byte, 0, 1000)
|
||||
|
||||
j := 0
|
||||
for i, seq := range references {
|
||||
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
|
||||
taxa[i], _ = taxo.Taxon(seq.Taxid())
|
||||
|
||||
if is_centrer, _ := seq.GetBoolAttribute("reffamidx_clusterhead"); is_centrer {
|
||||
clusterdb = append(clusterdb, seq)
|
||||
clustercounts = append(clustercounts, refcounts[i])
|
||||
clustertaxa[j] = taxa[i]
|
||||
j++
|
||||
}
|
||||
|
||||
family, _ := seq.GetIntAttribute("family_taxid")
|
||||
|
||||
fs, ok := familydb[family]
|
||||
if !ok {
|
||||
fs = obiseq.NewBioSequenceSlice(0)
|
||||
familydb[family] = fs
|
||||
}
|
||||
|
||||
*fs = append(*fs, seq)
|
||||
|
||||
fc, ok := familycounts[family]
|
||||
|
||||
if !ok {
|
||||
fci := make([]*obikmer.Table4mer, 0)
|
||||
fc = &fci
|
||||
familycounts[family] = fc
|
||||
}
|
||||
|
||||
*fc = append(*fc, refcounts[i])
|
||||
|
||||
ft, ok := familytaxa[family]
|
||||
|
||||
if !ok {
|
||||
ft = make(obitax.TaxonSet, 0)
|
||||
familytaxa[family] = ft
|
||||
}
|
||||
|
||||
ft[len(ft)] = taxa[i]
|
||||
|
||||
seqstr := obiutils.UnsafeStringFreomBytes(seq.Sequence())
|
||||
em, ok := exactmatch[seqstr]
|
||||
|
||||
if !ok {
|
||||
em = obiseq.NewBioSequenceSlice()
|
||||
exactmatch[seqstr] = em
|
||||
}
|
||||
|
||||
*em = append(*em, seq)
|
||||
|
||||
}
|
||||
|
||||
exacttaxid := make(map[string]*Obitag2Match, len(exactmatch))
|
||||
for seqstr, seqs := range exactmatch {
|
||||
var err error
|
||||
t, _ := taxo.Taxon((*seqs)[0].Taxid())
|
||||
w := (*seqs)[0].Count()
|
||||
lseqs := seqs.Len()
|
||||
|
||||
for i := 1; i < lseqs; i++ {
|
||||
t2, _ := taxo.Taxon((*seqs)[i].Taxid())
|
||||
t, err = t.LCA(t2)
|
||||
|
||||
if err != nil {
|
||||
log.Panic(err)
|
||||
}
|
||||
|
||||
w += (*seqs)[i].Count()
|
||||
}
|
||||
|
||||
exacttaxid[seqstr] = &Obitag2Match{
|
||||
Taxon: t,
|
||||
Id: (*seqs)[0].Id(),
|
||||
Weight: w,
|
||||
}
|
||||
}
|
||||
|
||||
db := &Obitag2RefDB{
|
||||
Taxonomy: taxo,
|
||||
|
||||
Full: &references,
|
||||
FullCounts: &refcounts,
|
||||
FullTaxa: &taxa,
|
||||
|
||||
Clusters: &clusterdb,
|
||||
ClusterCounts: &clustercounts,
|
||||
ClusterTaxa: &clustertaxa,
|
||||
|
||||
Families: &familydb,
|
||||
FamilyCounts: &familycounts,
|
||||
FamilyTaxa: &familytaxa,
|
||||
|
||||
ExactTaxid: &exacttaxid,
|
||||
}
|
||||
|
||||
worker := IdentifySeqWorker(db, CLIRunExact())
|
||||
|
||||
return iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers(), 0)
|
||||
}
|
120
pkg/obitools/obitag2/options.go
Normal file
120
pkg/obitools/obitag2/options.go
Normal file
@ -0,0 +1,120 @@
|
||||
package obitag2
|
||||
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _RefDB = ""
|
||||
var _SaveRefDB = ""
|
||||
var _RunExact = false
|
||||
var _GeomSim = false
|
||||
|
||||
func TagOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringVar(&_RefDB, "reference-db", _RefDB,
|
||||
options.Alias("R"),
|
||||
options.Required(),
|
||||
options.ArgName("FILENAME"),
|
||||
options.Description("The name of the file containing the reference DB"))
|
||||
|
||||
options.StringVar(&_SaveRefDB, "save-db", _SaveRefDB,
|
||||
options.ArgName("FILENAME"),
|
||||
options.Description("The name of a file where to save the reference DB with its indices"))
|
||||
|
||||
options.BoolVar(&_GeomSim, "geometric", _GeomSim,
|
||||
options.Alias("G"),
|
||||
options.Description("Activate the experimental geometric similarity heuristic"))
|
||||
|
||||
// options.BoolVar(&_RunExact, "exact", _RunExact,
|
||||
// options.Alias("E"),
|
||||
// options.Description("Desactivate the heuristic limitating the sequence comparisons"))
|
||||
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every options declared for
|
||||
// the obiuniq command
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
obifind.LoadTaxonomyOptionSet(options, true, false)
|
||||
TagOptionSet(options)
|
||||
}
|
||||
|
||||
func CLIRefDBName() string {
|
||||
return _RefDB
|
||||
}
|
||||
|
||||
func CLIRefDB() obiseq.BioSequenceSlice {
|
||||
refdb, err := obiformats.ReadSequencesFromFile(_RefDB)
|
||||
|
||||
if err != nil {
|
||||
log.Panicf("Cannot open the reference library file : %s\n", _RefDB)
|
||||
}
|
||||
|
||||
return refdb.Load()
|
||||
}
|
||||
|
||||
func CLIGeometricMode() bool {
|
||||
return _GeomSim
|
||||
}
|
||||
|
||||
func CLIShouldISaveRefDB() bool {
|
||||
return _SaveRefDB != ""
|
||||
}
|
||||
|
||||
func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) {
|
||||
if CLIShouldISaveRefDB() {
|
||||
idb := obiiter.IBatchOver(db, 1000)
|
||||
|
||||
var newIter obiiter.IBioSequence
|
||||
|
||||
opts := make([]obiformats.WithOption, 0, 10)
|
||||
|
||||
switch obiconvert.CLIOutputFastHeaderFormat() {
|
||||
case "json":
|
||||
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
|
||||
case "obi":
|
||||
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqOBIHeader))
|
||||
default:
|
||||
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
|
||||
}
|
||||
|
||||
nworkers := obioptions.CLIParallelWorkers() / 4
|
||||
if nworkers < 2 {
|
||||
nworkers = 2
|
||||
}
|
||||
|
||||
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
|
||||
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
|
||||
|
||||
opts = append(opts, obiformats.OptionsCompressed(obiconvert.CLICompressed()))
|
||||
|
||||
var err error
|
||||
|
||||
switch obiconvert.CLIOutputFormat() {
|
||||
case "fastq":
|
||||
newIter, err = obiformats.WriteFastqToFile(idb, _SaveRefDB, opts...)
|
||||
case "fasta":
|
||||
newIter, err = obiformats.WriteFastaToFile(idb, _SaveRefDB, opts...)
|
||||
default:
|
||||
newIter, err = obiformats.WriteSequencesToFile(idb, _SaveRefDB, opts...)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Write file error: %v", err)
|
||||
}
|
||||
|
||||
newIter.Recycle()
|
||||
obiiter.WaitForLastPipe()
|
||||
}
|
||||
}
|
||||
|
||||
func CLIRunExact() bool {
|
||||
return _RunExact
|
||||
}
|
42
pkg/obiutils/counter.go
Normal file
42
pkg/obiutils/counter.go
Normal file
@ -0,0 +1,42 @@
|
||||
package obiutils
|
||||
|
||||
import "sync"
|
||||
|
||||
type Counter struct {
|
||||
Inc func() int
|
||||
Dec func() int
|
||||
Value func() int
|
||||
}
|
||||
|
||||
func NewCounter(initial ...int) *Counter {
|
||||
value := 0
|
||||
lock := sync.Mutex{}
|
||||
|
||||
if len(initial) > 0 {
|
||||
value = initial[0]
|
||||
}
|
||||
|
||||
c := Counter{
|
||||
Inc: func() int {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
value++
|
||||
return value
|
||||
},
|
||||
|
||||
Dec: func() int {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
value--
|
||||
return value
|
||||
},
|
||||
|
||||
Value: func() int {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
return value
|
||||
},
|
||||
}
|
||||
|
||||
return &c
|
||||
}
|
12
pkg/obiutils/strings.go
Normal file
12
pkg/obiutils/strings.go
Normal file
@ -0,0 +1,12 @@
|
||||
package obiutils
|
||||
|
||||
import "unsafe"
|
||||
|
||||
func UnsafeStringFreomBytes(data []byte) string {
|
||||
if len(data) > 0 {
|
||||
s := unsafe.String(unsafe.SliceData(data), len(data))
|
||||
return s
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
Reference in New Issue
Block a user