Xprize update

Former-commit-id: d38919a897961e4d40da3b844057c3fb94fdb6d7
This commit is contained in:
Eric Coissac
2024-07-25 18:09:03 -04:00
parent 4e4fac491f
commit 67665a6b40
18 changed files with 895 additions and 29 deletions

View File

@ -2,7 +2,6 @@ package obichunk
import (
"io/fs"
"io/ioutil"
"os"
"path/filepath"
@ -14,7 +13,7 @@ import (
)
func tempDir() (string, error) {
dir, err := ioutil.TempDir(os.TempDir(), "obiseq_chunks_")
dir, err := os.MkdirTemp(os.TempDir(), "obiseq_chunks_")
if err != nil {
return "", err
}

View File

@ -249,6 +249,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
log.Fatalln("Invalid value for @spacer parameter")
}
log.Infof("Set global spacer to %d bp", spacer)
library.SetTagSpacer(spacer)
case 2:
primer := values[0]
@ -258,6 +259,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
log.Fatalln("Invalid value for @spacer parameter")
}
log.Infof("Set spacer for primer %s to %d bp", primer, spacer)
library.SetTagSpacerFor(primer, spacer)
default:
log.Fatalln("Invalid value for @spacer parameter")
@ -274,6 +276,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
log.Fatalln("Invalid value for @forward_spacer parameter")
}
log.Infof("Set spacer for forward primer to %d bp", spacer)
library.SetForwardTagSpacer(spacer)
default:
log.Fatalln("Invalid value for @forward_spacer parameter")
@ -290,6 +293,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
log.Fatalln("Invalid value for @reverse_spacer parameter")
}
log.Infof("Set spacer for reverse primer to %d bp", spacer)
library.SetReverseTagSpacer(spacer)
default:
log.Fatalln("Invalid value for @reverse_spacer parameter")
@ -301,9 +305,13 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
log.Fatalln("Missing value for @tag_delimiter parameter")
case 1:
value := []byte(values[0])[0]
log.Infof("Set global tag delimiter to %c", value)
library.SetTagDelimiter(value)
case 2:
value := []byte(values[1])[0]
log.Infof("Set tag delimiter for primer %s to %c", values[0], value)
library.SetTagDelimiterFor(values[0], value)
default:
log.Fatalln("Invalid value for @tag_delimiter parameter")
@ -315,6 +323,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
log.Fatalln("Missing value for @forward_tag_delimiter parameter")
case 1:
value := []byte(values[0])[0]
log.Infof("Set tag delimiter for forward primer to %c", value)
library.SetForwardTagDelimiter(value)
default:
log.Fatalln("Invalid value for @forward_tag_delimiter parameter")
@ -326,6 +336,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
log.Fatalln("Missing value for @reverse_tag_delimiter parameter")
case 1:
value := []byte(values[0])[0]
log.Infof("Set tag delimiter for reverse primer to %c", value)
library.SetReverseTagDelimiter(value)
default:
log.Fatalln("Invalid value for @reverse_tag_delimiter parameter")
@ -339,6 +351,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
if err := library.SetMatching(values[0]); err != nil {
log.Fatalf("Invalid value %s for @matching parameter", values[0])
}
log.Infof("Set tag matching mode to %s", values[0])
default:
log.Fatalln("Invalid value for @matching parameter")
}
@ -354,6 +367,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
log.Fatalf("Invalid value %s for @primer_error parameter", values[0])
}
log.Infof("Set global allowed primer mismatches to %d", dist)
library.SetAllowedMismatches(dist)
case 2:
primer := values[0]
@ -363,6 +377,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
log.Fatalf("Invalid value %s for @primer_error parameter", values[1])
}
log.Infof("Set allowed primer mismatches for primer %s to %d", primer, dist)
library.SetAllowedMismatchesFor(primer, dist)
default:
log.Fatalln("Invalid value for @primer_error parameter")
@ -379,6 +394,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
log.Fatalf("Invalid value %s for @forward_primer_error parameter", values[0])
}
log.Infof("Set allowed mismatches for forward primer to %d", dist)
library.SetForwardAllowedMismatches(dist)
default:
log.Fatalln("Invalid value for @forward_primer_error parameter")
@ -395,6 +411,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
log.Fatalf("Invalid value %s for @reverse_primer_error parameter", values[0])
}
log.Infof("Set allowed mismatches for reverse primer to %d", dist)
library.SetReverseAllowedMismatches(dist)
default:
log.Fatalln("Invalid value for @reverse_primer_error parameter")
@ -410,6 +427,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
if err != nil {
log.Fatalf("Invalid value %s for @tag_indels parameter", values[0])
}
log.Infof("Set global maximum tag indels to %d", indels)
library.SetTagIndels(indels)
case 2:
indels, err := strconv.Atoi(values[1])
@ -418,6 +437,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
log.Fatalf("Invalid value %s for @tag_indels parameter", values[1])
}
log.Infof("Set maximum tag indels for primer %s to %d", values[0], indels)
library.SetTagIndelsFor(values[0], indels)
}
},
@ -432,6 +452,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
if err != nil {
log.Fatalf("Invalid value %s for @forward_tag_indels parameter", values[0])
}
log.Infof("Set maximum tag indels for forward primer to %d", indels)
library.SetForwardTagIndels(indels)
default:
log.Fatalln("Invalid value for @forward_tag_indels parameter")
@ -447,6 +469,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
if err != nil {
log.Fatalf("Invalid value %s for @reverse_tag_indels parameter", values[0])
}
log.Infof("Set maximum tag indels for reverse primer to %d", indels)
library.SetReverseTagIndels(indels)
default:
log.Fatalln("Invalid value for @reverse_tag_indels parameter")
@ -457,8 +481,22 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
case 0:
log.Fatalln("Missing value for @indels parameter")
case 1:
if values[0] == "true" {
log.Info("Allows indels for primer matching")
} else {
log.Info("Disallows indels for primer matching")
}
library.SetAllowsIndels(values[0] == "true")
case 2:
if values[1] == "true" {
log.Infof("Allows indels for primer matching %s", values[0])
} else {
log.Infof("Disallows indels for primer matching %s", values[0])
}
library.SetAllowsIndelsFor(values[0], values[1] == "true")
default:
log.Fatalln("Invalid value for @indels parameter")
@ -470,6 +508,12 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
case 0:
log.Fatalln("Missing value for @forward_indels parameter")
case 1:
if values[0] == "true" {
log.Info("Allows indels for forward primer matching")
} else {
log.Info("Disallows indels for forward primer matching")
}
library.SetForwardAllowsIndels(values[0] == "true")
default:
log.Fatalln("Invalid value for @forward_indels parameter")
@ -480,6 +524,11 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value
case 0:
log.Fatalln("Missing value for @reverse_indels parameter")
case 1:
if values[0] == "true" {
log.Info("Allows indels for reverse primer matching")
} else {
log.Info("Disallows indels for reverse primer matching")
}
library.SetReverseAllowsIndels(values[0] == "true")
default:
log.Fatalln("Invalid value for @reverse_indels parameter")

View File

@ -607,7 +607,7 @@ func (g *DeBruijnGraph) HaviestPath() []uint64 {
}
}
log.Infof("Heaviest node: %d [%v]", heaviestNode, heaviestWeight)
log.Debugf("Heaviest node: %d [%v]", heaviestNode, heaviestWeight)
// Reconstruct the path from the start node to the heaviest node found
heaviestPath := make([]uint64, 0)
currentNode := heaviestNode

View File

@ -7,7 +7,7 @@ import (
// TODO: The version number is extracted from git. This induces that the version
// corresponds to the last commit, and not the one when the file will be
// commited
var _Commit = "f265a39"
var _Commit = "4d86483"
var _Version = "Release 4.2.0"
// Version returns the version of the obitools package.

View File

@ -415,11 +415,16 @@ func (s *BioSequence) SetTaxid(taxid int) {
s.SetAttribute("taxid", taxid)
}
func (s *BioSequence) OBITagRefIndex() map[int]string {
func (s *BioSequence) OBITagRefIndex(slot ...string) map[int]string {
key := "obitag_ref_index"
if len(slot) > 0 {
key = slot[0]
}
var val map[int]string
i, ok := s.GetAttribute("obitag_ref_index")
i, ok := s.GetAttribute(key)
if !ok {
return nil

View File

@ -202,4 +202,8 @@ var OBILang = gval.NewLanguage(
scomp[string(k)] = float64(v)
}
return scomp, nil
}))
}),
gval.Function("replace", func(args ...interface{}) (interface{}, error) {
return strings.ReplaceAll(args[0].(string), args[1].(string), args[2].(string)), nil
}),
)

View File

@ -66,3 +66,27 @@ func (taxonomy *Taxonomy) SetPath(sequence *obiseq.BioSequence) string {
return tpath
}
func (taxonomy *Taxonomy) SetScientificName(sequence *obiseq.BioSequence) string {
taxid, err := taxonomy.Taxon(sequence.Taxid())
if err != nil {
log.Fatalf("Taxid %d not defined in the current taxonomy", sequence.Taxid())
}
sequence.SetAttribute("scienctific_name", taxid.ScientificName())
return taxid.ScientificName()
}
func (taxonomy *Taxonomy) SetTaxonomicRank(sequence *obiseq.BioSequence) string {
taxid, err := taxonomy.Taxon(sequence.Taxid())
if err != nil {
log.Fatalf("Taxid %d not defined in the current taxonomy", sequence.Taxid())
}
sequence.SetAttribute("taxonomic_rank", taxid.Rank())
return taxid.Rank()
}

View File

@ -57,6 +57,23 @@ func (taxonomy *Taxonomy) IsSubCladeOf(taxid int) obiseq.SequencePredicate {
return f
}
func (taxonomy *Taxonomy) IsSubCladeOfSlot(key string) obiseq.SequencePredicate {
f := func(sequence *obiseq.BioSequence) bool {
val, ok := sequence.GetStringAttribute(key)
if ok {
parent, err1 := taxonomy.Taxon(val)
taxon, err2 := taxonomy.Taxon(sequence.Taxid())
return err1 == nil && err2 == nil && taxon.IsSubCladeOf(parent)
}
return false
}
return f
}
func (taxonomy *Taxonomy) HasRequiredRank(rank string) obiseq.SequencePredicate {
if !obiutils.Contains(taxonomy.RankList(), rank) {

View File

@ -2,6 +2,8 @@ package obitax
import (
"fmt"
"regexp"
"strconv"
)
type TaxName struct {
@ -54,11 +56,43 @@ func (taxonomy *Taxonomy) AddNewTaxa(taxid, parent int, rank string, replace boo
return n, nil
}
func (taxonomy *Taxonomy) Taxon(taxid int) (*TaxNode, error) {
t, ok := (*taxonomy.nodes)[taxid]
// func (taxonomy *Taxonomy) Taxon(taxid int) (*TaxNode, error) {
// t, ok := (*taxonomy.nodes)[taxid]
// if !ok {
// a, aok := taxonomy.alias[taxid]
// if !aok {
// return nil, fmt.Errorf("Taxid %d is not part of the taxonomy", taxid)
// }
// t = a
// }
// return t, nil
// }
func (taxonomy *Taxonomy) Taxon(taxid interface{}) (*TaxNode, error) {
var itaxid int
var err error
switch v := taxid.(type) {
case int:
itaxid = v
case string:
itaxid, err = strconv.Atoi(v)
if err != nil {
re := regexp.MustCompile(`TX:(\d+)`)
parts := re.FindStringSubmatch(v)
if len(parts) != 2 {
return nil, fmt.Errorf("I cannot parse taxid from %s", v)
}
itaxid, _ = strconv.Atoi(parts[1])
}
}
t, ok := (*taxonomy.nodes)[itaxid]
if !ok {
a, aok := taxonomy.alias[taxid]
a, aok := taxonomy.alias[itaxid]
if !aok {
return nil, fmt.Errorf("Taxid %d is not part of the taxonomy", taxid)
}
@ -66,7 +100,6 @@ func (taxonomy *Taxonomy) Taxon(taxid int) (*TaxNode, error) {
}
return t, nil
}
func (taxonomy *Taxonomy) AddNewName(taxid int, name, nameclass *string) error {
node, node_err := taxonomy.Taxon(taxid)
if node_err != nil {

View File

@ -215,6 +215,24 @@ func AddTaxonAtRankWorker(taxonomy *obitax.Taxonomy, ranks ...string) obiseq.Seq
return f
}
func AddTaxonRankWorker(taxonomy *obitax.Taxonomy) obiseq.SeqWorker {
f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
taxonomy.SetTaxonomicRank(s)
return obiseq.BioSequenceSlice{s}, nil
}
return f
}
func AddScientificNameWorker(taxonomy *obitax.Taxonomy) obiseq.SeqWorker {
f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
taxonomy.SetScientificName(s)
return obiseq.BioSequenceSlice{s}, nil
}
return f
}
func AddSeqLengthWorker() obiseq.SeqWorker {
f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
s.SetAttribute("seq_length", s.Len())
@ -266,6 +284,18 @@ func CLIAnnotationWorker() obiseq.SeqWorker {
annotator = annotator.ChainWorkers(w)
}
if CLISetTaxonomicRank() {
taxo := obigrep.CLILoadSelectedTaxonomy()
w := AddTaxonRankWorker(taxo)
annotator = annotator.ChainWorkers(w)
}
if CLISetScientificName() {
taxo := obigrep.CLILoadSelectedTaxonomy()
w := AddScientificNameWorker(taxo)
annotator = annotator.ChainWorkers(w)
}
if CLIHasAddLCA() {
taxo := obigrep.CLILoadSelectedTaxonomy()
w := obitax.AddLCAWorker(taxo, CLILCASlotName(), CLILCAThreshold())

View File

@ -32,6 +32,8 @@ var _lcaError = 0.0
var _setId = ""
var _cut = ""
var _taxonomicPath = false
var _withRank = false
var _withScientificName = false
func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
// options.BoolVar(&_addRank, "seq-rank", _addRank,
@ -117,6 +119,12 @@ func SequenceAnnotationOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_taxonomicPath, "taxonomic-path", _taxonomicPath,
options.Description("Annotate the sequence with its taxonomic path"))
options.BoolVar(&_withRank, "taxonomic-rank", _withRank,
options.Description("Annotate the sequence with its taxonomic rank"))
options.BoolVar(&_withScientificName, "scientific-name", _withScientificName,
options.Description("Annotate the sequence with its scientific name"))
// options.StringVar(&_tagList, "tag-list", _tagList,
// options.ArgName("FILENAME"),
// options.Description("<FILENAME> points to a file containing attribute names"+
@ -307,3 +315,11 @@ func CLIPatternInDels() bool {
func CLISetTaxonomicPath() bool {
return _taxonomicPath
}
func CLISetTaxonomicRank() bool {
return _withRank
}
func CLISetScientificName() bool {
return _withScientificName
}

View File

@ -52,7 +52,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
}
log.Printf("Number of reads : %d\n", len(seqs))
log.Debugf("Number of reads : %d\n", len(seqs))
if kmer_size < 0 {
longest := make([]int, len(seqs))
@ -64,7 +64,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
}
kmer_size = slices.Max(longest) + 1
log.Printf("estimated kmer size : %d", kmer_size)
log.Debugf("estimated kmer size : %d", kmer_size)
}
var graph *obikmer.DeBruijnGraph
@ -80,7 +80,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
}
kmer_size++
log.Infof("Cycle detected, increasing kmer size to %d\n", kmer_size)
log.Debugf("Cycle detected, increasing kmer size to %d\n", kmer_size)
}
if save_graph {
@ -96,7 +96,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
}
}
log.Printf("Graph size : %d\n", graph.Len())
log.Debugf("Graph size : %d\n", graph.Len())
total_kmer := graph.Len()
seq, err := graph.LongestConsensus(consensus_id)
@ -260,6 +260,21 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
sample_key string, kmer_size int) obiseq.BioSequenceSlice {
denoised := obiseq.MakeBioSequenceSlice(len(*graph.Vertices))
bar := (*progressbar.ProgressBar)(nil)
if obiconvert.CLIProgressBar() {
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription(fmt.Sprintf("[Build consensus] on %s", graph.Name)),
)
bar = progressbar.NewOptions(len(*graph.Vertices), pbopt...)
}
for i, v := range *graph.Vertices {
var err error
var clean *obiseq.BioSequence
@ -283,6 +298,7 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
clean.SetAttribute("obiconsensus_consensus", true)
}
pack.Recycle(false)
} else {
clean = obiseq.NewBioSequence(v.Id(), v.Sequence(), v.Definition())
clean.SetAttribute("obiconsensus_consensus", false)
@ -295,7 +311,20 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
clean.SetAttribute("obiconsensus_weight", int(1))
}
annotations := v.Annotations()
for k, v := range annotations {
if !clean.HasAttribute(k) {
clean.SetAttribute(k, v)
}
}
denoised[i] = clean
if bar != nil {
bar.Add(1)
}
}
return denoised

View File

@ -1,6 +1,7 @@
package obigrep
import (
"strconv"
"strings"
log "github.com/sirupsen/logrus"
@ -13,7 +14,7 @@ import (
"github.com/DavidGamba/go-getoptions"
)
var _BelongTaxa = make([]int, 0)
var _BelongTaxa = make([]string, 0)
var _NotBelongTaxa = make([]int, 0)
var _RequiredRanks = make([]string, 0)
@ -48,7 +49,7 @@ func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
options.Alias("t"),
options.Description("Points to the directory containing the NCBI Taxonomy database dump."))
options.IntSliceVar(&_BelongTaxa, "restrict-to-taxon", 1, 1,
options.StringSliceVar(&_BelongTaxa, "restrict-to-taxon", 1, 1,
options.Alias("r"),
options.ArgName("TAXID"),
options.Description("Require that the actual taxon of the sequence belongs the provided taxid."))
@ -231,13 +232,27 @@ func CLILoadSelectedTaxonomy() *obitax.Taxonomy {
}
func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
var p obiseq.SequencePredicate
var p2 obiseq.SequencePredicate
if len(_BelongTaxa) > 0 {
taxonomy := CLILoadSelectedTaxonomy()
p := taxonomy.IsSubCladeOf(_BelongTaxa[0])
for _, taxid := range _BelongTaxa[1:] {
p = p.Or(taxonomy.IsSubCladeOf(taxid))
taxid, err := strconv.Atoi(_BelongTaxa[0])
if err != nil {
p = taxonomy.IsSubCladeOfSlot(_BelongTaxa[0])
} else {
p = taxonomy.IsSubCladeOf(taxid)
}
for _, staxid := range _BelongTaxa[1:] {
taxid, err := strconv.Atoi(staxid)
if err != nil {
p2 = taxonomy.IsSubCladeOfSlot(staxid)
} else {
p2 = taxonomy.IsSubCladeOf(taxid)
}
p = p.Or(p2)
}
return p

View File

@ -12,21 +12,32 @@ import (
type IndexedSequenceSlice struct {
Sequences obiseq.BioSequenceSlice
Indices []map[interface{}]*obiutils.Set[int]
Indices []map[string]*obiutils.Set[int]
}
func (s IndexedSequenceSlice) Len() int {
return len(s.Sequences)
}
func (s IndexedSequenceSlice) Get(keys ...interface{}) *obiseq.BioSequenceSlice {
func (s IndexedSequenceSlice) Get(keys ...string) *obiseq.BioSequenceSlice {
var keeps obiutils.Set[int]
for i, v := range s.Indices {
if i == 0 {
keeps = *v[keys[0]]
p, ok := v[keys[0]]
if !ok {
keeps = obiutils.MakeSet[int]()
break
}
keeps = *p
} else {
keeps = keeps.Intersection(*v[keys[i]])
p, ok := v[keys[i]]
if !ok {
keeps = obiutils.MakeSet[int]()
break
}
keeps = keeps.Intersection(*p)
}
}
@ -39,14 +50,14 @@ func (s IndexedSequenceSlice) Get(keys ...interface{}) *obiseq.BioSequenceSlice
}
func BuildIndexedSequenceSlice(seqs obiseq.BioSequenceSlice, keys []string) IndexedSequenceSlice {
indices := make([]map[interface{}]*obiutils.Set[int], len(keys))
indices := make([]map[string]*obiutils.Set[int], len(keys))
for i, k := range keys {
idx := make(map[interface{}]*obiutils.Set[int])
idx := make(map[string]*obiutils.Set[int])
for j, seq := range seqs {
if value, ok := seq.GetAttribute(k); ok {
if value, ok := seq.GetStringAttribute(k); ok {
goods, ok := idx[value]
if !ok {
goods = obiutils.NewSet[int]()
@ -67,10 +78,10 @@ func MakeJoinWorker(by []string, index IndexedSequenceSlice, updateId, updateSeq
f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
var ok bool
keys := make([]interface{}, len(by))
keys := make([]string, len(by))
for i, v := range by {
keys[i], ok = sequence.GetAttribute(v)
keys[i], ok = sequence.GetStringAttribute(v)
if !ok {
return obiseq.BioSequenceSlice{sequence}, nil
}

View File

@ -0,0 +1,460 @@
package obitag2
import (
"sort"
"strconv"
"strings"
log "github.com/sirupsen/logrus"
"golang.org/x/exp/maps"
"golang.org/x/exp/slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
)
type Obitag2Match struct {
Taxon *obitax.TaxNode
Id string
Weight int
}
type Obitag2RefDB struct {
Taxonomy *obitax.Taxonomy
Full *obiseq.BioSequenceSlice
FullCounts *[]*obikmer.Table4mer
FullTaxa *obitax.TaxonSet
Clusters *obiseq.BioSequenceSlice
ClusterCounts *[]*obikmer.Table4mer
ClusterTaxa *obitax.TaxonSet
Families *map[int]*obiseq.BioSequenceSlice
FamilyCounts *map[int]*[]*obikmer.Table4mer
FamilyTaxa *map[int]obitax.TaxonSet
ExactTaxid *map[string]*Obitag2Match
}
// MatchDistanceIndex returns the taxid, rank, and scientificName based on the given distance and distanceIdx.
//
// Parameters:
// - distance: The distance to match against the keys in distanceIdx.
// - distanceIdx: A map containing distances as keys and corresponding values in the format "taxid@rank@scientificName".
//
// Returns:
// - taxid: The taxid associated with the matched distance.
// - rank: The rank associated with the matched distance.
// - scientificName: The scientific name associated with the matched distance.
func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, string) {
keys := maps.Keys(distanceIdx)
slices.Sort(keys)
i := sort.Search(len(keys), func(i int) bool {
return distance <= keys[i]
})
var taxid int
var rank string
var scientificName string
if i == len(keys) || distance > keys[len(keys)-1] {
taxid = 1
rank = "no rank"
scientificName = "root"
} else {
parts := strings.Split(distanceIdx[keys[i]], "@")
taxid, _ = strconv.Atoi(parts[0])
rank = parts[1]
scientificName = parts[2]
}
// log.Info("taxid:", taxid, " rank:", rank, " scientificName:", scientificName)
return taxid, rank, scientificName
}
// FindClosests finds the closest bio sequence from a given sequence and a slice of reference sequences.
//
// Parameters:
// - sequence: the bio sequence to find the closest matches for.
// - references: a slice of reference sequences to compare against.
// - refcounts: a slice of reference sequence counts.
// - runExact: a boolean flag indicating whether to run an exact match.
//
// Returns:
// - bests: a slice of the closest bio sequences.
// - maxe: the maximum score.
// - bestId: the best ID.
// - bestmatch: the best match.
// - bestidxs: a slice of the best indexes.
func FindClosests(sequence *obiseq.BioSequence,
references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer,
runExact bool) (obiseq.BioSequenceSlice, int, float64, string, []int) {
var matrix []uint64
seqwords := obikmer.Count4Mer(sequence, nil, nil)
cw := make([]int, len(refcounts))
for i, ref := range refcounts {
cw[i] = obikmer.Common4Mer(seqwords, ref)
}
o := obiutils.Reverse(obiutils.IntOrder(cw), true)
bests := obiseq.MakeBioSequenceSlice()
// bests = append(bests, references[o[0]])
bestidxs := make([]int, 0)
// bestidxs = append(bestidxs, o[0])
bestId := 0.0
bestmatch := references[o[0]].Id()
maxe := -1
wordmin := 0
// sss := make([]int, 0, 10000)
// log.Warnf("%v\n%v\n%d", seqwords, refcounts[o[0]], obikmer.Common4Mer(seqwords, refcounts[o[0]]))
for i, order := range o {
ref := references[order]
score := int(1e9)
if cw[order] < wordmin || i > 1000 {
break
}
lcs, alilength := -1, -1
switch maxe {
case 0:
if obiutils.UnsafeStringFreomBytes(sequence.Sequence()) == obiutils.UnsafeStringFreomBytes(references[order].Sequence()) {
score = 0
alilength = sequence.Len()
lcs = alilength
}
case 1:
d, _, _, _ := obialign.D1Or0(sequence, references[order])
if d >= 0 {
score = d
alilength = max(sequence.Len(), ref.Len())
lcs = alilength - score
}
default:
lcs, alilength = obialign.FastLCSScore(sequence, references[order], maxe, &matrix)
if lcs >= 0 {
score = alilength - lcs
}
}
// log.Warnf("LCS : %d ALILENGTH : %d score : %d cw : %d", lcs, alilength, score, cw[order])
if lcs >= 0 {
// sss = append(sss, cw[order], alilength-lcs)
//
// We have found a better candidate than never
//
if maxe == -1 || score < maxe {
bests = bests[:0] // Empty the best lists
bestidxs = bestidxs[:0] //
maxe = score // Memorize the best scores
wordmin = max(0, max(sequence.Len(), ref.Len())-3-4*maxe)
bestId = float64(lcs) / float64(alilength)
bestmatch = ref.Id()
// log.Warnln(sequence.Id(), ref.Id(), cw[order], maxe, bestId, wordmin)
}
if score == maxe {
bests = append(bests, ref)
bestidxs = append(bestidxs, order)
id := float64(lcs) / float64(alilength)
if id > bestId {
bestId = id
bestmatch = ref.Id()
}
// log.Println(ref.Id(), maxe, bestId,bestidxs)
}
}
}
// sequence.SetAttribute("maxe", sss)
log.Debugln("Closest Match", sequence.Id(), maxe, bestId, references[bestidxs[0]].Id(), bestidxs, len(bests))
return bests, maxe, bestId, bestmatch, bestidxs
}
func (db *Obitag2RefDB) BestConsensus(bests obiseq.BioSequenceSlice, differences int, slot string) *obitax.TaxNode {
taxon := (*obitax.TaxNode)(nil)
for _, best := range bests {
idx := best.OBITagRefIndex(slot)
if idx == nil {
log.Fatalf("Reference database must be indexed first")
}
d := differences
identification, ok := idx[d]
found := false
var parts []string
/*
Here is an horrible hack for xprize challence.
With Euka01 the part[0] was equal to "" for at
least a sequence consensus. Which is not normal.
TO BE CHECKED AND CORRECTED
The problem seems related to idx that doesn't have
a 0 distance
*/
for !found && d >= 0 {
for !ok && d >= 0 {
identification, ok = idx[d]
d--
}
parts = strings.Split(identification, "@")
found = parts[0] != ""
if !found {
log.Debugln("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d)
for !ok && d <= 1000 {
identification, ok = idx[d]
d++
}
}
}
match_taxid, err := strconv.Atoi(parts[0])
if err != nil {
log.Panicln("Cannot extract taxid from :", identification)
}
match_taxon, err := db.Taxonomy.Taxon(match_taxid)
if err != nil {
log.Panicln("Cannot find taxon corresponding to taxid :", match_taxid)
}
if taxon != nil {
taxon, _ = taxon.LCA(match_taxon)
} else {
taxon = match_taxon
}
}
return taxon
}
// Identify makes the taxonomic identification of a BioSequence.
//
// Parameters:
// - sequence: A pointer to a BioSequence to identify.
// - references: A BioSequenceSlice.
// - refcounts: A slice of pointers to Table4mer.
// - taxa: A TaxonSet.
// - taxo: A pointer to a Taxonomy.
// - runExact: A boolean value indicating whether to run exact matching.
//
// Returns:
// - A pointer to a BioSequence.
func Identify(sequence *obiseq.BioSequence,
db *Obitag2RefDB,
runExact bool) *obiseq.BioSequence {
identity := 1.0
differences := 0
method := "exact match"
weight := 0
var bestmatch string
var taxon *obitax.TaxNode
exacttaxon, ok := (*db.ExactTaxid)[obiutils.UnsafeStringFreomBytes(sequence.Sequence())]
if ok {
taxon = exacttaxon.Taxon
bestmatch = exacttaxon.Id
weight = exacttaxon.Weight
} else {
var bests obiseq.BioSequenceSlice
bests, differences, identity, bestmatch, _ = FindClosests(sequence, *db.Clusters, *db.ClusterCounts, runExact)
weight = bests.Len() /* Should be the some of the count */
ftaxon := (*obitax.TaxNode)(nil)
if identity >= 0.5 && differences >= 0 {
ftaxon = db.BestConsensus(bests, differences, "obitag_ref_index")
fam := ftaxon.TaxonAtRank("family")
if fam != nil {
ftaxid := fam.Taxid()
bests, differences, identity, bestmatch, _ = FindClosests(
sequence,
*(*db.Families)[ftaxid],
*(*db.FamilyCounts)[ftaxid],
runExact,
)
sequence.SetAttribute("obitag_proposed_family", fam.ScientificName())
ftaxon = db.BestConsensus(bests, differences, "reffamidx_in")
}
} else {
// Cannot match any taxon
ftaxon, _ = db.Taxonomy.Taxon(1)
sequence.SetTaxid(1)
}
taxon = ftaxon
method = "lcsfamlily"
}
sequence.SetTaxid(taxon.Taxid())
sequence.SetAttribute("scientific_name", taxon.ScientificName())
sequence.SetAttribute("obitag_rank", taxon.Rank())
sequence.SetAttribute("obitag_bestid", identity)
sequence.SetAttribute("obitag_bestmatch", bestmatch)
sequence.SetAttribute("obitag_match_count", weight)
sequence.SetAttribute("obitag_similarity_method", method)
return sequence
}
func IdentifySeqWorker(db *Obitag2RefDB, runExact bool) obiseq.SeqWorker {
return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
return obiseq.BioSequenceSlice{Identify(sequence, db, runExact)}, nil
}
}
func CLIAssignTaxonomy(iterator obiiter.IBioSequence,
references obiseq.BioSequenceSlice,
taxo *obitax.Taxonomy,
) obiiter.IBioSequence {
refcounts := make(
[]*obikmer.Table4mer,
len(references))
taxa := make(obitax.TaxonSet,
len(references))
familydb := make(map[int]*obiseq.BioSequenceSlice)
familycounts := make(map[int]*[]*obikmer.Table4mer)
familytaxa := make(map[int]obitax.TaxonSet)
clusterdb := obiseq.MakeBioSequenceSlice()
clustercounts := make([]*obikmer.Table4mer, 0)
clustertaxa := make(obitax.TaxonSet, 0)
exactmatch := make(map[string]*obiseq.BioSequenceSlice)
buffer := make([]byte, 0, 1000)
j := 0
for i, seq := range references {
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
taxa[i], _ = taxo.Taxon(seq.Taxid())
if is_centrer, _ := seq.GetBoolAttribute("reffamidx_clusterhead"); is_centrer {
clusterdb = append(clusterdb, seq)
clustercounts = append(clustercounts, refcounts[i])
clustertaxa[j] = taxa[i]
j++
}
family, _ := seq.GetIntAttribute("family_taxid")
fs, ok := familydb[family]
if !ok {
fs = obiseq.NewBioSequenceSlice(0)
familydb[family] = fs
}
*fs = append(*fs, seq)
fc, ok := familycounts[family]
if !ok {
fci := make([]*obikmer.Table4mer, 0)
fc = &fci
familycounts[family] = fc
}
*fc = append(*fc, refcounts[i])
ft, ok := familytaxa[family]
if !ok {
ft = make(obitax.TaxonSet, 0)
familytaxa[family] = ft
}
ft[len(ft)] = taxa[i]
seqstr := obiutils.UnsafeStringFreomBytes(seq.Sequence())
em, ok := exactmatch[seqstr]
if !ok {
em = obiseq.NewBioSequenceSlice()
exactmatch[seqstr] = em
}
*em = append(*em, seq)
}
exacttaxid := make(map[string]*Obitag2Match, len(exactmatch))
for seqstr, seqs := range exactmatch {
var err error
t, _ := taxo.Taxon((*seqs)[0].Taxid())
w := (*seqs)[0].Count()
lseqs := seqs.Len()
for i := 1; i < lseqs; i++ {
t2, _ := taxo.Taxon((*seqs)[i].Taxid())
t, err = t.LCA(t2)
if err != nil {
log.Panic(err)
}
w += (*seqs)[i].Count()
}
exacttaxid[seqstr] = &Obitag2Match{
Taxon: t,
Id: (*seqs)[0].Id(),
Weight: w,
}
}
db := &Obitag2RefDB{
Taxonomy: taxo,
Full: &references,
FullCounts: &refcounts,
FullTaxa: &taxa,
Clusters: &clusterdb,
ClusterCounts: &clustercounts,
ClusterTaxa: &clustertaxa,
Families: &familydb,
FamilyCounts: &familycounts,
FamilyTaxa: &familytaxa,
ExactTaxid: &exacttaxid,
}
worker := IdentifySeqWorker(db, CLIRunExact())
return iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers(), 0)
}

View File

@ -0,0 +1,120 @@
package obitag2
import (
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind"
"github.com/DavidGamba/go-getoptions"
)
var _RefDB = ""
var _SaveRefDB = ""
var _RunExact = false
var _GeomSim = false
func TagOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_RefDB, "reference-db", _RefDB,
options.Alias("R"),
options.Required(),
options.ArgName("FILENAME"),
options.Description("The name of the file containing the reference DB"))
options.StringVar(&_SaveRefDB, "save-db", _SaveRefDB,
options.ArgName("FILENAME"),
options.Description("The name of a file where to save the reference DB with its indices"))
options.BoolVar(&_GeomSim, "geometric", _GeomSim,
options.Alias("G"),
options.Description("Activate the experimental geometric similarity heuristic"))
// options.BoolVar(&_RunExact, "exact", _RunExact,
// options.Alias("E"),
// options.Description("Desactivate the heuristic limitating the sequence comparisons"))
}
// OptionSet adds to the basic option set every options declared for
// the obiuniq command
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
obifind.LoadTaxonomyOptionSet(options, true, false)
TagOptionSet(options)
}
func CLIRefDBName() string {
return _RefDB
}
func CLIRefDB() obiseq.BioSequenceSlice {
refdb, err := obiformats.ReadSequencesFromFile(_RefDB)
if err != nil {
log.Panicf("Cannot open the reference library file : %s\n", _RefDB)
}
return refdb.Load()
}
func CLIGeometricMode() bool {
return _GeomSim
}
func CLIShouldISaveRefDB() bool {
return _SaveRefDB != ""
}
func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) {
if CLIShouldISaveRefDB() {
idb := obiiter.IBatchOver(db, 1000)
var newIter obiiter.IBioSequence
opts := make([]obiformats.WithOption, 0, 10)
switch obiconvert.CLIOutputFastHeaderFormat() {
case "json":
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
case "obi":
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqOBIHeader))
default:
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
}
nworkers := obioptions.CLIParallelWorkers() / 4
if nworkers < 2 {
nworkers = 2
}
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
opts = append(opts, obiformats.OptionsCompressed(obiconvert.CLICompressed()))
var err error
switch obiconvert.CLIOutputFormat() {
case "fastq":
newIter, err = obiformats.WriteFastqToFile(idb, _SaveRefDB, opts...)
case "fasta":
newIter, err = obiformats.WriteFastaToFile(idb, _SaveRefDB, opts...)
default:
newIter, err = obiformats.WriteSequencesToFile(idb, _SaveRefDB, opts...)
}
if err != nil {
log.Fatalf("Write file error: %v", err)
}
newIter.Recycle()
obiiter.WaitForLastPipe()
}
}
func CLIRunExact() bool {
return _RunExact
}

42
pkg/obiutils/counter.go Normal file
View File

@ -0,0 +1,42 @@
package obiutils
import "sync"
type Counter struct {
Inc func() int
Dec func() int
Value func() int
}
func NewCounter(initial ...int) *Counter {
value := 0
lock := sync.Mutex{}
if len(initial) > 0 {
value = initial[0]
}
c := Counter{
Inc: func() int {
lock.Lock()
defer lock.Unlock()
value++
return value
},
Dec: func() int {
lock.Lock()
defer lock.Unlock()
value--
return value
},
Value: func() int {
lock.Lock()
defer lock.Unlock()
return value
},
}
return &c
}

12
pkg/obiutils/strings.go Normal file
View File

@ -0,0 +1,12 @@
package obiutils
import "unsafe"
func UnsafeStringFreomBytes(data []byte) string {
if len(data) > 0 {
s := unsafe.String(unsafe.SliceData(data), len(data))
return s
}
return ""
}