diff --git a/pkg/obichunk/chunk_on_disk.go b/pkg/obichunk/chunk_on_disk.go index 4b7fe61..c378c00 100644 --- a/pkg/obichunk/chunk_on_disk.go +++ b/pkg/obichunk/chunk_on_disk.go @@ -2,7 +2,6 @@ package obichunk import ( "io/fs" - "io/ioutil" "os" "path/filepath" @@ -14,7 +13,7 @@ import ( ) func tempDir() (string, error) { - dir, err := ioutil.TempDir(os.TempDir(), "obiseq_chunks_") + dir, err := os.MkdirTemp(os.TempDir(), "obiseq_chunks_") if err != nil { return "", err } diff --git a/pkg/obiformats/ngsfilter_read.go b/pkg/obiformats/ngsfilter_read.go index 7ed1e7a..ef51e37 100644 --- a/pkg/obiformats/ngsfilter_read.go +++ b/pkg/obiformats/ngsfilter_read.go @@ -249,6 +249,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value log.Fatalln("Invalid value for @spacer parameter") } + log.Infof("Set global spacer to %d bp", spacer) library.SetTagSpacer(spacer) case 2: primer := values[0] @@ -258,6 +259,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value log.Fatalln("Invalid value for @spacer parameter") } + log.Infof("Set spacer for primer %s to %d bp", primer, spacer) library.SetTagSpacerFor(primer, spacer) default: log.Fatalln("Invalid value for @spacer parameter") @@ -274,6 +276,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value log.Fatalln("Invalid value for @forward_spacer parameter") } + log.Infof("Set spacer for forward primer to %d bp", spacer) library.SetForwardTagSpacer(spacer) default: log.Fatalln("Invalid value for @forward_spacer parameter") @@ -290,6 +293,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value log.Fatalln("Invalid value for @reverse_spacer parameter") } + log.Infof("Set spacer for reverse primer to %d bp", spacer) library.SetReverseTagSpacer(spacer) default: log.Fatalln("Invalid value for @reverse_spacer parameter") @@ -301,9 +305,13 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value log.Fatalln("Missing value for @tag_delimiter parameter") case 1: value := []byte(values[0])[0] + + log.Infof("Set global tag delimiter to %c", value) library.SetTagDelimiter(value) case 2: value := []byte(values[1])[0] + + log.Infof("Set tag delimiter for primer %s to %c", values[0], value) library.SetTagDelimiterFor(values[0], value) default: log.Fatalln("Invalid value for @tag_delimiter parameter") @@ -315,6 +323,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value log.Fatalln("Missing value for @forward_tag_delimiter parameter") case 1: value := []byte(values[0])[0] + + log.Infof("Set tag delimiter for forward primer to %c", value) library.SetForwardTagDelimiter(value) default: log.Fatalln("Invalid value for @forward_tag_delimiter parameter") @@ -326,6 +336,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value log.Fatalln("Missing value for @reverse_tag_delimiter parameter") case 1: value := []byte(values[0])[0] + + log.Infof("Set tag delimiter for reverse primer to %c", value) library.SetReverseTagDelimiter(value) default: log.Fatalln("Invalid value for @reverse_tag_delimiter parameter") @@ -339,6 +351,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value if err := library.SetMatching(values[0]); err != nil { log.Fatalf("Invalid value %s for @matching parameter", values[0]) } + log.Infof("Set tag matching mode to %s", values[0]) default: log.Fatalln("Invalid value for @matching parameter") } @@ -354,6 +367,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value log.Fatalf("Invalid value %s for @primer_error parameter", values[0]) } + log.Infof("Set global allowed primer mismatches to %d", dist) library.SetAllowedMismatches(dist) case 2: primer := values[0] @@ -363,6 +377,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value log.Fatalf("Invalid value %s for @primer_error parameter", values[1]) } + log.Infof("Set allowed primer mismatches for primer %s to %d", primer, dist) library.SetAllowedMismatchesFor(primer, dist) default: log.Fatalln("Invalid value for @primer_error parameter") @@ -379,6 +394,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value log.Fatalf("Invalid value %s for @forward_primer_error parameter", values[0]) } + log.Infof("Set allowed mismatches for forward primer to %d", dist) library.SetForwardAllowedMismatches(dist) default: log.Fatalln("Invalid value for @forward_primer_error parameter") @@ -395,6 +411,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value log.Fatalf("Invalid value %s for @reverse_primer_error parameter", values[0]) } + log.Infof("Set allowed mismatches for reverse primer to %d", dist) library.SetReverseAllowedMismatches(dist) default: log.Fatalln("Invalid value for @reverse_primer_error parameter") @@ -410,6 +427,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value if err != nil { log.Fatalf("Invalid value %s for @tag_indels parameter", values[0]) } + + log.Infof("Set global maximum tag indels to %d", indels) library.SetTagIndels(indels) case 2: indels, err := strconv.Atoi(values[1]) @@ -418,6 +437,7 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value log.Fatalf("Invalid value %s for @tag_indels parameter", values[1]) } + log.Infof("Set maximum tag indels for primer %s to %d", values[0], indels) library.SetTagIndelsFor(values[0], indels) } }, @@ -432,6 +452,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value if err != nil { log.Fatalf("Invalid value %s for @forward_tag_indels parameter", values[0]) } + + log.Infof("Set maximum tag indels for forward primer to %d", indels) library.SetForwardTagIndels(indels) default: log.Fatalln("Invalid value for @forward_tag_indels parameter") @@ -447,6 +469,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value if err != nil { log.Fatalf("Invalid value %s for @reverse_tag_indels parameter", values[0]) } + + log.Infof("Set maximum tag indels for reverse primer to %d", indels) library.SetReverseTagIndels(indels) default: log.Fatalln("Invalid value for @reverse_tag_indels parameter") @@ -457,8 +481,22 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value case 0: log.Fatalln("Missing value for @indels parameter") case 1: + + if values[0] == "true" { + log.Info("Allows indels for primer matching") + } else { + log.Info("Disallows indels for primer matching") + } + library.SetAllowsIndels(values[0] == "true") case 2: + + if values[1] == "true" { + log.Infof("Allows indels for primer matching %s", values[0]) + } else { + log.Infof("Disallows indels for primer matching %s", values[0]) + } + library.SetAllowsIndelsFor(values[0], values[1] == "true") default: log.Fatalln("Invalid value for @indels parameter") @@ -470,6 +508,12 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value case 0: log.Fatalln("Missing value for @forward_indels parameter") case 1: + if values[0] == "true" { + log.Info("Allows indels for forward primer matching") + } else { + log.Info("Disallows indels for forward primer matching") + } + library.SetForwardAllowsIndels(values[0] == "true") default: log.Fatalln("Invalid value for @forward_indels parameter") @@ -480,6 +524,11 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value case 0: log.Fatalln("Missing value for @reverse_indels parameter") case 1: + if values[0] == "true" { + log.Info("Allows indels for reverse primer matching") + } else { + log.Info("Disallows indels for reverse primer matching") + } library.SetReverseAllowsIndels(values[0] == "true") default: log.Fatalln("Invalid value for @reverse_indels parameter") diff --git a/pkg/obikmer/debruijn.go b/pkg/obikmer/debruijn.go index 3bafddd..133304c 100644 --- a/pkg/obikmer/debruijn.go +++ b/pkg/obikmer/debruijn.go @@ -607,7 +607,7 @@ func (g *DeBruijnGraph) HaviestPath() []uint64 { } } - log.Infof("Heaviest node: %d [%v]", heaviestNode, heaviestWeight) + log.Debugf("Heaviest node: %d [%v]", heaviestNode, heaviestWeight) // Reconstruct the path from the start node to the heaviest node found heaviestPath := make([]uint64, 0) currentNode := heaviestNode diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 14ad09d..9eae625 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "f265a39" +var _Commit = "4d86483" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/attributes.go b/pkg/obiseq/attributes.go index afdc142..10ee503 100644 --- a/pkg/obiseq/attributes.go +++ b/pkg/obiseq/attributes.go @@ -415,11 +415,16 @@ func (s *BioSequence) SetTaxid(taxid int) { s.SetAttribute("taxid", taxid) } -func (s *BioSequence) OBITagRefIndex() map[int]string { +func (s *BioSequence) OBITagRefIndex(slot ...string) map[int]string { + key := "obitag_ref_index" + + if len(slot) > 0 { + key = slot[0] + } var val map[int]string - i, ok := s.GetAttribute("obitag_ref_index") + i, ok := s.GetAttribute(key) if !ok { return nil diff --git a/pkg/obiseq/language.go b/pkg/obiseq/language.go index 3200d48..a79c66a 100644 --- a/pkg/obiseq/language.go +++ b/pkg/obiseq/language.go @@ -202,4 +202,8 @@ var OBILang = gval.NewLanguage( scomp[string(k)] = float64(v) } return scomp, nil - })) + }), + gval.Function("replace", func(args ...interface{}) (interface{}, error) { + return strings.ReplaceAll(args[0].(string), args[1].(string), args[2].(string)), nil + }), +) diff --git a/pkg/obitax/sequence_methods.go b/pkg/obitax/sequence_methods.go index 8623eab..384a14f 100644 --- a/pkg/obitax/sequence_methods.go +++ b/pkg/obitax/sequence_methods.go @@ -66,3 +66,27 @@ func (taxonomy *Taxonomy) SetPath(sequence *obiseq.BioSequence) string { return tpath } + +func (taxonomy *Taxonomy) SetScientificName(sequence *obiseq.BioSequence) string { + taxid, err := taxonomy.Taxon(sequence.Taxid()) + + if err != nil { + log.Fatalf("Taxid %d not defined in the current taxonomy", sequence.Taxid()) + } + + sequence.SetAttribute("scienctific_name", taxid.ScientificName()) + + return taxid.ScientificName() +} + +func (taxonomy *Taxonomy) SetTaxonomicRank(sequence *obiseq.BioSequence) string { + taxid, err := taxonomy.Taxon(sequence.Taxid()) + + if err != nil { + log.Fatalf("Taxid %d not defined in the current taxonomy", sequence.Taxid()) + } + + sequence.SetAttribute("taxonomic_rank", taxid.Rank()) + + return taxid.Rank() +} diff --git a/pkg/obitax/sequence_predicate.go b/pkg/obitax/sequence_predicate.go index b7381b2..551a5a8 100644 --- a/pkg/obitax/sequence_predicate.go +++ b/pkg/obitax/sequence_predicate.go @@ -57,6 +57,23 @@ func (taxonomy *Taxonomy) IsSubCladeOf(taxid int) obiseq.SequencePredicate { return f } +func (taxonomy *Taxonomy) IsSubCladeOfSlot(key string) obiseq.SequencePredicate { + + f := func(sequence *obiseq.BioSequence) bool { + val, ok := sequence.GetStringAttribute(key) + + if ok { + parent, err1 := taxonomy.Taxon(val) + taxon, err2 := taxonomy.Taxon(sequence.Taxid()) + return err1 == nil && err2 == nil && taxon.IsSubCladeOf(parent) + } + + return false + } + + return f +} + func (taxonomy *Taxonomy) HasRequiredRank(rank string) obiseq.SequencePredicate { if !obiutils.Contains(taxonomy.RankList(), rank) { diff --git a/pkg/obitax/taxonomy.go b/pkg/obitax/taxonomy.go index 4d4ee65..787f513 100644 --- a/pkg/obitax/taxonomy.go +++ b/pkg/obitax/taxonomy.go @@ -2,6 +2,8 @@ package obitax import ( "fmt" + "regexp" + "strconv" ) type TaxName struct { @@ -54,11 +56,43 @@ func (taxonomy *Taxonomy) AddNewTaxa(taxid, parent int, rank string, replace boo return n, nil } -func (taxonomy *Taxonomy) Taxon(taxid int) (*TaxNode, error) { - t, ok := (*taxonomy.nodes)[taxid] +// func (taxonomy *Taxonomy) Taxon(taxid int) (*TaxNode, error) { +// t, ok := (*taxonomy.nodes)[taxid] + +// if !ok { +// a, aok := taxonomy.alias[taxid] +// if !aok { +// return nil, fmt.Errorf("Taxid %d is not part of the taxonomy", taxid) +// } +// t = a +// } +// return t, nil +// } + +func (taxonomy *Taxonomy) Taxon(taxid interface{}) (*TaxNode, error) { + var itaxid int + var err error + + switch v := taxid.(type) { + case int: + itaxid = v + case string: + itaxid, err = strconv.Atoi(v) + + if err != nil { + re := regexp.MustCompile(`TX:(\d+)`) + parts := re.FindStringSubmatch(v) + if len(parts) != 2 { + return nil, fmt.Errorf("I cannot parse taxid from %s", v) + } + itaxid, _ = strconv.Atoi(parts[1]) + } + } + + t, ok := (*taxonomy.nodes)[itaxid] if !ok { - a, aok := taxonomy.alias[taxid] + a, aok := taxonomy.alias[itaxid] if !aok { return nil, fmt.Errorf("Taxid %d is not part of the taxonomy", taxid) } @@ -66,7 +100,6 @@ func (taxonomy *Taxonomy) Taxon(taxid int) (*TaxNode, error) { } return t, nil } - func (taxonomy *Taxonomy) AddNewName(taxid int, name, nameclass *string) error { node, node_err := taxonomy.Taxon(taxid) if node_err != nil { diff --git a/pkg/obitools/obiannotate/obiannotate.go b/pkg/obitools/obiannotate/obiannotate.go index a3b48c8..585812c 100644 --- a/pkg/obitools/obiannotate/obiannotate.go +++ b/pkg/obitools/obiannotate/obiannotate.go @@ -215,6 +215,24 @@ func AddTaxonAtRankWorker(taxonomy *obitax.Taxonomy, ranks ...string) obiseq.Seq return f } +func AddTaxonRankWorker(taxonomy *obitax.Taxonomy) obiseq.SeqWorker { + f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + taxonomy.SetTaxonomicRank(s) + return obiseq.BioSequenceSlice{s}, nil + } + + return f +} + +func AddScientificNameWorker(taxonomy *obitax.Taxonomy) obiseq.SeqWorker { + f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + taxonomy.SetScientificName(s) + return obiseq.BioSequenceSlice{s}, nil + } + + return f +} + func AddSeqLengthWorker() obiseq.SeqWorker { f := func(s *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { s.SetAttribute("seq_length", s.Len()) @@ -266,6 +284,18 @@ func CLIAnnotationWorker() obiseq.SeqWorker { annotator = annotator.ChainWorkers(w) } + if CLISetTaxonomicRank() { + taxo := obigrep.CLILoadSelectedTaxonomy() + w := AddTaxonRankWorker(taxo) + annotator = annotator.ChainWorkers(w) + } + + if CLISetScientificName() { + taxo := obigrep.CLILoadSelectedTaxonomy() + w := AddScientificNameWorker(taxo) + annotator = annotator.ChainWorkers(w) + } + if CLIHasAddLCA() { taxo := obigrep.CLILoadSelectedTaxonomy() w := obitax.AddLCAWorker(taxo, CLILCASlotName(), CLILCAThreshold()) diff --git a/pkg/obitools/obiannotate/options.go b/pkg/obitools/obiannotate/options.go index 291f74f..f7e722e 100644 --- a/pkg/obitools/obiannotate/options.go +++ b/pkg/obitools/obiannotate/options.go @@ -32,6 +32,8 @@ var _lcaError = 0.0 var _setId = "" var _cut = "" var _taxonomicPath = false +var _withRank = false +var _withScientificName = false func SequenceAnnotationOptionSet(options *getoptions.GetOpt) { // options.BoolVar(&_addRank, "seq-rank", _addRank, @@ -117,6 +119,12 @@ func SequenceAnnotationOptionSet(options *getoptions.GetOpt) { options.BoolVar(&_taxonomicPath, "taxonomic-path", _taxonomicPath, options.Description("Annotate the sequence with its taxonomic path")) + options.BoolVar(&_withRank, "taxonomic-rank", _withRank, + options.Description("Annotate the sequence with its taxonomic rank")) + + options.BoolVar(&_withScientificName, "scientific-name", _withScientificName, + options.Description("Annotate the sequence with its scientific name")) + // options.StringVar(&_tagList, "tag-list", _tagList, // options.ArgName("FILENAME"), // options.Description(" points to a file containing attribute names"+ @@ -307,3 +315,11 @@ func CLIPatternInDels() bool { func CLISetTaxonomicPath() bool { return _taxonomicPath } + +func CLISetTaxonomicRank() bool { + return _withRank +} + +func CLISetScientificName() bool { + return _withScientificName +} diff --git a/pkg/obitools/obiconsensus/obiconsensus.go b/pkg/obitools/obiconsensus/obiconsensus.go index cc1fd9c..544b3bd 100644 --- a/pkg/obitools/obiconsensus/obiconsensus.go +++ b/pkg/obitools/obiconsensus/obiconsensus.go @@ -52,7 +52,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, } - log.Printf("Number of reads : %d\n", len(seqs)) + log.Debugf("Number of reads : %d\n", len(seqs)) if kmer_size < 0 { longest := make([]int, len(seqs)) @@ -64,7 +64,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, } kmer_size = slices.Max(longest) + 1 - log.Printf("estimated kmer size : %d", kmer_size) + log.Debugf("estimated kmer size : %d", kmer_size) } var graph *obikmer.DeBruijnGraph @@ -80,7 +80,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, } kmer_size++ - log.Infof("Cycle detected, increasing kmer size to %d\n", kmer_size) + log.Debugf("Cycle detected, increasing kmer size to %d\n", kmer_size) } if save_graph { @@ -96,7 +96,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, } } - log.Printf("Graph size : %d\n", graph.Len()) + log.Debugf("Graph size : %d\n", graph.Len()) total_kmer := graph.Len() seq, err := graph.LongestConsensus(consensus_id) @@ -260,6 +260,21 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation], sample_key string, kmer_size int) obiseq.BioSequenceSlice { denoised := obiseq.MakeBioSequenceSlice(len(*graph.Vertices)) + bar := (*progressbar.ProgressBar)(nil) + if obiconvert.CLIProgressBar() { + + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowIts(), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetDescription(fmt.Sprintf("[Build consensus] on %s", graph.Name)), + ) + + bar = progressbar.NewOptions(len(*graph.Vertices), pbopt...) + } + for i, v := range *graph.Vertices { var err error var clean *obiseq.BioSequence @@ -283,6 +298,7 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation], clean.SetAttribute("obiconsensus_consensus", true) } pack.Recycle(false) + } else { clean = obiseq.NewBioSequence(v.Id(), v.Sequence(), v.Definition()) clean.SetAttribute("obiconsensus_consensus", false) @@ -295,7 +311,20 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation], clean.SetAttribute("obiconsensus_weight", int(1)) } + annotations := v.Annotations() + + for k, v := range annotations { + if !clean.HasAttribute(k) { + clean.SetAttribute(k, v) + } + } + denoised[i] = clean + + if bar != nil { + bar.Add(1) + } + } return denoised diff --git a/pkg/obitools/obigrep/options.go b/pkg/obitools/obigrep/options.go index 5f393f4..7b5cf6c 100644 --- a/pkg/obitools/obigrep/options.go +++ b/pkg/obitools/obigrep/options.go @@ -1,6 +1,7 @@ package obigrep import ( + "strconv" "strings" log "github.com/sirupsen/logrus" @@ -13,7 +14,7 @@ import ( "github.com/DavidGamba/go-getoptions" ) -var _BelongTaxa = make([]int, 0) +var _BelongTaxa = make([]string, 0) var _NotBelongTaxa = make([]int, 0) var _RequiredRanks = make([]string, 0) @@ -48,7 +49,7 @@ func TaxonomySelectionOptionSet(options *getoptions.GetOpt) { options.Alias("t"), options.Description("Points to the directory containing the NCBI Taxonomy database dump.")) - options.IntSliceVar(&_BelongTaxa, "restrict-to-taxon", 1, 1, + options.StringSliceVar(&_BelongTaxa, "restrict-to-taxon", 1, 1, options.Alias("r"), options.ArgName("TAXID"), options.Description("Require that the actual taxon of the sequence belongs the provided taxid.")) @@ -231,13 +232,27 @@ func CLILoadSelectedTaxonomy() *obitax.Taxonomy { } func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate { + var p obiseq.SequencePredicate + var p2 obiseq.SequencePredicate if len(_BelongTaxa) > 0 { taxonomy := CLILoadSelectedTaxonomy() - p := taxonomy.IsSubCladeOf(_BelongTaxa[0]) - for _, taxid := range _BelongTaxa[1:] { - p = p.Or(taxonomy.IsSubCladeOf(taxid)) + taxid, err := strconv.Atoi(_BelongTaxa[0]) + if err != nil { + p = taxonomy.IsSubCladeOfSlot(_BelongTaxa[0]) + } else { + p = taxonomy.IsSubCladeOf(taxid) + } + for _, staxid := range _BelongTaxa[1:] { + taxid, err := strconv.Atoi(staxid) + if err != nil { + p2 = taxonomy.IsSubCladeOfSlot(staxid) + } else { + p2 = taxonomy.IsSubCladeOf(taxid) + } + + p = p.Or(p2) } return p diff --git a/pkg/obitools/obijoin/join.go b/pkg/obitools/obijoin/join.go index a8f5520..1f087ab 100644 --- a/pkg/obitools/obijoin/join.go +++ b/pkg/obitools/obijoin/join.go @@ -12,21 +12,32 @@ import ( type IndexedSequenceSlice struct { Sequences obiseq.BioSequenceSlice - Indices []map[interface{}]*obiutils.Set[int] + Indices []map[string]*obiutils.Set[int] } func (s IndexedSequenceSlice) Len() int { return len(s.Sequences) } -func (s IndexedSequenceSlice) Get(keys ...interface{}) *obiseq.BioSequenceSlice { +func (s IndexedSequenceSlice) Get(keys ...string) *obiseq.BioSequenceSlice { var keeps obiutils.Set[int] for i, v := range s.Indices { + if i == 0 { - keeps = *v[keys[0]] + p, ok := v[keys[0]] + if !ok { + keeps = obiutils.MakeSet[int]() + break + } + keeps = *p } else { - keeps = keeps.Intersection(*v[keys[i]]) + p, ok := v[keys[i]] + if !ok { + keeps = obiutils.MakeSet[int]() + break + } + keeps = keeps.Intersection(*p) } } @@ -39,14 +50,14 @@ func (s IndexedSequenceSlice) Get(keys ...interface{}) *obiseq.BioSequenceSlice } func BuildIndexedSequenceSlice(seqs obiseq.BioSequenceSlice, keys []string) IndexedSequenceSlice { - indices := make([]map[interface{}]*obiutils.Set[int], len(keys)) + indices := make([]map[string]*obiutils.Set[int], len(keys)) for i, k := range keys { - idx := make(map[interface{}]*obiutils.Set[int]) + idx := make(map[string]*obiutils.Set[int]) for j, seq := range seqs { - if value, ok := seq.GetAttribute(k); ok { + if value, ok := seq.GetStringAttribute(k); ok { goods, ok := idx[value] if !ok { goods = obiutils.NewSet[int]() @@ -67,10 +78,10 @@ func MakeJoinWorker(by []string, index IndexedSequenceSlice, updateId, updateSeq f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { var ok bool - keys := make([]interface{}, len(by)) + keys := make([]string, len(by)) for i, v := range by { - keys[i], ok = sequence.GetAttribute(v) + keys[i], ok = sequence.GetStringAttribute(v) if !ok { return obiseq.BioSequenceSlice{sequence}, nil } diff --git a/pkg/obitools/obitag2/obitag.go b/pkg/obitools/obitag2/obitag.go new file mode 100644 index 0000000..00af2e3 --- /dev/null +++ b/pkg/obitools/obitag2/obitag.go @@ -0,0 +1,460 @@ +package obitag2 + +import ( + "sort" + "strconv" + "strings" + + log "github.com/sirupsen/logrus" + "golang.org/x/exp/maps" + "golang.org/x/exp/slices" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" +) + +type Obitag2Match struct { + Taxon *obitax.TaxNode + Id string + Weight int +} +type Obitag2RefDB struct { + Taxonomy *obitax.Taxonomy + + Full *obiseq.BioSequenceSlice + FullCounts *[]*obikmer.Table4mer + FullTaxa *obitax.TaxonSet + + Clusters *obiseq.BioSequenceSlice + ClusterCounts *[]*obikmer.Table4mer + ClusterTaxa *obitax.TaxonSet + + Families *map[int]*obiseq.BioSequenceSlice + FamilyCounts *map[int]*[]*obikmer.Table4mer + FamilyTaxa *map[int]obitax.TaxonSet + + ExactTaxid *map[string]*Obitag2Match +} + +// MatchDistanceIndex returns the taxid, rank, and scientificName based on the given distance and distanceIdx. +// +// Parameters: +// - distance: The distance to match against the keys in distanceIdx. +// - distanceIdx: A map containing distances as keys and corresponding values in the format "taxid@rank@scientificName". +// +// Returns: +// - taxid: The taxid associated with the matched distance. +// - rank: The rank associated with the matched distance. +// - scientificName: The scientific name associated with the matched distance. +func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, string) { + keys := maps.Keys(distanceIdx) + slices.Sort(keys) + + i := sort.Search(len(keys), func(i int) bool { + return distance <= keys[i] + }) + + var taxid int + var rank string + var scientificName string + + if i == len(keys) || distance > keys[len(keys)-1] { + taxid = 1 + rank = "no rank" + scientificName = "root" + } else { + parts := strings.Split(distanceIdx[keys[i]], "@") + taxid, _ = strconv.Atoi(parts[0]) + rank = parts[1] + scientificName = parts[2] + } + + // log.Info("taxid:", taxid, " rank:", rank, " scientificName:", scientificName) + + return taxid, rank, scientificName +} + +// FindClosests finds the closest bio sequence from a given sequence and a slice of reference sequences. +// +// Parameters: +// - sequence: the bio sequence to find the closest matches for. +// - references: a slice of reference sequences to compare against. +// - refcounts: a slice of reference sequence counts. +// - runExact: a boolean flag indicating whether to run an exact match. +// +// Returns: +// - bests: a slice of the closest bio sequences. +// - maxe: the maximum score. +// - bestId: the best ID. +// - bestmatch: the best match. +// - bestidxs: a slice of the best indexes. +func FindClosests(sequence *obiseq.BioSequence, + references obiseq.BioSequenceSlice, + refcounts []*obikmer.Table4mer, + runExact bool) (obiseq.BioSequenceSlice, int, float64, string, []int) { + + var matrix []uint64 + + seqwords := obikmer.Count4Mer(sequence, nil, nil) + cw := make([]int, len(refcounts)) + + for i, ref := range refcounts { + cw[i] = obikmer.Common4Mer(seqwords, ref) + } + + o := obiutils.Reverse(obiutils.IntOrder(cw), true) + + bests := obiseq.MakeBioSequenceSlice() + // bests = append(bests, references[o[0]]) + bestidxs := make([]int, 0) + // bestidxs = append(bestidxs, o[0]) + bestId := 0.0 + bestmatch := references[o[0]].Id() + + maxe := -1 + wordmin := 0 + + // sss := make([]int, 0, 10000) + // log.Warnf("%v\n%v\n%d", seqwords, refcounts[o[0]], obikmer.Common4Mer(seqwords, refcounts[o[0]])) + + for i, order := range o { + ref := references[order] + score := int(1e9) + + if cw[order] < wordmin || i > 1000 { + break + } + lcs, alilength := -1, -1 + switch maxe { + case 0: + if obiutils.UnsafeStringFreomBytes(sequence.Sequence()) == obiutils.UnsafeStringFreomBytes(references[order].Sequence()) { + score = 0 + alilength = sequence.Len() + lcs = alilength + } + case 1: + d, _, _, _ := obialign.D1Or0(sequence, references[order]) + if d >= 0 { + score = d + alilength = max(sequence.Len(), ref.Len()) + lcs = alilength - score + } + default: + lcs, alilength = obialign.FastLCSScore(sequence, references[order], maxe, &matrix) + if lcs >= 0 { + score = alilength - lcs + } + } + + // log.Warnf("LCS : %d ALILENGTH : %d score : %d cw : %d", lcs, alilength, score, cw[order]) + if lcs >= 0 { + + // sss = append(sss, cw[order], alilength-lcs) + + // + // We have found a better candidate than never + // + if maxe == -1 || score < maxe { + bests = bests[:0] // Empty the best lists + bestidxs = bestidxs[:0] // + + maxe = score // Memorize the best scores + wordmin = max(0, max(sequence.Len(), ref.Len())-3-4*maxe) + bestId = float64(lcs) / float64(alilength) + bestmatch = ref.Id() + // log.Warnln(sequence.Id(), ref.Id(), cw[order], maxe, bestId, wordmin) + } + + if score == maxe { + bests = append(bests, ref) + bestidxs = append(bestidxs, order) + id := float64(lcs) / float64(alilength) + if id > bestId { + bestId = id + bestmatch = ref.Id() + } + // log.Println(ref.Id(), maxe, bestId,bestidxs) + } + + } + + } + // sequence.SetAttribute("maxe", sss) + log.Debugln("Closest Match", sequence.Id(), maxe, bestId, references[bestidxs[0]].Id(), bestidxs, len(bests)) + return bests, maxe, bestId, bestmatch, bestidxs +} + +func (db *Obitag2RefDB) BestConsensus(bests obiseq.BioSequenceSlice, differences int, slot string) *obitax.TaxNode { + taxon := (*obitax.TaxNode)(nil) + + for _, best := range bests { + idx := best.OBITagRefIndex(slot) + if idx == nil { + log.Fatalf("Reference database must be indexed first") + } + + d := differences + identification, ok := idx[d] + found := false + var parts []string + + /* + Here is an horrible hack for xprize challence. + With Euka01 the part[0] was equal to "" for at + least a sequence consensus. Which is not normal. + + TO BE CHECKED AND CORRECTED + + The problem seems related to idx that doesn't have + a 0 distance + */ + for !found && d >= 0 { + for !ok && d >= 0 { + identification, ok = idx[d] + d-- + } + + parts = strings.Split(identification, "@") + + found = parts[0] != "" + if !found { + log.Debugln("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d) + for !ok && d <= 1000 { + identification, ok = idx[d] + d++ + } + + } + } + + match_taxid, err := strconv.Atoi(parts[0]) + + if err != nil { + log.Panicln("Cannot extract taxid from :", identification) + } + + match_taxon, err := db.Taxonomy.Taxon(match_taxid) + + if err != nil { + log.Panicln("Cannot find taxon corresponding to taxid :", match_taxid) + } + + if taxon != nil { + taxon, _ = taxon.LCA(match_taxon) + } else { + taxon = match_taxon + } + + } + + return taxon +} + +// Identify makes the taxonomic identification of a BioSequence. +// +// Parameters: +// - sequence: A pointer to a BioSequence to identify. +// - references: A BioSequenceSlice. +// - refcounts: A slice of pointers to Table4mer. +// - taxa: A TaxonSet. +// - taxo: A pointer to a Taxonomy. +// - runExact: A boolean value indicating whether to run exact matching. +// +// Returns: +// - A pointer to a BioSequence. +func Identify(sequence *obiseq.BioSequence, + db *Obitag2RefDB, + runExact bool) *obiseq.BioSequence { + + identity := 1.0 + differences := 0 + method := "exact match" + weight := 0 + + var bestmatch string + var taxon *obitax.TaxNode + + exacttaxon, ok := (*db.ExactTaxid)[obiutils.UnsafeStringFreomBytes(sequence.Sequence())] + if ok { + taxon = exacttaxon.Taxon + bestmatch = exacttaxon.Id + weight = exacttaxon.Weight + } else { + var bests obiseq.BioSequenceSlice + bests, differences, identity, bestmatch, _ = FindClosests(sequence, *db.Clusters, *db.ClusterCounts, runExact) + weight = bests.Len() /* Should be the some of the count */ + ftaxon := (*obitax.TaxNode)(nil) + + if identity >= 0.5 && differences >= 0 { + ftaxon = db.BestConsensus(bests, differences, "obitag_ref_index") + + fam := ftaxon.TaxonAtRank("family") + + if fam != nil { + ftaxid := fam.Taxid() + bests, differences, identity, bestmatch, _ = FindClosests( + sequence, + *(*db.Families)[ftaxid], + *(*db.FamilyCounts)[ftaxid], + runExact, + ) + sequence.SetAttribute("obitag_proposed_family", fam.ScientificName()) + ftaxon = db.BestConsensus(bests, differences, "reffamidx_in") + + } + + } else { + // Cannot match any taxon + ftaxon, _ = db.Taxonomy.Taxon(1) + sequence.SetTaxid(1) + } + + taxon = ftaxon + method = "lcsfamlily" + } + + sequence.SetTaxid(taxon.Taxid()) + + sequence.SetAttribute("scientific_name", taxon.ScientificName()) + sequence.SetAttribute("obitag_rank", taxon.Rank()) + sequence.SetAttribute("obitag_bestid", identity) + sequence.SetAttribute("obitag_bestmatch", bestmatch) + sequence.SetAttribute("obitag_match_count", weight) + sequence.SetAttribute("obitag_similarity_method", method) + + return sequence +} + +func IdentifySeqWorker(db *Obitag2RefDB, runExact bool) obiseq.SeqWorker { + return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + return obiseq.BioSequenceSlice{Identify(sequence, db, runExact)}, nil + } +} + +func CLIAssignTaxonomy(iterator obiiter.IBioSequence, + references obiseq.BioSequenceSlice, + taxo *obitax.Taxonomy, +) obiiter.IBioSequence { + + refcounts := make( + []*obikmer.Table4mer, + len(references)) + + taxa := make(obitax.TaxonSet, + len(references)) + + familydb := make(map[int]*obiseq.BioSequenceSlice) + familycounts := make(map[int]*[]*obikmer.Table4mer) + familytaxa := make(map[int]obitax.TaxonSet) + clusterdb := obiseq.MakeBioSequenceSlice() + clustercounts := make([]*obikmer.Table4mer, 0) + clustertaxa := make(obitax.TaxonSet, 0) + exactmatch := make(map[string]*obiseq.BioSequenceSlice) + + buffer := make([]byte, 0, 1000) + + j := 0 + for i, seq := range references { + refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil) + taxa[i], _ = taxo.Taxon(seq.Taxid()) + + if is_centrer, _ := seq.GetBoolAttribute("reffamidx_clusterhead"); is_centrer { + clusterdb = append(clusterdb, seq) + clustercounts = append(clustercounts, refcounts[i]) + clustertaxa[j] = taxa[i] + j++ + } + + family, _ := seq.GetIntAttribute("family_taxid") + + fs, ok := familydb[family] + if !ok { + fs = obiseq.NewBioSequenceSlice(0) + familydb[family] = fs + } + + *fs = append(*fs, seq) + + fc, ok := familycounts[family] + + if !ok { + fci := make([]*obikmer.Table4mer, 0) + fc = &fci + familycounts[family] = fc + } + + *fc = append(*fc, refcounts[i]) + + ft, ok := familytaxa[family] + + if !ok { + ft = make(obitax.TaxonSet, 0) + familytaxa[family] = ft + } + + ft[len(ft)] = taxa[i] + + seqstr := obiutils.UnsafeStringFreomBytes(seq.Sequence()) + em, ok := exactmatch[seqstr] + + if !ok { + em = obiseq.NewBioSequenceSlice() + exactmatch[seqstr] = em + } + + *em = append(*em, seq) + + } + + exacttaxid := make(map[string]*Obitag2Match, len(exactmatch)) + for seqstr, seqs := range exactmatch { + var err error + t, _ := taxo.Taxon((*seqs)[0].Taxid()) + w := (*seqs)[0].Count() + lseqs := seqs.Len() + + for i := 1; i < lseqs; i++ { + t2, _ := taxo.Taxon((*seqs)[i].Taxid()) + t, err = t.LCA(t2) + + if err != nil { + log.Panic(err) + } + + w += (*seqs)[i].Count() + } + + exacttaxid[seqstr] = &Obitag2Match{ + Taxon: t, + Id: (*seqs)[0].Id(), + Weight: w, + } + } + + db := &Obitag2RefDB{ + Taxonomy: taxo, + + Full: &references, + FullCounts: &refcounts, + FullTaxa: &taxa, + + Clusters: &clusterdb, + ClusterCounts: &clustercounts, + ClusterTaxa: &clustertaxa, + + Families: &familydb, + FamilyCounts: &familycounts, + FamilyTaxa: &familytaxa, + + ExactTaxid: &exacttaxid, + } + + worker := IdentifySeqWorker(db, CLIRunExact()) + + return iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers(), 0) +} diff --git a/pkg/obitools/obitag2/options.go b/pkg/obitools/obitag2/options.go new file mode 100644 index 0000000..74bdc58 --- /dev/null +++ b/pkg/obitools/obitag2/options.go @@ -0,0 +1,120 @@ +package obitag2 + +import ( + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind" + "github.com/DavidGamba/go-getoptions" +) + +var _RefDB = "" +var _SaveRefDB = "" +var _RunExact = false +var _GeomSim = false + +func TagOptionSet(options *getoptions.GetOpt) { + options.StringVar(&_RefDB, "reference-db", _RefDB, + options.Alias("R"), + options.Required(), + options.ArgName("FILENAME"), + options.Description("The name of the file containing the reference DB")) + + options.StringVar(&_SaveRefDB, "save-db", _SaveRefDB, + options.ArgName("FILENAME"), + options.Description("The name of a file where to save the reference DB with its indices")) + + options.BoolVar(&_GeomSim, "geometric", _GeomSim, + options.Alias("G"), + options.Description("Activate the experimental geometric similarity heuristic")) + + // options.BoolVar(&_RunExact, "exact", _RunExact, + // options.Alias("E"), + // options.Description("Desactivate the heuristic limitating the sequence comparisons")) + +} + +// OptionSet adds to the basic option set every options declared for +// the obiuniq command +func OptionSet(options *getoptions.GetOpt) { + obiconvert.OptionSet(options) + obifind.LoadTaxonomyOptionSet(options, true, false) + TagOptionSet(options) +} + +func CLIRefDBName() string { + return _RefDB +} + +func CLIRefDB() obiseq.BioSequenceSlice { + refdb, err := obiformats.ReadSequencesFromFile(_RefDB) + + if err != nil { + log.Panicf("Cannot open the reference library file : %s\n", _RefDB) + } + + return refdb.Load() +} + +func CLIGeometricMode() bool { + return _GeomSim +} + +func CLIShouldISaveRefDB() bool { + return _SaveRefDB != "" +} + +func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) { + if CLIShouldISaveRefDB() { + idb := obiiter.IBatchOver(db, 1000) + + var newIter obiiter.IBioSequence + + opts := make([]obiformats.WithOption, 0, 10) + + switch obiconvert.CLIOutputFastHeaderFormat() { + case "json": + opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader)) + case "obi": + opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqOBIHeader)) + default: + opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader)) + } + + nworkers := obioptions.CLIParallelWorkers() / 4 + if nworkers < 2 { + nworkers = 2 + } + + opts = append(opts, obiformats.OptionsParallelWorkers(nworkers)) + opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize())) + + opts = append(opts, obiformats.OptionsCompressed(obiconvert.CLICompressed())) + + var err error + + switch obiconvert.CLIOutputFormat() { + case "fastq": + newIter, err = obiformats.WriteFastqToFile(idb, _SaveRefDB, opts...) + case "fasta": + newIter, err = obiformats.WriteFastaToFile(idb, _SaveRefDB, opts...) + default: + newIter, err = obiformats.WriteSequencesToFile(idb, _SaveRefDB, opts...) + } + + if err != nil { + log.Fatalf("Write file error: %v", err) + } + + newIter.Recycle() + obiiter.WaitForLastPipe() + } +} + +func CLIRunExact() bool { + return _RunExact +} diff --git a/pkg/obiutils/counter.go b/pkg/obiutils/counter.go new file mode 100644 index 0000000..b7e4a45 --- /dev/null +++ b/pkg/obiutils/counter.go @@ -0,0 +1,42 @@ +package obiutils + +import "sync" + +type Counter struct { + Inc func() int + Dec func() int + Value func() int +} + +func NewCounter(initial ...int) *Counter { + value := 0 + lock := sync.Mutex{} + + if len(initial) > 0 { + value = initial[0] + } + + c := Counter{ + Inc: func() int { + lock.Lock() + defer lock.Unlock() + value++ + return value + }, + + Dec: func() int { + lock.Lock() + defer lock.Unlock() + value-- + return value + }, + + Value: func() int { + lock.Lock() + defer lock.Unlock() + return value + }, + } + + return &c +} diff --git a/pkg/obiutils/strings.go b/pkg/obiutils/strings.go new file mode 100644 index 0000000..fc3d576 --- /dev/null +++ b/pkg/obiutils/strings.go @@ -0,0 +1,12 @@ +package obiutils + +import "unsafe" + +func UnsafeStringFreomBytes(data []byte) string { + if len(data) > 0 { + s := unsafe.String(unsafe.SliceData(data), len(data)) + return s + } + + return "" +}