Changes to be committed:

modified:   cmd/obitools/obitag/main.go
	modified:   cmd/obitools/obitag2/main.go
	modified:   go.mod
	modified:   go.sum
	modified:   pkg/obiformats/ncbitaxdump/read.go
	modified:   pkg/obioptions/version.go
	modified:   pkg/obiseq/attributes.go
	modified:   pkg/obiseq/taxonomy_lca.go
	modified:   pkg/obiseq/taxonomy_methods.go
	modified:   pkg/obiseq/taxonomy_predicate.go
	modified:   pkg/obitax/inner.go
	modified:   pkg/obitax/lca.go
	new file:   pkg/obitax/taxid.go
	modified:   pkg/obitax/taxon.go
	modified:   pkg/obitax/taxonomy.go
	modified:   pkg/obitax/taxonslice.go
	modified:   pkg/obitools/obicleandb/obicleandb.go
	modified:   pkg/obitools/obigrep/options.go
	modified:   pkg/obitools/obilandmark/obilandmark.go
	modified:   pkg/obitools/obilandmark/options.go
	modified:   pkg/obitools/obirefidx/famlilyindexing.go
	modified:   pkg/obitools/obirefidx/geomindexing.go
	modified:   pkg/obitools/obirefidx/obirefidx.go
	modified:   pkg/obitools/obirefidx/options.go
	modified:   pkg/obitools/obitag/obigeomtag.go
	modified:   pkg/obitools/obitag/obitag.go
	modified:   pkg/obitools/obitag/options.go
	modified:   pkg/obiutils/strings.go
This commit is contained in:
Eric Coissac
2024-12-19 13:36:59 +01:00
parent f41a6fbb60
commit 795df34d1a
28 changed files with 590 additions and 280 deletions

View File

@@ -250,22 +250,26 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
if len(obigrep.CLIRequiredRanks()) > 0 {
rankPredicate = obigrep.CLIHasRankDefinedPredicate()
} else {
rankPredicate = taxonomy.HasRequiredRank("species").And(taxonomy.HasRequiredRank("genus")).And(taxonomy.HasRequiredRank("family"))
rankPredicate = obiseq.HasRequiredRank(
taxonomy, "species",
).And(obiseq.HasRequiredRank(
taxonomy, "genus",
)).And(obiseq.HasRequiredRank(taxonomy, "family"))
}
goodTaxa := taxonomy.IsAValidTaxon(CLIUpdateTaxids()).And(rankPredicate)
goodTaxa := obiseq.IsAValidTaxon(taxonomy, CLIUpdateTaxids()).And(rankPredicate)
usable := unique.FilterOn(goodTaxa,
obioptions.CLIBatchSize(),
obioptions.CLIParallelWorkers())
annotated := usable.MakeIWorker(taxonomy.MakeSetSpeciesWorker(),
annotated := usable.MakeIWorker(obiseq.MakeSetSpeciesWorker(taxonomy),
false,
obioptions.CLIParallelWorkers(),
).MakeIWorker(taxonomy.MakeSetGenusWorker(),
).MakeIWorker(obiseq.MakeSetGenusWorker(taxonomy),
false,
obioptions.CLIParallelWorkers(),
).MakeIWorker(taxonomy.MakeSetFamilyWorker(),
).MakeIWorker(obiseq.MakeSetFamilyWorker(taxonomy),
false,
obioptions.CLIParallelWorkers(),
)

View File

@@ -1,7 +1,6 @@
package obigrep
import (
"strconv"
"strings"
log "github.com/sirupsen/logrus"
@@ -16,7 +15,7 @@ import (
)
var _BelongTaxa = make([]string, 0)
var _NotBelongTaxa = make([]int, 0)
var _NotBelongTaxa = make([]string, 0)
var _RequiredRanks = make([]string, 0)
var _MinimumLength = 1
@@ -60,7 +59,7 @@ func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
options.ArgName("TAXID"),
options.Description("Require that the actual taxon of the sequence belongs the provided taxid."))
options.IntSliceVar(&_NotBelongTaxa, "ignore-taxon", 1, 1,
options.StringSliceVar(&_NotBelongTaxa, "ignore-taxon", 1, 1,
options.Alias("i"),
options.ArgName("TAXID"),
options.Description("Require that the actual taxon of the sequence doesn't belong the provided taxid."))
@@ -273,18 +272,18 @@ func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
if len(_BelongTaxa) > 0 {
taxonomy := CLILoadSelectedTaxonomy()
taxid, err := strconv.Atoi(_BelongTaxa[0])
if err != nil {
p = taxonomy.IsSubCladeOfSlot(_BelongTaxa[0])
taxon := taxonomy.Taxon(_BelongTaxa[0])
if taxon == nil {
p = obiseq.IsSubCladeOfSlot(taxonomy, _BelongTaxa[0])
} else {
p = taxonomy.IsSubCladeOf(taxid)
p = obiseq.IsSubCladeOf(taxonomy, taxon)
}
for _, staxid := range _BelongTaxa[1:] {
taxid, err := strconv.Atoi(staxid)
if err != nil {
p2 = taxonomy.IsSubCladeOfSlot(staxid)
taxon := taxonomy.Taxon(staxid)
if taxon == nil {
p2 = obiseq.IsSubCladeOfSlot(taxonomy, staxid)
} else {
p2 = taxonomy.IsSubCladeOf(taxid)
p2 = obiseq.IsSubCladeOf(taxonomy, taxon)
}
p = p.Or(p2)
@@ -297,13 +296,28 @@ func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
}
func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate {
var p obiseq.SequencePredicate
var p2 obiseq.SequencePredicate
if len(_NotBelongTaxa) > 0 {
taxonomy := CLILoadSelectedTaxonomy()
p := taxonomy.IsSubCladeOf(_NotBelongTaxa[0])
taxon := taxonomy.Taxon(_NotBelongTaxa[0])
if taxon == nil {
p = obiseq.IsSubCladeOfSlot(taxonomy, _NotBelongTaxa[0])
} else {
p = obiseq.IsSubCladeOf(taxonomy, taxon)
}
for _, taxid := range _NotBelongTaxa[1:] {
p = p.Or(taxonomy.IsSubCladeOf(taxid))
taxon := taxonomy.Taxon(taxid)
if taxon == nil {
p2 = obiseq.IsSubCladeOfSlot(taxonomy, taxid)
} else {
p2 = obiseq.IsSubCladeOf(taxonomy, taxon)
}
p = p.Or(p2)
}
return p.Not()
@@ -316,10 +330,10 @@ func CLIHasRankDefinedPredicate() obiseq.SequencePredicate {
if len(_RequiredRanks) > 0 {
taxonomy := CLILoadSelectedTaxonomy()
p := taxonomy.HasRequiredRank(_RequiredRanks[0])
p := obiseq.HasRequiredRank(taxonomy, _RequiredRanks[0])
for _, rank := range _RequiredRanks[1:] {
p = p.And(taxonomy.HasRequiredRank(rank))
p = p.And(obiseq.HasRequiredRank(taxonomy, rank))
}
return p

View File

@@ -11,7 +11,6 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obirefidx"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/schollz/progressbar/v3"
@@ -155,19 +154,20 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
}
}
if obifind.CLIHasSelectedTaxonomy() {
taxo, err := obifind.CLILoadSelectedTaxonomy()
if err != nil {
log.Fatal(err)
if obioptions.CLIHasSelectedTaxonomy() {
taxo := obitax.DefaultTaxonomy()
if taxo == nil {
log.Fatal("No taxonomy available")
}
taxa := make(obitax.TaxonSet, len(library))
taxa := obitax.DefaultTaxonomy().NewTaxonSlice(len(library), len(library))
for i, seq := range library {
taxa[i], err = taxo.Taxon(seq.Taxid())
if err != nil {
log.Fatal(err)
taxon := seq.Taxon(taxo)
if taxon == nil {
log.Fatal("%s: Cannot identify taxid %s in %s", seq.Id(), seq.Taxid(), taxo.Name())
}
taxa.Set(i, taxon)
}
pbopt := make([]progressbar.Option, 0, 5)
@@ -182,7 +182,7 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
bar := progressbar.NewOptions(len(library), pbopt...)
for i, seq := range library {
idx := obirefidx.GeomIndexSesquence(i, library, &taxa, taxo)
idx := obirefidx.GeomIndexSesquence(i, library, taxa, taxo)
seq.SetOBITagGeomRefIndex(idx)
if i%10 == 0 {

View File

@@ -1,8 +1,8 @@
package obilandmark
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind"
"github.com/DavidGamba/go-getoptions"
)
@@ -25,7 +25,7 @@ func LandmarkOptionSet(options *getoptions.GetOpt) {
func OptionSet(options *getoptions.GetOpt) {
obiconvert.InputOptionSet(options)
obiconvert.OutputOptionSet(options)
obifind.LoadTaxonomyOptionSet(options, false, false)
obioptions.LoadTaxonomyOptionSet(options, false, false)
LandmarkOptionSet(options)
}

View File

@@ -13,7 +13,6 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind"
"github.com/schollz/progressbar/v3"
log "github.com/sirupsen/logrus"
)
@@ -73,9 +72,7 @@ func MakeIndexingSliceWorker(indexslot, idslot string,
[]*obikmer.Table4mer,
len(sequences))
taxa := make(
obitax.TaxonSet,
len(sequences))
taxa := taxonomy.NewTaxonSlice(sequences.Len(), sequences.Len())
for i, seq := range sequences {
j, ok := seq.GetIntAttribute(idslot)
@@ -85,8 +82,11 @@ func MakeIndexingSliceWorker(indexslot, idslot string,
}
kmercounts[i] = (*kmers)[j]
taxa[i], _ = taxonomy.Taxon(seq.Taxid())
taxon := seq.Taxon(taxonomy)
if taxon == nil {
log.Panicf("%s: Cannot get the taxon %s in %s", seq.Id(), seq.Taxid(), taxonomy.Name())
}
taxa.Set(i, taxon)
}
@@ -103,7 +103,7 @@ func MakeIndexingSliceWorker(indexslot, idslot string,
f := func() {
for l := range limits {
for i := l[0]; i < l[1]; i++ {
idx := IndexSequence(i, sequences, &kmercounts, &taxa, taxonomy)
idx := IndexSequence(i, sequences, &kmercounts, taxa, taxonomy)
sequences[i].SetAttribute(indexslot, idx)
}
}
@@ -134,7 +134,7 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
nref := len(references)
log.Infof("Done. Database contains %d sequences", nref)
taxonomy, error := obifind.CLILoadSelectedTaxonomy()
taxonomy, error := obioptions.CLILoadSelectedTaxonomy()
if error != nil {
log.Panicln(error)
}
@@ -155,13 +155,13 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
log.Info("done")
partof := obiiter.IBatchOver(source, references,
obioptions.CLIBatchSize()).MakeIWorker(taxonomy.MakeSetSpeciesWorker(),
obioptions.CLIBatchSize()).MakeIWorker(obiseq.MakeSetSpeciesWorker(taxonomy),
false,
obioptions.CLIParallelWorkers(),
).MakeIWorker(taxonomy.MakeSetGenusWorker(),
).MakeIWorker(obiseq.MakeSetGenusWorker(taxonomy),
false,
obioptions.CLIParallelWorkers(),
).MakeIWorker(taxonomy.MakeSetFamilyWorker(),
).MakeIWorker(obiseq.MakeSetFamilyWorker(taxonomy),
false,
obioptions.CLIParallelWorkers(),
)
@@ -187,14 +187,20 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
clusters := obiseq.MakeBioSequenceSlice(0)
kcluster := make([]*obikmer.Table4mer, 0)
taxa := make(obitax.TaxonSet, 0)
taxa := taxonomy.NewTaxonSlice(references.Len(), references.Len())
j := 0
for i, seq := range references {
if is_centrer, _ := seq.GetBoolAttribute("reffamidx_clusterhead"); is_centrer {
clusters = append(clusters, seq)
kcluster = append(kcluster, refcounts[i])
taxa[j], _ = taxonomy.Taxon(seq.Taxid())
taxon := seq.Taxon(taxonomy)
if taxon == nil {
log.Panicf("%s: Cannot get the taxon %s in %s", seq.Id(), seq.Taxid(), taxonomy.Name())
}
taxa.Set(j, taxon)
j++
}
}
@@ -225,7 +231,7 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
f := func() {
for l := range limits {
for i := l[0]; i < l[1]; i++ {
idx := IndexSequence(i, clusters, &kcluster, &taxa, taxonomy)
idx := IndexSequence(i, clusters, &kcluster, taxa, taxonomy)
clusters[i].SetOBITagRefIndex(idx)
bar.Add(1)
}

View File

@@ -2,10 +2,11 @@ package obirefidx
import (
"fmt"
log "github.com/sirupsen/logrus"
"sort"
"sync"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
@@ -15,7 +16,7 @@ import (
func GeomIndexSesquence(seqidx int,
references obiseq.BioSequenceSlice,
taxa *obitax.TaxonSet,
taxa *obitax.TaxonSlice,
taxo *obitax.Taxonomy) map[int]string {
sequence := references[seqidx]
@@ -56,28 +57,28 @@ func GeomIndexSesquence(seqidx int,
order := obiutils.Order(sort.IntSlice(seq_dist))
lca := (*taxa)[seqidx]
lca := taxa.Taxon(seqidx)
index := make(map[int]string)
index[0] = fmt.Sprintf(
"%d@%s@%s",
lca.Taxid(),
lca.ScientificName(),
lca.Rank())
"%s@%s",
lca.String(),
lca.Rank(),
)
for _, o := range order {
new_lca, _ := lca.LCA((*taxa)[o])
if new_lca.Taxid() != lca.Taxid() {
new_lca, _ := lca.LCA(taxa.Taxon(o))
if new_lca.SameAs(lca) {
lca = new_lca
index[int(seq_dist[o])] = fmt.Sprintf(
"%d@%s@%s",
lca.Taxid(),
lca.ScientificName(),
lca.Rank())
}
"%s@%s",
lca.String(),
lca.Rank(),
)
if lca.Taxid() == 1 {
break
if lca.IsRoot() {
break
}
}
}

View File

@@ -12,102 +12,171 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/schollz/progressbar/v3"
)
// IndexSequence processes a biological sequence and indexes it based on taxonomic information.
// It computes the least common ancestors (LCA) for the sequence and its references,
// evaluates common k-mers, and calculates differences in alignment scores.
//
// Parameters:
// - seqidx: The index of the sequence to process within the references slice.
// - references: A slice of biological sequences to compare against.
// - kmers: A pointer to a slice of k-mer tables used for common k-mer calculations.
// - taxa: A slice of taxonomic information corresponding to the sequences.
// - taxo: A taxonomy object used for LCA calculations.
//
// Returns:
//
// A map where the keys are integers representing alignment differences,
// and the values are strings formatted as "Taxon@Rank" indicating the taxonomic
// classification of the sequence based on the computed differences.
func IndexSequence(seqidx int,
references obiseq.BioSequenceSlice,
kmers *[]*obikmer.Table4mer,
taxa *obitax.TaxonSet,
taxa *obitax.TaxonSlice,
taxo *obitax.Taxonomy) map[int]string {
// Retrieve the biological sequence at the specified index from the references slice.
sequence := references[seqidx]
seq_len := sequence.Len()
// Get the taxon corresponding to the current sequence index.
tseq := taxa.Taxon(seqidx)
// Get the taxonomic path for the current sequence.
pseq := tseq.Path()
path_len := pseq.Len()
// For each taxonomic ancestor in the path, a biosequence slice is created to store
// the reference sequences having that ancestor as their LCA with the current sequence.
refs := make(map[*obitax.TaxNode]*[]int, path_len)
for i := 0; i < path_len; i++ {
temp := make([]int, 0, 100)
refs[pseq.Taxon(i).Node] = &temp
}
// log.Infof("%s length of path: %d", sequence.Id(), len(refs))
n := taxa.Len()
lcaCache := make(map[*obitax.TaxNode]*obitax.TaxNode, n)
for i := 0; i < n; i++ {
taxon := taxa.Taxon(i) // Get the taxon at index i.
// Compute the LCA between the current taxon and the taxon of the sequence.
node, ok := lcaCache[taxon.Node]
if !ok {
lca, err := tseq.LCA(taxon)
if err != nil {
// Log a fatal error if the LCA computation fails, including the taxon details.
log.Fatalf("(%s,%s): %+v", tseq.String(), taxon.String(), err)
}
node = lca.Node
lcaCache[taxon.Node] = node
}
// log.Infof("%s Processing taxon: %s x %s -> %s", sequence.Id(), tseq.String(), taxon.String(), node.String(taxo.Code()))
// Append the current sequence to the LCA's reference sequence slice.
*refs[node] = append(*refs[node], i)
}
closest := make([]int, path_len)
closest[0] = 0
// Initialize a matrix to store alignment scores
var matrix []uint64
lca := make(obitax.TaxonSet, len(references))
tseq := (*taxa)[seqidx]
// log.Warnf("%s : %s", sequence.Id(), pseq.String())
for idx_path := 1; idx_path < path_len; idx_path++ {
mini := -1
seqidcs := refs[pseq.Taxon(idx_path).Node]
for i, taxon := range *taxa {
lca[i], _ = tseq.LCA(taxon)
}
ns := len(*seqidcs)
cw := make([]int, len(references))
sw := (*kmers)[seqidx]
for i, ref := range *kmers {
cw[i] = obikmer.Common4Mer(sw, ref)
}
if ns > 0 {
ow := obiutils.Reverse(obiutils.IntOrder(cw), true)
pseq, _ := tseq.Path()
obiutils.Reverse(*pseq, true)
// score := make([]int, len(references))
mindiff := make([]int, len(*pseq))
/*
nseq := make([]int, len(*pseq))
nali := make([]int, len(*pseq))
nok := make([]int, len(*pseq))
nfast := make([]int, len(*pseq))
nfastok := make([]int, len(*pseq))
*/
shared := make([]int, ns)
lseq := sequence.Len()
for j, is := range *seqidcs {
shared[j] = obikmer.Common4Mer((*kmers)[seqidx], (*kmers)[is])
}
mini := -1
wordmin := 0
ow := obiutils.Reverse(obiutils.IntOrder(shared), true)
for _, order := range ow {
is := (*seqidcs)[order]
suject := references[is]
for i, ancestor := range *pseq {
for _, order := range ow {
if lca[order] == ancestor {
// nseq[i]++
if mini != -1 {
wordmin = max(sequence.Len(), references[order].Len()) - 3 - 4*mini
}
if cw[order] < wordmin {
break
wordmin := max(seq_len, suject.Len()) - 3 - 4*mini
// If the common k-mer count for the current order is less than the
// minimum word length, break the loop.
if shared[order] < wordmin {
break
}
}
// Initialize variables for Longest Common Subsequence (LCS) score and alignment length.
lcs, alilength := -1, -1
errs := int(1e9)
if mini != -1 && mini <= 1 {
// nfast[i]++
d, _, _, _ := obialign.D1Or0(sequence, references[order])
if d >= 0 {
errs = d
// nfastok[i]++
errs := int(1e9) // Initialize errors to a large number.
// If mini is set and less than or equal to 1, perform a specific alignment.
if mini == 0 || mini == 1 {
// Perform a specific alignment and get the distance.
d, _, _, _ := obialign.D1Or0(sequence, suject)
if d >= 0 { // If the distance is valid (non-negative).
errs = d // Update errors with the distance.
}
} else {
// nali[i]++
lcs, alilength = obialign.FastLCSScore(sequence, references[order], mini, &matrix)
if lcs >= 0 {
// nok[i]++
errs = alilength - lcs
// Perform a Fast LCS score calculation for the sequence and reference.
lcs, alilength = obialign.FastLCSScore(sequence, suject, mini, &matrix)
if lcs >= 0 { // If LCS score is valid (non-negative).
errs = alilength - lcs // Calculate errors based on alignment length.
}
}
// Update mini with the minimum errors found.
if mini == -1 || errs < mini {
mini = errs
}
if mini == 0 {
// log.Warnf("%s: %s", sequence.Id(), sequence.String())
// log.Warnf("%s: %s", suject.Id(), suject.String())
break
}
}
if mini == -1 {
log.Fatalf("(%s,%s): No alignment found.", sequence.Id(), pseq.Taxon(idx_path).String())
}
closest[idx_path] = mini
// insure than closest is strictly increasing
for k := idx_path - 1; k >= 0 && mini < closest[k]; k-- {
closest[k] = mini
// log.Warnf("(%s,%s) Smaller alignment found than previous (%d,%d). Resetting closest.", sequence.Id(), pseq.Taxon(idx_path).String(), mini, closest[k])
}
} else {
closest[idx_path] = seq_len
}
mindiff[i] = mini
}
obitag_index := make(map[int]string, len(*pseq))
obitag_index := make(map[int]string, pseq.Len())
old := lseq
for i, d := range mindiff {
if d != -1 && d < old {
current_taxid := (*pseq)[i]
// log.Warnf("(%s,%s): %v", sequence.Id(), pseq.Taxon(0).String(), closest)
for i, d := range closest {
if i < (len(closest)-1) && d < closest[i+1] {
current_taxon := pseq.Taxon(i)
obitag_index[d] = fmt.Sprintf(
"%d@%s@%s",
current_taxid.Taxid(),
current_taxid.ScientificName(),
current_taxid.Rank())
old = d
"%s@%s",
current_taxon.String(),
current_taxon.Rank(),
)
}
}
@@ -128,22 +197,21 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
source, references := iterator.Load()
log.Infof("Done. Database contains %d sequences", len(references))
taxo, error := obifind.CLILoadSelectedTaxonomy()
taxo, error := obioptions.CLILoadSelectedTaxonomy()
if error != nil {
log.Panicln(error)
}
log.Infoln("Indexing sequence taxids...")
taxa := make(
obitax.TaxonSet,
len(references))
n := len(references)
taxa := taxo.NewTaxonSlice(n, n)
j := 0
for i, seq := range references {
taxon, err := taxo.Taxon(seq.Taxid())
if err == nil {
taxa[j] = taxon
taxon := seq.Taxon(taxo)
if taxon != nil {
taxa.Set(j, taxon)
references[j] = references[i]
j++
}
@@ -198,7 +266,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
for l := range limits {
sl := obiseq.MakeBioSequenceSlice()
for i := l[0]; i < l[1]; i++ {
idx := IndexSequence(i, references, &refcounts, &taxa, taxo)
idx := IndexSequence(i, references, &refcounts, taxa, taxo)
iref := references[i].Copy()
iref.SetOBITagRefIndex(idx)
sl = append(sl, iref)

View File

@@ -1,8 +1,8 @@
package obirefidx
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind"
"github.com/DavidGamba/go-getoptions"
)
@@ -10,5 +10,5 @@ import (
// the obiuniq command
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
obifind.LoadTaxonomyOptionSet(options, true, false)
obioptions.LoadTaxonomyOptionSet(options, true, false)
}

View File

@@ -48,19 +48,18 @@ func ExtractLandmarkSeqs(references *obiseq.BioSequenceSlice) *obiseq.BioSequenc
// - taxonomy: a pointer to a Taxonomy object.
//
// The function returns a pointer to a TaxonSet, which is a set of taxa.
func ExtractTaxonSet(references *obiseq.BioSequenceSlice, taxonomy *obitax.Taxonomy) *obitax.TaxonSet {
var err error
taxa := make(obitax.TaxonSet, len(*references))
func ExtractTaxonSet(references *obiseq.BioSequenceSlice, taxonomy *obitax.Taxonomy) *obitax.TaxonSlice {
taxa := taxonomy.NewTaxonSlice(references.Len(), references.Len())
for i, ref := range *references {
taxid := ref.Taxid()
taxa[i], err = taxonomy.Taxon(taxid)
if err != nil {
log.Panicf("Taxid %d, for sequence %s not found in taxonomy", taxid, ref.Id())
taxon := ref.Taxon(taxonomy)
if taxon == nil {
log.Panicf("%s: Cannot get the taxon %s in %s", ref.Id(), ref.Taxid(), taxonomy.Name())
}
taxa.Set(i, taxon)
}
return &taxa
return taxa
}
// MapOnLandmarkSequences calculates the coordinates of landmarks on a given sequence.
@@ -149,29 +148,26 @@ func FindGeomClosest(sequence *obiseq.BioSequence,
func GeomIdentify(sequence *obiseq.BioSequence,
landmarks *obiseq.BioSequenceSlice,
references *obiseq.BioSequenceSlice,
taxa *obitax.TaxonSet,
taxa *obitax.TaxonSlice,
taxo *obitax.Taxonomy,
buffer *[]uint64) *obiseq.BioSequence {
best_seq, min_dist, best_id, query_location, matches := FindGeomClosest(sequence, landmarks, references, buffer)
taxon := (*obitax.TaxNode)(nil)
taxon := (*obitax.Taxon)(nil)
var err error
if best_id > 0.5 {
taxid, _, _ := MatchDistanceIndex(min_dist, (*matches)[0].OBITagGeomRefIndex())
taxon, _ = taxo.Taxon(taxid)
taxon = MatchDistanceIndex(taxo, min_dist, (*matches)[0].OBITagGeomRefIndex())
for i := 1; i < len(*matches); i++ {
taxid, _, _ := MatchDistanceIndex(min_dist, (*matches)[i].OBITagGeomRefIndex())
newTaxon, _ := taxo.Taxon(taxid)
newTaxon := MatchDistanceIndex(taxo, min_dist, (*matches)[i].OBITagGeomRefIndex())
taxon, err = newTaxon.LCA(taxon)
if err != nil {
log.Panicf("LCA error: %v", err)
}
}
sequence.SetTaxid(taxon.Taxid())
sequence.SetTaxon(taxon)
} else {
taxon, _ = taxo.Taxon(1)
sequence.SetTaxid(1)
sequence.SetTaxon(taxo.Root())
}
sequence.SetAttribute("scientific_name", taxon.ScientificName())

View File

@@ -2,8 +2,6 @@ package obitag
import (
"sort"
"strconv"
"strings"
log "github.com/sirupsen/logrus"
"golang.org/x/exp/maps"
@@ -29,7 +27,9 @@ import (
// - taxid: The taxid associated with the matched distance.
// - rank: The rank associated with the matched distance.
// - scientificName: The scientific name associated with the matched distance.
func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, string) {
func MatchDistanceIndex(taxonomy *obitax.Taxonomy, distance int, distanceIdx map[int]string) *obitax.Taxon {
taxonomy = taxonomy.OrDefault(true)
keys := maps.Keys(distanceIdx)
slices.Sort(keys)
@@ -37,24 +37,20 @@ func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string,
return distance <= keys[i]
})
var taxid int
var rank string
var scientificName string
var taxon *obitax.Taxon
if i == len(keys) || distance > keys[len(keys)-1] {
taxid = 1
rank = "no rank"
scientificName = "root"
taxon = taxonomy.Root()
} else {
parts := strings.Split(distanceIdx[keys[i]], "@")
taxid, _ = strconv.Atoi(parts[0])
rank = parts[1]
scientificName = parts[2]
taxon = taxonomy.Taxon(distanceIdx[keys[i]])
if taxon == nil {
log.Panicf("Cannot identify taxon %s in %s", distanceIdx[keys[i]], taxonomy.Name())
}
}
// log.Info("taxid:", taxid, " rank:", rank, " scientificName:", scientificName)
return taxid, rank, scientificName
return taxon
}
// FindClosests finds the closest bio sequence from a given sequence and a slice of reference sequences.
@@ -169,12 +165,12 @@ func FindClosests(sequence *obiseq.BioSequence,
func Identify(sequence *obiseq.BioSequence,
references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer,
taxa obitax.TaxonSet,
taxa *obitax.TaxonSlice,
taxo *obitax.Taxonomy,
runExact bool) *obiseq.BioSequence {
bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact)
taxon := (*obitax.TaxNode)(nil)
taxon := (*obitax.Taxon)(nil)
if identity >= 0.5 && differences >= 0 {
newidx := 0
@@ -183,56 +179,24 @@ func Identify(sequence *obiseq.BioSequence,
if idx == nil {
// log.Debugln("Need of indexing")
newidx++
idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, &taxa, taxo)
idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, taxa, taxo)
references[seqidxs[i]].SetOBITagRefIndex(idx)
log.Debugln(references[seqidxs[i]].Id(), idx)
}
d := differences
identification, ok := idx[d]
found := false
var parts []string
/*
Here is an horrible hack for xprize challence.
With Euka01 the part[0] was equal to "" for at
least a sequence consensus. Which is not normal.
TO BE CHECKED AND CORRECTED
The problem seems related to idx that doesn't have
a 0 distance
*/
for !found && d >= 0 {
for !ok && d >= 0 {
identification, ok = idx[d]
d--
}
parts = strings.Split(identification, "@")
found = parts[0] != ""
if !found {
log.Debugln("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d)
for !ok && d <= 1000 {
identification, ok = idx[d]
d++
}
}
for !ok && d >= 0 {
d--
identification, ok = idx[d]
}
match_taxid, err := strconv.Atoi(parts[0])
if err != nil {
log.Panicln("Cannot extract taxid from :", identification)
if !ok {
log.Panic("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d)
}
match_taxon, err := taxo.Taxon(match_taxid)
if err != nil {
log.Panicln("Cannot find taxon corresponding to taxid :", match_taxid)
}
match_taxon := taxo.Taxon(identification)
if taxon != nil {
taxon, _ = taxon.LCA(match_taxon)
@@ -241,16 +205,16 @@ func Identify(sequence *obiseq.BioSequence,
}
}
log.Debugln(sequence.Id(), "Best matches:", len(bests), "New index:", newidx)
sequence.SetTaxid(taxon.Taxid())
sequence.SetTaxon(taxon)
} else {
taxon, _ = taxo.Taxon(1)
sequence.SetTaxid(1)
taxon = taxo.Root()
sequence.SetTaxon(taxon)
}
sequence.SetAttribute("scientific_name", taxon.ScientificName())
sequence.SetAttribute("obitag_rank", taxon.Rank())
sequence.SetAttribute("obitag_bestid", identity)
sequence.SetAttribute("obitag_bestmatch", bestmatch)
@@ -262,7 +226,7 @@ func Identify(sequence *obiseq.BioSequence,
func IdentifySeqWorker(references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer,
taxa obitax.TaxonSet,
taxa *obitax.TaxonSlice,
taxo *obitax.Taxonomy,
runExact bool) obiseq.SeqWorker {
return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
@@ -279,23 +243,21 @@ func CLIAssignTaxonomy(iterator obiiter.IBioSequence,
[]*obikmer.Table4mer,
len(references))
taxa := make(obitax.TaxonSet,
len(references))
taxa := taxo.NewTaxonSlice(references.Len(), references.Len())
buffer := make([]byte, 0, 1000)
var err error
j := 0
for _, seq := range references {
references[j] = seq
refcounts[j] = obikmer.Count4Mer(seq, &buffer, nil)
taxa[j], err = taxo.Taxon(seq.Taxid())
if err == nil {
taxon := seq.Taxon(taxo)
taxa.Set(j, taxon)
if taxon != nil {
j++
} else {
log.Warnf("Taxid %d is not described in the taxonomy."+
log.Warnf("Taxid %d is not described in the taxonomy %s."+
" Sequence %s is discared from the reference database",
seq.Taxid(), seq.Id())
seq.Taxid(), taxo.Name(), seq.Id())
}
}

View File

@@ -8,7 +8,6 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind"
"github.com/DavidGamba/go-getoptions"
)
@@ -42,7 +41,7 @@ func TagOptionSet(options *getoptions.GetOpt) {
// the obiuniq command
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
obifind.LoadTaxonomyOptionSet(options, true, false)
obioptions.LoadTaxonomyOptionSet(options, true, false)
TagOptionSet(options)
}