mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-13 18:50:26 +00:00
A first prototype for the space of sequences
Former-commit-id: 07dc6ef044b5b6a6fb45dc2acb01dffe71a96195
This commit is contained in:
@@ -48,11 +48,14 @@ func FilterTaxonomyOptionSet(options *getoptions.GetOpt) {
|
||||
options.Description("Restrict output to some subclades."))
|
||||
}
|
||||
|
||||
|
||||
func CLISelectedNCBITaxDump() string {
|
||||
return __taxdump__
|
||||
}
|
||||
|
||||
func CLIHasSelectedTaxonomy() bool {
|
||||
return __taxdump__ != ""
|
||||
}
|
||||
|
||||
func CLIAreAlternativeNamesSelected() bool {
|
||||
return __alternative_name__
|
||||
}
|
||||
|
||||
@@ -10,6 +10,9 @@ import (
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obistats"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obirefidx"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
log "github.com/sirupsen/logrus"
|
||||
@@ -103,14 +106,14 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
|
||||
library := iterator.Load()
|
||||
|
||||
library_size := len(library)
|
||||
n_landmark := NCenter()
|
||||
n_landmark := CLINCenter()
|
||||
|
||||
landmark_idx := obistats.SampleIntWithoutReplacement(n_landmark, library_size)
|
||||
log.Infof("Library contains %d sequence", len(library))
|
||||
|
||||
var seqworld obiutils.Matrix[float64]
|
||||
|
||||
for loop := 0; loop < 5; loop++ {
|
||||
for loop := 0; loop < 2; loop++ {
|
||||
sort.IntSlice(landmark_idx).Sort()
|
||||
log.Debugf("Selected indices : %v", landmark_idx)
|
||||
|
||||
@@ -154,14 +157,52 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
|
||||
}
|
||||
|
||||
classes := obistats.AssignToClass(&seqworld, &initialCenters)
|
||||
|
||||
for i, seq := range library {
|
||||
seq.SetAttribute("landmark_coord", seqworld[i])
|
||||
ic, _ := obiutils.InterfaceToIntSlice(seqworld[i])
|
||||
seq.SetCoordinate(ic)
|
||||
seq.SetAttribute("landmark_class", classes[i])
|
||||
if i, ok := seq_landmark[i]; ok {
|
||||
seq.SetAttribute("landmark_id", i)
|
||||
}
|
||||
}
|
||||
|
||||
if obifind.CLIHasSelectedTaxonomy() {
|
||||
taxo, err := obifind.CLILoadSelectedTaxonomy()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
taxa := make(obitax.TaxonSet, len(library))
|
||||
|
||||
for i, seq := range library {
|
||||
taxa[i], err = taxo.Taxon(seq.Taxid())
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("[Sequence Indexing]"),
|
||||
)
|
||||
|
||||
bar := progressbar.NewOptions(len(library), pbopt...)
|
||||
|
||||
for i, seq := range library {
|
||||
idx := obirefidx.GeomIndexSesquence(i, library, &taxa, taxo)
|
||||
seq.SetOBITagGeomRefIndex(idx)
|
||||
|
||||
if i%10 == 0 {
|
||||
bar.Add(10)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return obiiter.IBatchOver(library, obioptions.CLIBatchSize())
|
||||
|
||||
}
|
||||
|
||||
@@ -2,28 +2,37 @@ package obilandmark
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _nCenter = 200
|
||||
|
||||
// ObilandmarkOptionSet sets the options for Obilandmark.
|
||||
// LandmarkOptionSet sets the options for Obilandmark.
|
||||
//
|
||||
// options: a pointer to the getoptions.GetOpt struct.
|
||||
// Return type: none.
|
||||
func ObilandmarkOptionSet(options *getoptions.GetOpt) {
|
||||
func LandmarkOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
options.IntVar(&_nCenter, "center", _nCenter,
|
||||
options.Alias("n"),
|
||||
options.Description("Maximum numbers of differences between two variant sequences (default: %d)."))
|
||||
options.Description("Number of landmark sequences to be selected."))
|
||||
}
|
||||
|
||||
// OptionSet is a function that sets the options for the GetOpt struct.
|
||||
//
|
||||
// It takes a pointer to a GetOpt struct as its parameter and does not return anything.
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.InputOptionSet(options)
|
||||
obiconvert.OutputOptionSet(options)
|
||||
ObilandmarkOptionSet(options)
|
||||
obifind.LoadTaxonomyOptionSet(options, false, false)
|
||||
LandmarkOptionSet(options)
|
||||
}
|
||||
|
||||
func NCenter() int {
|
||||
// CLINCenter returns desired number of centers as specified by user.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns an integer value.
|
||||
func CLINCenter() int {
|
||||
return _nCenter
|
||||
}
|
||||
|
||||
79
pkg/obitools/obirefidx/geomindexing.go
Normal file
79
pkg/obitools/obirefidx/geomindexing.go
Normal file
@@ -0,0 +1,79 @@
|
||||
package obirefidx
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||
)
|
||||
|
||||
func GeomIndexSesquence(seqidx int,
|
||||
references obiseq.BioSequenceSlice,
|
||||
taxa *obitax.TaxonSet,
|
||||
taxo *obitax.Taxonomy) map[int]string {
|
||||
|
||||
sequence := references[seqidx]
|
||||
location := sequence.GetCoordinate()
|
||||
|
||||
if location == nil {
|
||||
log.Fatalf("Sequence %s does not have a coordinate", sequence.Id())
|
||||
}
|
||||
|
||||
seq_dist := make([]float64, len(references))
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for i, ref := range references {
|
||||
wg.Add(1)
|
||||
go func(i int, ref *obiseq.BioSequence) {
|
||||
defer wg.Done()
|
||||
reflocation := ref.GetCoordinate()
|
||||
if reflocation == nil {
|
||||
log.Fatalf("Sequence %s does not have a coordinate", ref.Id())
|
||||
}
|
||||
d := 0.0
|
||||
for i, x := range location {
|
||||
diff := float64(x - reflocation[i])
|
||||
d += diff * diff
|
||||
}
|
||||
seq_dist[i] = d
|
||||
}(i, ref)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
order := obiutils.Order(sort.Float64Slice(seq_dist))
|
||||
|
||||
lca := (*taxa)[seqidx]
|
||||
|
||||
index := make(map[int]string)
|
||||
index[0.0] = fmt.Sprintf(
|
||||
"%d@%s@%s",
|
||||
lca.Taxid(),
|
||||
lca.ScientificName(),
|
||||
lca.Rank())
|
||||
|
||||
old_dist := 0.0
|
||||
for _, o := range order {
|
||||
new_lca, _ := lca.LCA((*taxa)[o])
|
||||
if new_lca.Taxid() != lca.Taxid() || seq_dist[o] != old_dist {
|
||||
lca = new_lca
|
||||
old_dist = seq_dist[o]
|
||||
index[int(seq_dist[o])] = fmt.Sprintf(
|
||||
"%d@%s@%s",
|
||||
lca.Taxid(),
|
||||
lca.ScientificName(),
|
||||
lca.Rank())
|
||||
}
|
||||
|
||||
if lca.Taxid() == 1 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return index
|
||||
}
|
||||
@@ -24,9 +24,6 @@ func IndexSequence(seqidx int,
|
||||
taxo *obitax.Taxonomy) map[int]string {
|
||||
|
||||
sequence := references[seqidx]
|
||||
// matrix := obialign.NewFullLCSMatrix(nil,
|
||||
// sequence.Length(),
|
||||
// sequence.Length())
|
||||
|
||||
var matrix []uint64
|
||||
|
||||
@@ -54,7 +51,9 @@ func IndexSequence(seqidx int,
|
||||
nok := make([]int, len(*pseq))
|
||||
nfast := make([]int, len(*pseq))
|
||||
nfastok := make([]int, len(*pseq))
|
||||
*/lseq := sequence.Len()
|
||||
*/
|
||||
|
||||
lseq := sequence.Len()
|
||||
|
||||
mini := -1
|
||||
wordmin := 0
|
||||
|
||||
209
pkg/obitools/obitag/obigeomtag.go
Normal file
209
pkg/obitools/obitag/obigeomtag.go
Normal file
@@ -0,0 +1,209 @@
|
||||
package obitag
|
||||
|
||||
import (
|
||||
"log"
|
||||
"math"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
|
||||
)
|
||||
|
||||
// ExtractLandmarkSeqs extracts landmark sequences from the given BioSequenceSlice.
|
||||
//
|
||||
// The landmark sequences are extracted from the given BioSequenceSlice and put in a new BioSequenceSlice
|
||||
// in the order corresponding to their landmark IDs.
|
||||
//
|
||||
// references: A pointer to a BioSequenceSlice containing the references.
|
||||
// Returns: A pointer to a BioSequenceSlice containing the extracted landmark sequences.
|
||||
func ExtractLandmarkSeqs(references *obiseq.BioSequenceSlice) *obiseq.BioSequenceSlice {
|
||||
landmarks := make(map[int]*obiseq.BioSequence, 100)
|
||||
|
||||
for _, ref := range *references {
|
||||
if id := ref.GetLandmarkID(); id != -1 {
|
||||
landmarks[id] = ref
|
||||
}
|
||||
}
|
||||
|
||||
ls := obiseq.NewBioSequenceSlice(len(landmarks))
|
||||
*ls = (*ls)[0:len(landmarks)]
|
||||
|
||||
for k, l := range landmarks {
|
||||
(*ls)[k] = l
|
||||
}
|
||||
|
||||
return ls
|
||||
}
|
||||
|
||||
// ExtractTaxonSet extracts a set of taxa from the given references and taxonomy.
|
||||
//
|
||||
// If a reference sequence has a taxid absent from the taxonomy, the function will panic.
|
||||
//
|
||||
// The function takes two parameters:
|
||||
// - references: a pointer to a BioSequenceSlice, which is a slice of BioSequence objects.
|
||||
// - taxonomy: a pointer to a Taxonomy object.
|
||||
//
|
||||
// The function returns a pointer to a TaxonSet, which is a set of taxa.
|
||||
func ExtractTaxonSet(references *obiseq.BioSequenceSlice, taxonomy *obitax.Taxonomy) *obitax.TaxonSet {
|
||||
var err error
|
||||
taxa := make(obitax.TaxonSet, len(*references))
|
||||
|
||||
for i, ref := range *references {
|
||||
taxid := ref.Taxid()
|
||||
taxa[i], err = taxonomy.Taxon(taxid)
|
||||
if err != nil {
|
||||
log.Panicf("Taxid %d, for sequence %s not found in taxonomy", taxid, ref.Id())
|
||||
}
|
||||
}
|
||||
|
||||
return &taxa
|
||||
}
|
||||
|
||||
// MapOnLandmarkSequences calculates the coordinates of landmarks on a given sequence.
|
||||
//
|
||||
// It takes in three parameters:
|
||||
// - sequence: a pointer to a BioSequence object representing the sequence.
|
||||
// - landmarks: a pointer to a BioSequenceSlice object representing the landmarks.
|
||||
// - buffer: a pointer to a slice of uint64, used as a buffer for calculations.
|
||||
//
|
||||
// It returns a slice of integers representing the coordinates of the landmarks on the sequence.
|
||||
func MapOnLandmarkSequences(sequence *obiseq.BioSequence, landmarks *obiseq.BioSequenceSlice, buffer *[]uint64) []int {
|
||||
|
||||
coords := make([]int, len(*landmarks))
|
||||
|
||||
for i, l := range *landmarks {
|
||||
lcs, length := obialign.FastLCSEGFScore(sequence, l, -1, buffer)
|
||||
coords[i] = length - lcs
|
||||
}
|
||||
|
||||
return coords
|
||||
}
|
||||
|
||||
// FindGeomClosest finds the closest geometric sequence in a given set of reference sequences to a query sequence.
|
||||
//
|
||||
// Parameters:
|
||||
// - sequence: A pointer to a BioSequence object representing the query sequence.
|
||||
// - landmarks: A pointer to a BioSequenceSlice object representing the landmarks.
|
||||
// - references: A pointer to a BioSequenceSlice object representing the reference sequences.
|
||||
// - buffer: A pointer to a slice of uint64 representing a buffer.
|
||||
//
|
||||
// Returns:
|
||||
// - A pointer to a BioSequence object representing the closest sequence.
|
||||
// - An int representing the minimum distance.
|
||||
// - A float64 representing the best identity score.
|
||||
// - An array of int representing the indices of the closest sequences.
|
||||
// - A pointer to a BioSequenceSlice object representing the matched sequences.
|
||||
func FindGeomClosest(sequence *obiseq.BioSequence,
|
||||
landmarks *obiseq.BioSequenceSlice,
|
||||
references *obiseq.BioSequenceSlice,
|
||||
buffer *[]uint64) (*obiseq.BioSequence, int, float64, []int, *obiseq.BioSequenceSlice) {
|
||||
|
||||
min_dist := math.MaxInt64
|
||||
min_idx := make([]int, 0)
|
||||
|
||||
query_location := MapOnLandmarkSequences(sequence, landmarks, buffer)
|
||||
|
||||
for i, l := range *references {
|
||||
coord := l.GetCoordinate()
|
||||
if len(coord) == 0 {
|
||||
log.Panicf("Empty coordinate for reference sequence %s", l.Id())
|
||||
}
|
||||
dist := 0
|
||||
for j := 0; j < len(coord); j++ {
|
||||
diff := query_location[j] - coord[j]
|
||||
dist += diff * diff
|
||||
}
|
||||
|
||||
if dist == min_dist {
|
||||
min_idx = append(min_idx, i)
|
||||
}
|
||||
if dist < min_dist {
|
||||
min_dist = dist
|
||||
min_idx = make([]int, 0)
|
||||
min_idx = append(min_idx, i)
|
||||
}
|
||||
}
|
||||
|
||||
best_seq := (*references)[min_idx[0]]
|
||||
best_id := 0.0
|
||||
|
||||
for _, i := range min_idx {
|
||||
seq := (*references)[i]
|
||||
lcs, length := obialign.FastLCSEGFScore(sequence, seq, -1, buffer)
|
||||
ident := float64(lcs) / float64(length)
|
||||
if ident > best_id {
|
||||
best_id = ident
|
||||
best_seq = seq
|
||||
}
|
||||
}
|
||||
|
||||
matches := obiseq.MakeBioSequenceSlice(len(min_idx))
|
||||
matches = matches[0:len(min_idx)]
|
||||
for i, j := range min_idx {
|
||||
matches[i] = (*references)[j]
|
||||
}
|
||||
|
||||
return best_seq, min_dist, best_id, query_location, &matches
|
||||
}
|
||||
|
||||
func GeomIdentify(sequence *obiseq.BioSequence,
|
||||
landmarks *obiseq.BioSequenceSlice,
|
||||
references *obiseq.BioSequenceSlice,
|
||||
taxa *obitax.TaxonSet,
|
||||
taxo *obitax.Taxonomy,
|
||||
buffer *[]uint64) *obiseq.BioSequence {
|
||||
best_seq, min_dist, best_id, query_location, matches := FindGeomClosest(sequence, landmarks, references, buffer)
|
||||
|
||||
taxon := (*obitax.TaxNode)(nil)
|
||||
var err error
|
||||
|
||||
if best_id > 0.5 {
|
||||
taxid, _, _ := MatchDistanceIndex(min_dist, (*matches)[0].OBITagGeomRefIndex())
|
||||
taxon, _ = taxo.Taxon(taxid)
|
||||
for i := 1; i < len(*matches); i++ {
|
||||
taxid, _, _ := MatchDistanceIndex(min_dist, (*matches)[i].OBITagGeomRefIndex())
|
||||
newTaxon, _ := taxo.Taxon(taxid)
|
||||
taxon, err = newTaxon.LCA(taxon)
|
||||
if err != nil {
|
||||
log.Panicf("LCA error: %v", err)
|
||||
}
|
||||
}
|
||||
sequence.SetTaxid(taxon.Taxid())
|
||||
} else {
|
||||
taxon, _ = taxo.Taxon(1)
|
||||
sequence.SetTaxid(1)
|
||||
}
|
||||
|
||||
sequence.SetAttribute("scientific_name", taxon.ScientificName())
|
||||
sequence.SetAttribute("obitag_rank", taxon.Rank())
|
||||
sequence.SetAttribute("obitag_bestid", best_id)
|
||||
sequence.SetAttribute("obitag_bestmatch", best_seq.Id())
|
||||
sequence.SetAttribute("obitag_min_dist", min_dist)
|
||||
sequence.SetAttribute("obitag_coord", query_location)
|
||||
sequence.SetAttribute("obitag_match_count", len(*matches))
|
||||
sequence.SetAttribute("obitag_similarity_method", "geometric")
|
||||
|
||||
return sequence
|
||||
}
|
||||
|
||||
func GeomIdentifySeqWorker(references *obiseq.BioSequenceSlice,
|
||||
taxo *obitax.Taxonomy) obiseq.SeqWorker {
|
||||
|
||||
landmarks := ExtractLandmarkSeqs(references)
|
||||
taxa := ExtractTaxonSet(references, taxo)
|
||||
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
|
||||
buffer := make([]uint64, 100)
|
||||
return GeomIdentify(sequence, landmarks, references, taxa, taxo, &buffer)
|
||||
}
|
||||
}
|
||||
|
||||
func CLIGeomAssignTaxonomy(iterator obiiter.IBioSequence,
|
||||
references obiseq.BioSequenceSlice,
|
||||
taxo *obitax.Taxonomy,
|
||||
) obiiter.IBioSequence {
|
||||
|
||||
worker := GeomIdentifySeqWorker(&references, taxo)
|
||||
return iterator.MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0)
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
package obitag
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
@@ -16,6 +17,61 @@ import (
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||
)
|
||||
|
||||
// MatchDistanceIndex returns the taxid, rank, and scientificName based on the given distance and distanceIdx.
|
||||
//
|
||||
// Parameters:
|
||||
// - distance: The distance to match against the keys in distanceIdx.
|
||||
// - distanceIdx: A map containing distances as keys and corresponding values in the format "taxid@rank@scientificName".
|
||||
//
|
||||
// Returns:
|
||||
// - taxid: The taxid associated with the matched distance.
|
||||
// - rank: The rank associated with the matched distance.
|
||||
// - scientificName: The scientific name associated with the matched distance.
|
||||
func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, string) {
|
||||
keys := make([]int, 0, len(distanceIdx))
|
||||
for k := range distanceIdx {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
sort.Ints(keys)
|
||||
|
||||
i := sort.Search(len(keys), func(i int) bool {
|
||||
return distance <= keys[i]
|
||||
})
|
||||
|
||||
var taxid int
|
||||
var rank string
|
||||
var scientificName string
|
||||
|
||||
if i == len(keys) || distance > keys[len(keys)-1] {
|
||||
taxid = 1
|
||||
rank = "no rank"
|
||||
scientificName = "root"
|
||||
} else {
|
||||
parts := strings.Split(distanceIdx[keys[i]], "@")
|
||||
taxid, _ = strconv.Atoi(parts[0])
|
||||
rank = parts[1]
|
||||
scientificName = parts[2]
|
||||
}
|
||||
|
||||
// log.Info("taxid:", taxid, " rank:", rank, " scientificName:", scientificName)
|
||||
|
||||
return taxid, rank, scientificName
|
||||
}
|
||||
|
||||
// FindClosests finds the closest bio sequence from a given sequence and a slice of reference sequences.
|
||||
//
|
||||
// Parameters:
|
||||
// - sequence: the bio sequence to find the closest matches for.
|
||||
// - references: a slice of reference sequences to compare against.
|
||||
// - refcounts: a slice of reference sequence counts.
|
||||
// - runExact: a boolean flag indicating whether to run an exact match.
|
||||
//
|
||||
// Returns:
|
||||
// - bests: a slice of the closest bio sequences.
|
||||
// - maxe: the maximum score.
|
||||
// - bestId: the best ID.
|
||||
// - bestmatch: the best match.
|
||||
// - bestidxs: a slice of the best indexes.
|
||||
func FindClosests(sequence *obiseq.BioSequence,
|
||||
references obiseq.BioSequenceSlice,
|
||||
refcounts []*obikmer.Table4mer,
|
||||
@@ -94,6 +150,18 @@ func FindClosests(sequence *obiseq.BioSequence,
|
||||
return bests, maxe, bestId, bestmatch, bestidxs
|
||||
}
|
||||
|
||||
// Identify makes the taxonomic identification of a BioSequence.
|
||||
//
|
||||
// Parameters:
|
||||
// - sequence: A pointer to a BioSequence to identify.
|
||||
// - references: A BioSequenceSlice.
|
||||
// - refcounts: A slice of pointers to Table4mer.
|
||||
// - taxa: A TaxonSet.
|
||||
// - taxo: A pointer to a Taxonomy.
|
||||
// - runExact: A boolean value indicating whether to run exact matching.
|
||||
//
|
||||
// Returns:
|
||||
// - A pointer to a BioSequence.
|
||||
func Identify(sequence *obiseq.BioSequence,
|
||||
references obiseq.BioSequenceSlice,
|
||||
refcounts []*obikmer.Table4mer,
|
||||
@@ -171,24 +239,19 @@ func Identify(sequence *obiseq.BioSequence,
|
||||
log.Debugln(sequence.Id(), "Best matches:", len(bests), "New index:", newidx)
|
||||
|
||||
sequence.SetTaxid(taxon.Taxid())
|
||||
sequence.SetAttribute("scientific_name", taxon.ScientificName())
|
||||
sequence.SetAttribute("obitag_rank", taxon.Rank())
|
||||
sequence.SetAttribute("obitag_bestid", identity)
|
||||
sequence.SetAttribute("obitag_difference", differences)
|
||||
sequence.SetAttribute("obitag_bestmatch", bestmatch)
|
||||
sequence.SetAttribute("obitag_match_count", len(bests))
|
||||
|
||||
} else {
|
||||
taxon, _ = taxo.Taxon(1)
|
||||
sequence.SetTaxid(1)
|
||||
sequence.SetAttribute("scientific_name", taxon.ScientificName())
|
||||
sequence.SetAttribute("obitag_rank", taxon.Rank())
|
||||
sequence.SetAttribute("obitag_bestid", identity)
|
||||
sequence.SetAttribute("obitag_difference", differences)
|
||||
sequence.SetAttribute("obitag_bestmatch", bestmatch)
|
||||
sequence.SetAttribute("obitag_match_count", len(bests))
|
||||
}
|
||||
|
||||
sequence.SetAttribute("scientific_name", taxon.ScientificName())
|
||||
sequence.SetAttribute("obitag_rank", taxon.Rank())
|
||||
sequence.SetAttribute("obitag_bestid", identity)
|
||||
sequence.SetAttribute("obitag_bestmatch", bestmatch)
|
||||
sequence.SetAttribute("obitag_match_count", len(bests))
|
||||
sequence.SetAttribute("obitag_similarity_method", "lcs")
|
||||
|
||||
return sequence
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
var _RefDB = ""
|
||||
var _SaveRefDB = ""
|
||||
var _RunExact = false
|
||||
var _GeomSim = false
|
||||
|
||||
func TagOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringVar(&_RefDB, "reference-db", _RefDB,
|
||||
@@ -27,6 +28,10 @@ func TagOptionSet(options *getoptions.GetOpt) {
|
||||
options.ArgName("FILENAME"),
|
||||
options.Description("The name of a file where to save the reference DB with its indices"))
|
||||
|
||||
options.BoolVar(&_GeomSim, "geometric", _GeomSim,
|
||||
options.Alias("G"),
|
||||
options.Description("Activate the experimental geometric similarity heuristic"))
|
||||
|
||||
// options.BoolVar(&_RunExact, "exact", _RunExact,
|
||||
// options.Alias("E"),
|
||||
// options.Description("Unactivate the heuristic limatitating the sequence comparisons"))
|
||||
@@ -55,6 +60,10 @@ func CLIRefDB() obiseq.BioSequenceSlice {
|
||||
return refdb.Load()
|
||||
}
|
||||
|
||||
func CLIGeometricMode() bool {
|
||||
return _GeomSim
|
||||
}
|
||||
|
||||
func CLIShouldISaveRefDB() bool {
|
||||
return _SaveRefDB != ""
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user