A first prototype for the space of sequences

Former-commit-id: 07dc6ef044b5b6a6fb45dc2acb01dffe71a96195
This commit is contained in:
2023-08-27 14:58:55 +02:00
parent cbd42d5b30
commit 9bf006af93
17 changed files with 969 additions and 117 deletions

View File

@@ -48,11 +48,14 @@ func FilterTaxonomyOptionSet(options *getoptions.GetOpt) {
options.Description("Restrict output to some subclades."))
}
func CLISelectedNCBITaxDump() string {
return __taxdump__
}
func CLIHasSelectedTaxonomy() bool {
return __taxdump__ != ""
}
func CLIAreAlternativeNamesSelected() bool {
return __alternative_name__
}

View File

@@ -10,6 +10,9 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obistats"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obirefidx"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
"github.com/schollz/progressbar/v3"
log "github.com/sirupsen/logrus"
@@ -103,14 +106,14 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
library := iterator.Load()
library_size := len(library)
n_landmark := NCenter()
n_landmark := CLINCenter()
landmark_idx := obistats.SampleIntWithoutReplacement(n_landmark, library_size)
log.Infof("Library contains %d sequence", len(library))
var seqworld obiutils.Matrix[float64]
for loop := 0; loop < 5; loop++ {
for loop := 0; loop < 2; loop++ {
sort.IntSlice(landmark_idx).Sort()
log.Debugf("Selected indices : %v", landmark_idx)
@@ -154,14 +157,52 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
}
classes := obistats.AssignToClass(&seqworld, &initialCenters)
for i, seq := range library {
seq.SetAttribute("landmark_coord", seqworld[i])
ic, _ := obiutils.InterfaceToIntSlice(seqworld[i])
seq.SetCoordinate(ic)
seq.SetAttribute("landmark_class", classes[i])
if i, ok := seq_landmark[i]; ok {
seq.SetAttribute("landmark_id", i)
}
}
if obifind.CLIHasSelectedTaxonomy() {
taxo, err := obifind.CLILoadSelectedTaxonomy()
if err != nil {
log.Fatal(err)
}
taxa := make(obitax.TaxonSet, len(library))
for i, seq := range library {
taxa[i], err = taxo.Taxon(seq.Taxid())
if err != nil {
log.Fatal(err)
}
}
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowCount(),
progressbar.OptionShowIts(),
progressbar.OptionSetDescription("[Sequence Indexing]"),
)
bar := progressbar.NewOptions(len(library), pbopt...)
for i, seq := range library {
idx := obirefidx.GeomIndexSesquence(i, library, &taxa, taxo)
seq.SetOBITagGeomRefIndex(idx)
if i%10 == 0 {
bar.Add(10)
}
}
}
return obiiter.IBatchOver(library, obioptions.CLIBatchSize())
}

View File

@@ -2,28 +2,37 @@ package obilandmark
import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
"github.com/DavidGamba/go-getoptions"
)
var _nCenter = 200
// ObilandmarkOptionSet sets the options for Obilandmark.
// LandmarkOptionSet sets the options for Obilandmark.
//
// options: a pointer to the getoptions.GetOpt struct.
// Return type: none.
func ObilandmarkOptionSet(options *getoptions.GetOpt) {
func LandmarkOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_nCenter, "center", _nCenter,
options.Alias("n"),
options.Description("Maximum numbers of differences between two variant sequences (default: %d)."))
options.Description("Number of landmark sequences to be selected."))
}
// OptionSet is a function that sets the options for the GetOpt struct.
//
// It takes a pointer to a GetOpt struct as its parameter and does not return anything.
func OptionSet(options *getoptions.GetOpt) {
obiconvert.InputOptionSet(options)
obiconvert.OutputOptionSet(options)
ObilandmarkOptionSet(options)
obifind.LoadTaxonomyOptionSet(options, false, false)
LandmarkOptionSet(options)
}
func NCenter() int {
// CLINCenter returns desired number of centers as specified by user.
//
// No parameters.
// Returns an integer value.
func CLINCenter() int {
return _nCenter
}

View File

@@ -0,0 +1,79 @@
package obirefidx
import (
"fmt"
"log"
"sort"
"sync"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
)
func GeomIndexSesquence(seqidx int,
references obiseq.BioSequenceSlice,
taxa *obitax.TaxonSet,
taxo *obitax.Taxonomy) map[int]string {
sequence := references[seqidx]
location := sequence.GetCoordinate()
if location == nil {
log.Fatalf("Sequence %s does not have a coordinate", sequence.Id())
}
seq_dist := make([]float64, len(references))
var wg sync.WaitGroup
for i, ref := range references {
wg.Add(1)
go func(i int, ref *obiseq.BioSequence) {
defer wg.Done()
reflocation := ref.GetCoordinate()
if reflocation == nil {
log.Fatalf("Sequence %s does not have a coordinate", ref.Id())
}
d := 0.0
for i, x := range location {
diff := float64(x - reflocation[i])
d += diff * diff
}
seq_dist[i] = d
}(i, ref)
}
wg.Wait()
order := obiutils.Order(sort.Float64Slice(seq_dist))
lca := (*taxa)[seqidx]
index := make(map[int]string)
index[0.0] = fmt.Sprintf(
"%d@%s@%s",
lca.Taxid(),
lca.ScientificName(),
lca.Rank())
old_dist := 0.0
for _, o := range order {
new_lca, _ := lca.LCA((*taxa)[o])
if new_lca.Taxid() != lca.Taxid() || seq_dist[o] != old_dist {
lca = new_lca
old_dist = seq_dist[o]
index[int(seq_dist[o])] = fmt.Sprintf(
"%d@%s@%s",
lca.Taxid(),
lca.ScientificName(),
lca.Rank())
}
if lca.Taxid() == 1 {
break
}
}
return index
}

View File

@@ -24,9 +24,6 @@ func IndexSequence(seqidx int,
taxo *obitax.Taxonomy) map[int]string {
sequence := references[seqidx]
// matrix := obialign.NewFullLCSMatrix(nil,
// sequence.Length(),
// sequence.Length())
var matrix []uint64
@@ -54,7 +51,9 @@ func IndexSequence(seqidx int,
nok := make([]int, len(*pseq))
nfast := make([]int, len(*pseq))
nfastok := make([]int, len(*pseq))
*/lseq := sequence.Len()
*/
lseq := sequence.Len()
mini := -1
wordmin := 0

View File

@@ -0,0 +1,209 @@
package obitag
import (
"log"
"math"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
)
// ExtractLandmarkSeqs extracts landmark sequences from the given BioSequenceSlice.
//
// The landmark sequences are extracted from the given BioSequenceSlice and put in a new BioSequenceSlice
// in the order corresponding to their landmark IDs.
//
// references: A pointer to a BioSequenceSlice containing the references.
// Returns: A pointer to a BioSequenceSlice containing the extracted landmark sequences.
func ExtractLandmarkSeqs(references *obiseq.BioSequenceSlice) *obiseq.BioSequenceSlice {
landmarks := make(map[int]*obiseq.BioSequence, 100)
for _, ref := range *references {
if id := ref.GetLandmarkID(); id != -1 {
landmarks[id] = ref
}
}
ls := obiseq.NewBioSequenceSlice(len(landmarks))
*ls = (*ls)[0:len(landmarks)]
for k, l := range landmarks {
(*ls)[k] = l
}
return ls
}
// ExtractTaxonSet extracts a set of taxa from the given references and taxonomy.
//
// If a reference sequence has a taxid absent from the taxonomy, the function will panic.
//
// The function takes two parameters:
// - references: a pointer to a BioSequenceSlice, which is a slice of BioSequence objects.
// - taxonomy: a pointer to a Taxonomy object.
//
// The function returns a pointer to a TaxonSet, which is a set of taxa.
func ExtractTaxonSet(references *obiseq.BioSequenceSlice, taxonomy *obitax.Taxonomy) *obitax.TaxonSet {
var err error
taxa := make(obitax.TaxonSet, len(*references))
for i, ref := range *references {
taxid := ref.Taxid()
taxa[i], err = taxonomy.Taxon(taxid)
if err != nil {
log.Panicf("Taxid %d, for sequence %s not found in taxonomy", taxid, ref.Id())
}
}
return &taxa
}
// MapOnLandmarkSequences calculates the coordinates of landmarks on a given sequence.
//
// It takes in three parameters:
// - sequence: a pointer to a BioSequence object representing the sequence.
// - landmarks: a pointer to a BioSequenceSlice object representing the landmarks.
// - buffer: a pointer to a slice of uint64, used as a buffer for calculations.
//
// It returns a slice of integers representing the coordinates of the landmarks on the sequence.
func MapOnLandmarkSequences(sequence *obiseq.BioSequence, landmarks *obiseq.BioSequenceSlice, buffer *[]uint64) []int {
coords := make([]int, len(*landmarks))
for i, l := range *landmarks {
lcs, length := obialign.FastLCSEGFScore(sequence, l, -1, buffer)
coords[i] = length - lcs
}
return coords
}
// FindGeomClosest finds the closest geometric sequence in a given set of reference sequences to a query sequence.
//
// Parameters:
// - sequence: A pointer to a BioSequence object representing the query sequence.
// - landmarks: A pointer to a BioSequenceSlice object representing the landmarks.
// - references: A pointer to a BioSequenceSlice object representing the reference sequences.
// - buffer: A pointer to a slice of uint64 representing a buffer.
//
// Returns:
// - A pointer to a BioSequence object representing the closest sequence.
// - An int representing the minimum distance.
// - A float64 representing the best identity score.
// - An array of int representing the indices of the closest sequences.
// - A pointer to a BioSequenceSlice object representing the matched sequences.
func FindGeomClosest(sequence *obiseq.BioSequence,
landmarks *obiseq.BioSequenceSlice,
references *obiseq.BioSequenceSlice,
buffer *[]uint64) (*obiseq.BioSequence, int, float64, []int, *obiseq.BioSequenceSlice) {
min_dist := math.MaxInt64
min_idx := make([]int, 0)
query_location := MapOnLandmarkSequences(sequence, landmarks, buffer)
for i, l := range *references {
coord := l.GetCoordinate()
if len(coord) == 0 {
log.Panicf("Empty coordinate for reference sequence %s", l.Id())
}
dist := 0
for j := 0; j < len(coord); j++ {
diff := query_location[j] - coord[j]
dist += diff * diff
}
if dist == min_dist {
min_idx = append(min_idx, i)
}
if dist < min_dist {
min_dist = dist
min_idx = make([]int, 0)
min_idx = append(min_idx, i)
}
}
best_seq := (*references)[min_idx[0]]
best_id := 0.0
for _, i := range min_idx {
seq := (*references)[i]
lcs, length := obialign.FastLCSEGFScore(sequence, seq, -1, buffer)
ident := float64(lcs) / float64(length)
if ident > best_id {
best_id = ident
best_seq = seq
}
}
matches := obiseq.MakeBioSequenceSlice(len(min_idx))
matches = matches[0:len(min_idx)]
for i, j := range min_idx {
matches[i] = (*references)[j]
}
return best_seq, min_dist, best_id, query_location, &matches
}
func GeomIdentify(sequence *obiseq.BioSequence,
landmarks *obiseq.BioSequenceSlice,
references *obiseq.BioSequenceSlice,
taxa *obitax.TaxonSet,
taxo *obitax.Taxonomy,
buffer *[]uint64) *obiseq.BioSequence {
best_seq, min_dist, best_id, query_location, matches := FindGeomClosest(sequence, landmarks, references, buffer)
taxon := (*obitax.TaxNode)(nil)
var err error
if best_id > 0.5 {
taxid, _, _ := MatchDistanceIndex(min_dist, (*matches)[0].OBITagGeomRefIndex())
taxon, _ = taxo.Taxon(taxid)
for i := 1; i < len(*matches); i++ {
taxid, _, _ := MatchDistanceIndex(min_dist, (*matches)[i].OBITagGeomRefIndex())
newTaxon, _ := taxo.Taxon(taxid)
taxon, err = newTaxon.LCA(taxon)
if err != nil {
log.Panicf("LCA error: %v", err)
}
}
sequence.SetTaxid(taxon.Taxid())
} else {
taxon, _ = taxo.Taxon(1)
sequence.SetTaxid(1)
}
sequence.SetAttribute("scientific_name", taxon.ScientificName())
sequence.SetAttribute("obitag_rank", taxon.Rank())
sequence.SetAttribute("obitag_bestid", best_id)
sequence.SetAttribute("obitag_bestmatch", best_seq.Id())
sequence.SetAttribute("obitag_min_dist", min_dist)
sequence.SetAttribute("obitag_coord", query_location)
sequence.SetAttribute("obitag_match_count", len(*matches))
sequence.SetAttribute("obitag_similarity_method", "geometric")
return sequence
}
func GeomIdentifySeqWorker(references *obiseq.BioSequenceSlice,
taxo *obitax.Taxonomy) obiseq.SeqWorker {
landmarks := ExtractLandmarkSeqs(references)
taxa := ExtractTaxonSet(references, taxo)
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
buffer := make([]uint64, 100)
return GeomIdentify(sequence, landmarks, references, taxa, taxo, &buffer)
}
}
func CLIGeomAssignTaxonomy(iterator obiiter.IBioSequence,
references obiseq.BioSequenceSlice,
taxo *obitax.Taxonomy,
) obiiter.IBioSequence {
worker := GeomIdentifySeqWorker(&references, taxo)
return iterator.MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0)
}

View File

@@ -1,6 +1,7 @@
package obitag
import (
"sort"
"strconv"
"strings"
@@ -16,6 +17,61 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
)
// MatchDistanceIndex returns the taxid, rank, and scientificName based on the given distance and distanceIdx.
//
// Parameters:
// - distance: The distance to match against the keys in distanceIdx.
// - distanceIdx: A map containing distances as keys and corresponding values in the format "taxid@rank@scientificName".
//
// Returns:
// - taxid: The taxid associated with the matched distance.
// - rank: The rank associated with the matched distance.
// - scientificName: The scientific name associated with the matched distance.
func MatchDistanceIndex(distance int, distanceIdx map[int]string) (int, string, string) {
keys := make([]int, 0, len(distanceIdx))
for k := range distanceIdx {
keys = append(keys, k)
}
sort.Ints(keys)
i := sort.Search(len(keys), func(i int) bool {
return distance <= keys[i]
})
var taxid int
var rank string
var scientificName string
if i == len(keys) || distance > keys[len(keys)-1] {
taxid = 1
rank = "no rank"
scientificName = "root"
} else {
parts := strings.Split(distanceIdx[keys[i]], "@")
taxid, _ = strconv.Atoi(parts[0])
rank = parts[1]
scientificName = parts[2]
}
// log.Info("taxid:", taxid, " rank:", rank, " scientificName:", scientificName)
return taxid, rank, scientificName
}
// FindClosests finds the closest bio sequence from a given sequence and a slice of reference sequences.
//
// Parameters:
// - sequence: the bio sequence to find the closest matches for.
// - references: a slice of reference sequences to compare against.
// - refcounts: a slice of reference sequence counts.
// - runExact: a boolean flag indicating whether to run an exact match.
//
// Returns:
// - bests: a slice of the closest bio sequences.
// - maxe: the maximum score.
// - bestId: the best ID.
// - bestmatch: the best match.
// - bestidxs: a slice of the best indexes.
func FindClosests(sequence *obiseq.BioSequence,
references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer,
@@ -94,6 +150,18 @@ func FindClosests(sequence *obiseq.BioSequence,
return bests, maxe, bestId, bestmatch, bestidxs
}
// Identify makes the taxonomic identification of a BioSequence.
//
// Parameters:
// - sequence: A pointer to a BioSequence to identify.
// - references: A BioSequenceSlice.
// - refcounts: A slice of pointers to Table4mer.
// - taxa: A TaxonSet.
// - taxo: A pointer to a Taxonomy.
// - runExact: A boolean value indicating whether to run exact matching.
//
// Returns:
// - A pointer to a BioSequence.
func Identify(sequence *obiseq.BioSequence,
references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer,
@@ -171,24 +239,19 @@ func Identify(sequence *obiseq.BioSequence,
log.Debugln(sequence.Id(), "Best matches:", len(bests), "New index:", newidx)
sequence.SetTaxid(taxon.Taxid())
sequence.SetAttribute("scientific_name", taxon.ScientificName())
sequence.SetAttribute("obitag_rank", taxon.Rank())
sequence.SetAttribute("obitag_bestid", identity)
sequence.SetAttribute("obitag_difference", differences)
sequence.SetAttribute("obitag_bestmatch", bestmatch)
sequence.SetAttribute("obitag_match_count", len(bests))
} else {
taxon, _ = taxo.Taxon(1)
sequence.SetTaxid(1)
sequence.SetAttribute("scientific_name", taxon.ScientificName())
sequence.SetAttribute("obitag_rank", taxon.Rank())
sequence.SetAttribute("obitag_bestid", identity)
sequence.SetAttribute("obitag_difference", differences)
sequence.SetAttribute("obitag_bestmatch", bestmatch)
sequence.SetAttribute("obitag_match_count", len(bests))
}
sequence.SetAttribute("scientific_name", taxon.ScientificName())
sequence.SetAttribute("obitag_rank", taxon.Rank())
sequence.SetAttribute("obitag_bestid", identity)
sequence.SetAttribute("obitag_bestmatch", bestmatch)
sequence.SetAttribute("obitag_match_count", len(bests))
sequence.SetAttribute("obitag_similarity_method", "lcs")
return sequence
}

View File

@@ -15,6 +15,7 @@ import (
var _RefDB = ""
var _SaveRefDB = ""
var _RunExact = false
var _GeomSim = false
func TagOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_RefDB, "reference-db", _RefDB,
@@ -27,6 +28,10 @@ func TagOptionSet(options *getoptions.GetOpt) {
options.ArgName("FILENAME"),
options.Description("The name of a file where to save the reference DB with its indices"))
options.BoolVar(&_GeomSim, "geometric", _GeomSim,
options.Alias("G"),
options.Description("Activate the experimental geometric similarity heuristic"))
// options.BoolVar(&_RunExact, "exact", _RunExact,
// options.Alias("E"),
// options.Description("Unactivate the heuristic limatitating the sequence comparisons"))
@@ -55,6 +60,10 @@ func CLIRefDB() obiseq.BioSequenceSlice {
return refdb.Load()
}
func CLIGeometricMode() bool {
return _GeomSim
}
func CLIShouldISaveRefDB() bool {
return _SaveRefDB != ""
}