Changes to be committed:

modified: cmd/obitools/obitag/main.go modified: cmd/obitools/obitag2/main.go modified: go.mod modified: go.sum modified: pkg/obiformats/ncbitaxdump/read.go modified: pkg/obioptions/version.go modified: pkg/obiseq/attributes.go modified: pkg/obiseq/taxonomy_lca.go modified: pkg/obiseq/taxonomy_methods.go modified: pkg/obiseq/taxonomy_predicate.go modified: pkg/obitax/inner.go modified: pkg/obitax/lca.go new file: pkg/obitax/taxid.go modified: pkg/obitax/taxon.go modified: pkg/obitax/taxonomy.go modified: pkg/obitax/taxonslice.go modified: pkg/obitools/obicleandb/obicleandb.go modified: pkg/obitools/obigrep/options.go modified: pkg/obitools/obilandmark/obilandmark.go modified: pkg/obitools/obilandmark/options.go modified: pkg/obitools/obirefidx/famlilyindexing.go modified: pkg/obitools/obirefidx/geomindexing.go modified: pkg/obitools/obirefidx/obirefidx.go modified: pkg/obitools/obirefidx/options.go modified: pkg/obitools/obitag/obigeomtag.go modified: pkg/obitools/obitag/obitag.go modified: pkg/obitools/obitag/options.go modified: pkg/obiutils/strings.go
2026-03-26 14:00:51 +00:00 · 2024-12-19 13:36:59 +01:00
parent f41a6fbb60
commit 795df34d1a
28 changed files with 590 additions and 280 deletions
--- a/pkg/obitools/obirefidx/obirefidx.go
+++ b/pkg/obitools/obirefidx/obirefidx.go
@@ -12,102 +12,171 @@ import (
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
-	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obifind"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
 	"github.com/schollz/progressbar/v3"
 )

+// IndexSequence processes a biological sequence and indexes it based on taxonomic information.
+// It computes the least common ancestors (LCA) for the sequence and its references,
+// evaluates common k-mers, and calculates differences in alignment scores.
+//
+// Parameters:
+//   - seqidx: The index of the sequence to process within the references slice.
+//   - references: A slice of biological sequences to compare against.
+//   - kmers: A pointer to a slice of k-mer tables used for common k-mer calculations.
+//   - taxa: A slice of taxonomic information corresponding to the sequences.
+//   - taxo: A taxonomy object used for LCA calculations.
+//
+// Returns:
+//
+//	A map where the keys are integers representing alignment differences,
+//	and the values are strings formatted as "Taxon@Rank" indicating the taxonomic
+//	classification of the sequence based on the computed differences.
 func IndexSequence(seqidx int,
 	references obiseq.BioSequenceSlice,
 	kmers *[]*obikmer.Table4mer,
-	taxa *obitax.TaxonSet,
+	taxa *obitax.TaxonSlice,
 	taxo *obitax.Taxonomy) map[int]string {

+	// Retrieve the biological sequence at the specified index from the references slice.
 	sequence := references[seqidx]
+	seq_len := sequence.Len()
+	// Get the taxon corresponding to the current sequence index.
+	tseq := taxa.Taxon(seqidx)

+	// Get the taxonomic path for the current sequence.
+	pseq := tseq.Path()
+	path_len := pseq.Len()
+
+	// For each taxonomic ancestor in the path, a biosequence slice is created to store
+	// the reference sequences having that ancestor as their LCA with the current sequence.
+	refs := make(map[*obitax.TaxNode]*[]int, path_len)
+
+	for i := 0; i < path_len; i++ {
+		temp := make([]int, 0, 100)
+		refs[pseq.Taxon(i).Node] = &temp
+	}
+
+	// log.Infof("%s length of path: %d", sequence.Id(), len(refs))
+
+	n := taxa.Len()
+	lcaCache := make(map[*obitax.TaxNode]*obitax.TaxNode, n)
+
+	for i := 0; i < n; i++ {
+		taxon := taxa.Taxon(i) // Get the taxon at index i.
+		// Compute the LCA between the current taxon and the taxon of the sequence.
+		node, ok := lcaCache[taxon.Node]
+		if !ok {
+			lca, err := tseq.LCA(taxon)
+			if err != nil {
+				// Log a fatal error if the LCA computation fails, including the taxon details.
+				log.Fatalf("(%s,%s): %+v", tseq.String(), taxon.String(), err)
+			}
+			node = lca.Node
+			lcaCache[taxon.Node] = node
+		}
+
+		// log.Infof("%s Processing taxon: %s x %s -> %s", sequence.Id(), tseq.String(), taxon.String(), node.String(taxo.Code()))
+
+		// Append the current sequence to the LCA's reference sequence slice.
+		*refs[node] = append(*refs[node], i)
+	}
+
+	closest := make([]int, path_len)
+
+	closest[0] = 0
+
+	// Initialize a matrix to store alignment scores
 	var matrix []uint64

-	lca := make(obitax.TaxonSet, len(references))
-	tseq := (*taxa)[seqidx]
+	// log.Warnf("%s : %s", sequence.Id(), pseq.String())
+	for idx_path := 1; idx_path < path_len; idx_path++ {
+		mini := -1
+		seqidcs := refs[pseq.Taxon(idx_path).Node]

-	for i, taxon := range *taxa {
-		lca[i], _ = tseq.LCA(taxon)
-	}
+		ns := len(*seqidcs)

-	cw := make([]int, len(references))
-	sw := (*kmers)[seqidx]
-	for i, ref := range *kmers {
-		cw[i] = obikmer.Common4Mer(sw, ref)
-	}
+		if ns > 0 {

-	ow := obiutils.Reverse(obiutils.IntOrder(cw), true)
-	pseq, _ := tseq.Path()
-	obiutils.Reverse(*pseq, true)
-	// score := make([]int, len(references))
-	mindiff := make([]int, len(*pseq))
-	/*
-		 	nseq := make([]int, len(*pseq))
-			nali := make([]int, len(*pseq))
-			nok := make([]int, len(*pseq))
-			nfast := make([]int, len(*pseq))
-			nfastok := make([]int, len(*pseq))
-	*/
+			shared := make([]int, ns)

-	lseq := sequence.Len()
+			for j, is := range *seqidcs {
+				shared[j] = obikmer.Common4Mer((*kmers)[seqidx], (*kmers)[is])
+			}

-	mini := -1
-	wordmin := 0
+			ow := obiutils.Reverse(obiutils.IntOrder(shared), true)
+
+			for _, order := range ow {
+				is := (*seqidcs)[order]
+				suject := references[is]

-	for i, ancestor := range *pseq {
-		for _, order := range ow {
-			if lca[order] == ancestor {
-				// nseq[i]++
 				if mini != -1 {
-					wordmin = max(sequence.Len(), references[order].Len()) - 3 - 4*mini
-				}
-
-				if cw[order] < wordmin {
-					break
+					wordmin := max(seq_len, suject.Len()) - 3 - 4*mini
+
+					// If the common k-mer count for the current order is less than the
+					// minimum word length, break the loop.
+					if shared[order] < wordmin {
+						break
+					}
 				}

+				// Initialize variables for Longest Common Subsequence (LCS) score and alignment length.
 				lcs, alilength := -1, -1
-				errs := int(1e9)
-				if mini != -1 && mini <= 1 {
-					// nfast[i]++
-					d, _, _, _ := obialign.D1Or0(sequence, references[order])
-					if d >= 0 {
-						errs = d
-						// nfastok[i]++
+				errs := int(1e9) // Initialize errors to a large number.
+
+				// If mini is set and less than or equal to 1, perform a specific alignment.
+				if mini == 0 || mini == 1 {
+					// Perform a specific alignment and get the distance.
+					d, _, _, _ := obialign.D1Or0(sequence, suject)
+					if d >= 0 { // If the distance is valid (non-negative).
+						errs = d // Update errors with the distance.
 					}
 				} else {
-					// nali[i]++
-					lcs, alilength = obialign.FastLCSScore(sequence, references[order], mini, &matrix)
-					if lcs >= 0 {
-						// nok[i]++
-						errs = alilength - lcs
+					// Perform a Fast LCS score calculation for the sequence and reference.
+					lcs, alilength = obialign.FastLCSScore(sequence, suject, mini, &matrix)
+					if lcs >= 0 { // If LCS score is valid (non-negative).
+						errs = alilength - lcs // Calculate errors based on alignment length.
 					}
-
 				}
+
+				// Update mini with the minimum errors found.
 				if mini == -1 || errs < mini {
 					mini = errs
 				}
+
+				if mini == 0 {
+					// log.Warnf("%s: %s", sequence.Id(), sequence.String())
+					// log.Warnf("%s: %s", suject.Id(), suject.String())
+					break
+				}
 			}
+
+			if mini == -1 {
+				log.Fatalf("(%s,%s): No alignment found.", sequence.Id(), pseq.Taxon(idx_path).String())
+			}
+
+			closest[idx_path] = mini
+			// insure than closest is strictly increasing
+			for k := idx_path - 1; k >= 0 && mini < closest[k]; k-- {
+				closest[k] = mini
+				// log.Warnf("(%s,%s) Smaller alignment found than previous (%d,%d). Resetting closest.", sequence.Id(), pseq.Taxon(idx_path).String(), mini, closest[k])
+			}
+		} else {
+			closest[idx_path] = seq_len
 		}
-		mindiff[i] = mini
 	}

-	obitag_index := make(map[int]string, len(*pseq))
+	obitag_index := make(map[int]string, pseq.Len())

-	old := lseq
-	for i, d := range mindiff {
-		if d != -1 && d < old {
-			current_taxid := (*pseq)[i]
+	// log.Warnf("(%s,%s): %v", sequence.Id(), pseq.Taxon(0).String(), closest)
+	for i, d := range closest {
+		if i < (len(closest)-1) && d < closest[i+1] {
+			current_taxon := pseq.Taxon(i)
 			obitag_index[d] = fmt.Sprintf(
-				"%d@%s@%s",
-				current_taxid.Taxid(),
-				current_taxid.ScientificName(),
-				current_taxid.Rank())
-			old = d
+				"%s@%s",
+				current_taxon.String(),
+				current_taxon.Rank(),
+			)
 		}
 	}

@@ -128,22 +197,21 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
 	source, references := iterator.Load()
 	log.Infof("Done. Database contains %d sequences", len(references))

-	taxo, error := obifind.CLILoadSelectedTaxonomy()
+	taxo, error := obioptions.CLILoadSelectedTaxonomy()
 	if error != nil {
 		log.Panicln(error)
 	}

 	log.Infoln("Indexing sequence taxids...")

-	taxa := make(
-		obitax.TaxonSet,
-		len(references))
+	n := len(references)
+	taxa := taxo.NewTaxonSlice(n, n)

 	j := 0
 	for i, seq := range references {
-		taxon, err := taxo.Taxon(seq.Taxid())
-		if err == nil {
-			taxa[j] = taxon
+		taxon := seq.Taxon(taxo)
+		if taxon != nil {
+			taxa.Set(j, taxon)
 			references[j] = references[i]
 			j++
 		}
@@ -198,7 +266,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
 		for l := range limits {
 			sl := obiseq.MakeBioSequenceSlice()
 			for i := l[0]; i < l[1]; i++ {
-				idx := IndexSequence(i, references, &refcounts, &taxa, taxo)
+				idx := IndexSequence(i, references, &refcounts, taxa, taxo)
 				iref := references[i].Copy()
 				iref.SetOBITagRefIndex(idx)
 				sl = append(sl, iref)