Add a first version of obitag the successor of ecotag

This commit is contained in:
2022-10-26 13:16:56 +02:00
parent e17d1fbca6
commit 8aa323dad5
17 changed files with 884 additions and 5 deletions

View File

@ -0,0 +1,22 @@
package main
import (
"os"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obirefidx"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
)
func main() {
optionParser := obioptions.GenerateOptionParser(obirefidx.OptionSet)
_, args, _ := optionParser(os.Args)
fs, _ := obiconvert.ReadBioSequencesBatch(args...)
indexed := obirefidx.IndexReferenceDB(fs)
written, _ := obiconvert.WriteBioSequencesBatch(indexed, false)
written.Consume()
}

View File

@ -0,0 +1,21 @@
package main
import (
"os"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obitag"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
)
func main() {
optionParser := obioptions.GenerateOptionParser(obitag.OptionSet)
_, args, _ := optionParser(os.Args)
fs, _ := obiconvert.ReadBioSequencesBatch(args...)
identified := obitag.AssignTaxonomy(fs)
obiconvert.WriteBioSequencesBatch(identified, true)
}

View File

@ -4,6 +4,7 @@ import (
"bufio" "bufio"
"bytes" "bytes"
"encoding/json" "encoding/json"
"fmt"
"io" "io"
"os" "os"
"reflect" "reflect"
@ -12,6 +13,15 @@ import (
"github.com/barkimedes/go-deepcopy" "github.com/barkimedes/go-deepcopy"
) )
// InterfaceToInt converts a interface{} to an integer value if possible.
// If not a "NotAnInteger" error is returned via the err
// return value and val is set to 0.
func InterfaceToString(i interface{}) (val string, err error) {
err = nil
val = fmt.Sprintf("%V", i)
return
}
// NotAnInteger defines a new type of Error : "NotAnInteger" // NotAnInteger defines a new type of Error : "NotAnInteger"
type NotAnInteger struct { type NotAnInteger struct {
message string message string

29
pkg/goutils/minmax.go Normal file
View File

@ -0,0 +1,29 @@
package goutils
func MinInt(x, y int) int {
if x < y {
return x
}
return y
}
func MaxInt(x, y int) int {
if x < y {
return y
}
return x
}
func MinUInt16(x, y uint16) uint16 {
if x < y {
return x
}
return y
}
func MaxUInt16(x, y uint16) uint16 {
if x < y {
return y
}
return x
}

33
pkg/goutils/ranks.go Normal file
View File

@ -0,0 +1,33 @@
package goutils
import "sort"
// intRanker is a helper type for the rank function.
type intRanker struct {
x []int // Data to be ranked.
r []int // A list of indexes into f that reflects rank order after sorting.
}
// ranker satisfies the sort.Interface without mutating the reference slice, f.
func (r intRanker) Len() int { return len(r.x) }
func (r intRanker) Less(i, j int) bool { return r.x[r.r[i]] < r.x[r.r[j]] }
func (r intRanker) Swap(i, j int) { r.r[i], r.r[j] = r.r[j], r.r[i] }
func IntOrder(data []int) []int {
if len(data) == 0 {
return nil
}
rk := intRanker{
x: data,
r: make([]int, len(data)),
}
for i := 0; i < len(data); i++ {
rk.r[i] = i
}
sort.Sort(rk)
return rk.r
}

118
pkg/obialign/fulllcs.go Normal file
View File

@ -0,0 +1,118 @@
package obialign
import (
"log"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
type FullLCSMatrix struct {
matrix []int16 // Score matrix
lenght []int16 // Alignment length matrix
ll int // Length of the longest sequence
l int // Length of the shortest sequence
}
func NewFullLCSMatrix(matrix *FullLCSMatrix, L, l int) *FullLCSMatrix {
if matrix == nil {
matrix = &FullLCSMatrix{}
}
if l > L {
log.Panicf("L (%d) must be greater or equal to l (%d)", L, l)
}
needed := (L) * (l)
if needed > matrix.Cap() {
matrix.matrix = make([]int16, needed)
matrix.lenght = make([]int16, needed)
}
matrix.matrix = matrix.matrix[:needed]
matrix.lenght = matrix.lenght[:needed]
matrix.ll = L
matrix.l = l
return matrix
}
func (matrix *FullLCSMatrix) Cap() int {
return cap(matrix.matrix)
}
func (matrix *FullLCSMatrix) Length() int {
return len(matrix.matrix)
}
func (matrix *FullLCSMatrix) Get(i, j int) (int16, int16) {
if i == 0 {
return 0, int16(j)
}
if j == 0 {
return 0, int16(i)
}
pos := (i-1)*matrix.ll + j - 1
return matrix.matrix[pos], matrix.lenght[pos]
}
func (matrix *FullLCSMatrix) Set(i, j int, score, length int16) {
if i > 0 && j > 0 {
pos := (i-1)*matrix.ll + j - 1
matrix.matrix[pos] = score
matrix.lenght[pos] = length
}
}
// Computes the LCS between two DNA sequences and the length of the
// corresponding alignment
func FullLCSScore(seqA, seqB *obiseq.BioSequence,
matrix *FullLCSMatrix) (int, int) {
if seqA.Length() == 0 {
log.Fatal("Sequence A has a length of 0")
}
if seqB.Length() == 0 {
log.Fatal("Sequence B has a length of 0")
}
// swapped := false
if seqA.Length() < seqB.Length() {
seqA, seqB = seqB, seqA
// swapped = true
}
la := seqA.Length()
lb := seqB.Length()
sa := seqA.Sequence()
sb := seqB.Sequence()
matrix = NewFullLCSMatrix(matrix, la, lb)
for i := 1; i <= matrix.l; i++ {
for j := 1; j <= matrix.ll; j++ {
sd, ld := matrix.Get(i-1, j-1)
if sb[i-1] == sa[j-1] {
sd++
}
sup, lup := matrix.Get(i-1, j)
sleft, lleft := matrix.Get(i, j-1)
switch {
case sd >= sup && sd >= sleft:
matrix.Set(i, j, sd, ld+1)
case sup >= sleft && sup >= sd:
matrix.Set(i, j, sup, lup+1)
default:
matrix.Set(i, j, sleft, lleft+1)
}
}
}
s, l := matrix.Get(lb, la)
return int(s), int(l)
}

View File

@ -96,8 +96,18 @@ func (matrix *LCSMatrix) Set(i, j int, score, length int16) {
} }
} }
// Computes the LCS between two DNA sequences and the length of the
// corresponding alignment
func LCSScore(seqA, seqB *obiseq.BioSequence, maxError int, func LCSScore(seqA, seqB *obiseq.BioSequence, maxError int,
matrix *LCSMatrix) (int, int) { matrix *LCSMatrix) (int, int) {
if seqA.Length() == 0 {
log.Fatal("Sequence A has a length of 0")
}
if seqB.Length() == 0 {
log.Fatal("Sequence B has a length of 0")
}
// swapped := false // swapped := false
if seqA.Length() < seqB.Length() { if seqA.Length() < seqB.Length() {
@ -105,18 +115,39 @@ func LCSScore(seqA, seqB *obiseq.BioSequence, maxError int,
// swapped = true // swapped = true
} }
if (seqA.Length() - seqB.Length()) > maxError { if (seqA.Length() - seqB.Length()) > maxError {
return -1, -1 return -1, -1
} }
matrix = NewLCSMatrix(matrix, seqA.Length(), seqB.Length(), maxError) la := seqA.Length()
lb := seqB.Length()
sa := seqA.Sequence()
sb := seqB.Sequence()
matrix = NewLCSMatrix(matrix, la, lb, maxError)
for i := 1; i <= matrix.l; i++ { for i := 1; i <= matrix.l; i++ {
ij := max(1, i-matrix.extra) ij := max(1, i-matrix.extra)
sj := min(i+matrix.delta+matrix.extra, matrix.ll) sj := min(i+matrix.delta+matrix.extra, matrix.ll)
for j := ij; j <= sj; j++ { for j := ij; j <= sj; j++ {
sd, ld := matrix.Get(i-1, j-1) sd, ld := matrix.Get(i-1, j-1)
if seqB.Sequence()[i-1] == seqA.Sequence()[j-1] { if i > lb {
log.Println("Error on seq B ", 1, matrix.l)
log.Println(i)
log.Println(seqB.Length(), "/", lb)
log.Println(string(sa))
log.Fatalln(string(sb))
}
if j > la {
log.Println("Error on seq A ", ij, sj)
log.Println(j)
log.Println(seqA.Length(), "/", la)
log.Println(string(sa))
log.Fatalln(string(sb))
}
if sb[i-1] == sa[j-1] {
sd++ sd++
} }
sup, lup := matrix.Get(i-1, j) sup, lup := matrix.Get(i-1, j)
@ -132,11 +163,13 @@ func LCSScore(seqA, seqB *obiseq.BioSequence, maxError int,
} }
} }
s, l := matrix.Get(seqB.Length(), seqA.Length()) s, l := matrix.Get(lb, la)
if (l - s) > int16(maxError) { if (l - s) > int16(maxError) {
// log.Println(l,s,l-s,maxError)
return -1, -1 return -1, -1
} }
return int(s), int(l) return int(s), int(l)
} }

View File

@ -465,6 +465,14 @@ func (iterator IBioSequenceBatch) Recycle() {
log.Debugf("End of the recycling of %d Bioseq objects", recycled) log.Debugf("End of the recycling of %d Bioseq objects", recycled)
} }
func (iterator IBioSequenceBatch) Consume() {
for iterator.Next() {
batch := iterator.Get()
batch.Recycle()
}
}
func (iterator IBioSequenceBatch) Count(recycle bool) (int, int, int) { func (iterator IBioSequenceBatch) Count(recycle bool) (int, int, int) {
variants := 0 variants := 0
reads := 0 reads := 0

View File

@ -0,0 +1,17 @@
package obiiter
// func MakeSetAttributeWorker(rank string) obiiter.SeqWorker {
// if !goutils.Contains(taxonomy.RankList(), rank) {
// log.Fatalf("%s is not a valid rank (allowed ranks are %v)",
// rank,
// taxonomy.RankList())
// }
// w := func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
// taxonomy.SetTaxonAtRank(sequence, rank)
// return sequence
// }
// return w
// }

76
pkg/obikmer/counting.go Normal file
View File

@ -0,0 +1,76 @@
package obikmer
import (
"math"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
type Table4mer [256]uint16
func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Table4mer {
iternal_buffer := Encode4mer(seq, buffer)
if counts == nil {
var w Table4mer
counts = &w
}
// Every cells of the counter is set to zero
for i := 0; i < 256; i++ {
(*counts)[i] = 0
}
for _, code := range iternal_buffer {
(*counts)[code]++
}
return counts
}
func Common4Mer(count1, count2 *Table4mer) int {
sum := 0
for i := 0; i < 256; i++ {
sum += int(goutils.MinUInt16((*count1)[i], (*count2)[i]))
}
return sum
}
func Sum4Mer(count *Table4mer) int {
sum := 0
for i := 0; i < 256; i++ {
sum += int((*count)[i])
}
return sum
}
func LCS4MerBounds(count1, count2 *Table4mer) (int, int) {
s1 := Sum4Mer(count1)
s2 := Sum4Mer(count2)
smin := goutils.MinInt(s1, s2)
cw := Common4Mer(count1, count2)
lcsMax := smin + 3 - int(math.Ceil(float64(smin-cw)/4.0))
lcsMin := cw
if cw > 0 {
lcsMin += 3
}
return lcsMin, lcsMax
}
func Error4MerBounds(count1, count2 *Table4mer) (int, int) {
s1 := Sum4Mer(count1)
s2 := Sum4Mer(count2)
smax := goutils.MaxInt(s1, s2)
cw := Common4Mer(count1, count2)
errorMax := smax - cw + 2* int(math.Floor(float64(cw+5)/8.0))
errorMin := int(math.Ceil(float64(errorMax) / 4.0))
return errorMin, errorMax
}

View File

@ -21,7 +21,7 @@ var __single_base_code__ = []byte{0,
} }
// Encode4mer transforms an obiseq.BioSequence into a sequence // Encode4mer transforms an obiseq.BioSequence into a sequence
// of kmer of length 4. Each letter of the sequence noot belonging // of kmer of length 4. Each letter of the sequence not belonging
// A, C, G, T, U are considered as a A. The kmer is encoded as a byte // A, C, G, T, U are considered as a A. The kmer is encoded as a byte
// value ranging from 0 to 255. Each nucleotite is represented by // value ranging from 0 to 255. Each nucleotite is represented by
// two bits. The values 0, 1, 2, 3 correspond respectively to A, C, G, // two bits. The values 0, 1, 2, 3 correspond respectively to A, C, G,
@ -65,15 +65,24 @@ func Encode4mer(seq *obiseq.BioSequence, buffer *[]byte) []byte {
return *buffer return *buffer
} }
// Index4mer returns an index where the occurrence position of every fourmer is
// stored. The index is returned as an array of slices of integer. The first
// dimention corresponds to the code of the 4mer, the second
func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int { func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int {
iternal_buffer := Encode4mer(seq, buffer) iternal_buffer := Encode4mer(seq, buffer)
if index == nil || cap(*index) < 256 { if index == nil || cap(*index) < 256 {
// A new index is created
i := make([][]int, 256) i := make([][]int, 256)
index = &i if index == nil {
index = &i
} else {
*index = i
}
} }
// Every cells of the index is emptied
for i := 0; i < 256; i++ { for i := 0; i < 256; i++ {
(*index)[i] = (*index)[i][:0] (*index)[i] = (*index)[i][:0]
} }
@ -85,6 +94,9 @@ func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int
return *index return *index
} }
// FastShiftFourMer runs a Fast algorithm (similar to the one used in FASTA) to compare two sequences.
// The returned values are two integer values. The shift between both the sequences and the count of
// matching 4mer when this shift is applied between both the sequences.
func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (int, int) { func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (int, int) {
iternal_buffer := Encode4mer(seq, buffer) iternal_buffer := Encode4mer(seq, buffer)
@ -115,3 +127,4 @@ func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (i
return maxshift, maxcount return maxshift, maxcount
} }

View File

@ -13,6 +13,7 @@ package obiseq
import ( import (
"crypto/md5" "crypto/md5"
"fmt" "fmt"
"strconv"
"sync/atomic" "sync/atomic"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
@ -298,6 +299,49 @@ func (s *BioSequence) Taxid() int {
return taxid return taxid
} }
func (s *BioSequence) EcotagRefIndex() map[int]string {
var val map[int]string
i, ok := s.GetAttribute("ecotag_ref_index")
if !ok {
return nil
}
switch i := i.(type) {
case map[int]string:
val = i
case map[string]interface{}:
val = make(map[int]string, len(i))
for k, v := range i {
score, err := strconv.Atoi(k)
if err != nil {
log.Panicln(err)
}
val[score], err = goutils.InterfaceToString(v)
if err != nil {
log.Panicln(err)
}
}
case map[string]string:
val = make(map[int]string, len(i))
for k, v := range i {
score, err := strconv.Atoi(k)
if err != nil {
log.Panicln(err)
}
val[score] = v
}
default:
log.Panicln("value of attribute ecotag_ref_index cannot be casted to a map[int]string")
}
return val
}
func (s *BioSequence) SetTaxid(taxid int) { func (s *BioSequence) SetTaxid(taxid int) {
annot := s.Annotations() annot := s.Annotations()
annot["taxid"] = taxid annot["taxid"] = taxid

25
pkg/obitax/lca.go Normal file
View File

@ -0,0 +1,25 @@
package obitax
func (t1 *TaxNode) LCA(t2 *TaxNode) (*TaxNode, error) {
p1, err1 := t1.Path()
if err1 != nil {
return nil, err1
}
p2, err2 := t2.Path()
if err2 != nil {
return nil, err2
}
i1 := len(*p1) - 1
i2 := len(*p2) - 1
for i1 >= 0 && i2 >= 0 && (*p1)[i1].taxid == (*p2)[i2].taxid {
i1--
i2--
}
return (*p1)[i1+1], nil
}

View File

@ -0,0 +1,167 @@
package obirefidx
import (
"fmt"
"log"
"os"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
"github.com/schollz/progressbar/v3"
)
func IndexSequence(seqidx int,
references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer,
taxo *obitax.Taxonomy) map[int]string {
sequence := references[seqidx]
matrix := obialign.NewFullLCSMatrix(nil,
sequence.Length(),
sequence.Length())
score := make([]int, len(references))
for i, ref := range references {
// maxe := goutils.MaxInt(sequence.Length(), ref.Length())
// mine := 0
// if refcounts != nil {
// mine, maxe = obikmer.Error4MerBounds(refcounts[seqidx], refcounts[i])
// }
lcs, alilength := obialign.FullLCSScore(sequence, ref, matrix)
// if lcs < 0 {
// log.Print("Max error wrongly estimated", mine, maxe)
// log.Println(string(sequence.Sequence()))
// log.Fatalln(string(ref.Sequence()))
// maxe := goutils.MaxInt(sequence.Length(), ref.Length())
// lcs, alilength = obialign.LCSScore(sequence, ref, matrix)
// }
score[i] = alilength - lcs
}
o := goutils.IntOrder(score)
current_taxid, err := taxo.Taxon(references[o[0]].Taxid())
current_score := score[o[0]]
current_idx := o[0]
if err != nil {
log.Panicln(err)
}
ecotag_index := make(map[int]string)
for _, idx := range o {
new_taxid, err := taxo.Taxon(references[idx].Taxid())
if err != nil {
log.Panicln(err)
}
new_taxid, err = current_taxid.LCA(new_taxid)
if err != nil {
log.Panicln(err)
}
new_score := score[idx]
if current_taxid.Taxid() != new_taxid.Taxid() {
if new_score > current_score {
ecotag_index[score[current_idx]] = fmt.Sprintf(
"%d@%s@%s",
current_taxid.Taxid(),
current_taxid.ScientificName(),
current_taxid.Rank())
current_score = new_score
}
current_taxid = new_taxid
current_idx = idx
}
}
ecotag_index[score[current_idx]] = fmt.Sprintf(
"%d@%s@%s",
current_taxid.Taxid(),
current_taxid.ScientificName(),
current_taxid.Rank())
sequence.SetAttribute("ecotag_ref_index", ecotag_index)
return ecotag_index
}
func IndexReferenceDB(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
references := iterator.Load()
refcounts := make(
[]*obikmer.Table4mer,
len(references))
buffer := make([]byte, 0, 1000)
for i, seq := range references {
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
}
taxo, error := obifind.CLILoadSelectedTaxonomy()
if error != nil {
log.Panicln(error)
}
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowCount(),
progressbar.OptionShowIts(),
progressbar.OptionSetDescription("[Sequence Processing]"),
)
bar := progressbar.NewOptions(len(references), pbopt...)
limits := make(chan [2]int)
indexed := obiiter.MakeIBioSequenceBatch()
go func() {
for i := 0; i < len(references); i += 10 {
limits <- [2]int{i, goutils.MinInt(i+10, len(references))}
}
close(limits)
}()
f := func() {
for l := range limits {
sl := obiseq.MakeBioSequenceSlice()
for i := l[0]; i < l[1]; i++ {
IndexSequence(i, references, refcounts, taxo)
sl = append(sl, references[i])
}
indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl))
bar.Add(len(sl))
}
indexed.Done()
}
nworkers := obioptions.CLIParallelWorkers()
indexed.Add(nworkers)
go func() {
indexed.WaitAndClose()
}()
for w := 0; w < nworkers; w++ {
go f()
}
return indexed.Rebatch(1000)
}

View File

@ -0,0 +1,14 @@
package obirefidx
import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
"github.com/DavidGamba/go-getoptions"
)
// OptionSet adds to the basic option set every options declared for
// the obiuniq command
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
obifind.LoadTaxonomyOptionSet(options, true, false)
}

View File

@ -0,0 +1,204 @@
package obitag
import (
"fmt"
"log"
"strconv"
"strings"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
)
func IndexSequence(seqidx int,
references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer,
taxo *obitax.Taxonomy) map[int]string {
sequence := references[seqidx]
matrix := obialign.NewLCSMatrix(nil,
sequence.Length(),
sequence.Length(),
sequence.Length())
score := make([]int, len(references))
for i, ref := range references {
maxe := goutils.MaxInt(sequence.Length(), ref.Length())
mine := 0
if refcounts != nil {
mine, maxe = obikmer.Error4MerBounds(refcounts[seqidx], refcounts[i])
}
lcs, alilength := obialign.LCSScore(sequence, ref, (maxe+1)*2, matrix)
if lcs < 0 {
log.Print("Max error wrongly estimated", mine, maxe)
log.Println(string(sequence.Sequence()))
log.Fatalln(string(ref.Sequence()))
maxe := goutils.MaxInt(sequence.Length(), ref.Length())
lcs, alilength = obialign.LCSScore(sequence, ref, maxe, matrix)
}
score[i] = alilength - lcs
}
o := goutils.IntOrder(score)
current_taxid, err := taxo.Taxon(references[o[0]].Taxid())
current_score := score[o[0]]
current_idx := o[0]
if err != nil {
log.Panicln(err)
}
ecotag_index := make(map[int]string)
for _, idx := range o {
new_taxid, err := taxo.Taxon(references[idx].Taxid())
if err != nil {
log.Panicln(err)
}
new_taxid, err = current_taxid.LCA(new_taxid)
if err != nil {
log.Panicln(err)
}
new_score := score[idx]
if current_taxid.Taxid() != new_taxid.Taxid() {
if new_score > current_score {
ecotag_index[score[current_idx]] = fmt.Sprintf(
"%d@%s@%s",
current_taxid.Taxid(),
current_taxid.ScientificName(),
current_taxid.Rank())
current_score = new_score
}
current_taxid = new_taxid
current_idx = idx
}
}
ecotag_index[score[current_idx]] = fmt.Sprintf(
"%d@%s@%s",
current_taxid.Taxid(),
current_taxid.ScientificName(),
current_taxid.Rank())
sequence.SetAttribute("ecotag_ref_index", ecotag_index)
return ecotag_index
}
func FindClosest(sequence *obiseq.BioSequence,
references obiseq.BioSequenceSlice) (*obiseq.BioSequence, int, float64, int) {
matrix := obialign.NewLCSMatrix(nil,
sequence.Length(),
sequence.Length(),
sequence.Length())
maxe := goutils.MaxInt(sequence.Length(), references[0].Length())
best := references[0]
bestidx := 0
bestId := 0.0
for i, ref := range references {
lcs, alilength := obialign.LCSScore(sequence, ref, maxe, matrix)
if lcs == -1 {
// That aligment is worst than maxe, go to the next sequence
continue
}
score := alilength - lcs
if score < maxe {
best = references[i]
bestidx = i
maxe = score
bestId = float64(lcs) / float64(alilength)
// log.Println(best.Id(), maxe, bestId)
}
if maxe == 0 {
// We have found identity no need to continue to search
break
}
}
return best, maxe, bestId, bestidx
}
func Identify(sequence *obiseq.BioSequence,
references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer,
taxo *obitax.Taxonomy) *obiseq.BioSequence {
best, differences, identity, seqidx := FindClosest(sequence, references)
idx := best.EcotagRefIndex()
if idx == nil {
idx = IndexSequence(seqidx, references, refcounts, taxo)
}
d := differences
identification, ok := idx[d]
for !ok && d >= 0 {
identification, ok = idx[d]
d--
}
parts := strings.Split(identification, "@")
taxid, err := strconv.Atoi(parts[0])
if err != nil {
log.Panicln("Cannot extract taxid from :", identification)
}
sequence.SetTaxid(taxid)
sequence.SetAttribute("scientific_name", parts[1])
sequence.SetAttribute("ecotag_rank", parts[2])
sequence.SetAttribute("ecotag_id", identity)
sequence.SetAttribute("ecotag_difference", differences)
sequence.SetAttribute("ecotag_match", best.Id())
return sequence
}
func IdentifySeqWorker(references obiseq.BioSequenceSlice,
refcounts []*obikmer.Table4mer,
taxo *obitax.Taxonomy) obiiter.SeqWorker {
return func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
return Identify(sequence, references, refcounts, taxo)
}
}
func AssignTaxonomy(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
references := CLIRefDB()
refcounts := make(
[]*obikmer.Table4mer,
len(references))
buffer := make([]byte, 0, 1000)
for i, seq := range references {
refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil)
}
taxo, error := obifind.CLILoadSelectedTaxonomy()
if error != nil {
log.Panicln(error)
}
worker := IdentifySeqWorker(references, refcounts, taxo)
return iterator.Rebatch(10).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000)
}

View File

@ -0,0 +1,45 @@
package obitag
import (
"log"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
"github.com/DavidGamba/go-getoptions"
)
var _RefDB = ""
func TagOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_RefDB, "reference-db",_RefDB,
options.Alias("R"),
options.Required(),
options.ArgName("FILENAME"),
options.Description("The name of the file containing the reference DB"))
}
// OptionSet adds to the basic option set every options declared for
// the obiuniq command
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
obifind.LoadTaxonomyOptionSet(options,true,false)
TagOptionSet(options)
}
func CLIRefDBName() string {
return _RefDB
}
func CLIRefDB() obiseq.BioSequenceSlice {
refdb,err := obiformats.ReadSequencesBatchFromFile(_RefDB)
if err != nil {
log.Panicf("Cannot open the reference library file : %s\n",_RefDB)
}
return refdb.Load()
}