From 8aa323dad5e5f89bcbf201c235b55ef7da42cbc5 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 26 Oct 2022 13:16:56 +0200 Subject: [PATCH] Add a first version of obitag the successor of ecotag --- cmd/obitools/obirefidx/main.go | 22 +++ cmd/obitools/obitag/main.go | 21 +++ pkg/goutils/goutils.go | 10 ++ pkg/goutils/minmax.go | 29 ++++ pkg/goutils/ranks.go | 33 +++++ pkg/obialign/fulllcs.go | 118 ++++++++++++++++ pkg/obialign/lcs.go | 39 +++++- pkg/obiiter/batchiterator.go | 8 ++ pkg/obiiter/sequence_workers.go | 17 +++ pkg/obikmer/counting.go | 76 +++++++++++ pkg/obikmer/encodefourmer.go | 17 ++- pkg/obiseq/biosequence.go | 44 ++++++ pkg/obitax/lca.go | 25 ++++ pkg/obitools/obirefidx/obirefidx.go | 167 +++++++++++++++++++++++ pkg/obitools/obirefidx/options.go | 14 ++ pkg/obitools/obitag/obitag.go | 204 ++++++++++++++++++++++++++++ pkg/obitools/obitag/options.go | 45 ++++++ 17 files changed, 884 insertions(+), 5 deletions(-) create mode 100644 cmd/obitools/obirefidx/main.go create mode 100644 cmd/obitools/obitag/main.go create mode 100644 pkg/goutils/minmax.go create mode 100644 pkg/goutils/ranks.go create mode 100644 pkg/obialign/fulllcs.go create mode 100644 pkg/obiiter/sequence_workers.go create mode 100644 pkg/obikmer/counting.go create mode 100644 pkg/obitax/lca.go create mode 100644 pkg/obitools/obirefidx/obirefidx.go create mode 100644 pkg/obitools/obirefidx/options.go create mode 100644 pkg/obitools/obitag/obitag.go create mode 100644 pkg/obitools/obitag/options.go diff --git a/cmd/obitools/obirefidx/main.go b/cmd/obitools/obirefidx/main.go new file mode 100644 index 0000000..47d5bcc --- /dev/null +++ b/cmd/obitools/obirefidx/main.go @@ -0,0 +1,22 @@ +package main + +import ( + "os" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obirefidx" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" +) + +func main() { + optionParser := obioptions.GenerateOptionParser(obirefidx.OptionSet) + + _, args, _ := optionParser(os.Args) + + fs, _ := obiconvert.ReadBioSequencesBatch(args...) + indexed := obirefidx.IndexReferenceDB(fs) + + written, _ := obiconvert.WriteBioSequencesBatch(indexed, false) + written.Consume() +} diff --git a/cmd/obitools/obitag/main.go b/cmd/obitools/obitag/main.go new file mode 100644 index 0000000..713cdda --- /dev/null +++ b/cmd/obitools/obitag/main.go @@ -0,0 +1,21 @@ +package main + +import ( + "os" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obitag" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" +) + +func main() { + optionParser := obioptions.GenerateOptionParser(obitag.OptionSet) + + _, args, _ := optionParser(os.Args) + + fs, _ := obiconvert.ReadBioSequencesBatch(args...) + identified := obitag.AssignTaxonomy(fs) + + obiconvert.WriteBioSequencesBatch(identified, true) +} diff --git a/pkg/goutils/goutils.go b/pkg/goutils/goutils.go index e0cbf62..e42c5db 100644 --- a/pkg/goutils/goutils.go +++ b/pkg/goutils/goutils.go @@ -4,6 +4,7 @@ import ( "bufio" "bytes" "encoding/json" + "fmt" "io" "os" "reflect" @@ -12,6 +13,15 @@ import ( "github.com/barkimedes/go-deepcopy" ) +// InterfaceToInt converts a interface{} to an integer value if possible. +// If not a "NotAnInteger" error is returned via the err +// return value and val is set to 0. +func InterfaceToString(i interface{}) (val string, err error) { + err = nil + val = fmt.Sprintf("%V", i) + return +} + // NotAnInteger defines a new type of Error : "NotAnInteger" type NotAnInteger struct { message string diff --git a/pkg/goutils/minmax.go b/pkg/goutils/minmax.go new file mode 100644 index 0000000..e99cf1f --- /dev/null +++ b/pkg/goutils/minmax.go @@ -0,0 +1,29 @@ +package goutils + +func MinInt(x, y int) int { + if x < y { + return x + } + return y +} + +func MaxInt(x, y int) int { + if x < y { + return y + } + return x +} + +func MinUInt16(x, y uint16) uint16 { + if x < y { + return x + } + return y +} + +func MaxUInt16(x, y uint16) uint16 { + if x < y { + return y + } + return x +} diff --git a/pkg/goutils/ranks.go b/pkg/goutils/ranks.go new file mode 100644 index 0000000..af94aac --- /dev/null +++ b/pkg/goutils/ranks.go @@ -0,0 +1,33 @@ +package goutils + +import "sort" + +// intRanker is a helper type for the rank function. +type intRanker struct { + x []int // Data to be ranked. + r []int // A list of indexes into f that reflects rank order after sorting. +} + +// ranker satisfies the sort.Interface without mutating the reference slice, f. +func (r intRanker) Len() int { return len(r.x) } +func (r intRanker) Less(i, j int) bool { return r.x[r.r[i]] < r.x[r.r[j]] } +func (r intRanker) Swap(i, j int) { r.r[i], r.r[j] = r.r[j], r.r[i] } + +func IntOrder(data []int) []int { + if len(data) == 0 { + return nil + } + + rk := intRanker{ + x: data, + r: make([]int, len(data)), + } + + for i := 0; i < len(data); i++ { + rk.r[i] = i + } + + sort.Sort(rk) + + return rk.r +} diff --git a/pkg/obialign/fulllcs.go b/pkg/obialign/fulllcs.go new file mode 100644 index 0000000..2d2189e --- /dev/null +++ b/pkg/obialign/fulllcs.go @@ -0,0 +1,118 @@ +package obialign + +import ( + "log" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" +) + +type FullLCSMatrix struct { + matrix []int16 // Score matrix + lenght []int16 // Alignment length matrix + ll int // Length of the longest sequence + l int // Length of the shortest sequence +} + +func NewFullLCSMatrix(matrix *FullLCSMatrix, L, l int) *FullLCSMatrix { + if matrix == nil { + matrix = &FullLCSMatrix{} + } + + if l > L { + log.Panicf("L (%d) must be greater or equal to l (%d)", L, l) + } + + needed := (L) * (l) + + if needed > matrix.Cap() { + matrix.matrix = make([]int16, needed) + matrix.lenght = make([]int16, needed) + } + + matrix.matrix = matrix.matrix[:needed] + matrix.lenght = matrix.lenght[:needed] + + matrix.ll = L + matrix.l = l + + return matrix +} + +func (matrix *FullLCSMatrix) Cap() int { + return cap(matrix.matrix) +} + +func (matrix *FullLCSMatrix) Length() int { + return len(matrix.matrix) +} + +func (matrix *FullLCSMatrix) Get(i, j int) (int16, int16) { + if i == 0 { + return 0, int16(j) + } + if j == 0 { + return 0, int16(i) + } + + pos := (i-1)*matrix.ll + j - 1 + return matrix.matrix[pos], matrix.lenght[pos] +} + +func (matrix *FullLCSMatrix) Set(i, j int, score, length int16) { + if i > 0 && j > 0 { + pos := (i-1)*matrix.ll + j - 1 + matrix.matrix[pos] = score + matrix.lenght[pos] = length + } +} + +// Computes the LCS between two DNA sequences and the length of the +// corresponding alignment +func FullLCSScore(seqA, seqB *obiseq.BioSequence, + matrix *FullLCSMatrix) (int, int) { + + if seqA.Length() == 0 { + log.Fatal("Sequence A has a length of 0") + } + + if seqB.Length() == 0 { + log.Fatal("Sequence B has a length of 0") + } + // swapped := false + + if seqA.Length() < seqB.Length() { + seqA, seqB = seqB, seqA + // swapped = true + } + + la := seqA.Length() + lb := seqB.Length() + sa := seqA.Sequence() + sb := seqB.Sequence() + + matrix = NewFullLCSMatrix(matrix, la, lb) + + for i := 1; i <= matrix.l; i++ { + for j := 1; j <= matrix.ll; j++ { + sd, ld := matrix.Get(i-1, j-1) + + if sb[i-1] == sa[j-1] { + sd++ + } + sup, lup := matrix.Get(i-1, j) + sleft, lleft := matrix.Get(i, j-1) + switch { + case sd >= sup && sd >= sleft: + matrix.Set(i, j, sd, ld+1) + case sup >= sleft && sup >= sd: + matrix.Set(i, j, sup, lup+1) + default: + matrix.Set(i, j, sleft, lleft+1) + } + } + } + + s, l := matrix.Get(lb, la) + + return int(s), int(l) +} diff --git a/pkg/obialign/lcs.go b/pkg/obialign/lcs.go index 41835b6..ecce1f3 100644 --- a/pkg/obialign/lcs.go +++ b/pkg/obialign/lcs.go @@ -96,8 +96,18 @@ func (matrix *LCSMatrix) Set(i, j int, score, length int16) { } } +// Computes the LCS between two DNA sequences and the length of the +// corresponding alignment func LCSScore(seqA, seqB *obiseq.BioSequence, maxError int, matrix *LCSMatrix) (int, int) { + + if seqA.Length() == 0 { + log.Fatal("Sequence A has a length of 0") + } + + if seqB.Length() == 0 { + log.Fatal("Sequence B has a length of 0") + } // swapped := false if seqA.Length() < seqB.Length() { @@ -105,18 +115,39 @@ func LCSScore(seqA, seqB *obiseq.BioSequence, maxError int, // swapped = true } + + if (seqA.Length() - seqB.Length()) > maxError { return -1, -1 } - matrix = NewLCSMatrix(matrix, seqA.Length(), seqB.Length(), maxError) + la := seqA.Length() + lb := seqB.Length() + sa := seqA.Sequence() + sb := seqB.Sequence() + + matrix = NewLCSMatrix(matrix, la, lb, maxError) for i := 1; i <= matrix.l; i++ { ij := max(1, i-matrix.extra) sj := min(i+matrix.delta+matrix.extra, matrix.ll) for j := ij; j <= sj; j++ { sd, ld := matrix.Get(i-1, j-1) - if seqB.Sequence()[i-1] == seqA.Sequence()[j-1] { + if i > lb { + log.Println("Error on seq B ", 1, matrix.l) + log.Println(i) + log.Println(seqB.Length(), "/", lb) + log.Println(string(sa)) + log.Fatalln(string(sb)) + } + if j > la { + log.Println("Error on seq A ", ij, sj) + log.Println(j) + log.Println(seqA.Length(), "/", la) + log.Println(string(sa)) + log.Fatalln(string(sb)) + } + if sb[i-1] == sa[j-1] { sd++ } sup, lup := matrix.Get(i-1, j) @@ -132,11 +163,13 @@ func LCSScore(seqA, seqB *obiseq.BioSequence, maxError int, } } - s, l := matrix.Get(seqB.Length(), seqA.Length()) + s, l := matrix.Get(lb, la) if (l - s) > int16(maxError) { + // log.Println(l,s,l-s,maxError) return -1, -1 } return int(s), int(l) } + diff --git a/pkg/obiiter/batchiterator.go b/pkg/obiiter/batchiterator.go index 5721b97..086fb3b 100644 --- a/pkg/obiiter/batchiterator.go +++ b/pkg/obiiter/batchiterator.go @@ -465,6 +465,14 @@ func (iterator IBioSequenceBatch) Recycle() { log.Debugf("End of the recycling of %d Bioseq objects", recycled) } +func (iterator IBioSequenceBatch) Consume() { + for iterator.Next() { + batch := iterator.Get() + batch.Recycle() + } +} + + func (iterator IBioSequenceBatch) Count(recycle bool) (int, int, int) { variants := 0 reads := 0 diff --git a/pkg/obiiter/sequence_workers.go b/pkg/obiiter/sequence_workers.go new file mode 100644 index 0000000..302ae9d --- /dev/null +++ b/pkg/obiiter/sequence_workers.go @@ -0,0 +1,17 @@ +package obiiter + +// func MakeSetAttributeWorker(rank string) obiiter.SeqWorker { + +// if !goutils.Contains(taxonomy.RankList(), rank) { +// log.Fatalf("%s is not a valid rank (allowed ranks are %v)", +// rank, +// taxonomy.RankList()) +// } + +// w := func(sequence *obiseq.BioSequence) *obiseq.BioSequence { +// taxonomy.SetTaxonAtRank(sequence, rank) +// return sequence +// } + +// return w +// } \ No newline at end of file diff --git a/pkg/obikmer/counting.go b/pkg/obikmer/counting.go new file mode 100644 index 0000000..1a2fb9a --- /dev/null +++ b/pkg/obikmer/counting.go @@ -0,0 +1,76 @@ +package obikmer + +import ( + "math" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" +) + +type Table4mer [256]uint16 + +func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Table4mer { + + iternal_buffer := Encode4mer(seq, buffer) + + if counts == nil { + var w Table4mer + counts = &w + } + + // Every cells of the counter is set to zero + for i := 0; i < 256; i++ { + (*counts)[i] = 0 + } + + for _, code := range iternal_buffer { + (*counts)[code]++ + } + return counts +} + +func Common4Mer(count1, count2 *Table4mer) int { + sum := 0 + for i := 0; i < 256; i++ { + sum += int(goutils.MinUInt16((*count1)[i], (*count2)[i])) + } + return sum +} + +func Sum4Mer(count *Table4mer) int { + sum := 0 + for i := 0; i < 256; i++ { + sum += int((*count)[i]) + } + return sum +} + +func LCS4MerBounds(count1, count2 *Table4mer) (int, int) { + s1 := Sum4Mer(count1) + s2 := Sum4Mer(count2) + smin := goutils.MinInt(s1, s2) + + cw := Common4Mer(count1, count2) + + lcsMax := smin + 3 - int(math.Ceil(float64(smin-cw)/4.0)) + lcsMin := cw + + if cw > 0 { + lcsMin += 3 + } + + return lcsMin, lcsMax +} + +func Error4MerBounds(count1, count2 *Table4mer) (int, int) { + s1 := Sum4Mer(count1) + s2 := Sum4Mer(count2) + smax := goutils.MaxInt(s1, s2) + + cw := Common4Mer(count1, count2) + + errorMax := smax - cw + 2* int(math.Floor(float64(cw+5)/8.0)) + errorMin := int(math.Ceil(float64(errorMax) / 4.0)) + + return errorMin, errorMax +} diff --git a/pkg/obikmer/encodefourmer.go b/pkg/obikmer/encodefourmer.go index 6bb735d..b2db7e1 100644 --- a/pkg/obikmer/encodefourmer.go +++ b/pkg/obikmer/encodefourmer.go @@ -21,7 +21,7 @@ var __single_base_code__ = []byte{0, } // Encode4mer transforms an obiseq.BioSequence into a sequence -// of kmer of length 4. Each letter of the sequence noot belonging +// of kmer of length 4. Each letter of the sequence not belonging // A, C, G, T, U are considered as a A. The kmer is encoded as a byte // value ranging from 0 to 255. Each nucleotite is represented by // two bits. The values 0, 1, 2, 3 correspond respectively to A, C, G, @@ -65,15 +65,24 @@ func Encode4mer(seq *obiseq.BioSequence, buffer *[]byte) []byte { return *buffer } +// Index4mer returns an index where the occurrence position of every fourmer is +// stored. The index is returned as an array of slices of integer. The first +// dimention corresponds to the code of the 4mer, the second func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int { iternal_buffer := Encode4mer(seq, buffer) if index == nil || cap(*index) < 256 { + // A new index is created i := make([][]int, 256) - index = &i + if index == nil { + index = &i + } else { + *index = i + } } + // Every cells of the index is emptied for i := 0; i < 256; i++ { (*index)[i] = (*index)[i][:0] } @@ -85,6 +94,9 @@ func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int return *index } +// FastShiftFourMer runs a Fast algorithm (similar to the one used in FASTA) to compare two sequences. +// The returned values are two integer values. The shift between both the sequences and the count of +// matching 4mer when this shift is applied between both the sequences. func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (int, int) { iternal_buffer := Encode4mer(seq, buffer) @@ -115,3 +127,4 @@ func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (i return maxshift, maxcount } + diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index 0855e21..b383bdd 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -13,6 +13,7 @@ package obiseq import ( "crypto/md5" "fmt" + "strconv" "sync/atomic" log "github.com/sirupsen/logrus" @@ -298,6 +299,49 @@ func (s *BioSequence) Taxid() int { return taxid } +func (s *BioSequence) EcotagRefIndex() map[int]string { + + var val map[int]string + + i, ok := s.GetAttribute("ecotag_ref_index") + + if !ok { + return nil + } + + switch i := i.(type) { + case map[int]string: + val = i + case map[string]interface{}: + val = make(map[int]string, len(i)) + for k, v := range i { + score, err := strconv.Atoi(k) + if err != nil { + log.Panicln(err) + } + + val[score], err = goutils.InterfaceToString(v) + if err != nil { + log.Panicln(err) + } + } + case map[string]string: + val = make(map[int]string, len(i)) + for k, v := range i { + score, err := strconv.Atoi(k) + if err != nil { + log.Panicln(err) + } + val[score] = v + + } + default: + log.Panicln("value of attribute ecotag_ref_index cannot be casted to a map[int]string") + } + + return val +} + func (s *BioSequence) SetTaxid(taxid int) { annot := s.Annotations() annot["taxid"] = taxid diff --git a/pkg/obitax/lca.go b/pkg/obitax/lca.go new file mode 100644 index 0000000..2bd14de --- /dev/null +++ b/pkg/obitax/lca.go @@ -0,0 +1,25 @@ +package obitax + +func (t1 *TaxNode) LCA(t2 *TaxNode) (*TaxNode, error) { + p1, err1 := t1.Path() + + if err1 != nil { + return nil, err1 + } + + p2, err2 := t2.Path() + + if err2 != nil { + return nil, err2 + } + + i1 := len(*p1) - 1 + i2 := len(*p2) - 1 + + for i1 >= 0 && i2 >= 0 && (*p1)[i1].taxid == (*p2)[i2].taxid { + i1-- + i2-- + } + + return (*p1)[i1+1], nil +} diff --git a/pkg/obitools/obirefidx/obirefidx.go b/pkg/obitools/obirefidx/obirefidx.go new file mode 100644 index 0000000..7ee8edc --- /dev/null +++ b/pkg/obitools/obirefidx/obirefidx.go @@ -0,0 +1,167 @@ +package obirefidx + +import ( + "fmt" + "log" + "os" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind" + "github.com/schollz/progressbar/v3" +) + +func IndexSequence(seqidx int, + references obiseq.BioSequenceSlice, + refcounts []*obikmer.Table4mer, + taxo *obitax.Taxonomy) map[int]string { + + sequence := references[seqidx] + matrix := obialign.NewFullLCSMatrix(nil, + sequence.Length(), + sequence.Length()) + + score := make([]int, len(references)) + for i, ref := range references { + // maxe := goutils.MaxInt(sequence.Length(), ref.Length()) + // mine := 0 + // if refcounts != nil { + // mine, maxe = obikmer.Error4MerBounds(refcounts[seqidx], refcounts[i]) + // } + + + lcs, alilength := obialign.FullLCSScore(sequence, ref, matrix) + + // if lcs < 0 { + // log.Print("Max error wrongly estimated", mine, maxe) + // log.Println(string(sequence.Sequence())) + // log.Fatalln(string(ref.Sequence())) + + // maxe := goutils.MaxInt(sequence.Length(), ref.Length()) + // lcs, alilength = obialign.LCSScore(sequence, ref, matrix) + // } + score[i] = alilength - lcs + } + + o := goutils.IntOrder(score) + + current_taxid, err := taxo.Taxon(references[o[0]].Taxid()) + current_score := score[o[0]] + current_idx := o[0] + + if err != nil { + log.Panicln(err) + } + + ecotag_index := make(map[int]string) + + for _, idx := range o { + new_taxid, err := taxo.Taxon(references[idx].Taxid()) + if err != nil { + log.Panicln(err) + } + + new_taxid, err = current_taxid.LCA(new_taxid) + if err != nil { + log.Panicln(err) + } + + new_score := score[idx] + + if current_taxid.Taxid() != new_taxid.Taxid() { + + if new_score > current_score { + ecotag_index[score[current_idx]] = fmt.Sprintf( + "%d@%s@%s", + current_taxid.Taxid(), + current_taxid.ScientificName(), + current_taxid.Rank()) + current_score = new_score + } + + current_taxid = new_taxid + current_idx = idx + } + } + + ecotag_index[score[current_idx]] = fmt.Sprintf( + "%d@%s@%s", + current_taxid.Taxid(), + current_taxid.ScientificName(), + current_taxid.Rank()) + + sequence.SetAttribute("ecotag_ref_index", ecotag_index) + + return ecotag_index +} + +func IndexReferenceDB(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch { + + references := iterator.Load() + refcounts := make( + []*obikmer.Table4mer, + len(references)) + + buffer := make([]byte, 0, 1000) + + for i, seq := range references { + refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil) + } + + taxo, error := obifind.CLILoadSelectedTaxonomy() + if error != nil { + log.Panicln(error) + } + + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowCount(), + progressbar.OptionShowIts(), + progressbar.OptionSetDescription("[Sequence Processing]"), + ) + + bar := progressbar.NewOptions(len(references), pbopt...) + + limits := make(chan [2]int) + indexed := obiiter.MakeIBioSequenceBatch() + go func() { + for i := 0; i < len(references); i += 10 { + limits <- [2]int{i, goutils.MinInt(i+10, len(references))} + } + close(limits) + }() + + f := func() { + for l := range limits { + sl := obiseq.MakeBioSequenceSlice() + for i := l[0]; i < l[1]; i++ { + IndexSequence(i, references, refcounts, taxo) + sl = append(sl, references[i]) + } + indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl)) + bar.Add(len(sl)) + } + + indexed.Done() + } + + nworkers := obioptions.CLIParallelWorkers() + indexed.Add(nworkers) + + go func() { + indexed.WaitAndClose() + }() + + for w := 0; w < nworkers; w++ { + go f() + } + + return indexed.Rebatch(1000) +} diff --git a/pkg/obitools/obirefidx/options.go b/pkg/obitools/obirefidx/options.go new file mode 100644 index 0000000..d7a0754 --- /dev/null +++ b/pkg/obitools/obirefidx/options.go @@ -0,0 +1,14 @@ +package obirefidx + +import ( + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind" + "github.com/DavidGamba/go-getoptions" +) + +// OptionSet adds to the basic option set every options declared for +// the obiuniq command +func OptionSet(options *getoptions.GetOpt) { + obiconvert.OptionSet(options) + obifind.LoadTaxonomyOptionSet(options, true, false) +} diff --git a/pkg/obitools/obitag/obitag.go b/pkg/obitools/obitag/obitag.go new file mode 100644 index 0000000..f63ed28 --- /dev/null +++ b/pkg/obitools/obitag/obitag.go @@ -0,0 +1,204 @@ +package obitag + +import ( + "fmt" + "log" + "strconv" + "strings" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind" +) + +func IndexSequence(seqidx int, + references obiseq.BioSequenceSlice, + refcounts []*obikmer.Table4mer, + taxo *obitax.Taxonomy) map[int]string { + + sequence := references[seqidx] + matrix := obialign.NewLCSMatrix(nil, + sequence.Length(), + sequence.Length(), + sequence.Length()) + + score := make([]int, len(references)) + for i, ref := range references { + maxe := goutils.MaxInt(sequence.Length(), ref.Length()) + mine := 0 + if refcounts != nil { + mine, maxe = obikmer.Error4MerBounds(refcounts[seqidx], refcounts[i]) + } + lcs, alilength := obialign.LCSScore(sequence, ref, (maxe+1)*2, matrix) + + if lcs < 0 { + log.Print("Max error wrongly estimated", mine, maxe) + log.Println(string(sequence.Sequence())) + log.Fatalln(string(ref.Sequence())) + + maxe := goutils.MaxInt(sequence.Length(), ref.Length()) + lcs, alilength = obialign.LCSScore(sequence, ref, maxe, matrix) + } + score[i] = alilength - lcs + } + + o := goutils.IntOrder(score) + + current_taxid, err := taxo.Taxon(references[o[0]].Taxid()) + current_score := score[o[0]] + current_idx := o[0] + + if err != nil { + log.Panicln(err) + } + + ecotag_index := make(map[int]string) + + for _, idx := range o { + new_taxid, err := taxo.Taxon(references[idx].Taxid()) + if err != nil { + log.Panicln(err) + } + + new_taxid, err = current_taxid.LCA(new_taxid) + if err != nil { + log.Panicln(err) + } + + new_score := score[idx] + + if current_taxid.Taxid() != new_taxid.Taxid() { + + if new_score > current_score { + ecotag_index[score[current_idx]] = fmt.Sprintf( + "%d@%s@%s", + current_taxid.Taxid(), + current_taxid.ScientificName(), + current_taxid.Rank()) + current_score = new_score + } + + current_taxid = new_taxid + current_idx = idx + } + } + + ecotag_index[score[current_idx]] = fmt.Sprintf( + "%d@%s@%s", + current_taxid.Taxid(), + current_taxid.ScientificName(), + current_taxid.Rank()) + + sequence.SetAttribute("ecotag_ref_index", ecotag_index) + + return ecotag_index +} + +func FindClosest(sequence *obiseq.BioSequence, + references obiseq.BioSequenceSlice) (*obiseq.BioSequence, int, float64, int) { + + matrix := obialign.NewLCSMatrix(nil, + sequence.Length(), + sequence.Length(), + sequence.Length()) + + maxe := goutils.MaxInt(sequence.Length(), references[0].Length()) + best := references[0] + bestidx := 0 + bestId := 0.0 + + for i, ref := range references { + lcs, alilength := obialign.LCSScore(sequence, ref, maxe, matrix) + if lcs == -1 { + // That aligment is worst than maxe, go to the next sequence + continue + } + + score := alilength - lcs + if score < maxe { + best = references[i] + bestidx = i + maxe = score + bestId = float64(lcs) / float64(alilength) + // log.Println(best.Id(), maxe, bestId) + } + + if maxe == 0 { + // We have found identity no need to continue to search + break + } + } + return best, maxe, bestId, bestidx +} + +func Identify(sequence *obiseq.BioSequence, + references obiseq.BioSequenceSlice, + refcounts []*obikmer.Table4mer, + taxo *obitax.Taxonomy) *obiseq.BioSequence { + best, differences, identity, seqidx := FindClosest(sequence, references) + + idx := best.EcotagRefIndex() + if idx == nil { + idx = IndexSequence(seqidx, references, refcounts, taxo) + } + + d := differences + identification, ok := idx[d] + for !ok && d >= 0 { + identification, ok = idx[d] + d-- + } + + parts := strings.Split(identification, "@") + taxid, err := strconv.Atoi(parts[0]) + + if err != nil { + log.Panicln("Cannot extract taxid from :", identification) + } + + sequence.SetTaxid(taxid) + sequence.SetAttribute("scientific_name", parts[1]) + sequence.SetAttribute("ecotag_rank", parts[2]) + sequence.SetAttribute("ecotag_id", identity) + sequence.SetAttribute("ecotag_difference", differences) + sequence.SetAttribute("ecotag_match", best.Id()) + + return sequence +} + +func IdentifySeqWorker(references obiseq.BioSequenceSlice, + refcounts []*obikmer.Table4mer, + taxo *obitax.Taxonomy) obiiter.SeqWorker { + return func(sequence *obiseq.BioSequence) *obiseq.BioSequence { + return Identify(sequence, references, refcounts, taxo) + } +} + +func AssignTaxonomy(iterator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch { + + references := CLIRefDB() + refcounts := make( + []*obikmer.Table4mer, + len(references)) + + buffer := make([]byte, 0, 1000) + + for i, seq := range references { + refcounts[i] = obikmer.Count4Mer(seq, &buffer, nil) + } + + taxo, error := obifind.CLILoadSelectedTaxonomy() + + if error != nil { + log.Panicln(error) + } + + worker := IdentifySeqWorker(references, refcounts, taxo) + + return iterator.Rebatch(10).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000) +} diff --git a/pkg/obitools/obitag/options.go b/pkg/obitools/obitag/options.go new file mode 100644 index 0000000..5f7e07d --- /dev/null +++ b/pkg/obitools/obitag/options.go @@ -0,0 +1,45 @@ +package obitag + +import ( + "log" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind" + "github.com/DavidGamba/go-getoptions" +) + +var _RefDB = "" + +func TagOptionSet(options *getoptions.GetOpt) { + options.StringVar(&_RefDB, "reference-db",_RefDB, + options.Alias("R"), + options.Required(), + options.ArgName("FILENAME"), + options.Description("The name of the file containing the reference DB")) + + +} + +// OptionSet adds to the basic option set every options declared for +// the obiuniq command +func OptionSet(options *getoptions.GetOpt) { + obiconvert.OptionSet(options) + obifind.LoadTaxonomyOptionSet(options,true,false) + TagOptionSet(options) +} + +func CLIRefDBName() string { + return _RefDB +} + +func CLIRefDB() obiseq.BioSequenceSlice { + refdb,err := obiformats.ReadSequencesBatchFromFile(_RefDB) + + if err != nil { + log.Panicf("Cannot open the reference library file : %s\n",_RefDB) + } + + return refdb.Load() +} \ No newline at end of file