diff --git a/pkg/obialign/fastlcsegf.go b/pkg/obialign/fastlcsegf.go index d59ac46..bce5d06 100644 --- a/pkg/obialign/fastlcsegf.go +++ b/pkg/obialign/fastlcsegf.go @@ -4,33 +4,6 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" ) -var _iupac = [26]byte{ - // a b c d e f - 1, 14, 2, 13, 0, 0, - // g h i j k l - 4, 11, 0, 0, 12, 0, - // m n o p q r - 3, 15, 0, 0, 0, 5, - // s t u v w x - 6, 8, 8, 13, 9, 0, - // y z - 10, 0, -} - -func _samenuc(a, b byte) bool { - if (a >= 'A') && (a <= 'Z') { - a |= 32 - } - if (b >= 'A') && (b <= 'Z') { - b |= 32 - } - - if (a >= 'a') && (a <= 'z') && (b >= 'a') && (b <= 'z') { - return (_iupac[a-'a'] & _iupac[b-'a']) > 0 - } - return a == b -} - // FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices. // // The score is calculated using the following scoring matrix: @@ -165,7 +138,7 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[ default: // We are in the middle of the matrix Sdiag = _incpath(previous[x]) - if _samenuc(bA[j-1], bB[i-1]) { + if obiseq.SameIUPACNuc(bA[j-1], bB[i-1]) { Sdiag = _incscore(Sdiag) } @@ -265,7 +238,7 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[ Sleft = _notavail default: Sdiag = _incpath(previous[x]) - if _samenuc(bA[j-1], bB[i-1]) { + if obiseq.SameIUPACNuc(bA[j-1], bB[i-1]) { Sdiag = _incscore(Sdiag) } diff --git a/pkg/obialign/locatepattern.go b/pkg/obialign/locatepattern.go index 23b5538..d5170e6 100644 --- a/pkg/obialign/locatepattern.go +++ b/pkg/obialign/locatepattern.go @@ -1,6 +1,9 @@ package obialign -import log "github.com/sirupsen/logrus" +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + log "github.com/sirupsen/logrus" +) // buffIndex converts a pair of coordinates (i, j) into a linear index in a matrix // of size width x width. The coordinates are (-1)-indexed, and the linear index @@ -69,7 +72,7 @@ func LocatePattern(id string, pattern, sequence []byte) (int, int, int) { // Mismatch score = -1 // Match score = 0 match := -1 - if _samenuc(pattern[j], sequence[i]) { + if obiseq.SameIUPACNuc(pattern[j], sequence[i]) { match = 0 } @@ -103,7 +106,7 @@ func LocatePattern(id string, pattern, sequence []byte) (int, int, int) { // Mismatch score = -1 // Match score = 0 match := -1 - if _samenuc(pattern[jmax], sequence[i]) { + if obiseq.SameIUPACNuc(pattern[jmax], sequence[i]) { match = 0 } diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 7a2605a..76d4cf6 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "d7ed9d3" +var _Commit = "29bf4ce" var _Version = "Release 4.4.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/iupac_nog.go b/pkg/obiseq/iupac_nog.go new file mode 100644 index 0000000..a3fba9b --- /dev/null +++ b/pkg/obiseq/iupac_nog.go @@ -0,0 +1,28 @@ +package obiseq + +var _iupac = [26]byte{ + // a b c d e f + 1, 14, 2, 13, 0, 0, + // g h i j k l + 4, 11, 0, 0, 12, 0, + // m n o p q r + 3, 15, 0, 0, 0, 5, + // s t u v w x + 6, 8, 8, 13, 9, 0, + // y z + 10, 0, +} + +func SameIUPACNuc(a, b byte) bool { + if (a >= 'A') && (a <= 'Z') { + a |= 32 + } + if (b >= 'A') && (b <= 'Z') { + b |= 32 + } + + if (a >= 'a') && (a <= 'z') && (b >= 'a') && (b <= 'z') { + return (_iupac[a-'a'] & _iupac[b-'a']) > 0 + } + return a == b +} diff --git a/pkg/obitools/obiclean/chimera.go b/pkg/obitools/obiclean/chimera.go index b13637a..fda821e 100644 --- a/pkg/obitools/obiclean/chimera.go +++ b/pkg/obitools/obiclean/chimera.go @@ -2,12 +2,10 @@ package obiclean import ( "fmt" - "os" "sort" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" - "github.com/schollz/progressbar/v3" log "github.com/sirupsen/logrus" ) @@ -21,13 +19,13 @@ func commonPrefix(a, b *obiseq.BioSequence) int { as := a.Sequence() bs := b.Sequence() - for i < l && as[i] == bs[i] { + for i < l && obiseq.SameIUPACNuc(as[i], bs[i]) { i++ } - if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) { - log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i]) - } + // if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) { + // log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i]) + // } return i } @@ -44,29 +42,97 @@ func commonSuffix(a, b *obiseq.BioSequence) int { bs := b.Sequence() l := 0 - for i >= 0 && j >= 0 && as[i] == bs[j] { + for i >= 0 && j >= 0 && obiseq.SameIUPACNuc(as[i], bs[j]) { i-- j-- l++ } - if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) { - log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:]) - } - // obilog.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:]) + // if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) { + // log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:]) + // } + // log.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:]) return l } +// oneDifference return true if s1 and s2 differ by exactly 1 operation +// (substitution, insertion or deletion) +func oneDifference(s1, s2 []byte) bool { + l1 := len(s1) + l2 := len(s2) + + // Case 1: same lengths → test substitution + if l1 == l2 { + diff := 0 + for i := 0; i < l1; i++ { + if s1[i] != s2[i] { + diff++ + if diff > 1 { + return false + } + } + } + return diff == 1 + } + + // Case 2: difference of 1 character → insertion/deletion + if l1 == l2+1 { + for i := 0; i < l1; i++ { + if string(s1[:i])+string(s1[i+1:]) == string(s2) { + return true + } + } + return false + } + if l2 == l1+1 { + for i := 0; i < l2; i++ { + if string(s2[:i])+string(s2[i+1:]) == string(s1) { + return true + } + } + return false + } + + // Case 3: difference > 1 character + return false +} + +func GetChimera(sequence *obiseq.BioSequence) map[string]string { + annotation := sequence.Annotations() + iobistatus, ok := annotation["chimera"] + var chimera map[string]string + var err error + + if ok { + switch iobistatus := iobistatus.(type) { + case map[string]string: + chimera = iobistatus + case map[string]interface{}: + chimera = make(map[string]string) + for k, v := range iobistatus { + chimera[k], err = obiutils.InterfaceToString(v) + if err != nil { + log.Panicf("chimera attribute of sequence %s must be castable to a map[string]string", sequence.Id()) + } + } + } + } else { + chimera = make(map[string]string) + annotation["chimera"] = chimera + } + + return chimera +} + +// add the suffix/prefix comparisons to detect chimeras (handle IUPAC codes) func AnnotateChimera(samples map[string]*[]*seqPCR) { w := func(sample string, seqs *[]*seqPCR) { ls := len(*seqs) - cp := make([]int, ls) - cs := make([]int, ls) - pcrs := make([]*seqPCR, 0, ls) + // select only sequences without edges (head sequences) for _, s := range *seqs { if len(s.Edges) == 0 { pcrs = append(pcrs, s) @@ -75,66 +141,63 @@ func AnnotateChimera(samples map[string]*[]*seqPCR) { lp := len(pcrs) + // sort by increasing weight (like increasing abundance) sort.Slice(pcrs, func(i, j int) bool { return pcrs[i].Weight < pcrs[j].Weight }) for i, s := range pcrs { - for j := i + 1; j < lp; j++ { - s2 := pcrs[j] - cp[j] = commonPrefix(s.Sequence, s2.Sequence) - cs[j] = commonSuffix(s.Sequence, s2.Sequence) - } + seqRef := s.Sequence + L := seqRef.Len() - var cm map[string]string - var err error + maxLeft, maxRight := 0, 0 + var nameLeft, nameRight string - chimera, ok := s.Sequence.GetAttribute("chimera") + // looking for potential parents + for j := 0; j < lp; j++ { + if j == i { + continue + } + seqParent := pcrs[j].Sequence - if !ok { - cm = map[string]string{} - } else { - cm, err = obiutils.InterfaceToStringMap(chimera) - if err != nil { - log.Fatalf("type of chimera not map[string]string: %T (%v)", - chimera, err) + // Check abundance: parent must be more abundant + if pcrs[j].Weight <= s.Weight { + continue + } + + // Check edit distance (skip if only one diff, supposed to never happen) + if oneDifference(seqRef.Sequence(), seqParent.Sequence()) { + continue + } + + // Common prefix + left := commonPrefix(seqRef, seqParent) + if left > maxLeft { + maxLeft = left + nameLeft = seqParent.Id() + } + + // Common suffix + right := commonSuffix(seqRef, seqParent) + if right > maxRight { + maxRight = right + nameRight = seqParent.Id() } } - ls := s.Sequence.Len() + // Select parents with longuest prefix/suffix + // Condition prefix+suffix covers the sequence and sequence not include into parent + if maxLeft >= L-maxRight && maxLeft > 0 && maxRight < L { - for k := i + 1; k < lp; k++ { - for l := i + 1; l < lp; l++ { - if k != l && cp[k]+cs[l] == ls { - cm[sample] = fmt.Sprintf("{%s}/{%s}@(%d)", - pcrs[k].Sequence.Id(), - pcrs[l].Sequence.Id(), - cp[k]) - } - } - } - - if len(cm) > 0 { - s.Sequence.SetAttribute("chimera", cm) + chimeraMap := GetChimera(s.Sequence) + // overlap sequence + overlap := seqRef.Sequence()[L-maxRight : maxLeft] + chimeraMap[sample] = fmt.Sprintf("{%s}/{%s}@(%s)(%d)(%d)(%d)", nameLeft, nameRight, overlap, L-maxRight, maxLeft, len(overlap)) } } - } - pbopt := make([]progressbar.Option, 0, 5) - pbopt = append(pbopt, - progressbar.OptionSetWriter(os.Stderr), - progressbar.OptionSetWidth(15), - progressbar.OptionShowIts(), - progressbar.OptionSetPredictTime(true), - progressbar.OptionSetDescription("[Chimera detection]"), - ) - - bar := progressbar.NewOptions(len(samples), pbopt...) - for sn, sqs := range samples { w(sn, sqs) - bar.Add(1) } - } diff --git a/pkg/obitools/obimatrix/obimatrix.go b/pkg/obitools/obimatrix/obimatrix.go index 3ad44f8..0a77159 100644 --- a/pkg/obitools/obimatrix/obimatrix.go +++ b/pkg/obitools/obimatrix/obimatrix.go @@ -265,17 +265,18 @@ func CLIWriteCSVToStdout(matrix *MatrixData) { osamples := samples.Members() sort.Strings(osamples) - columns := make([]string, 0, len(osamples)+len(matrix.attributeList)) columns = append(columns, matrix.attributeList...) columns = append(columns, osamples...) + header := slices.Clone(columns) + csvwriter.Write(columns) nattribs := len(matrix.attributeList) for k, data := range matrix.matrix { attrs := matrix.attributes[k] - for i, kk := range osamples { + for i, kk := range header { if i < nattribs { if v, ok := attrs[kk]; ok { vs, err := obiutils.InterfaceToString(v)