work on obiclean chimera detection

2026-06-24 01:31:00 +00:00 · 2025-10-20 16:35:19 +02:00
parent 29bf4ce871
commit d17a9520b9
6 changed files with 160 additions and 92 deletions
@@ -4,33 +4,6 @@ import (
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 )

-var _iupac = [26]byte{
-	//  a   b  c  d   e  f
-	1, 14, 2, 13, 0, 0,
-	//  g   h  i  j  k   l
-	4, 11, 0, 0, 12, 0,
-	//  m   n  o  p  q  r
-	3, 15, 0, 0, 0, 5,
-	//  s  t  u   v  w  x
-	6, 8, 8, 13, 9, 0,
-	//  y   z
-	10, 0,
-}
-
-func _samenuc(a, b byte) bool {
-	if (a >= 'A') && (a <= 'Z') {
-		a |= 32
-	}
-	if (b >= 'A') && (b <= 'Z') {
-		b |= 32
-	}
-
-	if (a >= 'a') && (a <= 'z') && (b >= 'a') && (b <= 'z') {
-		return (_iupac[a-'a'] & _iupac[b-'a']) > 0
-	}
-	return a == b
-}
-
 // FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices.
 //
 // The score is calculated using the following scoring matrix:
@@ -165,7 +138,7 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
 			default:
 				// We are in the middle of the matrix
 				Sdiag = _incpath(previous[x])
-				if _samenuc(bA[j-1], bB[i-1]) {
+				if obiseq.SameIUPACNuc(bA[j-1], bB[i-1]) {
 					Sdiag = _incscore(Sdiag)
 				}

@@ -265,7 +238,7 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
 				Sleft = _notavail
 			default:
 				Sdiag = _incpath(previous[x])
-				if _samenuc(bA[j-1], bB[i-1]) {
+				if obiseq.SameIUPACNuc(bA[j-1], bB[i-1]) {
 					Sdiag = _incscore(Sdiag)
 				}

@@ -1,6 +1,9 @@
 package obialign

-import log "github.com/sirupsen/logrus"
+import (
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+	log "github.com/sirupsen/logrus"
+)

 // buffIndex converts a pair of coordinates (i, j) into a linear index in a matrix
 // of size width x width. The coordinates are (-1)-indexed, and the linear index
@@ -69,7 +72,7 @@ func LocatePattern(id string, pattern, sequence []byte) (int, int, int) {
 			// Mismatch score = -1
 			// Match score = 0
 			match := -1
-			if _samenuc(pattern[j], sequence[i]) {
+			if obiseq.SameIUPACNuc(pattern[j], sequence[i]) {
 				match = 0
 			}

@@ -103,7 +106,7 @@ func LocatePattern(id string, pattern, sequence []byte) (int, int, int) {
 		// Mismatch score = -1
 		// Match score = 0
 		match := -1
-		if _samenuc(pattern[jmax], sequence[i]) {
+		if obiseq.SameIUPACNuc(pattern[jmax], sequence[i]) {
 			match = 0
 		}

@@ -8,7 +8,7 @@ import (
 // corresponds to the last commit, and not the one when the file will be
 // commited

-var _Commit = "d7ed9d3"
+var _Commit = "29bf4ce"
 var _Version = "Release 4.4.0"

 // Version returns the version of the obitools package.
@@ -0,0 +1,28 @@
+package obiseq
+
+var _iupac = [26]byte{
+	//  a   b  c  d   e  f
+	1, 14, 2, 13, 0, 0,
+	//  g   h  i  j  k   l
+	4, 11, 0, 0, 12, 0,
+	//  m   n  o  p  q  r
+	3, 15, 0, 0, 0, 5,
+	//  s  t  u   v  w  x
+	6, 8, 8, 13, 9, 0,
+	//  y   z
+	10, 0,
+}
+
+func SameIUPACNuc(a, b byte) bool {
+	if (a >= 'A') && (a <= 'Z') {
+		a |= 32
+	}
+	if (b >= 'A') && (b <= 'Z') {
+		b |= 32
+	}
+
+	if (a >= 'a') && (a <= 'z') && (b >= 'a') && (b <= 'z') {
+		return (_iupac[a-'a'] & _iupac[b-'a']) > 0
+	}
+	return a == b
+}
@@ -2,12 +2,10 @@ package obiclean

 import (
 	"fmt"
-	"os"
 	"sort"

 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
 	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
-	"github.com/schollz/progressbar/v3"
 	log "github.com/sirupsen/logrus"
 )

@@ -21,13 +19,13 @@ func commonPrefix(a, b *obiseq.BioSequence) int {
 	as := a.Sequence()
 	bs := b.Sequence()

-	for i < l && as[i] == bs[i] {
+	for i < l && obiseq.SameIUPACNuc(as[i], bs[i]) {
 		i++
 	}

-	if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) {
-		log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i])
-	}
+	// if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) {
+	// 	log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i])
+	// }

 	return i
 }
@@ -44,29 +42,97 @@ func commonSuffix(a, b *obiseq.BioSequence) int {
 	bs := b.Sequence()

 	l := 0
-	for i >= 0 && j >= 0 && as[i] == bs[j] {
+	for i >= 0 && j >= 0 && obiseq.SameIUPACNuc(as[i], bs[j]) {
 		i--
 		j--
 		l++
 	}

-	if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) {
-		log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:])
-	}
-	// obilog.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:])
+	// if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) {
+	// 	log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:])
+	// }
+	// log.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:])

 	return l
 }

+// oneDifference return true if s1 and s2 differ by exactly 1 operation
+// (substitution, insertion or deletion)
+func oneDifference(s1, s2 []byte) bool {
+	l1 := len(s1)
+	l2 := len(s2)
+
+	// Case 1: same lengths → test substitution
+	if l1 == l2 {
+		diff := 0
+		for i := 0; i < l1; i++ {
+			if s1[i] != s2[i] {
+				diff++
+				if diff > 1 {
+					return false
+				}
+			}
+		}
+		return diff == 1
+	}
+
+	// Case 2: difference of 1 character → insertion/deletion
+	if l1 == l2+1 {
+		for i := 0; i < l1; i++ {
+			if string(s1[:i])+string(s1[i+1:]) == string(s2) {
+				return true
+			}
+		}
+		return false
+	}
+	if l2 == l1+1 {
+		for i := 0; i < l2; i++ {
+			if string(s2[:i])+string(s2[i+1:]) == string(s1) {
+				return true
+			}
+		}
+		return false
+	}
+
+	// Case 3: difference > 1 character
+	return false
+}
+
+func GetChimera(sequence *obiseq.BioSequence) map[string]string {
+	annotation := sequence.Annotations()
+	iobistatus, ok := annotation["chimera"]
+	var chimera map[string]string
+	var err error
+
+	if ok {
+		switch iobistatus := iobistatus.(type) {
+		case map[string]string:
+			chimera = iobistatus
+		case map[string]interface{}:
+			chimera = make(map[string]string)
+			for k, v := range iobistatus {
+				chimera[k], err = obiutils.InterfaceToString(v)
+				if err != nil {
+					log.Panicf("chimera attribute of sequence %s must be castable to a map[string]string", sequence.Id())
+				}
+			}
+		}
+	} else {
+		chimera = make(map[string]string)
+		annotation["chimera"] = chimera
+	}
+
+	return chimera
+}
+
+// add the suffix/prefix comparisons to detect chimeras (handle IUPAC codes)
 func AnnotateChimera(samples map[string]*[]*seqPCR) {

 	w := func(sample string, seqs *[]*seqPCR) {
 		ls := len(*seqs)
-		cp := make([]int, ls)
-		cs := make([]int, ls)
-
 		pcrs := make([]*seqPCR, 0, ls)

+		// select only sequences without edges (head sequences)
 		for _, s := range *seqs {
 			if len(s.Edges) == 0 {
 				pcrs = append(pcrs, s)
@@ -75,66 +141,63 @@ func AnnotateChimera(samples map[string]*[]*seqPCR) {

 		lp := len(pcrs)

+		// sort by increasing weight (like increasing abundance)
 		sort.Slice(pcrs, func(i, j int) bool {
 			return pcrs[i].Weight < pcrs[j].Weight
 		})

 		for i, s := range pcrs {
-			for j := i + 1; j < lp; j++ {
-				s2 := pcrs[j]
-				cp[j] = commonPrefix(s.Sequence, s2.Sequence)
-				cs[j] = commonSuffix(s.Sequence, s2.Sequence)
-			}
+			seqRef := s.Sequence
+			L := seqRef.Len()

-			var cm map[string]string
-			var err error
+			maxLeft, maxRight := 0, 0
+			var nameLeft, nameRight string

-			chimera, ok := s.Sequence.GetAttribute("chimera")
+			// looking for potential parents
+			for j := 0; j < lp; j++ {
+				if j == i {
+					continue
+				}
+				seqParent := pcrs[j].Sequence

-			if !ok {
-				cm = map[string]string{}
-			} else {
-				cm, err = obiutils.InterfaceToStringMap(chimera)
-				if err != nil {
-					log.Fatalf("type of chimera not map[string]string: %T (%v)",
-						chimera, err)
+				// Check abundance: parent must be more abundant
+				if pcrs[j].Weight <= s.Weight {
+					continue
+				}
+
+				// Check edit distance (skip if only one diff, supposed to never happen)
+				if oneDifference(seqRef.Sequence(), seqParent.Sequence()) {
+					continue
+				}
+
+				// Common prefix
+				left := commonPrefix(seqRef, seqParent)
+				if left > maxLeft {
+					maxLeft = left
+					nameLeft = seqParent.Id()
+				}
+
+				// Common suffix
+				right := commonSuffix(seqRef, seqParent)
+				if right > maxRight {
+					maxRight = right
+					nameRight = seqParent.Id()
 				}
 			}

-			ls := s.Sequence.Len()
+			// Select parents with longuest prefix/suffix
+			// Condition prefix+suffix covers the sequence and sequence not include into parent
+			if maxLeft >= L-maxRight && maxLeft > 0 && maxRight < L {

-			for k := i + 1; k < lp; k++ {
-				for l := i + 1; l < lp; l++ {
-					if k != l && cp[k]+cs[l] == ls {
-						cm[sample] = fmt.Sprintf("{%s}/{%s}@(%d)",
-							pcrs[k].Sequence.Id(),
-							pcrs[l].Sequence.Id(),
-							cp[k])
-					}
-				}
-			}
-
-			if len(cm) > 0 {
-				s.Sequence.SetAttribute("chimera", cm)
+				chimeraMap := GetChimera(s.Sequence)
+				// overlap sequence
+				overlap := seqRef.Sequence()[L-maxRight : maxLeft]
+				chimeraMap[sample] = fmt.Sprintf("{%s}/{%s}@(%s)(%d)(%d)(%d)", nameLeft, nameRight, overlap, L-maxRight, maxLeft, len(overlap))
 			}
 		}
-
 	}

-	pbopt := make([]progressbar.Option, 0, 5)
-	pbopt = append(pbopt,
-		progressbar.OptionSetWriter(os.Stderr),
-		progressbar.OptionSetWidth(15),
-		progressbar.OptionShowIts(),
-		progressbar.OptionSetPredictTime(true),
-		progressbar.OptionSetDescription("[Chimera detection]"),
-	)
-
-	bar := progressbar.NewOptions(len(samples), pbopt...)
-
 	for sn, sqs := range samples {
 		w(sn, sqs)
-		bar.Add(1)
 	}
-
 }
@@ -265,17 +265,18 @@ func CLIWriteCSVToStdout(matrix *MatrixData) {

 	osamples := samples.Members()
 	sort.Strings(osamples)
-
 	columns := make([]string, 0, len(osamples)+len(matrix.attributeList))
 	columns = append(columns, matrix.attributeList...)
 	columns = append(columns, osamples...)

+	header := slices.Clone(columns)
+
 	csvwriter.Write(columns)
 	nattribs := len(matrix.attributeList)

 	for k, data := range matrix.matrix {
 		attrs := matrix.attributes[k]
-		for i, kk := range osamples {
+		for i, kk := range header {
 			if i < nattribs {
 				if v, ok := attrs[kk]; ok {
 					vs, err := obiutils.InterfaceToString(v)