work on obiclean chimera detection

This commit is contained in:
Eric Coissac
2025-10-20 16:35:19 +02:00
parent 29bf4ce871
commit d17a9520b9
6 changed files with 160 additions and 92 deletions

View File

@@ -4,33 +4,6 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
) )
var _iupac = [26]byte{
// a b c d e f
1, 14, 2, 13, 0, 0,
// g h i j k l
4, 11, 0, 0, 12, 0,
// m n o p q r
3, 15, 0, 0, 0, 5,
// s t u v w x
6, 8, 8, 13, 9, 0,
// y z
10, 0,
}
func _samenuc(a, b byte) bool {
if (a >= 'A') && (a <= 'Z') {
a |= 32
}
if (b >= 'A') && (b <= 'Z') {
b |= 32
}
if (a >= 'a') && (a <= 'z') && (b >= 'a') && (b <= 'z') {
return (_iupac[a-'a'] & _iupac[b-'a']) > 0
}
return a == b
}
// FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices. // FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices.
// //
// The score is calculated using the following scoring matrix: // The score is calculated using the following scoring matrix:
@@ -165,7 +138,7 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
default: default:
// We are in the middle of the matrix // We are in the middle of the matrix
Sdiag = _incpath(previous[x]) Sdiag = _incpath(previous[x])
if _samenuc(bA[j-1], bB[i-1]) { if obiseq.SameIUPACNuc(bA[j-1], bB[i-1]) {
Sdiag = _incscore(Sdiag) Sdiag = _incscore(Sdiag)
} }
@@ -265,7 +238,7 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
Sleft = _notavail Sleft = _notavail
default: default:
Sdiag = _incpath(previous[x]) Sdiag = _incpath(previous[x])
if _samenuc(bA[j-1], bB[i-1]) { if obiseq.SameIUPACNuc(bA[j-1], bB[i-1]) {
Sdiag = _incscore(Sdiag) Sdiag = _incscore(Sdiag)
} }

View File

@@ -1,6 +1,9 @@
package obialign package obialign
import log "github.com/sirupsen/logrus" import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
log "github.com/sirupsen/logrus"
)
// buffIndex converts a pair of coordinates (i, j) into a linear index in a matrix // buffIndex converts a pair of coordinates (i, j) into a linear index in a matrix
// of size width x width. The coordinates are (-1)-indexed, and the linear index // of size width x width. The coordinates are (-1)-indexed, and the linear index
@@ -69,7 +72,7 @@ func LocatePattern(id string, pattern, sequence []byte) (int, int, int) {
// Mismatch score = -1 // Mismatch score = -1
// Match score = 0 // Match score = 0
match := -1 match := -1
if _samenuc(pattern[j], sequence[i]) { if obiseq.SameIUPACNuc(pattern[j], sequence[i]) {
match = 0 match = 0
} }
@@ -103,7 +106,7 @@ func LocatePattern(id string, pattern, sequence []byte) (int, int, int) {
// Mismatch score = -1 // Mismatch score = -1
// Match score = 0 // Match score = 0
match := -1 match := -1
if _samenuc(pattern[jmax], sequence[i]) { if obiseq.SameIUPACNuc(pattern[jmax], sequence[i]) {
match = 0 match = 0
} }

View File

@@ -8,7 +8,7 @@ import (
// corresponds to the last commit, and not the one when the file will be // corresponds to the last commit, and not the one when the file will be
// commited // commited
var _Commit = "d7ed9d3" var _Commit = "29bf4ce"
var _Version = "Release 4.4.0" var _Version = "Release 4.4.0"
// Version returns the version of the obitools package. // Version returns the version of the obitools package.

28
pkg/obiseq/iupac_nog.go Normal file
View File

@@ -0,0 +1,28 @@
package obiseq
var _iupac = [26]byte{
// a b c d e f
1, 14, 2, 13, 0, 0,
// g h i j k l
4, 11, 0, 0, 12, 0,
// m n o p q r
3, 15, 0, 0, 0, 5,
// s t u v w x
6, 8, 8, 13, 9, 0,
// y z
10, 0,
}
func SameIUPACNuc(a, b byte) bool {
if (a >= 'A') && (a <= 'Z') {
a |= 32
}
if (b >= 'A') && (b <= 'Z') {
b |= 32
}
if (a >= 'a') && (a <= 'z') && (b >= 'a') && (b <= 'z') {
return (_iupac[a-'a'] & _iupac[b-'a']) > 0
}
return a == b
}

View File

@@ -2,12 +2,10 @@ package obiclean
import ( import (
"fmt" "fmt"
"os"
"sort" "sort"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/schollz/progressbar/v3"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
@@ -21,13 +19,13 @@ func commonPrefix(a, b *obiseq.BioSequence) int {
as := a.Sequence() as := a.Sequence()
bs := b.Sequence() bs := b.Sequence()
for i < l && as[i] == bs[i] { for i < l && obiseq.SameIUPACNuc(as[i], bs[i]) {
i++ i++
} }
if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) { // if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) {
log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i]) // log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i])
} // }
return i return i
} }
@@ -44,29 +42,97 @@ func commonSuffix(a, b *obiseq.BioSequence) int {
bs := b.Sequence() bs := b.Sequence()
l := 0 l := 0
for i >= 0 && j >= 0 && as[i] == bs[j] { for i >= 0 && j >= 0 && obiseq.SameIUPACNuc(as[i], bs[j]) {
i-- i--
j-- j--
l++ l++
} }
if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) { // if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) {
log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:]) // log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:])
} // }
// obilog.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:]) // log.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:])
return l return l
} }
// oneDifference return true if s1 and s2 differ by exactly 1 operation
// (substitution, insertion or deletion)
func oneDifference(s1, s2 []byte) bool {
l1 := len(s1)
l2 := len(s2)
// Case 1: same lengths → test substitution
if l1 == l2 {
diff := 0
for i := 0; i < l1; i++ {
if s1[i] != s2[i] {
diff++
if diff > 1 {
return false
}
}
}
return diff == 1
}
// Case 2: difference of 1 character → insertion/deletion
if l1 == l2+1 {
for i := 0; i < l1; i++ {
if string(s1[:i])+string(s1[i+1:]) == string(s2) {
return true
}
}
return false
}
if l2 == l1+1 {
for i := 0; i < l2; i++ {
if string(s2[:i])+string(s2[i+1:]) == string(s1) {
return true
}
}
return false
}
// Case 3: difference > 1 character
return false
}
func GetChimera(sequence *obiseq.BioSequence) map[string]string {
annotation := sequence.Annotations()
iobistatus, ok := annotation["chimera"]
var chimera map[string]string
var err error
if ok {
switch iobistatus := iobistatus.(type) {
case map[string]string:
chimera = iobistatus
case map[string]interface{}:
chimera = make(map[string]string)
for k, v := range iobistatus {
chimera[k], err = obiutils.InterfaceToString(v)
if err != nil {
log.Panicf("chimera attribute of sequence %s must be castable to a map[string]string", sequence.Id())
}
}
}
} else {
chimera = make(map[string]string)
annotation["chimera"] = chimera
}
return chimera
}
// add the suffix/prefix comparisons to detect chimeras (handle IUPAC codes)
func AnnotateChimera(samples map[string]*[]*seqPCR) { func AnnotateChimera(samples map[string]*[]*seqPCR) {
w := func(sample string, seqs *[]*seqPCR) { w := func(sample string, seqs *[]*seqPCR) {
ls := len(*seqs) ls := len(*seqs)
cp := make([]int, ls)
cs := make([]int, ls)
pcrs := make([]*seqPCR, 0, ls) pcrs := make([]*seqPCR, 0, ls)
// select only sequences without edges (head sequences)
for _, s := range *seqs { for _, s := range *seqs {
if len(s.Edges) == 0 { if len(s.Edges) == 0 {
pcrs = append(pcrs, s) pcrs = append(pcrs, s)
@@ -75,66 +141,63 @@ func AnnotateChimera(samples map[string]*[]*seqPCR) {
lp := len(pcrs) lp := len(pcrs)
// sort by increasing weight (like increasing abundance)
sort.Slice(pcrs, func(i, j int) bool { sort.Slice(pcrs, func(i, j int) bool {
return pcrs[i].Weight < pcrs[j].Weight return pcrs[i].Weight < pcrs[j].Weight
}) })
for i, s := range pcrs { for i, s := range pcrs {
for j := i + 1; j < lp; j++ { seqRef := s.Sequence
s2 := pcrs[j] L := seqRef.Len()
cp[j] = commonPrefix(s.Sequence, s2.Sequence)
cs[j] = commonSuffix(s.Sequence, s2.Sequence)
}
var cm map[string]string maxLeft, maxRight := 0, 0
var err error var nameLeft, nameRight string
chimera, ok := s.Sequence.GetAttribute("chimera") // looking for potential parents
for j := 0; j < lp; j++ {
if j == i {
continue
}
seqParent := pcrs[j].Sequence
if !ok { // Check abundance: parent must be more abundant
cm = map[string]string{} if pcrs[j].Weight <= s.Weight {
} else { continue
cm, err = obiutils.InterfaceToStringMap(chimera) }
if err != nil {
log.Fatalf("type of chimera not map[string]string: %T (%v)", // Check edit distance (skip if only one diff, supposed to never happen)
chimera, err) if oneDifference(seqRef.Sequence(), seqParent.Sequence()) {
continue
}
// Common prefix
left := commonPrefix(seqRef, seqParent)
if left > maxLeft {
maxLeft = left
nameLeft = seqParent.Id()
}
// Common suffix
right := commonSuffix(seqRef, seqParent)
if right > maxRight {
maxRight = right
nameRight = seqParent.Id()
} }
} }
ls := s.Sequence.Len() // Select parents with longuest prefix/suffix
// Condition prefix+suffix covers the sequence and sequence not include into parent
if maxLeft >= L-maxRight && maxLeft > 0 && maxRight < L {
for k := i + 1; k < lp; k++ { chimeraMap := GetChimera(s.Sequence)
for l := i + 1; l < lp; l++ { // overlap sequence
if k != l && cp[k]+cs[l] == ls { overlap := seqRef.Sequence()[L-maxRight : maxLeft]
cm[sample] = fmt.Sprintf("{%s}/{%s}@(%d)", chimeraMap[sample] = fmt.Sprintf("{%s}/{%s}@(%s)(%d)(%d)(%d)", nameLeft, nameRight, overlap, L-maxRight, maxLeft, len(overlap))
pcrs[k].Sequence.Id(),
pcrs[l].Sequence.Id(),
cp[k])
}
}
}
if len(cm) > 0 {
s.Sequence.SetAttribute("chimera", cm)
} }
} }
} }
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription("[Chimera detection]"),
)
bar := progressbar.NewOptions(len(samples), pbopt...)
for sn, sqs := range samples { for sn, sqs := range samples {
w(sn, sqs) w(sn, sqs)
bar.Add(1)
} }
} }

View File

@@ -265,17 +265,18 @@ func CLIWriteCSVToStdout(matrix *MatrixData) {
osamples := samples.Members() osamples := samples.Members()
sort.Strings(osamples) sort.Strings(osamples)
columns := make([]string, 0, len(osamples)+len(matrix.attributeList)) columns := make([]string, 0, len(osamples)+len(matrix.attributeList))
columns = append(columns, matrix.attributeList...) columns = append(columns, matrix.attributeList...)
columns = append(columns, osamples...) columns = append(columns, osamples...)
header := slices.Clone(columns)
csvwriter.Write(columns) csvwriter.Write(columns)
nattribs := len(matrix.attributeList) nattribs := len(matrix.attributeList)
for k, data := range matrix.matrix { for k, data := range matrix.matrix {
attrs := matrix.attributes[k] attrs := matrix.attributes[k]
for i, kk := range osamples { for i, kk := range header {
if i < nattribs { if i < nattribs {
if v, ok := attrs[kk]; ok { if v, ok := attrs[kk]; ok {
vs, err := obiutils.InterfaceToString(v) vs, err := obiutils.InterfaceToString(v)