Add an experimental chimera detection...

This commit is contained in:
Eric Coissac
2025-02-24 15:02:49 +01:00
parent 51b3e83d32
commit db284f1d44
4 changed files with 176 additions and 5 deletions

View File

@ -8,7 +8,7 @@ import (
// corresponds to the last commit, and not the one when the file will be
// commited
var _Commit = "8671285"
var _Commit = "51b3e83"
var _Version = "Release 4.2.0"
// Version returns the version of the obitools package.

View File

@ -0,0 +1,129 @@
package obiclean
import (
"fmt"
"sort"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
func commonPrefix(a, b *obiseq.BioSequence) int {
i := 0
l := min(a.Len(), b.Len())
if l == 0 {
return 0
}
as := a.Sequence()
bs := b.Sequence()
for i < l && as[i] == bs[i] {
i++
}
if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) {
log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i])
}
return i
}
func commonSuffix(a, b *obiseq.BioSequence) int {
i := a.Len() - 1
j := b.Len() - 1
if i < 0 || j < 0 {
return -1
}
as := a.Sequence()
bs := b.Sequence()
for i >= 0 && j >= 0 && as[i] == bs[j] {
i--
j--
}
if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) {
log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:])
}
// log.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:])
return i + 1
}
func AnnotateChimera(samples map[string]*[]*seqPCR) {
w := func(sample string, seqs *[]*seqPCR) {
ls := len(*seqs)
cp := make([]int, ls)
cs := make([]int, ls)
pcrs := make([]*seqPCR, 0, ls)
for _, s := range *seqs {
if len(s.Edges) == 0 {
pcrs = append(pcrs, s)
}
}
lp := len(pcrs)
sort.Slice(pcrs, func(i, j int) bool {
return pcrs[i].Weight < pcrs[j].Weight
})
for i, s := range pcrs {
for j := i + 1; j < lp; j++ {
s2 := pcrs[j]
cp[j] = commonPrefix(s.Sequence, s2.Sequence)
cs[j] = commonSuffix(s.Sequence, s2.Sequence)
}
var cm map[string]string
var err error
chimera, ok := s.Sequence.GetAttribute("chimera")
if !ok {
cm = map[string]string{}
} else {
cm, err = obiutils.InterfaceToStringMap(chimera)
if err != nil {
log.Fatalf("type of chimera not map[string]string: %T (%v)",
chimera, err)
}
}
for k := i + 1; k < lp; k++ {
for l := i + 1; l < lp; l++ {
if k != l &&
cs[k] >= 0 &&
obiutils.Abs(cp[k]-cs[l]) == 0 &&
obiutils.UnsafeString(pcrs[k].Sequence.Sequence()[:cp[k]]) !=
obiutils.UnsafeString(pcrs[l].Sequence.Sequence()[:cp[k]]) &&
obiutils.UnsafeString(pcrs[k].Sequence.Sequence()[cp[k]:]) !=
obiutils.UnsafeString(pcrs[l].Sequence.Sequence()[cp[k]:]) {
cm[sample] = fmt.Sprintf("{%s}/{%s}@(%d)",
pcrs[k].Sequence.Id(),
pcrs[l].Sequence.Id(),
cp[k])
}
}
}
if len(cm) > 0 {
s.Sequence.SetAttribute("chimera", cm)
}
}
}
for sn, sqs := range samples {
w(sn, sqs)
}
}

View File

@ -2,6 +2,7 @@ package obiclean
import (
"fmt"
"maps"
"os"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
@ -19,6 +20,7 @@ type seqPCR struct {
Sequence *obiseq.BioSequence // pointer to the corresponding sequence
SonCount int
AddedSons int
IsHead bool
Edges []Edge
Cluster map[int]bool // used as the set of head sequences associated to that sequence
}
@ -50,6 +52,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
Sequence: s,
SonCount: 0,
AddedSons: 0,
IsHead: false,
})
}
}
@ -112,6 +115,28 @@ func IsHead(sequence *obiseq.BioSequence) bool {
return ishead
}
func NotAlwaysChimera(tag string) obiseq.SequencePredicate {
descriptor := obiseq.MakeStatsOnDescription(tag)
predicat := func(sequence *obiseq.BioSequence) bool {
chimera, ok := sequence.GetStringMap("chimera")
if !ok || len(chimera) == 0 {
return true
}
samples := maps.Keys(sequence.StatsOn(descriptor, "NA"))
for s := range samples {
if _, ok := chimera[s]; !ok {
return true
}
}
return false
}
return predicat
}
func HeadCount(sequence *obiseq.BioSequence) int {
var err error
annotation := sequence.Annotations()
@ -235,6 +260,7 @@ func Mutation(sample map[string]*([]*seqPCR)) {
}
func Status(sequence *obiseq.BioSequence) map[string]string {
var err error
annotation := sequence.Annotations()
iobistatus, ok := annotation["obiclean_status"]
var obistatus map[string]string
@ -244,9 +270,9 @@ func Status(sequence *obiseq.BioSequence) map[string]string {
case map[string]string:
obistatus = iobistatus
case map[string]interface{}:
obistatus = make(map[string]string)
for k, v := range iobistatus {
obistatus[k] = fmt.Sprint(v)
obistatus, err = obiutils.InterfaceToStringMap(obistatus)
if err != nil {
log.Panicf("obiclean_status attribute of sequence %s must be castable to a map[string]string", sequence.Id())
}
}
} else {
@ -354,6 +380,10 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
}
}
if DetectChimera() {
AnnotateChimera(samples)
}
if SaveGraphToFiles() {
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
}
@ -366,7 +396,9 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
iter := annotateOBIClean(source, db)
if OnlyHead() {
iter = iter.FilterOn(IsHead, obidefault.BatchSize())
iter = iter.FilterOn(IsHead,
obidefault.BatchSize()).FilterOn(NotAlwaysChimera(SampleAttribute()),
obidefault.BatchSize())
}
if MinSampleCount() > 1 {

View File

@ -17,6 +17,7 @@ var _onlyHead = false
var _saveGraph = "__@@NOSAVE@@__"
var _saveRatio = "__@@NOSAVE@@__"
var _minSample = 1
var _detectChimera = false
func ObicleanOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
@ -59,6 +60,10 @@ func ObicleanOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_minSample, "min-sample-count", _minSample,
options.Description("Minimum number of samples a sequence must be present in to be considered in the analysis."),
)
options.BoolVar(&_detectChimera, "detect-chimera", _detectChimera,
options.Description("Detect chimera sequences."),
)
}
func OptionSet(options *getoptions.GetOpt) {
@ -120,3 +125,8 @@ func RatioTableFilename() string {
func MinSampleCount() int {
return _minSample
}
// It returns true if chimera detection is enabled
func DetectChimera() bool {
return _detectChimera
}