From db284f1d446b6d4df1e48445d494e963fd4aca12 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 24 Feb 2025 15:02:49 +0100 Subject: [PATCH] Add an experimental chimera detection... --- pkg/obioptions/version.go | 2 +- pkg/obitools/obiclean/chimera.go | 129 ++++++++++++++++++++++++++++++ pkg/obitools/obiclean/obiclean.go | 40 ++++++++- pkg/obitools/obiclean/options.go | 10 +++ 4 files changed, 176 insertions(+), 5 deletions(-) create mode 100644 pkg/obitools/obiclean/chimera.go diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index a925e23..01ba7b8 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "8671285" +var _Commit = "51b3e83" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obitools/obiclean/chimera.go b/pkg/obitools/obiclean/chimera.go new file mode 100644 index 0000000..7aacbc7 --- /dev/null +++ b/pkg/obitools/obiclean/chimera.go @@ -0,0 +1,129 @@ +package obiclean + +import ( + "fmt" + "sort" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" + log "github.com/sirupsen/logrus" +) + +func commonPrefix(a, b *obiseq.BioSequence) int { + i := 0 + l := min(a.Len(), b.Len()) + + if l == 0 { + return 0 + } + as := a.Sequence() + bs := b.Sequence() + + for i < l && as[i] == bs[i] { + i++ + } + + if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) { + log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i]) + } + + return i +} + +func commonSuffix(a, b *obiseq.BioSequence) int { + i := a.Len() - 1 + j := b.Len() - 1 + + if i < 0 || j < 0 { + return -1 + } + + as := a.Sequence() + bs := b.Sequence() + + for i >= 0 && j >= 0 && as[i] == bs[j] { + i-- + j-- + } + + if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) { + log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:]) + } + // log.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:]) + + return i + 1 +} + +func AnnotateChimera(samples map[string]*[]*seqPCR) { + + w := func(sample string, seqs *[]*seqPCR) { + ls := len(*seqs) + cp := make([]int, ls) + cs := make([]int, ls) + + pcrs := make([]*seqPCR, 0, ls) + + for _, s := range *seqs { + if len(s.Edges) == 0 { + pcrs = append(pcrs, s) + } + } + + lp := len(pcrs) + + sort.Slice(pcrs, func(i, j int) bool { + return pcrs[i].Weight < pcrs[j].Weight + }) + + for i, s := range pcrs { + for j := i + 1; j < lp; j++ { + s2 := pcrs[j] + cp[j] = commonPrefix(s.Sequence, s2.Sequence) + cs[j] = commonSuffix(s.Sequence, s2.Sequence) + } + + var cm map[string]string + var err error + + chimera, ok := s.Sequence.GetAttribute("chimera") + + if !ok { + cm = map[string]string{} + } else { + cm, err = obiutils.InterfaceToStringMap(chimera) + if err != nil { + log.Fatalf("type of chimera not map[string]string: %T (%v)", + chimera, err) + } + } + + for k := i + 1; k < lp; k++ { + for l := i + 1; l < lp; l++ { + if k != l && + cs[k] >= 0 && + obiutils.Abs(cp[k]-cs[l]) == 0 && + obiutils.UnsafeString(pcrs[k].Sequence.Sequence()[:cp[k]]) != + obiutils.UnsafeString(pcrs[l].Sequence.Sequence()[:cp[k]]) && + obiutils.UnsafeString(pcrs[k].Sequence.Sequence()[cp[k]:]) != + obiutils.UnsafeString(pcrs[l].Sequence.Sequence()[cp[k]:]) { + + cm[sample] = fmt.Sprintf("{%s}/{%s}@(%d)", + pcrs[k].Sequence.Id(), + pcrs[l].Sequence.Id(), + cp[k]) + } + } + } + + if len(cm) > 0 { + s.Sequence.SetAttribute("chimera", cm) + } + } + + } + + for sn, sqs := range samples { + w(sn, sqs) + } + +} diff --git a/pkg/obitools/obiclean/obiclean.go b/pkg/obitools/obiclean/obiclean.go index bbad674..55dda02 100644 --- a/pkg/obitools/obiclean/obiclean.go +++ b/pkg/obitools/obiclean/obiclean.go @@ -2,6 +2,7 @@ package obiclean import ( "fmt" + "maps" "os" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" @@ -19,6 +20,7 @@ type seqPCR struct { Sequence *obiseq.BioSequence // pointer to the corresponding sequence SonCount int AddedSons int + IsHead bool Edges []Edge Cluster map[int]bool // used as the set of head sequences associated to that sequence } @@ -50,6 +52,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice, Sequence: s, SonCount: 0, AddedSons: 0, + IsHead: false, }) } } @@ -112,6 +115,28 @@ func IsHead(sequence *obiseq.BioSequence) bool { return ishead } +func NotAlwaysChimera(tag string) obiseq.SequencePredicate { + descriptor := obiseq.MakeStatsOnDescription(tag) + predicat := func(sequence *obiseq.BioSequence) bool { + + chimera, ok := sequence.GetStringMap("chimera") + if !ok || len(chimera) == 0 { + return true + } + samples := maps.Keys(sequence.StatsOn(descriptor, "NA")) + + for s := range samples { + if _, ok := chimera[s]; !ok { + return true + } + } + + return false + } + + return predicat +} + func HeadCount(sequence *obiseq.BioSequence) int { var err error annotation := sequence.Annotations() @@ -235,6 +260,7 @@ func Mutation(sample map[string]*([]*seqPCR)) { } func Status(sequence *obiseq.BioSequence) map[string]string { + var err error annotation := sequence.Annotations() iobistatus, ok := annotation["obiclean_status"] var obistatus map[string]string @@ -244,9 +270,9 @@ func Status(sequence *obiseq.BioSequence) map[string]string { case map[string]string: obistatus = iobistatus case map[string]interface{}: - obistatus = make(map[string]string) - for k, v := range iobistatus { - obistatus[k] = fmt.Sprint(v) + obistatus, err = obiutils.InterfaceToStringMap(obistatus) + if err != nil { + log.Panicf("obiclean_status attribute of sequence %s must be castable to a map[string]string", sequence.Id()) } } } else { @@ -354,6 +380,10 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence { } } + if DetectChimera() { + AnnotateChimera(samples) + } + if SaveGraphToFiles() { SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate()) } @@ -366,7 +396,9 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence { iter := annotateOBIClean(source, db) if OnlyHead() { - iter = iter.FilterOn(IsHead, obidefault.BatchSize()) + iter = iter.FilterOn(IsHead, + obidefault.BatchSize()).FilterOn(NotAlwaysChimera(SampleAttribute()), + obidefault.BatchSize()) } if MinSampleCount() > 1 { diff --git a/pkg/obitools/obiclean/options.go b/pkg/obitools/obiclean/options.go index d87aa6c..f95bb64 100644 --- a/pkg/obitools/obiclean/options.go +++ b/pkg/obitools/obiclean/options.go @@ -17,6 +17,7 @@ var _onlyHead = false var _saveGraph = "__@@NOSAVE@@__" var _saveRatio = "__@@NOSAVE@@__" var _minSample = 1 +var _detectChimera = false func ObicleanOptionSet(options *getoptions.GetOpt) { options.StringVar(&_sampleAttribute, "sample", _sampleAttribute, @@ -59,6 +60,10 @@ func ObicleanOptionSet(options *getoptions.GetOpt) { options.IntVar(&_minSample, "min-sample-count", _minSample, options.Description("Minimum number of samples a sequence must be present in to be considered in the analysis."), ) + + options.BoolVar(&_detectChimera, "detect-chimera", _detectChimera, + options.Description("Detect chimera sequences."), + ) } func OptionSet(options *getoptions.GetOpt) { @@ -120,3 +125,8 @@ func RatioTableFilename() string { func MinSampleCount() int { return _minSample } + +// It returns true if chimera detection is enabled +func DetectChimera() bool { + return _detectChimera +}