mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Add an experimental chimera detection...
This commit is contained in:
@ -8,7 +8,7 @@ import (
|
||||
// corresponds to the last commit, and not the one when the file will be
|
||||
// commited
|
||||
|
||||
var _Commit = "8671285"
|
||||
var _Commit = "51b3e83"
|
||||
var _Version = "Release 4.2.0"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
|
129
pkg/obitools/obiclean/chimera.go
Normal file
129
pkg/obitools/obiclean/chimera.go
Normal file
@ -0,0 +1,129 @@
|
||||
package obiclean
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func commonPrefix(a, b *obiseq.BioSequence) int {
|
||||
i := 0
|
||||
l := min(a.Len(), b.Len())
|
||||
|
||||
if l == 0 {
|
||||
return 0
|
||||
}
|
||||
as := a.Sequence()
|
||||
bs := b.Sequence()
|
||||
|
||||
for i < l && as[i] == bs[i] {
|
||||
i++
|
||||
}
|
||||
|
||||
if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) {
|
||||
log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i])
|
||||
}
|
||||
|
||||
return i
|
||||
}
|
||||
|
||||
func commonSuffix(a, b *obiseq.BioSequence) int {
|
||||
i := a.Len() - 1
|
||||
j := b.Len() - 1
|
||||
|
||||
if i < 0 || j < 0 {
|
||||
return -1
|
||||
}
|
||||
|
||||
as := a.Sequence()
|
||||
bs := b.Sequence()
|
||||
|
||||
for i >= 0 && j >= 0 && as[i] == bs[j] {
|
||||
i--
|
||||
j--
|
||||
}
|
||||
|
||||
if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) {
|
||||
log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:])
|
||||
}
|
||||
// log.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:])
|
||||
|
||||
return i + 1
|
||||
}
|
||||
|
||||
func AnnotateChimera(samples map[string]*[]*seqPCR) {
|
||||
|
||||
w := func(sample string, seqs *[]*seqPCR) {
|
||||
ls := len(*seqs)
|
||||
cp := make([]int, ls)
|
||||
cs := make([]int, ls)
|
||||
|
||||
pcrs := make([]*seqPCR, 0, ls)
|
||||
|
||||
for _, s := range *seqs {
|
||||
if len(s.Edges) == 0 {
|
||||
pcrs = append(pcrs, s)
|
||||
}
|
||||
}
|
||||
|
||||
lp := len(pcrs)
|
||||
|
||||
sort.Slice(pcrs, func(i, j int) bool {
|
||||
return pcrs[i].Weight < pcrs[j].Weight
|
||||
})
|
||||
|
||||
for i, s := range pcrs {
|
||||
for j := i + 1; j < lp; j++ {
|
||||
s2 := pcrs[j]
|
||||
cp[j] = commonPrefix(s.Sequence, s2.Sequence)
|
||||
cs[j] = commonSuffix(s.Sequence, s2.Sequence)
|
||||
}
|
||||
|
||||
var cm map[string]string
|
||||
var err error
|
||||
|
||||
chimera, ok := s.Sequence.GetAttribute("chimera")
|
||||
|
||||
if !ok {
|
||||
cm = map[string]string{}
|
||||
} else {
|
||||
cm, err = obiutils.InterfaceToStringMap(chimera)
|
||||
if err != nil {
|
||||
log.Fatalf("type of chimera not map[string]string: %T (%v)",
|
||||
chimera, err)
|
||||
}
|
||||
}
|
||||
|
||||
for k := i + 1; k < lp; k++ {
|
||||
for l := i + 1; l < lp; l++ {
|
||||
if k != l &&
|
||||
cs[k] >= 0 &&
|
||||
obiutils.Abs(cp[k]-cs[l]) == 0 &&
|
||||
obiutils.UnsafeString(pcrs[k].Sequence.Sequence()[:cp[k]]) !=
|
||||
obiutils.UnsafeString(pcrs[l].Sequence.Sequence()[:cp[k]]) &&
|
||||
obiutils.UnsafeString(pcrs[k].Sequence.Sequence()[cp[k]:]) !=
|
||||
obiutils.UnsafeString(pcrs[l].Sequence.Sequence()[cp[k]:]) {
|
||||
|
||||
cm[sample] = fmt.Sprintf("{%s}/{%s}@(%d)",
|
||||
pcrs[k].Sequence.Id(),
|
||||
pcrs[l].Sequence.Id(),
|
||||
cp[k])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(cm) > 0 {
|
||||
s.Sequence.SetAttribute("chimera", cm)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for sn, sqs := range samples {
|
||||
w(sn, sqs)
|
||||
}
|
||||
|
||||
}
|
@ -2,6 +2,7 @@ package obiclean
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"maps"
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
@ -19,6 +20,7 @@ type seqPCR struct {
|
||||
Sequence *obiseq.BioSequence // pointer to the corresponding sequence
|
||||
SonCount int
|
||||
AddedSons int
|
||||
IsHead bool
|
||||
Edges []Edge
|
||||
Cluster map[int]bool // used as the set of head sequences associated to that sequence
|
||||
}
|
||||
@ -50,6 +52,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
|
||||
Sequence: s,
|
||||
SonCount: 0,
|
||||
AddedSons: 0,
|
||||
IsHead: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -112,6 +115,28 @@ func IsHead(sequence *obiseq.BioSequence) bool {
|
||||
return ishead
|
||||
}
|
||||
|
||||
func NotAlwaysChimera(tag string) obiseq.SequencePredicate {
|
||||
descriptor := obiseq.MakeStatsOnDescription(tag)
|
||||
predicat := func(sequence *obiseq.BioSequence) bool {
|
||||
|
||||
chimera, ok := sequence.GetStringMap("chimera")
|
||||
if !ok || len(chimera) == 0 {
|
||||
return true
|
||||
}
|
||||
samples := maps.Keys(sequence.StatsOn(descriptor, "NA"))
|
||||
|
||||
for s := range samples {
|
||||
if _, ok := chimera[s]; !ok {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
return predicat
|
||||
}
|
||||
|
||||
func HeadCount(sequence *obiseq.BioSequence) int {
|
||||
var err error
|
||||
annotation := sequence.Annotations()
|
||||
@ -235,6 +260,7 @@ func Mutation(sample map[string]*([]*seqPCR)) {
|
||||
}
|
||||
|
||||
func Status(sequence *obiseq.BioSequence) map[string]string {
|
||||
var err error
|
||||
annotation := sequence.Annotations()
|
||||
iobistatus, ok := annotation["obiclean_status"]
|
||||
var obistatus map[string]string
|
||||
@ -244,9 +270,9 @@ func Status(sequence *obiseq.BioSequence) map[string]string {
|
||||
case map[string]string:
|
||||
obistatus = iobistatus
|
||||
case map[string]interface{}:
|
||||
obistatus = make(map[string]string)
|
||||
for k, v := range iobistatus {
|
||||
obistatus[k] = fmt.Sprint(v)
|
||||
obistatus, err = obiutils.InterfaceToStringMap(obistatus)
|
||||
if err != nil {
|
||||
log.Panicf("obiclean_status attribute of sequence %s must be castable to a map[string]string", sequence.Id())
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -354,6 +380,10 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
}
|
||||
}
|
||||
|
||||
if DetectChimera() {
|
||||
AnnotateChimera(samples)
|
||||
}
|
||||
|
||||
if SaveGraphToFiles() {
|
||||
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
|
||||
}
|
||||
@ -366,7 +396,9 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
iter := annotateOBIClean(source, db)
|
||||
|
||||
if OnlyHead() {
|
||||
iter = iter.FilterOn(IsHead, obidefault.BatchSize())
|
||||
iter = iter.FilterOn(IsHead,
|
||||
obidefault.BatchSize()).FilterOn(NotAlwaysChimera(SampleAttribute()),
|
||||
obidefault.BatchSize())
|
||||
}
|
||||
|
||||
if MinSampleCount() > 1 {
|
||||
|
@ -17,6 +17,7 @@ var _onlyHead = false
|
||||
var _saveGraph = "__@@NOSAVE@@__"
|
||||
var _saveRatio = "__@@NOSAVE@@__"
|
||||
var _minSample = 1
|
||||
var _detectChimera = false
|
||||
|
||||
func ObicleanOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
|
||||
@ -59,6 +60,10 @@ func ObicleanOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_minSample, "min-sample-count", _minSample,
|
||||
options.Description("Minimum number of samples a sequence must be present in to be considered in the analysis."),
|
||||
)
|
||||
|
||||
options.BoolVar(&_detectChimera, "detect-chimera", _detectChimera,
|
||||
options.Description("Detect chimera sequences."),
|
||||
)
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
@ -120,3 +125,8 @@ func RatioTableFilename() string {
|
||||
func MinSampleCount() int {
|
||||
return _minSample
|
||||
}
|
||||
|
||||
// It returns true if chimera detection is enabled
|
||||
func DetectChimera() bool {
|
||||
return _detectChimera
|
||||
}
|
||||
|
Reference in New Issue
Block a user