mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Add an experimental chimera detection...
This commit is contained in:
@ -8,7 +8,7 @@ import (
|
|||||||
// corresponds to the last commit, and not the one when the file will be
|
// corresponds to the last commit, and not the one when the file will be
|
||||||
// commited
|
// commited
|
||||||
|
|
||||||
var _Commit = "8671285"
|
var _Commit = "51b3e83"
|
||||||
var _Version = "Release 4.2.0"
|
var _Version = "Release 4.2.0"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
|
129
pkg/obitools/obiclean/chimera.go
Normal file
129
pkg/obitools/obiclean/chimera.go
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
package obiclean
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
func commonPrefix(a, b *obiseq.BioSequence) int {
|
||||||
|
i := 0
|
||||||
|
l := min(a.Len(), b.Len())
|
||||||
|
|
||||||
|
if l == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
as := a.Sequence()
|
||||||
|
bs := b.Sequence()
|
||||||
|
|
||||||
|
for i < l && as[i] == bs[i] {
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
|
||||||
|
if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) {
|
||||||
|
log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i])
|
||||||
|
}
|
||||||
|
|
||||||
|
return i
|
||||||
|
}
|
||||||
|
|
||||||
|
func commonSuffix(a, b *obiseq.BioSequence) int {
|
||||||
|
i := a.Len() - 1
|
||||||
|
j := b.Len() - 1
|
||||||
|
|
||||||
|
if i < 0 || j < 0 {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
|
||||||
|
as := a.Sequence()
|
||||||
|
bs := b.Sequence()
|
||||||
|
|
||||||
|
for i >= 0 && j >= 0 && as[i] == bs[j] {
|
||||||
|
i--
|
||||||
|
j--
|
||||||
|
}
|
||||||
|
|
||||||
|
if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) {
|
||||||
|
log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:])
|
||||||
|
}
|
||||||
|
// log.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:])
|
||||||
|
|
||||||
|
return i + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
func AnnotateChimera(samples map[string]*[]*seqPCR) {
|
||||||
|
|
||||||
|
w := func(sample string, seqs *[]*seqPCR) {
|
||||||
|
ls := len(*seqs)
|
||||||
|
cp := make([]int, ls)
|
||||||
|
cs := make([]int, ls)
|
||||||
|
|
||||||
|
pcrs := make([]*seqPCR, 0, ls)
|
||||||
|
|
||||||
|
for _, s := range *seqs {
|
||||||
|
if len(s.Edges) == 0 {
|
||||||
|
pcrs = append(pcrs, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lp := len(pcrs)
|
||||||
|
|
||||||
|
sort.Slice(pcrs, func(i, j int) bool {
|
||||||
|
return pcrs[i].Weight < pcrs[j].Weight
|
||||||
|
})
|
||||||
|
|
||||||
|
for i, s := range pcrs {
|
||||||
|
for j := i + 1; j < lp; j++ {
|
||||||
|
s2 := pcrs[j]
|
||||||
|
cp[j] = commonPrefix(s.Sequence, s2.Sequence)
|
||||||
|
cs[j] = commonSuffix(s.Sequence, s2.Sequence)
|
||||||
|
}
|
||||||
|
|
||||||
|
var cm map[string]string
|
||||||
|
var err error
|
||||||
|
|
||||||
|
chimera, ok := s.Sequence.GetAttribute("chimera")
|
||||||
|
|
||||||
|
if !ok {
|
||||||
|
cm = map[string]string{}
|
||||||
|
} else {
|
||||||
|
cm, err = obiutils.InterfaceToStringMap(chimera)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("type of chimera not map[string]string: %T (%v)",
|
||||||
|
chimera, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for k := i + 1; k < lp; k++ {
|
||||||
|
for l := i + 1; l < lp; l++ {
|
||||||
|
if k != l &&
|
||||||
|
cs[k] >= 0 &&
|
||||||
|
obiutils.Abs(cp[k]-cs[l]) == 0 &&
|
||||||
|
obiutils.UnsafeString(pcrs[k].Sequence.Sequence()[:cp[k]]) !=
|
||||||
|
obiutils.UnsafeString(pcrs[l].Sequence.Sequence()[:cp[k]]) &&
|
||||||
|
obiutils.UnsafeString(pcrs[k].Sequence.Sequence()[cp[k]:]) !=
|
||||||
|
obiutils.UnsafeString(pcrs[l].Sequence.Sequence()[cp[k]:]) {
|
||||||
|
|
||||||
|
cm[sample] = fmt.Sprintf("{%s}/{%s}@(%d)",
|
||||||
|
pcrs[k].Sequence.Id(),
|
||||||
|
pcrs[l].Sequence.Id(),
|
||||||
|
cp[k])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(cm) > 0 {
|
||||||
|
s.Sequence.SetAttribute("chimera", cm)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
for sn, sqs := range samples {
|
||||||
|
w(sn, sqs)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -2,6 +2,7 @@ package obiclean
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"maps"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||||
@ -19,6 +20,7 @@ type seqPCR struct {
|
|||||||
Sequence *obiseq.BioSequence // pointer to the corresponding sequence
|
Sequence *obiseq.BioSequence // pointer to the corresponding sequence
|
||||||
SonCount int
|
SonCount int
|
||||||
AddedSons int
|
AddedSons int
|
||||||
|
IsHead bool
|
||||||
Edges []Edge
|
Edges []Edge
|
||||||
Cluster map[int]bool // used as the set of head sequences associated to that sequence
|
Cluster map[int]bool // used as the set of head sequences associated to that sequence
|
||||||
}
|
}
|
||||||
@ -50,6 +52,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
|
|||||||
Sequence: s,
|
Sequence: s,
|
||||||
SonCount: 0,
|
SonCount: 0,
|
||||||
AddedSons: 0,
|
AddedSons: 0,
|
||||||
|
IsHead: false,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -112,6 +115,28 @@ func IsHead(sequence *obiseq.BioSequence) bool {
|
|||||||
return ishead
|
return ishead
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func NotAlwaysChimera(tag string) obiseq.SequencePredicate {
|
||||||
|
descriptor := obiseq.MakeStatsOnDescription(tag)
|
||||||
|
predicat := func(sequence *obiseq.BioSequence) bool {
|
||||||
|
|
||||||
|
chimera, ok := sequence.GetStringMap("chimera")
|
||||||
|
if !ok || len(chimera) == 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
samples := maps.Keys(sequence.StatsOn(descriptor, "NA"))
|
||||||
|
|
||||||
|
for s := range samples {
|
||||||
|
if _, ok := chimera[s]; !ok {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
return predicat
|
||||||
|
}
|
||||||
|
|
||||||
func HeadCount(sequence *obiseq.BioSequence) int {
|
func HeadCount(sequence *obiseq.BioSequence) int {
|
||||||
var err error
|
var err error
|
||||||
annotation := sequence.Annotations()
|
annotation := sequence.Annotations()
|
||||||
@ -235,6 +260,7 @@ func Mutation(sample map[string]*([]*seqPCR)) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func Status(sequence *obiseq.BioSequence) map[string]string {
|
func Status(sequence *obiseq.BioSequence) map[string]string {
|
||||||
|
var err error
|
||||||
annotation := sequence.Annotations()
|
annotation := sequence.Annotations()
|
||||||
iobistatus, ok := annotation["obiclean_status"]
|
iobistatus, ok := annotation["obiclean_status"]
|
||||||
var obistatus map[string]string
|
var obistatus map[string]string
|
||||||
@ -244,9 +270,9 @@ func Status(sequence *obiseq.BioSequence) map[string]string {
|
|||||||
case map[string]string:
|
case map[string]string:
|
||||||
obistatus = iobistatus
|
obistatus = iobistatus
|
||||||
case map[string]interface{}:
|
case map[string]interface{}:
|
||||||
obistatus = make(map[string]string)
|
obistatus, err = obiutils.InterfaceToStringMap(obistatus)
|
||||||
for k, v := range iobistatus {
|
if err != nil {
|
||||||
obistatus[k] = fmt.Sprint(v)
|
log.Panicf("obiclean_status attribute of sequence %s must be castable to a map[string]string", sequence.Id())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -354,6 +380,10 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if DetectChimera() {
|
||||||
|
AnnotateChimera(samples)
|
||||||
|
}
|
||||||
|
|
||||||
if SaveGraphToFiles() {
|
if SaveGraphToFiles() {
|
||||||
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
|
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
|
||||||
}
|
}
|
||||||
@ -366,7 +396,9 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
iter := annotateOBIClean(source, db)
|
iter := annotateOBIClean(source, db)
|
||||||
|
|
||||||
if OnlyHead() {
|
if OnlyHead() {
|
||||||
iter = iter.FilterOn(IsHead, obidefault.BatchSize())
|
iter = iter.FilterOn(IsHead,
|
||||||
|
obidefault.BatchSize()).FilterOn(NotAlwaysChimera(SampleAttribute()),
|
||||||
|
obidefault.BatchSize())
|
||||||
}
|
}
|
||||||
|
|
||||||
if MinSampleCount() > 1 {
|
if MinSampleCount() > 1 {
|
||||||
|
@ -17,6 +17,7 @@ var _onlyHead = false
|
|||||||
var _saveGraph = "__@@NOSAVE@@__"
|
var _saveGraph = "__@@NOSAVE@@__"
|
||||||
var _saveRatio = "__@@NOSAVE@@__"
|
var _saveRatio = "__@@NOSAVE@@__"
|
||||||
var _minSample = 1
|
var _minSample = 1
|
||||||
|
var _detectChimera = false
|
||||||
|
|
||||||
func ObicleanOptionSet(options *getoptions.GetOpt) {
|
func ObicleanOptionSet(options *getoptions.GetOpt) {
|
||||||
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
|
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
|
||||||
@ -59,6 +60,10 @@ func ObicleanOptionSet(options *getoptions.GetOpt) {
|
|||||||
options.IntVar(&_minSample, "min-sample-count", _minSample,
|
options.IntVar(&_minSample, "min-sample-count", _minSample,
|
||||||
options.Description("Minimum number of samples a sequence must be present in to be considered in the analysis."),
|
options.Description("Minimum number of samples a sequence must be present in to be considered in the analysis."),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
options.BoolVar(&_detectChimera, "detect-chimera", _detectChimera,
|
||||||
|
options.Description("Detect chimera sequences."),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
func OptionSet(options *getoptions.GetOpt) {
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
@ -120,3 +125,8 @@ func RatioTableFilename() string {
|
|||||||
func MinSampleCount() int {
|
func MinSampleCount() int {
|
||||||
return _minSample
|
return _minSample
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// It returns true if chimera detection is enabled
|
||||||
|
func DetectChimera() bool {
|
||||||
|
return _detectChimera
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user