2024-06-04 16:49:12 +02:00
|
|
|
package obingslibrary
|
|
|
|
|
|
|
|
import (
|
2024-06-04 21:46:09 +02:00
|
|
|
"fmt"
|
2024-06-04 16:49:12 +02:00
|
|
|
"sort"
|
|
|
|
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
2024-06-04 21:46:09 +02:00
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
2024-06-04 16:49:12 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
type PrimerMatch struct {
|
|
|
|
Begin int
|
|
|
|
End int
|
|
|
|
Mismatches int
|
|
|
|
Marker int
|
|
|
|
Forward bool
|
|
|
|
}
|
|
|
|
|
|
|
|
type TagMatcher func(
|
|
|
|
sequence *obiseq.BioSequence,
|
|
|
|
begin, end int, forward bool) (TagPair, error)
|
|
|
|
|
2024-06-04 21:46:09 +02:00
|
|
|
func (library *NGSLibrary) TagExtractorFixedLength(
|
|
|
|
sequence *obiseq.BioSequence,
|
|
|
|
annotations obiseq.Annotation,
|
|
|
|
primers PrimerPair,
|
|
|
|
begin, end int,
|
|
|
|
forward bool) *TagPair {
|
|
|
|
|
|
|
|
marker, ok := library.Markers[primers]
|
|
|
|
|
|
|
|
forward_tag := ""
|
|
|
|
reverse_tag := ""
|
|
|
|
|
|
|
|
if !ok {
|
|
|
|
log.Fatalf("marker not found : %v", primers)
|
|
|
|
}
|
|
|
|
|
|
|
|
fb := 0
|
|
|
|
|
|
|
|
if forward {
|
|
|
|
annotations["direction"] = "direct"
|
|
|
|
fb = begin - marker.Forward_spacer - marker.Forward_tag_length
|
|
|
|
if fb < 0 {
|
|
|
|
annotations["demultiplex_error"] = "Cannot extract forward tag"
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
forward_tag = sequence.String()[fb:(fb + marker.Forward_tag_length)]
|
|
|
|
|
|
|
|
fb = end + marker.Reverse_spacer
|
|
|
|
if (fb + marker.Reverse_tag_length) > sequence.Len() {
|
|
|
|
annotations["demultiplex_error"] = "Cannot extract reverse tag"
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
seq_tag, err := sequence.Subsequence(fb, fb+marker.Forward_tag_length, false)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
annotations["demultiplex_error"] = "Cannot extract forward tag"
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
reverse_tag = seq_tag.ReverseComplement(true).String()
|
|
|
|
|
|
|
|
} else {
|
|
|
|
annotations["direction"] = "reverse"
|
|
|
|
fb = end + marker.Forward_spacer
|
|
|
|
if (fb + marker.Forward_tag_length) > sequence.Len() {
|
|
|
|
annotations["demultiplex_error"] = "Cannot extract forward tag"
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
seq_tag, err := sequence.Subsequence(fb, fb+marker.Forward_tag_length, false)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
annotations["demultiplex_error"] = "Cannot extract forward tag"
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
forward_tag = seq_tag.ReverseComplement(true).String()
|
|
|
|
|
|
|
|
fb = begin - marker.Reverse_spacer - marker.Reverse_tag_length
|
|
|
|
if fb < 0 {
|
|
|
|
sequence.SetAttribute("demultiplex_error", "Cannot extract reverse tag")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
reverse_tag = sequence.String()[fb:(fb + marker.Reverse_tag_length)]
|
|
|
|
}
|
|
|
|
|
|
|
|
annotations["forward_tag"] = forward_tag
|
|
|
|
annotations["reverse_tag"] = reverse_tag
|
|
|
|
|
|
|
|
return &TagPair{
|
|
|
|
Forward: forward_tag,
|
|
|
|
Reverse: reverse_tag,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (library *NGSLibrary) StrictSampleIdentifier(primerseqs PrimerPair, tags *TagPair, annotations obiseq.Annotation) *PCR {
|
|
|
|
marker := library.Markers[primerseqs]
|
|
|
|
pcr, ok := marker.samples[*tags]
|
|
|
|
|
|
|
|
if !ok {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
return pcr
|
|
|
|
}
|
2024-06-04 16:49:12 +02:00
|
|
|
|
|
|
|
func (library *NGSLibrary) ExtractMultiBarcode(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
|
|
|
i := 1
|
|
|
|
markers := make([]*Marker, len(library.Markers)+1)
|
|
|
|
primerseqs := make([]PrimerPair, len(library.Markers)+1)
|
|
|
|
matches := make([]PrimerMatch, 0, len(library.Markers)+1)
|
|
|
|
aseq, err := obiapat.MakeApatSequence(sequence, false)
|
|
|
|
|
2024-06-04 21:46:09 +02:00
|
|
|
results := obiseq.MakeBioSequenceSlice()
|
|
|
|
|
2024-06-04 16:49:12 +02:00
|
|
|
if err != nil {
|
|
|
|
log.Fatalf("error in building apat sequence : %v\n", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
for primers, marker := range library.Markers {
|
|
|
|
markers[i] = marker
|
|
|
|
primerseqs[i] = primers
|
|
|
|
locs := marker.forward.AllMatches(aseq, 0, -1)
|
|
|
|
if len(locs) > 0 {
|
|
|
|
for _, loc := range locs {
|
|
|
|
matches = append(matches, PrimerMatch{
|
|
|
|
Begin: loc[0],
|
|
|
|
End: loc[1],
|
|
|
|
Mismatches: loc[2],
|
|
|
|
Marker: i,
|
|
|
|
Forward: true,
|
|
|
|
})
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
locs = marker.creverse.AllMatches(aseq, locs[0][0]+1, -1)
|
|
|
|
|
|
|
|
if len(locs) > 0 {
|
|
|
|
for _, loc := range locs {
|
|
|
|
matches = append(matches, PrimerMatch{
|
|
|
|
Begin: loc[0],
|
|
|
|
End: loc[1],
|
|
|
|
Mismatches: loc[2],
|
|
|
|
Marker: -i,
|
|
|
|
Forward: true,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
locs = marker.reverse.AllMatches(aseq, 0, -1)
|
|
|
|
if len(locs) > 0 {
|
|
|
|
for _, loc := range locs {
|
|
|
|
matches = append(matches, PrimerMatch{
|
|
|
|
Begin: loc[0],
|
|
|
|
End: loc[1],
|
|
|
|
Mismatches: loc[2],
|
|
|
|
Marker: i,
|
|
|
|
Forward: false,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
locs = marker.cforward.AllMatches(aseq, locs[0][0]+1, -1)
|
|
|
|
|
|
|
|
if len(locs) > 0 {
|
|
|
|
for _, loc := range locs {
|
|
|
|
matches = append(matches, PrimerMatch{
|
|
|
|
Begin: loc[0],
|
|
|
|
End: loc[1],
|
|
|
|
Mismatches: loc[2],
|
|
|
|
Marker: -i,
|
|
|
|
Forward: false,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
i++
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(matches) > 0 {
|
|
|
|
sort.Slice(matches, func(i, j int) bool {
|
|
|
|
return matches[i].Begin < matches[j].Begin
|
|
|
|
})
|
|
|
|
|
|
|
|
state := 0
|
|
|
|
var from PrimerMatch
|
|
|
|
q := 0
|
|
|
|
for _, match := range matches {
|
|
|
|
|
|
|
|
switch state {
|
|
|
|
case 0:
|
|
|
|
if match.Marker > 0 {
|
|
|
|
from = match
|
|
|
|
state = 1
|
|
|
|
}
|
|
|
|
|
|
|
|
case 1:
|
|
|
|
if match.Marker == -from.Marker && match.Forward == from.Forward {
|
2024-06-04 21:46:09 +02:00
|
|
|
|
|
|
|
annotations := obiseq.GetAnnotation()
|
|
|
|
annotations["forward_primer"] = primerseqs[from.Marker].Forward
|
|
|
|
annotations["reverse_primer"] = primerseqs[from.Marker].Reverse
|
|
|
|
|
|
|
|
if from.Forward {
|
|
|
|
|
|
|
|
annotations["forward_match"] = sequence.String()[from.Begin:from.End]
|
|
|
|
sseq, err := sequence.Subsequence(match.Begin, match.End, false)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
annotations["multiplex_error"] = "Cannot extract reverse match"
|
|
|
|
}
|
|
|
|
annotations["reverse_match"] = sseq.ReverseComplement(true).String()
|
|
|
|
|
|
|
|
annotations["forward_error"] = from.Mismatches
|
|
|
|
annotations["reverse_error"] = match.Mismatches
|
|
|
|
} else {
|
|
|
|
annotations["reverse_match"] = sequence.String()[from.Begin:from.End]
|
|
|
|
sseq, err := sequence.Subsequence(match.Begin, match.End, false)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
annotations["multiplex_error"] = "Cannot extract forward match"
|
|
|
|
}
|
|
|
|
|
|
|
|
annotations["forward_match"] = sseq.ReverseComplement(true).String()
|
|
|
|
|
|
|
|
annotations["reverse_error"] = from.Mismatches
|
|
|
|
annotations["forward_error"] = match.Mismatches
|
|
|
|
}
|
|
|
|
|
|
|
|
tags := library.TagExtractorFixedLength(sequence, annotations, primerseqs[from.Marker], from.Begin, match.End, from.Forward)
|
|
|
|
|
|
|
|
barcode, err := sequence.Subsequence(from.End, match.Begin, false)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("%s [%s] : Cannot extract barcode %d : %v", sequence.Id(), sequence.Source(), q, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if !match.Forward {
|
|
|
|
barcode = barcode.ReverseComplement(true)
|
|
|
|
}
|
|
|
|
|
|
|
|
if tags != nil {
|
|
|
|
pcr := library.StrictSampleIdentifier(primerseqs[from.Marker], tags, annotations)
|
|
|
|
|
|
|
|
if pcr == nil {
|
|
|
|
annotations["demultiplex_error"] = "Cannot associate sample to the tag pair"
|
|
|
|
} else {
|
|
|
|
annotations["sample"] = pcr.Sample
|
|
|
|
annotations["experiment"] = pcr.Experiment
|
|
|
|
for k, v := range pcr.Annotations {
|
|
|
|
annotations[k] = v
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
barcode.AnnotationsLock()
|
|
|
|
obiutils.MustFillMap(barcode.Annotations(), annotations)
|
|
|
|
barcode.AnnotationsUnlock()
|
|
|
|
|
|
|
|
results = append(results, barcode)
|
2024-06-04 16:49:12 +02:00
|
|
|
state = 0
|
2024-06-04 21:46:09 +02:00
|
|
|
q++
|
2024-06-04 16:49:12 +02:00
|
|
|
} else if match.Marker > 0 {
|
|
|
|
log.Warnf("Marker mismatch : %d %d", match.Marker, from.Marker)
|
|
|
|
from = match
|
|
|
|
} else {
|
|
|
|
log.Warnf("Marker mismatch : %d %d", match.Marker, from.Marker)
|
|
|
|
state = 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-06-04 21:46:09 +02:00
|
|
|
if len(results) == 0 {
|
|
|
|
sequence.SetAttribute("demultiplex_error", "No barcode identified")
|
|
|
|
results = append(results, sequence)
|
|
|
|
} else {
|
|
|
|
for i, result := range results {
|
|
|
|
result.SetAttribute("amplicon_rank", fmt.Sprintf("%d/%d", i+1, len(results)))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return results, nil
|
2024-06-04 16:49:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (library *NGSLibrary) ExtractMultiBarcodeSliceWorker(options ...WithOption) obiseq.SeqSliceWorker {
|
|
|
|
opt := MakeOptions(options)
|
|
|
|
|
|
|
|
library.Compile(opt.AllowedMismatch(), opt.AllowsIndel())
|
|
|
|
|
|
|
|
worker := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
|
|
|
return library.ExtractMultiBarcode(sequence)
|
|
|
|
}
|
|
|
|
|
|
|
|
return obiseq.SeqToSliceWorker(worker, true)
|
|
|
|
}
|