mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Adds the command obimultiplex
This commit is contained in:
284
pkg/obingslibrary/match.go
Normal file
284
pkg/obingslibrary/match.go
Normal file
@ -0,0 +1,284 @@
|
||||
package obingslibrary
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiapat"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
type DemultiplexMatch struct {
|
||||
ForwardMatch string
|
||||
ReverseMatch string
|
||||
ForwardTag string
|
||||
ReverseTag string
|
||||
BarcodeStart int
|
||||
BarcodeEnd int
|
||||
ForwardMismatches int
|
||||
ReverseMismatches int
|
||||
IsDirect bool
|
||||
Pcr *PCR
|
||||
ForwardPrimer string
|
||||
ReversePrimer string
|
||||
Error error
|
||||
}
|
||||
|
||||
func (library *NGSLibrary) Compile(maxError int) error {
|
||||
for primers, marker := range *library {
|
||||
err := marker.Compile(primers.Forward,
|
||||
primers.Reverse,
|
||||
maxError)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (library *NGSLibrary) Match(sequence obiseq.BioSequence) *DemultiplexMatch {
|
||||
for primers, marker := range *library {
|
||||
m := marker.Match(sequence)
|
||||
if m != nil {
|
||||
m.ForwardPrimer = strings.ToLower(primers.Forward)
|
||||
m.ReversePrimer = strings.ToLower(primers.Reverse)
|
||||
return m
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (library *NGSLibrary) ExtractBarcode(sequence obiseq.BioSequence, inplace bool) (obiseq.BioSequence, error) {
|
||||
match := library.Match(sequence)
|
||||
return match.ExtractBarcode(sequence, inplace)
|
||||
}
|
||||
|
||||
func (marker *Marker) Compile(forward, reverse string, maxError int) error {
|
||||
var err error
|
||||
marker.forward, err = obiapat.MakeApatPattern(forward,
|
||||
maxError)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
marker.reverse, err = obiapat.MakeApatPattern(reverse,
|
||||
maxError)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
marker.cforward, err = marker.forward.ReverseComplement()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
marker.creverse, err = marker.reverse.ReverseComplement()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
marker.taglength = 0
|
||||
for tags := range marker.samples {
|
||||
lf := len(tags.Forward)
|
||||
lr := len(tags.Reverse)
|
||||
|
||||
l := lf
|
||||
if lf == 0 {
|
||||
l = lr
|
||||
}
|
||||
|
||||
if lr != 0 && l != lr {
|
||||
return fmt.Errorf("forward tag (%s) and reverse tag (%s) do not have the same length",
|
||||
tags.Forward, tags.Reverse)
|
||||
}
|
||||
|
||||
if marker.taglength != 0 && l != marker.taglength {
|
||||
return fmt.Errorf("tag pair (%s,%s) is not compatible with a tag length of %d",
|
||||
tags.Forward, tags.Reverse, marker.taglength)
|
||||
} else {
|
||||
marker.taglength = l
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (marker *Marker) Match(sequence obiseq.BioSequence) *DemultiplexMatch {
|
||||
aseq, _ := obiapat.MakeApatSequence(sequence, false)
|
||||
match := marker.forward.FindAllIndex(aseq, marker.taglength)
|
||||
|
||||
if len(match) > 0 {
|
||||
sseq := sequence.String()
|
||||
direct := sseq[match[0][0]:match[0][1]]
|
||||
ftag := sseq[(match[0][0] - marker.taglength):match[0][0]]
|
||||
|
||||
m := DemultiplexMatch{
|
||||
ForwardMatch: direct,
|
||||
ForwardTag: ftag,
|
||||
BarcodeStart: match[0][1],
|
||||
ForwardMismatches: match[0][2],
|
||||
IsDirect: true,
|
||||
Error: nil,
|
||||
}
|
||||
|
||||
rmatch := marker.creverse.FindAllIndex(aseq, match[0][1])
|
||||
|
||||
if len(rmatch) > 0 {
|
||||
|
||||
// extracting primer matches
|
||||
reverse, _ := sequence.Subsequence(rmatch[0][0], rmatch[0][1], false)
|
||||
defer reverse.Recycle()
|
||||
reverse = reverse.ReverseComplement(true)
|
||||
rtag, err := sequence.Subsequence(rmatch[0][1], rmatch[0][1]+marker.taglength, false)
|
||||
defer rtag.Recycle()
|
||||
srtag := ""
|
||||
|
||||
if err != nil {
|
||||
rtag = obiseq.NilBioSequence
|
||||
} else {
|
||||
rtag.ReverseComplement(true)
|
||||
srtag = strings.ToLower(rtag.String())
|
||||
}
|
||||
|
||||
m.ReverseMatch = strings.ToLower(reverse.String())
|
||||
m.ReverseMismatches = rmatch[0][2]
|
||||
m.BarcodeEnd = rmatch[0][0]
|
||||
m.ReverseTag = srtag
|
||||
|
||||
sample, ok := marker.samples[TagPair{ftag, srtag}]
|
||||
|
||||
if ok {
|
||||
m.Pcr = sample
|
||||
}
|
||||
|
||||
return &m
|
||||
|
||||
}
|
||||
|
||||
err := fmt.Errorf("cannot locates reverse priming site")
|
||||
m.Error = err
|
||||
|
||||
return &m
|
||||
}
|
||||
|
||||
match = marker.reverse.FindAllIndex(aseq, marker.taglength)
|
||||
|
||||
if len(match) > 0 {
|
||||
sseq := sequence.String()
|
||||
|
||||
reverse := strings.ToLower(sseq[match[0][0]:match[0][1]])
|
||||
rtag := strings.ToLower(sseq[(match[0][0] - marker.taglength):match[0][0]])
|
||||
|
||||
m := DemultiplexMatch{
|
||||
ReverseMatch: reverse,
|
||||
ReverseTag: rtag,
|
||||
BarcodeStart: match[0][1],
|
||||
ReverseMismatches: match[0][2],
|
||||
IsDirect: false,
|
||||
Error: nil,
|
||||
}
|
||||
|
||||
rmatch := marker.cforward.FindAllIndex(aseq, match[0][1])
|
||||
|
||||
if len(rmatch) > 0 {
|
||||
|
||||
direct, _ := sequence.Subsequence(rmatch[0][0], rmatch[0][1], false)
|
||||
defer direct.Recycle()
|
||||
direct = direct.ReverseComplement(true)
|
||||
|
||||
ftag, err := sequence.Subsequence(rmatch[0][1], rmatch[0][1]+marker.taglength, false)
|
||||
defer ftag.Recycle()
|
||||
sftag := ""
|
||||
if err != nil {
|
||||
ftag = obiseq.NilBioSequence
|
||||
|
||||
} else {
|
||||
ftag = ftag.ReverseComplement(true)
|
||||
sftag = ftag.String()
|
||||
}
|
||||
|
||||
m.ForwardMatch = direct.String()
|
||||
m.ForwardTag = sftag
|
||||
m.ReverseMismatches = rmatch[0][2]
|
||||
m.BarcodeEnd = rmatch[0][0]
|
||||
|
||||
sample, ok := marker.samples[TagPair{sftag, rtag}]
|
||||
|
||||
if ok {
|
||||
m.Pcr = sample
|
||||
}
|
||||
|
||||
return &m
|
||||
}
|
||||
|
||||
err := fmt.Errorf("cannot locates forward priming site")
|
||||
m.Error = err
|
||||
return &m
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (match *DemultiplexMatch) ExtractBarcode(sequence obiseq.BioSequence, inplace bool) (obiseq.BioSequence, error) {
|
||||
if !inplace {
|
||||
sequence = sequence.Copy()
|
||||
}
|
||||
|
||||
if match == nil {
|
||||
annot := sequence.Annotations()
|
||||
annot["demultiplex_error"] = "cannot match any primer pair"
|
||||
return sequence, errors.New("cannot match any primer pair")
|
||||
}
|
||||
|
||||
if match.ForwardMatch != "" && match.ReverseMatch != "" {
|
||||
var err error
|
||||
sequence, err = sequence.Subsequence(match.BarcodeStart, match.BarcodeEnd, false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("cannot extract sub sequence %d..%d %v", match.BarcodeStart, match.BarcodeEnd, *match)
|
||||
}
|
||||
}
|
||||
|
||||
if !match.IsDirect {
|
||||
sequence.ReverseComplement(true)
|
||||
}
|
||||
|
||||
annot := sequence.Annotations()
|
||||
if annot == nil {
|
||||
log.Fatalf("nil annot %v", sequence)
|
||||
}
|
||||
annot["forward_primer"] = match.ForwardPrimer
|
||||
annot["reverse_primer"] = match.ReversePrimer
|
||||
|
||||
if match.IsDirect {
|
||||
annot["direction"] = "direct"
|
||||
} else {
|
||||
annot["direction"] = "reverse"
|
||||
}
|
||||
|
||||
if match.ForwardMatch != "" {
|
||||
annot["forward_match"] = match.ForwardMatch
|
||||
annot["forward_mismatches"] = match.ForwardMismatches
|
||||
annot["forward_tag"] = match.ForwardTag
|
||||
}
|
||||
|
||||
if match.ReverseMatch != "" {
|
||||
annot["reverse_match"] = match.ReverseMatch
|
||||
annot["reverse_mismatches"] = match.ReverseMismatches
|
||||
annot["reverse_tag"] = match.ReverseTag
|
||||
}
|
||||
|
||||
if match.Error != nil {
|
||||
annot["demultiplex_error"] = fmt.Sprintf("%v", match.Error)
|
||||
}
|
||||
|
||||
if match.Pcr != nil {
|
||||
annot["sample"] = match.Pcr.Sample
|
||||
annot["experiment"] = match.Pcr.Experiment
|
||||
for k, val := range match.Pcr.Annotations {
|
||||
annot[k] = val
|
||||
}
|
||||
}
|
||||
|
||||
return sequence, match.Error
|
||||
}
|
65
pkg/obingslibrary/ngslibrary.go
Normal file
65
pkg/obingslibrary/ngslibrary.go
Normal file
@ -0,0 +1,65 @@
|
||||
package obingslibrary
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiapat"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
type PrimerPair struct {
|
||||
Forward string
|
||||
Reverse string
|
||||
}
|
||||
|
||||
type TagPair struct {
|
||||
Forward string
|
||||
Reverse string
|
||||
}
|
||||
|
||||
type PCR struct {
|
||||
Experiment string
|
||||
Sample string
|
||||
Partial bool
|
||||
Annotations obiseq.Annotation
|
||||
}
|
||||
|
||||
type Marker struct {
|
||||
forward obiapat.ApatPattern
|
||||
cforward obiapat.ApatPattern
|
||||
reverse obiapat.ApatPattern
|
||||
creverse obiapat.ApatPattern
|
||||
taglength int
|
||||
samples map[TagPair]*PCR
|
||||
}
|
||||
type NGSLibrary map[PrimerPair]*Marker
|
||||
|
||||
func MakeNGSLibrary() NGSLibrary {
|
||||
return make(NGSLibrary, 10)
|
||||
}
|
||||
|
||||
func (library *NGSLibrary) GetMarker(forward, reverse string) (*Marker, bool) {
|
||||
pair := PrimerPair{forward, reverse}
|
||||
marker, ok := (*library)[pair]
|
||||
|
||||
if ok {
|
||||
return marker, true
|
||||
}
|
||||
|
||||
m := Marker{samples: make(map[TagPair]*PCR, 1000)}
|
||||
(*library)[pair] = &m
|
||||
|
||||
return &m, false
|
||||
}
|
||||
|
||||
func (marker *Marker) GetPCR(forward, reverse string) (*PCR, bool) {
|
||||
pair := TagPair{forward, reverse}
|
||||
pcr, ok := marker.samples[pair]
|
||||
|
||||
if ok {
|
||||
return pcr, ok
|
||||
}
|
||||
|
||||
ipcr := PCR{}
|
||||
marker.samples[pair] = &ipcr
|
||||
|
||||
return &ipcr, false
|
||||
}
|
182
pkg/obingslibrary/worker.go
Normal file
182
pkg/obingslibrary/worker.go
Normal file
@ -0,0 +1,182 @@
|
||||
package obingslibrary
|
||||
|
||||
import "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
|
||||
type _Options struct {
|
||||
discardErrors bool
|
||||
unidentified string
|
||||
allowedMismatch int
|
||||
withProgressBar bool
|
||||
parallelWorkers int
|
||||
batchSize int
|
||||
bufferSize int
|
||||
}
|
||||
|
||||
// Options stores a set of option usable by the
|
||||
// PCR simulation algotithm.
|
||||
type Options struct {
|
||||
pointer *_Options
|
||||
}
|
||||
|
||||
// WithOption is the standard type for function
|
||||
// declaring options.
|
||||
type WithOption func(Options)
|
||||
|
||||
func OptionDiscardErrors(yes bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.discardErrors = yes
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionUnidentified(filename string) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.unidentified = filename
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionWithProgressBar(yes bool) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.withProgressBar = yes
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func OptionAllowedMismatches(count int) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.allowedMismatch = count
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
// OptionBufferSize sets the requested channel
|
||||
// buffer size.
|
||||
func OptionBufferSize(size int) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.bufferSize = size
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
// OptionParallelWorkers sets how many search
|
||||
// jobs will be run in parallel.
|
||||
func OptionParallelWorkers(nworkers int) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.parallelWorkers = nworkers
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
// OptionBatchSize sets the requested sequence
|
||||
// batch size.
|
||||
func OptionBatchSize(size int) WithOption {
|
||||
f := WithOption(func(opt Options) {
|
||||
opt.pointer.batchSize = size
|
||||
})
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func (options Options) DiscardErrors() bool {
|
||||
return options.pointer.unidentified == "" || options.pointer.discardErrors
|
||||
}
|
||||
|
||||
func (options Options) Unidentified() string {
|
||||
return options.pointer.unidentified
|
||||
}
|
||||
|
||||
func (options Options) AllowedMismatch() int {
|
||||
return options.pointer.allowedMismatch
|
||||
}
|
||||
|
||||
func (options Options) WithProgressBar() bool {
|
||||
return options.pointer.withProgressBar
|
||||
}
|
||||
|
||||
// BufferSize returns the size of the channel
|
||||
// buffer specified by the options
|
||||
func (options Options) BufferSize() int {
|
||||
return options.pointer.bufferSize
|
||||
}
|
||||
|
||||
// BatchSize returns the size of the
|
||||
// sequence batch used by the PCR algorithm
|
||||
func (options Options) BatchSize() int {
|
||||
return options.pointer.batchSize
|
||||
}
|
||||
|
||||
// ParallelWorkers returns how many search
|
||||
// jobs will be run in parallel.
|
||||
func (options Options) ParallelWorkers() int {
|
||||
return options.pointer.parallelWorkers
|
||||
}
|
||||
|
||||
// MakeOptions buils a new default option set for
|
||||
// the PCR simulation algoithm.
|
||||
func MakeOptions(setters []WithOption) Options {
|
||||
o := _Options{
|
||||
discardErrors: true,
|
||||
unidentified: "",
|
||||
allowedMismatch: 0,
|
||||
withProgressBar: false,
|
||||
parallelWorkers: 4,
|
||||
batchSize: 1000,
|
||||
bufferSize: 100,
|
||||
}
|
||||
|
||||
opt := Options{&o}
|
||||
|
||||
for _, set := range setters {
|
||||
set(opt)
|
||||
}
|
||||
|
||||
return opt
|
||||
}
|
||||
|
||||
func _ExtractBarcodeSlice(ngslibrary NGSLibrary,
|
||||
sequences obiseq.BioSequenceSlice,
|
||||
options Options) obiseq.BioSequenceSlice {
|
||||
newSlice := make(obiseq.BioSequenceSlice,0,len(sequences))
|
||||
|
||||
for _, seq := range sequences {
|
||||
s, err := ngslibrary.ExtractBarcode(seq,true)
|
||||
if err==nil || ! options.pointer.discardErrors {
|
||||
newSlice = append(newSlice, s)
|
||||
}
|
||||
}
|
||||
|
||||
return newSlice
|
||||
}
|
||||
|
||||
func ExtractBarcodeSlice(ngslibrary NGSLibrary,
|
||||
sequences obiseq.BioSequenceSlice,
|
||||
options ...WithOption) obiseq.BioSequenceSlice {
|
||||
|
||||
opt := MakeOptions(options)
|
||||
|
||||
ngslibrary.Compile(opt.AllowedMismatch())
|
||||
|
||||
return _ExtractBarcodeSlice(ngslibrary, sequences, opt)
|
||||
}
|
||||
|
||||
func ExtractBarcodeSliceWorker(ngslibrary NGSLibrary,
|
||||
options ...WithOption) obiseq.SeqSliceWorker {
|
||||
|
||||
opt := MakeOptions(options)
|
||||
|
||||
ngslibrary.Compile(opt.AllowedMismatch())
|
||||
|
||||
worker := func(sequences obiseq.BioSequenceSlice) obiseq.BioSequenceSlice {
|
||||
return _ExtractBarcodeSlice(ngslibrary, sequences, opt)
|
||||
}
|
||||
|
||||
return worker
|
||||
}
|
||||
|
Reference in New Issue
Block a user