Adds a reader for NGS filter files and change some API for the apat library

This commit is contained in:
2022-01-18 13:09:32 +01:00
parent 6571296bb2
commit 4551df08b1
13 changed files with 301 additions and 96 deletions

View File

@ -0,0 +1,34 @@
package main
import (
"os"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obipcr"
)
func main() {
// f, err := os.Create("cpu.pprof")
// if err != nil {
// log.Fatal(err)
// }
// pprof.StartCPUProfile(f)
// defer pprof.StopCPUProfile()
// ftrace, err := os.Create("cpu.trace")
// if err != nil {
// log.Fatal(err)
// }
// trace.Start(ftrace)
// defer trace.Stop()
optionParser := obioptions.GenerateOptionParser(obipcr.OptionSet)
_, args, _ := optionParser(os.Args)
sequences, _ := obiconvert.ReadBioSequencesBatch(args...)
amplicons, _ := obipcr.PCR(sequences)
obiconvert.WriteBioSequencesBatch(amplicons, true)
}

View File

@ -1,7 +1,9 @@
package main package main
import ( import (
"log"
"os" "os"
"runtime/trace"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
@ -10,6 +12,7 @@ import (
func main() { func main() {
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
// f, err := os.Create("cpu.pprof") // f, err := os.Create("cpu.pprof")
// if err != nil { // if err != nil {
// log.Fatal(err) // log.Fatal(err)
@ -17,12 +20,13 @@ func main() {
// pprof.StartCPUProfile(f) // pprof.StartCPUProfile(f)
// defer pprof.StopCPUProfile() // defer pprof.StopCPUProfile()
// ftrace, err := os.Create("cpu.trace") // go tool trace cpu.trace
// if err != nil { ftrace, err := os.Create("cpu.trace")
// log.Fatal(err) if err != nil {
// } log.Fatal(err)
// trace.Start(ftrace) }
// defer trace.Stop() trace.Start(ftrace)
defer trace.Stop()
optionParser := obioptions.GenerateOptionParser(obipcr.OptionSet) optionParser := obioptions.GenerateOptionParser(obipcr.OptionSet)

View File

@ -6,7 +6,8 @@ import (
"os" "os"
"runtime/trace" "runtime/trace"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiapat"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
) )
@ -41,27 +42,17 @@ func main() {
// } // }
A := []byte("ccgcctccttagaacaggctcctctagaaaaccatagtgggatatctaaagaaggcggagatagaaagagcggttcagcaggaatgccgagatggacggcgtgtgacg") A := []byte("ccgcctccttagaacaggctcctctagaaaaccatagtgggatatctaaagaaggcggagatagaaagagcggttcagcaggaatgccgagatggacggcgtgtgacg")
B := []byte("cgccaccaccgagatctacactctttccctacacgacgctcttccgatctccgcctccttagaacaggctcctctagaaaagcatagtggggtatctaaaggaggcgg") // B := []byte("cgccaccaccgagatctacactctttccctacacgacgctcttccgatctccgcctccttagaacaggctcctctagaaaagcatagtggggtatctaaaggaggcgg")
sA := obiseq.MakeBioSequence("A", A, "") sA := obiseq.MakeBioSequence("A", A, "")
sB := obiseq.MakeBioSequence("B", B, "") // sB := obiseq.MakeBioSequence("B", B, "")
fmt.Println(string(sA.Sequence())) pat, _ := obiapat.MakeApatPattern("TCCTTCCAACAGGCTCCTC", 3)
fmt.Println(sA.Qualities()) as, _ := obiapat.MakeApatSequence(sA, false)
fmt.Println(string(sB.Sequence())) fmt.Println(pat.FindAllIndex(as))
fmt.Println(sB.Qualities())
score, path := obialign.PELeftAlign(sA, sB, 2, obialign.NilPEAlignArena) file, _ := os.Open("sample/wolf_diet_ngsfilter.txt")
fmt.Printf("Score : %d Path : %v\n", score, path) xxx, _ := obiformats.ReadNGSFilter(file)
score, path = obialign.PERightAlign(sA, sB, 2, obialign.NilPEAlignArena)
fmt.Printf("Score : %d Path : %v\n", score, path)
fmt.Println(string(sA.Sequence())) fmt.Println(xxx)
sA.ReverseComplement(true)
fmt.Println(string(sA.Sequence()))
fmt.Println(string(sA.Id()))
sA.Reset()
fmt.Println(sA.Length())
fmt.Println(sA.String())
} }

View File

@ -202,8 +202,6 @@ func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, limits ...int) (l
C.int32_t(begin), C.int32_t(begin),
C.int32_t(length+C.MAX_PAT_LEN))) C.int32_t(length+C.MAX_PAT_LEN)))
//log.Printf("match count : %d\n", nhits)
if nhits == 0 { if nhits == 0 {
return nil return nil
} }

View File

@ -1,6 +1,8 @@
package obiapat package obiapat
import ( import (
"log"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
) )
@ -14,6 +16,10 @@ type _Options struct {
bufferSize int bufferSize int
batchSize int batchSize int
parallelWorkers int parallelWorkers int
forward ApatPattern
cfwd ApatPattern
reverse ApatPattern
crev ApatPattern
} }
// Options stores a set of option usable by the // Options stores a set of option usable by the
@ -90,6 +96,10 @@ func MakeOptions(setters []WithOption) Options {
parallelWorkers: 4, parallelWorkers: 4,
batchSize: 100, batchSize: 100,
bufferSize: 100, bufferSize: 100,
forward: NilApatPattern,
cfwd: NilApatPattern,
reverse: NilApatPattern,
crev: NilApatPattern,
} }
opt := Options{&o} opt := Options{&o}
@ -126,19 +136,41 @@ func OptionMaxLength(maxLength int) WithOption {
// OptionForwardError sets the number of // OptionForwardError sets the number of
// error allowed when matching the forward // error allowed when matching the forward
// primer. // primer.
func OptionForwardError(max int) WithOption { func OptionForwardPrimer(primer string, max int) WithOption {
f := WithOption(func(opt Options) { f := WithOption(func(opt Options) {
var err error
opt.pointer.forward, err = MakeApatPattern(primer, max)
if err != nil {
log.Fatalf("error : %v\n", err)
}
opt.pointer.cfwd, err = opt.pointer.forward.ReverseComplement()
if err != nil {
log.Fatalf("error : %v\n", err)
}
opt.pointer.forwardError = max opt.pointer.forwardError = max
}) })
return f return f
} }
// OptionReverseError sets the number of // OptionForwardError sets the number of
// error allowed when matching the reverse // error allowed when matching the forward
// primer. // primer.
func OptionReverseError(max int) WithOption { func OptionReversePrimer(primer string, max int) WithOption {
f := WithOption(func(opt Options) { f := WithOption(func(opt Options) {
var err error
opt.pointer.reverse, err = MakeApatPattern(primer, max)
if err != nil {
log.Fatalf("error : %v\n", err)
}
opt.pointer.crev, err = opt.pointer.reverse.ReverseComplement()
if err != nil {
log.Fatalf("error : %v\n", err)
}
opt.pointer.reverseError = max opt.pointer.reverseError = max
}) })
@ -185,14 +217,37 @@ func OptionBatchSize(size int) WithOption {
return f return f
} }
func _Pcr(seq ApatSequence, sequence obiseq.BioSequence, func (options Options) Free() {
forward, cfwd, reverse, crev ApatPattern, if options.pointer.forward.pointer != nil {
options.pointer.forward.Free()
}
if options.pointer.cfwd.pointer != nil {
options.pointer.cfwd.Free()
}
if options.pointer.reverse.pointer != nil {
options.pointer.reverse.Free()
}
if options.pointer.crev.pointer != nil {
options.pointer.crev.Free()
}
}
func _Pcr(seq ApatSequence,
sequence obiseq.BioSequence,
opt Options) obiseq.BioSequenceSlice { opt Options) obiseq.BioSequenceSlice {
results := make(obiseq.BioSequenceSlice, 0, 10) results := make(obiseq.BioSequenceSlice, 0, 10)
forward := opt.pointer.forward
cfwd := opt.pointer.cfwd
reverse := opt.pointer.reverse
crev := opt.pointer.crev
forwardMatches := forward.FindAllIndex(seq) forwardMatches := forward.FindAllIndex(seq)
if forwardMatches != nil { if len(forwardMatches) > 0 {
begin := forwardMatches[0][0] begin := forwardMatches[0][0]
length := seq.Length() - begin length := seq.Length() - begin
@ -262,7 +317,6 @@ func _Pcr(seq ApatSequence, sequence obiseq.BioSequence,
} }
forwardMatches = reverse.FindAllIndex(seq) forwardMatches = reverse.FindAllIndex(seq)
if forwardMatches != nil { if forwardMatches != nil {
begin := forwardMatches[0][0] begin := forwardMatches[0][0]
@ -340,28 +394,15 @@ func _Pcr(seq ApatSequence, sequence obiseq.BioSequence,
// obiseq.BioSequence instance. PCR parameters are // obiseq.BioSequence instance. PCR parameters are
// specified using the corresponding Option functions // specified using the corresponding Option functions
// defined for the PCR algorithm. // defined for the PCR algorithm.
func PCR(sequence obiseq.BioSequence, func PCR(sequence obiseq.BioSequence, options ...WithOption) obiseq.BioSequenceSlice {
forward, reverse string, options ...WithOption) obiseq.BioSequenceSlice {
opt := MakeOptions(options) opt := MakeOptions(options)
defer opt.Free()
seq, _ := MakeApatSequence(sequence, opt.Circular()) seq, _ := MakeApatSequence(sequence, opt.Circular())
defer seq.Free()
fwd, _ := MakeApatPattern(forward, opt.ForwardError()) results := _Pcr(seq, sequence, opt)
rev, _ := MakeApatPattern(reverse, opt.ReverseError())
cfwd, _ := fwd.ReverseComplement()
crev, _ := rev.ReverseComplement()
results := _Pcr(seq, sequence,
fwd, cfwd, rev, crev,
opt)
seq.Free()
fwd.Free()
rev.Free()
cfwd.Free()
crev.Free()
return results return results
} }
@ -372,32 +413,24 @@ func PCR(sequence obiseq.BioSequence,
// specified using the corresponding Option functions // specified using the corresponding Option functions
// defined for the PCR algorithm. // defined for the PCR algorithm.
func PCRSlice(sequences obiseq.BioSequenceSlice, func PCRSlice(sequences obiseq.BioSequenceSlice,
forward, reverse string, options ...WithOption) obiseq.BioSequenceSlice { options ...WithOption) obiseq.BioSequenceSlice {
results := make(obiseq.BioSequenceSlice, 0, len(sequences)) results := make(obiseq.BioSequenceSlice, 0, len(sequences))
opt := MakeOptions(options) opt := MakeOptions(options)
defer opt.Free()
fwd, _ := MakeApatPattern(forward, opt.ForwardError())
rev, _ := MakeApatPattern(reverse, opt.ReverseError())
cfwd, _ := fwd.ReverseComplement()
crev, _ := rev.ReverseComplement()
if len(sequences) > 0 { if len(sequences) > 0 {
seq, _ := MakeApatSequence(sequences[0], opt.Circular()) seq, _ := MakeApatSequence(sequences[0], opt.Circular())
amplicons := _Pcr(seq, sequences[0], amplicons := _Pcr(seq, sequences[0], opt)
fwd, cfwd, rev, crev,
opt)
if len(amplicons) > 0 { if len(amplicons) > 0 {
results = append(results, amplicons...) results = append(results, amplicons...)
} }
for _, sequence := range sequences[1:] { for _, sequence := range sequences[1:] {
seq, _ := MakeApatSequence(sequence, opt.Circular(), seq) seq, _ = MakeApatSequence(sequence, opt.Circular(), seq)
amplicons = _Pcr(seq, sequence, amplicons = _Pcr(seq, sequence, opt)
fwd, cfwd, rev, crev,
opt)
if len(amplicons) > 0 { if len(amplicons) > 0 {
results = append(results, amplicons...) results = append(results, amplicons...)
} }
@ -406,21 +439,15 @@ func PCRSlice(sequences obiseq.BioSequenceSlice,
seq.Free() seq.Free()
} }
fwd.Free()
rev.Free()
cfwd.Free()
crev.Free()
return results return results
} }
// PCRSliceWorker is a worker function builder which produce // PCRSliceWorker is a worker function builder which produce
// job function usable by the obiseq.MakeISliceWorker function. // job function usable by the obiseq.MakeISliceWorker function.
func PCRSliceWorker(forward, reverse string, func PCRSliceWorker(options ...WithOption) obiseq.SeqSliceWorker {
options ...WithOption) obiseq.SeqSliceWorker {
worker := func(sequences obiseq.BioSequenceSlice) obiseq.BioSequenceSlice { worker := func(sequences obiseq.BioSequenceSlice) obiseq.BioSequenceSlice {
return PCRSlice(sequences, forward, reverse, options...) return PCRSlice(sequences, options...)
} }
return worker return worker

View File

@ -184,8 +184,8 @@ func ReadEMBLBatch(reader io.Reader, options ...WithOption) obiseq.IBioSequenceB
newIter := obiseq.MakeIBioSequenceBatch(opt.BufferSize()) newIter := obiseq.MakeIBioSequenceBatch(opt.BufferSize())
// newIter.Add(opt.ParallelWorkers()) nworkers := opt.ParallelWorkers()
newIter.Add(2) newIter.Add(nworkers)
go func() { go func() {
newIter.Wait() newIter.Wait()
@ -196,7 +196,7 @@ func ReadEMBLBatch(reader io.Reader, options ...WithOption) obiseq.IBioSequenceB
}() }()
// for j := 0; j < opt.ParallelWorkers(); j++ { // for j := 0; j < opt.ParallelWorkers(); j++ {
for j := 0; j < 2; j++ { for j := 0; j < nworkers; j++ {
go _ParseEmblFile(entry_channel, newIter) go _ParseEmblFile(entry_channel, newIter)
} }

View File

@ -173,16 +173,11 @@ func __is_false__(text []byte) bool {
bytes.Equal(text, __FALSE__) bytes.Equal(text, __FALSE__)
} }
func ParseFastSeqOBIHeader(sequence obiseq.BioSequence) { func ParseOBIFeatures(text string, annotations obiseq.Annotation) string {
definition := []byte(sequence.Definition())
annotations := sequence.Annotations()
// all_matches := __obi_header_pattern__.FindAllSubmatchIndex(definition, -1)
definition := []byte(text)
d := definition d := definition
//for m := __obi_header_key_pattern__.FindIndex(definition); len(m) > 0; {
//fmt.Println(string(definition[0:20]), __match__key__(definition))
for m := __match__key__(definition); len(m) > 0; { for m := __match__key__(definition); len(m) > 0; {
var bvalue []byte var bvalue []byte
var value interface{} var value interface{}
@ -263,7 +258,16 @@ func ParseFastSeqOBIHeader(sequence obiseq.BioSequence) {
m = __match__key__(d) m = __match__key__(d)
} }
sequence.SetDefinition(string(bytes.TrimSpace(d))) return string(bytes.TrimSpace(d))
}
func ParseFastSeqOBIHeader(sequence obiseq.BioSequence) {
annotations := sequence.Annotations()
definition := ParseOBIFeatures(sequence.Definition(),
annotations)
sequence.SetDefinition(definition)
} }
func FormatFastSeqOBIHeader(sequence obiseq.BioSequence) string { func FormatFastSeqOBIHeader(sequence obiseq.BioSequence) string {

View File

@ -120,10 +120,10 @@ func WriteFastaBatch(iterator obiseq.IBioSequenceBatch, file io.Writer, options
} }
log.Println("Start of the fasta file writing") log.Println("Start of the fasta file writing")
go ff(iterator)
for i := 0; i < nwriters-1; i++ { for i := 0; i < nwriters-1; i++ {
go ff(iterator.Split()) go ff(iterator.Split())
} }
go ff(iterator)
next_to_send := 0 next_to_send := 0
received := make(map[int]FileChunck, 100) received := make(map[int]FileChunck, 100)

View File

@ -122,10 +122,10 @@ func WriteFastqBatch(iterator obiseq.IBioSequenceBatch, file io.Writer, options
} }
log.Println("Start of the fastq file writing") log.Println("Start of the fastq file writing")
go ff(iterator)
for i := 0; i < nwriters-1; i++ { for i := 0; i < nwriters-1; i++ {
go ff(iterator.Split()) go ff(iterator.Split())
} }
go ff(iterator)
next_to_send := 0 next_to_send := 0
received := make(map[int]FileChunck, 100) received := make(map[int]FileChunck, 100)

View File

@ -0,0 +1,133 @@
package obiformats
import (
"bufio"
"fmt"
"io"
"strings"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
type PrimerPair struct {
Forward string
Reverse string
}
type TagPair struct {
Forward string
Reverse string
}
type PCR struct {
Experiment string
Sample string
Partial bool
Annotations obiseq.Annotation
}
type PCRs map[TagPair]PCR
type NGSFilter map[PrimerPair]PCRs
func _readLines(reader io.Reader) []string {
r := bufio.NewReader(reader)
bytes := []byte{}
lines := []string{}
for {
line, isPrefix, err := r.ReadLine()
if err != nil {
break
}
bytes = append(bytes, line...)
if !isPrefix {
str := strings.TrimSpace(string(bytes))
if len(str) > 0 {
lines = append(lines, str)
bytes = []byte{}
}
}
}
if len(bytes) > 0 {
lines = append(lines, string(bytes))
}
return lines
}
func _parseMainNGSFilterTags(text string) TagPair {
tags := strings.Split(text, ":")
if len(tags) == 1 {
return TagPair{tags[0], tags[0]}
}
if tags[0] == "-" {
tags[0] = ""
}
if tags[1] == "-" {
tags[1] = ""
}
return TagPair{tags[0], tags[1]}
}
func _parseMainNGSFilter(text string) (PrimerPair, TagPair, string, string, bool) {
fields := strings.Fields(text)
tags := _parseMainNGSFilterTags(fields[2])
partial := fields[5] == "T" || fields[5] == "t"
return PrimerPair{fields[3], fields[4]},
tags,
fields[0],
fields[1],
partial
}
func ReadNGSFilter(reader io.Reader) (NGSFilter, error) {
ngsfilter := make(NGSFilter, 10)
lines := _readLines(reader)
for _, line := range lines {
line = strings.TrimSpace(line)
if strings.HasPrefix(line, "#") || len(line) == 0 {
continue
}
split := strings.SplitN(line, "@", 2)
primers, tags, experiment, sample, partial := _parseMainNGSFilter(split[0])
newPCR := PCR{
Experiment: experiment,
Sample: sample,
Partial: partial,
Annotations: nil,
}
if len(split) > 1 && len(split[1]) > 0 {
newPCR.Annotations = obiseq.GetAnnotation()
ParseOBIFeatures(split[1], newPCR.Annotations)
}
samples, ok := ngsfilter[primers]
if ok {
pcr, ok := samples[tags]
if ok {
return nil, fmt.Errorf("pair of tags %v used for samples %s in %s and %s in %s",
tags, sample, experiment, pcr.Sample, pcr.Experiment)
}
samples[tags] = newPCR
} else {
ngsfilter[primers] = make(PCRs, 1000)
ngsfilter[primers][tags] = newPCR
}
}
return ngsfilter, nil
}

View File

@ -56,19 +56,26 @@ func WriteSequenceBatch(iterator obiseq.IBioSequenceBatch,
file io.Writer, file io.Writer,
options ...WithOption) (obiseq.IBioSequenceBatch, error) { options ...WithOption) (obiseq.IBioSequenceBatch, error) {
var newIter obiseq.IBioSequenceBatch iterator = iterator.Rebatch(1000)
var err error
ok := iterator.Next() ok := iterator.Next()
if ok { if ok {
iterator.PushBack()
batch := iterator.Get() batch := iterator.Get()
iterator.PushBack()
var newIter obiseq.IBioSequenceBatch
var err error
if len(batch.Slice()) > 0 {
if batch.Slice()[0].HasQualities() { if batch.Slice()[0].HasQualities() {
newIter, err = WriteFastqBatch(iterator, file, options...) newIter, err = WriteFastqBatch(iterator, file, options...)
} else { } else {
newIter, err = WriteFastaBatch(iterator, file, options...) newIter, err = WriteFastaBatch(iterator, file, options...)
} }
} else {
newIter, err = WriteFastaBatch(iterator, file, options...)
}
return newIter, err return newIter, err
} }

View File

@ -0,0 +1 @@
package obimultiplex

View File

@ -10,12 +10,18 @@ import (
// obiseq.BioSequenceBatch containing the selected amplicon sequences. // obiseq.BioSequenceBatch containing the selected amplicon sequences.
func PCR(iterator obiseq.IBioSequenceBatch) (obiseq.IBioSequenceBatch, error) { func PCR(iterator obiseq.IBioSequenceBatch) (obiseq.IBioSequenceBatch, error) {
forward := ForwardPrimer()
reverse := ReversePrimer()
opts := make([]obiapat.WithOption, 0, 10) opts := make([]obiapat.WithOption, 0, 10)
opts = append(opts, obiapat.OptionForwardError(AllowedMismatch()), opts = append(opts,
obiapat.OptionReverseError(AllowedMismatch())) obiapat.OptionForwardPrimer(
ForwardPrimer(),
AllowedMismatch(),
),
obiapat.OptionReversePrimer(
ReversePrimer(),
AllowedMismatch(),
),
)
if MinLength() > 0 { if MinLength() > 0 {
opts = append(opts, obiapat.OptionMinLength(MinLength())) opts = append(opts, obiapat.OptionMinLength(MinLength()))
@ -29,7 +35,7 @@ func PCR(iterator obiseq.IBioSequenceBatch) (obiseq.IBioSequenceBatch, error) {
opts = append(opts, obiapat.OptionCircular(Circular())) opts = append(opts, obiapat.OptionCircular(Circular()))
} }
worker := obiapat.PCRSliceWorker(forward, reverse, opts...) worker := obiapat.PCRSliceWorker(opts...)
return iterator.MakeISliceWorker(worker), nil return iterator.MakeISliceWorker(worker), nil
} }