From 64676db3f403f7fa57404173321f121355b1a1c8 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sun, 16 Jan 2022 17:30:30 +0100 Subject: [PATCH] Connect the command line options to the algorithm of obipairing --- cmd/obitools/obipairing/main.go | 32 ++++++++++++--------- pkg/obialign/pairedendalign.go | 32 +++++++++++---------- pkg/obiseq/biosequence.go | 4 +++ pkg/obiseq/biosequenceslice.go | 1 - pkg/obitools/obipairing/options.go | 6 ++-- pkg/obitools/obipairing/pairing.go | 46 ++++++++++++++++++++++++------ pkg/obitools/obipcr/options.go | 11 +++++-- 7 files changed, 89 insertions(+), 43 deletions(-) diff --git a/cmd/obitools/obipairing/main.go b/cmd/obitools/obipairing/main.go index e11bcd9..351670b 100644 --- a/cmd/obitools/obipairing/main.go +++ b/cmd/obitools/obipairing/main.go @@ -3,7 +3,7 @@ package main import ( "log" "os" - "runtime/pprof" + "runtime/trace" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" @@ -13,25 +13,31 @@ import ( func main() { // go tool pprof -http=":8000" ./obipairing ./cpu.pprof - f, err := os.Create("cpu.pprof") - if err != nil { - log.Fatal(err) - } - pprof.StartCPUProfile(f) - defer pprof.StopCPUProfile() - - // go tool trace cpu.trace - // ftrace, err := os.Create("cpu.trace") + // f, err := os.Create("cpu.pprof") // if err != nil { // log.Fatal(err) // } - // trace.Start(ftrace) - // defer trace.Stop() + // pprof.StartCPUProfile(f) + // defer pprof.StopCPUProfile() + + // go tool trace cpu.trace + ftrace, err := os.Create("cpu.trace") + if err != nil { + log.Fatal(err) + } + trace.Start(ftrace) + defer trace.Stop() optionParser := obioptions.GenerateOptionParser(obipairing.OptionSet) optionParser(os.Args) pairs, _ := obipairing.IBatchPairedSequence() - paired := obipairing.IAssemblePESequencesBatch(pairs, 2, 50, 20, true) + paired := obipairing.IAssemblePESequencesBatch(pairs, + obipairing.GapPenality(), + obipairing.Delta(), + obipairing.MinOverlap(), + true, + obioptions.ParallelWorkers(), + ) obiconvert.WriteBioSequencesBatch(paired, true) } diff --git a/pkg/obialign/pairedendalign.go b/pkg/obialign/pairedendalign.go index 33696e0..1a60c2b 100644 --- a/pkg/obialign/pairedendalign.go +++ b/pkg/obialign/pairedendalign.go @@ -72,7 +72,7 @@ func _PairingScorePeAlign(baseA, qualA, baseB, qualB byte) int { } } -func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int, +func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64, scoreMatrix, pathMatrix *[]int) int { la := len(seqA) @@ -80,7 +80,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int, // The actual gap score is the gap score times the mismatch between // two bases with a score of 40 - gap = gap * _NucScorePartMatchMismatch[40][40] + gapPenalty := int(gap * float64(_NucScorePartMatchMismatch[40][40])) needed := (la + 1) * (lb + 1) @@ -106,13 +106,13 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int, for j := 0; j < lb; j++ { - _SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gap, 1) + _SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gapPenalty, 1) for i := 0; i < la1; i++ { left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j) diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j]) - left += gap - top += gap + left += gapPenalty + top += gapPenalty switch { case diag > left && diag > top: _SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0) @@ -127,7 +127,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int, left, diag, top := _GetMatrixFrom(scoreMatrix, la, la1, j) diag += _PairingScorePeAlign(seqA[la1], qualA[la1], seqB[j], qualB[j]) - top += gap + top += gapPenalty switch { case diag > left && diag > top: @@ -143,7 +143,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int, return _GetMatrix(scoreMatrix, la, la1, lb-1) } -func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int, +func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64, scoreMatrix, pathMatrix *[]int) int { la := len(seqA) @@ -151,7 +151,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int, // The actual gap score is the gap score times the mismatch between // two bases with a score of 40 - gap = gap * _NucScorePartMatchMismatch[40][40] + gapPenalty := int(gap * float64(_NucScorePartMatchMismatch[40][40])) needed := (la + 1) * (lb + 1) @@ -170,7 +170,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int, // Fills the first column with score 0 for i := 0; i < la; i++ { - _SetMatrices(scoreMatrix, pathMatrix, la, i, -1, (i+1)*gap, -1) + _SetMatrices(scoreMatrix, pathMatrix, la, i, -1, (i+1)*gapPenalty, -1) } lb1 := lb - 1 @@ -183,8 +183,8 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int, left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j) diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j]) - left += gap - top += gap + left += gapPenalty + top += gapPenalty switch { case diag > left && left > top: @@ -205,7 +205,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int, left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, lb1) diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[lb1], qualB[lb1]) - left += gap + left += gapPenalty switch { case diag > left && diag > top: @@ -220,7 +220,8 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int, return _GetMatrix(scoreMatrix, la, la-1, lb1) } -func PELeftAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (int, []int) { +func PELeftAlign(seqA, seqB obiseq.BioSequence, gap float64, + arena PEAlignArena) (int, []int) { if !_InitializedDnaScore { log.Println("Initializing the DNA Scoring matrix") @@ -243,7 +244,8 @@ func PELeftAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (in return score, arena.pointer.path } -func PERightAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (int, []int) { +func PERightAlign(seqA, seqB obiseq.BioSequence, gap float64, + arena PEAlignArena) (int, []int) { if !_InitializedDnaScore { log.Println("Initializing the DNA Scoring matrix") @@ -267,7 +269,7 @@ func PERightAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (i } func PEAlign(seqA, seqB obiseq.BioSequence, - gap, delta int, + gap float64, delta int, arena PEAlignArena) (int, []int) { var score, shift int var startA, startB int diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index f3bf1a0..dc0a8bd 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -127,6 +127,10 @@ func (s BioSequence) Features() string { return string(s.sequence.feature) } +func (s BioSequence) HasAnnotation() bool { + return len(s.sequence.annotations) > 0 +} + func (s BioSequence) Annotations() Annotation { if s.sequence.annotations == nil { s.sequence.annotations = GetAnnotation() diff --git a/pkg/obiseq/biosequenceslice.go b/pkg/obiseq/biosequenceslice.go index 9319095..034d9c5 100644 --- a/pkg/obiseq/biosequenceslice.go +++ b/pkg/obiseq/biosequenceslice.go @@ -1,4 +1,3 @@ package obiseq type BioSequenceSlice []BioSequence - diff --git a/pkg/obitools/obipairing/options.go b/pkg/obitools/obipairing/options.go index 74229e5..ed63f80 100644 --- a/pkg/obitools/obipairing/options.go +++ b/pkg/obitools/obipairing/options.go @@ -10,7 +10,7 @@ var _ForwardFiles = make([]string, 0, 10) var _ReverseFiles = make([]string, 0, 10) var _Delta = 5 var _MinOverlap = 20 -var _GapPenality = 2 +var _GapPenality = float64(2.0) var _WithoutStats = false func PairingOptionSet(options *getoptions.GetOpt) { @@ -28,7 +28,7 @@ func PairingOptionSet(options *getoptions.GetOpt) { options.IntVar(&_MinOverlap, "min-overlap", 20, options.Alias("O"), options.Description("Minimum ovelap between both the reads to consider the aligment (default 20).")) - options.IntVar(&_GapPenality, "gap-penality", 2, + options.Float64Var(&_GapPenality, "gap-penality", 2, options.Alias("G"), options.Description("Gap penality expressed as the multiply factor applied to the mismatch score between two nucleotides with a quality of 40 (default 2).")) options.BoolVar(&_WithoutStats, "without-stat", false, @@ -65,7 +65,7 @@ func MinOverlap() int { return _MinOverlap } -func GapPenality() int { +func GapPenality() float64 { return _GapPenality } diff --git a/pkg/obitools/obipairing/pairing.go b/pkg/obitools/obipairing/pairing.go index 8c3219c..6a043b4 100644 --- a/pkg/obitools/obipairing/pairing.go +++ b/pkg/obitools/obipairing/pairing.go @@ -57,7 +57,7 @@ func JoinPairedSequence(seqA, seqB obiseq.BioSequence, inplace bool) obiseq.BioS // If the inplace parameter is set to true, the seqA and seqB are // destroyed during the assembling process and cannot be reuse later on. func AssemblePESequences(seqA, seqB obiseq.BioSequence, - gap, delta, overlapMin int, withStats bool, + gap float64, delta, overlapMin int, withStats bool, inplace bool, arenaAlign obialign.PEAlignArena) obiseq.BioSequence { @@ -120,8 +120,42 @@ func AssemblePESequences(seqA, seqB obiseq.BioSequence, return cons } +// IAssemblePESequencesBatch aligns paired reads. +// +// The function consumes an iterator over batches of paired sequences and +// aligns each pair of sequences if they overlap. If they do not, both +// sequences are pasted together and a strech of ten dots is added at the +// juction of both the sequences. +// +// Parameters +// +// - iterator is an iterator of paired sequences as produced by the method +// IBioSequenceBatch.PairWith +// +// - gap the gap penality is expressed as a multiplicator factor of the cost +// of a mismatch between two bases having a quality score of 40. +// +// - delta the extension in number of base pairs added on both sides of the +// overlap detected by the FAST algorithm before the optimal alignment. +// +// - minOverlap the minimal length of the overlap to accept the alignment of +// the paired reads as correct. If the actual length is below this limit. The +// the alignment is discarded and both sequences are pasted. +// +// - withStats indicates (true value) if the algorithm adds annotation to each +// sequence on the quality of the aligned overlap. +// +// Two extra interger parameters can be added during the call of the function. +// The first one indicates how many parallel workers run for aligning the sequences. +// The second allows too specify the size of the channel buffer. +// +// Returns +// +// The function returns an iterator over batches of obiseq.Biosequence object. +// each pair of processed sequences produces one sequence in the result iterator. +// func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch, - gap, delta, overlapMin int, withStats bool, sizes ...int) obiseq.IBioSequenceBatch { + gap float64, delta, minOverlap int, withStats bool, sizes ...int) obiseq.IBioSequenceBatch { nworkers := runtime.NumCPU() * 3 / 2 buffsize := iterator.BufferSize() @@ -158,17 +192,13 @@ func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch, f := func(iterator obiseq.IPairedBioSequenceBatch, wid int) { arena := obialign.MakePEAlignArena(150, 150) - // log.Printf("\n==> %d Wait data to align\n", wid) - // start := time.Now() for iterator.Next() { - // elapsed := time.Since(start) - // log.Printf("\n==>%d got data to align after %s\n", wid, elapsed) batch := iterator.Get() cons := make(obiseq.BioSequenceSlice, len(batch.Forward())) processed := 0 for i, A := range batch.Forward() { B := batch.Reverse()[i] - cons[i] = AssemblePESequences(A, B, 2, 5, 20, true, true, arena) + cons[i] = AssemblePESequences(A, B, gap, delta, minOverlap, withStats, true, arena) if i%59 == 0 { bar.Add(59) processed += 59 @@ -179,8 +209,6 @@ func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch, batch.Order(), cons..., ) - // log.Printf("\n==> %d Wait data to align\n", wid) - // start = time.Now() } newIter.Done() } diff --git a/pkg/obitools/obipcr/options.go b/pkg/obitools/obipcr/options.go index d81f3ff..79b04cb 100644 --- a/pkg/obitools/obipcr/options.go +++ b/pkg/obitools/obipcr/options.go @@ -15,8 +15,15 @@ var _AllowedMismatch = 0 var _MinimumLength = 0 var _MaximumLength = -1 -// PCROptionSet adds to a command line option set every options -// needed by the PCR algorithm. +// PCROptionSet defines every options related to a simulated PCR. +// +// The function adds to a CLI every options proposed to the user +// to tune the parametters of the PCR simulation algorithm. +// +// Parameters +// +// - option : is a pointer to a getoptions.GetOpt instance normaly +// produced by the func PCROptionSet(options *getoptions.GetOpt) { options.BoolVar(&_Circular, "circular", false, options.Alias("c"),