Connect the command line options to the algorithm of obipairing

This commit is contained in:
2022-01-16 17:30:30 +01:00
parent 576a9f4d2d
commit 64676db3f4
7 changed files with 89 additions and 43 deletions

View File

@ -3,7 +3,7 @@ package main
import (
"log"
"os"
"runtime/pprof"
"runtime/trace"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
@ -13,25 +13,31 @@ import (
func main() {
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
f, err := os.Create("cpu.pprof")
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
defer pprof.StopCPUProfile()
// go tool trace cpu.trace
// ftrace, err := os.Create("cpu.trace")
// f, err := os.Create("cpu.pprof")
// if err != nil {
// log.Fatal(err)
// }
// trace.Start(ftrace)
// defer trace.Stop()
// pprof.StartCPUProfile(f)
// defer pprof.StopCPUProfile()
// go tool trace cpu.trace
ftrace, err := os.Create("cpu.trace")
if err != nil {
log.Fatal(err)
}
trace.Start(ftrace)
defer trace.Stop()
optionParser := obioptions.GenerateOptionParser(obipairing.OptionSet)
optionParser(os.Args)
pairs, _ := obipairing.IBatchPairedSequence()
paired := obipairing.IAssemblePESequencesBatch(pairs, 2, 50, 20, true)
paired := obipairing.IAssemblePESequencesBatch(pairs,
obipairing.GapPenality(),
obipairing.Delta(),
obipairing.MinOverlap(),
true,
obioptions.ParallelWorkers(),
)
obiconvert.WriteBioSequencesBatch(paired, true)
}

View File

@ -72,7 +72,7 @@ func _PairingScorePeAlign(baseA, qualA, baseB, qualB byte) int {
}
}
func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64,
scoreMatrix, pathMatrix *[]int) int {
la := len(seqA)
@ -80,7 +80,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
// The actual gap score is the gap score times the mismatch between
// two bases with a score of 40
gap = gap * _NucScorePartMatchMismatch[40][40]
gapPenalty := int(gap * float64(_NucScorePartMatchMismatch[40][40]))
needed := (la + 1) * (lb + 1)
@ -106,13 +106,13 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
for j := 0; j < lb; j++ {
_SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gap, 1)
_SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gapPenalty, 1)
for i := 0; i < la1; i++ {
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j])
left += gap
top += gap
left += gapPenalty
top += gapPenalty
switch {
case diag > left && diag > top:
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0)
@ -127,7 +127,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
left, diag, top := _GetMatrixFrom(scoreMatrix, la, la1, j)
diag += _PairingScorePeAlign(seqA[la1], qualA[la1], seqB[j], qualB[j])
top += gap
top += gapPenalty
switch {
case diag > left && diag > top:
@ -143,7 +143,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
return _GetMatrix(scoreMatrix, la, la1, lb-1)
}
func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64,
scoreMatrix, pathMatrix *[]int) int {
la := len(seqA)
@ -151,7 +151,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
// The actual gap score is the gap score times the mismatch between
// two bases with a score of 40
gap = gap * _NucScorePartMatchMismatch[40][40]
gapPenalty := int(gap * float64(_NucScorePartMatchMismatch[40][40]))
needed := (la + 1) * (lb + 1)
@ -170,7 +170,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
// Fills the first column with score 0
for i := 0; i < la; i++ {
_SetMatrices(scoreMatrix, pathMatrix, la, i, -1, (i+1)*gap, -1)
_SetMatrices(scoreMatrix, pathMatrix, la, i, -1, (i+1)*gapPenalty, -1)
}
lb1 := lb - 1
@ -183,8 +183,8 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j])
left += gap
top += gap
left += gapPenalty
top += gapPenalty
switch {
case diag > left && left > top:
@ -205,7 +205,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, lb1)
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[lb1], qualB[lb1])
left += gap
left += gapPenalty
switch {
case diag > left && diag > top:
@ -220,7 +220,8 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
return _GetMatrix(scoreMatrix, la, la-1, lb1)
}
func PELeftAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (int, []int) {
func PELeftAlign(seqA, seqB obiseq.BioSequence, gap float64,
arena PEAlignArena) (int, []int) {
if !_InitializedDnaScore {
log.Println("Initializing the DNA Scoring matrix")
@ -243,7 +244,8 @@ func PELeftAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (in
return score, arena.pointer.path
}
func PERightAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (int, []int) {
func PERightAlign(seqA, seqB obiseq.BioSequence, gap float64,
arena PEAlignArena) (int, []int) {
if !_InitializedDnaScore {
log.Println("Initializing the DNA Scoring matrix")
@ -267,7 +269,7 @@ func PERightAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (i
}
func PEAlign(seqA, seqB obiseq.BioSequence,
gap, delta int,
gap float64, delta int,
arena PEAlignArena) (int, []int) {
var score, shift int
var startA, startB int

View File

@ -127,6 +127,10 @@ func (s BioSequence) Features() string {
return string(s.sequence.feature)
}
func (s BioSequence) HasAnnotation() bool {
return len(s.sequence.annotations) > 0
}
func (s BioSequence) Annotations() Annotation {
if s.sequence.annotations == nil {
s.sequence.annotations = GetAnnotation()

View File

@ -1,4 +1,3 @@
package obiseq
type BioSequenceSlice []BioSequence

View File

@ -10,7 +10,7 @@ var _ForwardFiles = make([]string, 0, 10)
var _ReverseFiles = make([]string, 0, 10)
var _Delta = 5
var _MinOverlap = 20
var _GapPenality = 2
var _GapPenality = float64(2.0)
var _WithoutStats = false
func PairingOptionSet(options *getoptions.GetOpt) {
@ -28,7 +28,7 @@ func PairingOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_MinOverlap, "min-overlap", 20,
options.Alias("O"),
options.Description("Minimum ovelap between both the reads to consider the aligment (default 20)."))
options.IntVar(&_GapPenality, "gap-penality", 2,
options.Float64Var(&_GapPenality, "gap-penality", 2,
options.Alias("G"),
options.Description("Gap penality expressed as the multiply factor applied to the mismatch score between two nucleotides with a quality of 40 (default 2)."))
options.BoolVar(&_WithoutStats, "without-stat", false,
@ -65,7 +65,7 @@ func MinOverlap() int {
return _MinOverlap
}
func GapPenality() int {
func GapPenality() float64 {
return _GapPenality
}

View File

@ -57,7 +57,7 @@ func JoinPairedSequence(seqA, seqB obiseq.BioSequence, inplace bool) obiseq.BioS
// If the inplace parameter is set to true, the seqA and seqB are
// destroyed during the assembling process and cannot be reuse later on.
func AssemblePESequences(seqA, seqB obiseq.BioSequence,
gap, delta, overlapMin int, withStats bool,
gap float64, delta, overlapMin int, withStats bool,
inplace bool,
arenaAlign obialign.PEAlignArena) obiseq.BioSequence {
@ -120,8 +120,42 @@ func AssemblePESequences(seqA, seqB obiseq.BioSequence,
return cons
}
// IAssemblePESequencesBatch aligns paired reads.
//
// The function consumes an iterator over batches of paired sequences and
// aligns each pair of sequences if they overlap. If they do not, both
// sequences are pasted together and a strech of ten dots is added at the
// juction of both the sequences.
//
// Parameters
//
// - iterator is an iterator of paired sequences as produced by the method
// IBioSequenceBatch.PairWith
//
// - gap the gap penality is expressed as a multiplicator factor of the cost
// of a mismatch between two bases having a quality score of 40.
//
// - delta the extension in number of base pairs added on both sides of the
// overlap detected by the FAST algorithm before the optimal alignment.
//
// - minOverlap the minimal length of the overlap to accept the alignment of
// the paired reads as correct. If the actual length is below this limit. The
// the alignment is discarded and both sequences are pasted.
//
// - withStats indicates (true value) if the algorithm adds annotation to each
// sequence on the quality of the aligned overlap.
//
// Two extra interger parameters can be added during the call of the function.
// The first one indicates how many parallel workers run for aligning the sequences.
// The second allows too specify the size of the channel buffer.
//
// Returns
//
// The function returns an iterator over batches of obiseq.Biosequence object.
// each pair of processed sequences produces one sequence in the result iterator.
//
func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
gap, delta, overlapMin int, withStats bool, sizes ...int) obiseq.IBioSequenceBatch {
gap float64, delta, minOverlap int, withStats bool, sizes ...int) obiseq.IBioSequenceBatch {
nworkers := runtime.NumCPU() * 3 / 2
buffsize := iterator.BufferSize()
@ -158,17 +192,13 @@ func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
f := func(iterator obiseq.IPairedBioSequenceBatch, wid int) {
arena := obialign.MakePEAlignArena(150, 150)
// log.Printf("\n==> %d Wait data to align\n", wid)
// start := time.Now()
for iterator.Next() {
// elapsed := time.Since(start)
// log.Printf("\n==>%d got data to align after %s\n", wid, elapsed)
batch := iterator.Get()
cons := make(obiseq.BioSequenceSlice, len(batch.Forward()))
processed := 0
for i, A := range batch.Forward() {
B := batch.Reverse()[i]
cons[i] = AssemblePESequences(A, B, 2, 5, 20, true, true, arena)
cons[i] = AssemblePESequences(A, B, gap, delta, minOverlap, withStats, true, arena)
if i%59 == 0 {
bar.Add(59)
processed += 59
@ -179,8 +209,6 @@ func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
batch.Order(),
cons...,
)
// log.Printf("\n==> %d Wait data to align\n", wid)
// start = time.Now()
}
newIter.Done()
}

View File

@ -15,8 +15,15 @@ var _AllowedMismatch = 0
var _MinimumLength = 0
var _MaximumLength = -1
// PCROptionSet adds to a command line option set every options
// needed by the PCR algorithm.
// PCROptionSet defines every options related to a simulated PCR.
//
// The function adds to a CLI every options proposed to the user
// to tune the parametters of the PCR simulation algorithm.
//
// Parameters
//
// - option : is a pointer to a getoptions.GetOpt instance normaly
// produced by the
func PCROptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_Circular, "circular", false,
options.Alias("c"),