mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Connect the command line options to the algorithm of obipairing
This commit is contained in:
@ -3,7 +3,7 @@ package main
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
"runtime/pprof"
|
||||
"runtime/trace"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||
@ -13,25 +13,31 @@ import (
|
||||
func main() {
|
||||
|
||||
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
|
||||
f, err := os.Create("cpu.pprof")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
pprof.StartCPUProfile(f)
|
||||
defer pprof.StopCPUProfile()
|
||||
|
||||
// go tool trace cpu.trace
|
||||
// ftrace, err := os.Create("cpu.trace")
|
||||
// f, err := os.Create("cpu.pprof")
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
// trace.Start(ftrace)
|
||||
// defer trace.Stop()
|
||||
// pprof.StartCPUProfile(f)
|
||||
// defer pprof.StopCPUProfile()
|
||||
|
||||
// go tool trace cpu.trace
|
||||
ftrace, err := os.Create("cpu.trace")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
trace.Start(ftrace)
|
||||
defer trace.Stop()
|
||||
|
||||
optionParser := obioptions.GenerateOptionParser(obipairing.OptionSet)
|
||||
|
||||
optionParser(os.Args)
|
||||
pairs, _ := obipairing.IBatchPairedSequence()
|
||||
paired := obipairing.IAssemblePESequencesBatch(pairs, 2, 50, 20, true)
|
||||
paired := obipairing.IAssemblePESequencesBatch(pairs,
|
||||
obipairing.GapPenality(),
|
||||
obipairing.Delta(),
|
||||
obipairing.MinOverlap(),
|
||||
true,
|
||||
obioptions.ParallelWorkers(),
|
||||
)
|
||||
obiconvert.WriteBioSequencesBatch(paired, true)
|
||||
}
|
||||
|
@ -72,7 +72,7 @@ func _PairingScorePeAlign(baseA, qualA, baseB, qualB byte) int {
|
||||
}
|
||||
}
|
||||
|
||||
func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
||||
func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64,
|
||||
scoreMatrix, pathMatrix *[]int) int {
|
||||
|
||||
la := len(seqA)
|
||||
@ -80,7 +80,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
||||
|
||||
// The actual gap score is the gap score times the mismatch between
|
||||
// two bases with a score of 40
|
||||
gap = gap * _NucScorePartMatchMismatch[40][40]
|
||||
gapPenalty := int(gap * float64(_NucScorePartMatchMismatch[40][40]))
|
||||
|
||||
needed := (la + 1) * (lb + 1)
|
||||
|
||||
@ -106,13 +106,13 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
||||
|
||||
for j := 0; j < lb; j++ {
|
||||
|
||||
_SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gap, 1)
|
||||
_SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gapPenalty, 1)
|
||||
|
||||
for i := 0; i < la1; i++ {
|
||||
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
|
||||
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j])
|
||||
left += gap
|
||||
top += gap
|
||||
left += gapPenalty
|
||||
top += gapPenalty
|
||||
switch {
|
||||
case diag > left && diag > top:
|
||||
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0)
|
||||
@ -127,7 +127,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
||||
|
||||
left, diag, top := _GetMatrixFrom(scoreMatrix, la, la1, j)
|
||||
diag += _PairingScorePeAlign(seqA[la1], qualA[la1], seqB[j], qualB[j])
|
||||
top += gap
|
||||
top += gapPenalty
|
||||
|
||||
switch {
|
||||
case diag > left && diag > top:
|
||||
@ -143,7 +143,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
||||
return _GetMatrix(scoreMatrix, la, la1, lb-1)
|
||||
}
|
||||
|
||||
func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
||||
func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64,
|
||||
scoreMatrix, pathMatrix *[]int) int {
|
||||
|
||||
la := len(seqA)
|
||||
@ -151,7 +151,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
||||
|
||||
// The actual gap score is the gap score times the mismatch between
|
||||
// two bases with a score of 40
|
||||
gap = gap * _NucScorePartMatchMismatch[40][40]
|
||||
gapPenalty := int(gap * float64(_NucScorePartMatchMismatch[40][40]))
|
||||
|
||||
needed := (la + 1) * (lb + 1)
|
||||
|
||||
@ -170,7 +170,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
||||
|
||||
// Fills the first column with score 0
|
||||
for i := 0; i < la; i++ {
|
||||
_SetMatrices(scoreMatrix, pathMatrix, la, i, -1, (i+1)*gap, -1)
|
||||
_SetMatrices(scoreMatrix, pathMatrix, la, i, -1, (i+1)*gapPenalty, -1)
|
||||
}
|
||||
|
||||
lb1 := lb - 1
|
||||
@ -183,8 +183,8 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
||||
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
|
||||
|
||||
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j])
|
||||
left += gap
|
||||
top += gap
|
||||
left += gapPenalty
|
||||
top += gapPenalty
|
||||
|
||||
switch {
|
||||
case diag > left && left > top:
|
||||
@ -205,7 +205,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
||||
|
||||
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, lb1)
|
||||
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[lb1], qualB[lb1])
|
||||
left += gap
|
||||
left += gapPenalty
|
||||
|
||||
switch {
|
||||
case diag > left && diag > top:
|
||||
@ -220,7 +220,8 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
||||
return _GetMatrix(scoreMatrix, la, la-1, lb1)
|
||||
}
|
||||
|
||||
func PELeftAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (int, []int) {
|
||||
func PELeftAlign(seqA, seqB obiseq.BioSequence, gap float64,
|
||||
arena PEAlignArena) (int, []int) {
|
||||
|
||||
if !_InitializedDnaScore {
|
||||
log.Println("Initializing the DNA Scoring matrix")
|
||||
@ -243,7 +244,8 @@ func PELeftAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (in
|
||||
return score, arena.pointer.path
|
||||
}
|
||||
|
||||
func PERightAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (int, []int) {
|
||||
func PERightAlign(seqA, seqB obiseq.BioSequence, gap float64,
|
||||
arena PEAlignArena) (int, []int) {
|
||||
|
||||
if !_InitializedDnaScore {
|
||||
log.Println("Initializing the DNA Scoring matrix")
|
||||
@ -267,7 +269,7 @@ func PERightAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (i
|
||||
}
|
||||
|
||||
func PEAlign(seqA, seqB obiseq.BioSequence,
|
||||
gap, delta int,
|
||||
gap float64, delta int,
|
||||
arena PEAlignArena) (int, []int) {
|
||||
var score, shift int
|
||||
var startA, startB int
|
||||
|
@ -127,6 +127,10 @@ func (s BioSequence) Features() string {
|
||||
return string(s.sequence.feature)
|
||||
}
|
||||
|
||||
func (s BioSequence) HasAnnotation() bool {
|
||||
return len(s.sequence.annotations) > 0
|
||||
}
|
||||
|
||||
func (s BioSequence) Annotations() Annotation {
|
||||
if s.sequence.annotations == nil {
|
||||
s.sequence.annotations = GetAnnotation()
|
||||
|
@ -1,4 +1,3 @@
|
||||
package obiseq
|
||||
|
||||
type BioSequenceSlice []BioSequence
|
||||
|
||||
|
@ -10,7 +10,7 @@ var _ForwardFiles = make([]string, 0, 10)
|
||||
var _ReverseFiles = make([]string, 0, 10)
|
||||
var _Delta = 5
|
||||
var _MinOverlap = 20
|
||||
var _GapPenality = 2
|
||||
var _GapPenality = float64(2.0)
|
||||
var _WithoutStats = false
|
||||
|
||||
func PairingOptionSet(options *getoptions.GetOpt) {
|
||||
@ -28,7 +28,7 @@ func PairingOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_MinOverlap, "min-overlap", 20,
|
||||
options.Alias("O"),
|
||||
options.Description("Minimum ovelap between both the reads to consider the aligment (default 20)."))
|
||||
options.IntVar(&_GapPenality, "gap-penality", 2,
|
||||
options.Float64Var(&_GapPenality, "gap-penality", 2,
|
||||
options.Alias("G"),
|
||||
options.Description("Gap penality expressed as the multiply factor applied to the mismatch score between two nucleotides with a quality of 40 (default 2)."))
|
||||
options.BoolVar(&_WithoutStats, "without-stat", false,
|
||||
@ -65,7 +65,7 @@ func MinOverlap() int {
|
||||
return _MinOverlap
|
||||
}
|
||||
|
||||
func GapPenality() int {
|
||||
func GapPenality() float64 {
|
||||
return _GapPenality
|
||||
}
|
||||
|
||||
|
@ -57,7 +57,7 @@ func JoinPairedSequence(seqA, seqB obiseq.BioSequence, inplace bool) obiseq.BioS
|
||||
// If the inplace parameter is set to true, the seqA and seqB are
|
||||
// destroyed during the assembling process and cannot be reuse later on.
|
||||
func AssemblePESequences(seqA, seqB obiseq.BioSequence,
|
||||
gap, delta, overlapMin int, withStats bool,
|
||||
gap float64, delta, overlapMin int, withStats bool,
|
||||
inplace bool,
|
||||
arenaAlign obialign.PEAlignArena) obiseq.BioSequence {
|
||||
|
||||
@ -120,8 +120,42 @@ func AssemblePESequences(seqA, seqB obiseq.BioSequence,
|
||||
return cons
|
||||
}
|
||||
|
||||
// IAssemblePESequencesBatch aligns paired reads.
|
||||
//
|
||||
// The function consumes an iterator over batches of paired sequences and
|
||||
// aligns each pair of sequences if they overlap. If they do not, both
|
||||
// sequences are pasted together and a strech of ten dots is added at the
|
||||
// juction of both the sequences.
|
||||
//
|
||||
// Parameters
|
||||
//
|
||||
// - iterator is an iterator of paired sequences as produced by the method
|
||||
// IBioSequenceBatch.PairWith
|
||||
//
|
||||
// - gap the gap penality is expressed as a multiplicator factor of the cost
|
||||
// of a mismatch between two bases having a quality score of 40.
|
||||
//
|
||||
// - delta the extension in number of base pairs added on both sides of the
|
||||
// overlap detected by the FAST algorithm before the optimal alignment.
|
||||
//
|
||||
// - minOverlap the minimal length of the overlap to accept the alignment of
|
||||
// the paired reads as correct. If the actual length is below this limit. The
|
||||
// the alignment is discarded and both sequences are pasted.
|
||||
//
|
||||
// - withStats indicates (true value) if the algorithm adds annotation to each
|
||||
// sequence on the quality of the aligned overlap.
|
||||
//
|
||||
// Two extra interger parameters can be added during the call of the function.
|
||||
// The first one indicates how many parallel workers run for aligning the sequences.
|
||||
// The second allows too specify the size of the channel buffer.
|
||||
//
|
||||
// Returns
|
||||
//
|
||||
// The function returns an iterator over batches of obiseq.Biosequence object.
|
||||
// each pair of processed sequences produces one sequence in the result iterator.
|
||||
//
|
||||
func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
|
||||
gap, delta, overlapMin int, withStats bool, sizes ...int) obiseq.IBioSequenceBatch {
|
||||
gap float64, delta, minOverlap int, withStats bool, sizes ...int) obiseq.IBioSequenceBatch {
|
||||
|
||||
nworkers := runtime.NumCPU() * 3 / 2
|
||||
buffsize := iterator.BufferSize()
|
||||
@ -158,17 +192,13 @@ func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
|
||||
f := func(iterator obiseq.IPairedBioSequenceBatch, wid int) {
|
||||
arena := obialign.MakePEAlignArena(150, 150)
|
||||
|
||||
// log.Printf("\n==> %d Wait data to align\n", wid)
|
||||
// start := time.Now()
|
||||
for iterator.Next() {
|
||||
// elapsed := time.Since(start)
|
||||
// log.Printf("\n==>%d got data to align after %s\n", wid, elapsed)
|
||||
batch := iterator.Get()
|
||||
cons := make(obiseq.BioSequenceSlice, len(batch.Forward()))
|
||||
processed := 0
|
||||
for i, A := range batch.Forward() {
|
||||
B := batch.Reverse()[i]
|
||||
cons[i] = AssemblePESequences(A, B, 2, 5, 20, true, true, arena)
|
||||
cons[i] = AssemblePESequences(A, B, gap, delta, minOverlap, withStats, true, arena)
|
||||
if i%59 == 0 {
|
||||
bar.Add(59)
|
||||
processed += 59
|
||||
@ -179,8 +209,6 @@ func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
|
||||
batch.Order(),
|
||||
cons...,
|
||||
)
|
||||
// log.Printf("\n==> %d Wait data to align\n", wid)
|
||||
// start = time.Now()
|
||||
}
|
||||
newIter.Done()
|
||||
}
|
||||
|
@ -15,8 +15,15 @@ var _AllowedMismatch = 0
|
||||
var _MinimumLength = 0
|
||||
var _MaximumLength = -1
|
||||
|
||||
// PCROptionSet adds to a command line option set every options
|
||||
// needed by the PCR algorithm.
|
||||
// PCROptionSet defines every options related to a simulated PCR.
|
||||
//
|
||||
// The function adds to a CLI every options proposed to the user
|
||||
// to tune the parametters of the PCR simulation algorithm.
|
||||
//
|
||||
// Parameters
|
||||
//
|
||||
// - option : is a pointer to a getoptions.GetOpt instance normaly
|
||||
// produced by the
|
||||
func PCROptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&_Circular, "circular", false,
|
||||
options.Alias("c"),
|
||||
|
Reference in New Issue
Block a user