mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Connect the command line options to the algorithm of obipairing
This commit is contained in:
@ -3,7 +3,7 @@ package main
|
|||||||
import (
|
import (
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"runtime/pprof"
|
"runtime/trace"
|
||||||
|
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||||
@ -13,25 +13,31 @@ import (
|
|||||||
func main() {
|
func main() {
|
||||||
|
|
||||||
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
|
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
|
||||||
f, err := os.Create("cpu.pprof")
|
// f, err := os.Create("cpu.pprof")
|
||||||
if err != nil {
|
|
||||||
log.Fatal(err)
|
|
||||||
}
|
|
||||||
pprof.StartCPUProfile(f)
|
|
||||||
defer pprof.StopCPUProfile()
|
|
||||||
|
|
||||||
// go tool trace cpu.trace
|
|
||||||
// ftrace, err := os.Create("cpu.trace")
|
|
||||||
// if err != nil {
|
// if err != nil {
|
||||||
// log.Fatal(err)
|
// log.Fatal(err)
|
||||||
// }
|
// }
|
||||||
// trace.Start(ftrace)
|
// pprof.StartCPUProfile(f)
|
||||||
// defer trace.Stop()
|
// defer pprof.StopCPUProfile()
|
||||||
|
|
||||||
|
// go tool trace cpu.trace
|
||||||
|
ftrace, err := os.Create("cpu.trace")
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
trace.Start(ftrace)
|
||||||
|
defer trace.Stop()
|
||||||
|
|
||||||
optionParser := obioptions.GenerateOptionParser(obipairing.OptionSet)
|
optionParser := obioptions.GenerateOptionParser(obipairing.OptionSet)
|
||||||
|
|
||||||
optionParser(os.Args)
|
optionParser(os.Args)
|
||||||
pairs, _ := obipairing.IBatchPairedSequence()
|
pairs, _ := obipairing.IBatchPairedSequence()
|
||||||
paired := obipairing.IAssemblePESequencesBatch(pairs, 2, 50, 20, true)
|
paired := obipairing.IAssemblePESequencesBatch(pairs,
|
||||||
|
obipairing.GapPenality(),
|
||||||
|
obipairing.Delta(),
|
||||||
|
obipairing.MinOverlap(),
|
||||||
|
true,
|
||||||
|
obioptions.ParallelWorkers(),
|
||||||
|
)
|
||||||
obiconvert.WriteBioSequencesBatch(paired, true)
|
obiconvert.WriteBioSequencesBatch(paired, true)
|
||||||
}
|
}
|
||||||
|
@ -72,7 +72,7 @@ func _PairingScorePeAlign(baseA, qualA, baseB, qualB byte) int {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap float64,
|
||||||
scoreMatrix, pathMatrix *[]int) int {
|
scoreMatrix, pathMatrix *[]int) int {
|
||||||
|
|
||||||
la := len(seqA)
|
la := len(seqA)
|
||||||
@ -80,7 +80,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
|||||||
|
|
||||||
// The actual gap score is the gap score times the mismatch between
|
// The actual gap score is the gap score times the mismatch between
|
||||||
// two bases with a score of 40
|
// two bases with a score of 40
|
||||||
gap = gap * _NucScorePartMatchMismatch[40][40]
|
gapPenalty := int(gap * float64(_NucScorePartMatchMismatch[40][40]))
|
||||||
|
|
||||||
needed := (la + 1) * (lb + 1)
|
needed := (la + 1) * (lb + 1)
|
||||||
|
|
||||||
@ -106,13 +106,13 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
|||||||
|
|
||||||
for j := 0; j < lb; j++ {
|
for j := 0; j < lb; j++ {
|
||||||
|
|
||||||
_SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gap, 1)
|
_SetMatrices(scoreMatrix, pathMatrix, la, -1, j, (j+1)*gapPenalty, 1)
|
||||||
|
|
||||||
for i := 0; i < la1; i++ {
|
for i := 0; i < la1; i++ {
|
||||||
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
|
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
|
||||||
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j])
|
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j])
|
||||||
left += gap
|
left += gapPenalty
|
||||||
top += gap
|
top += gapPenalty
|
||||||
switch {
|
switch {
|
||||||
case diag > left && diag > top:
|
case diag > left && diag > top:
|
||||||
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0)
|
_SetMatrices(scoreMatrix, pathMatrix, la, i, j, diag, 0)
|
||||||
@ -127,7 +127,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
|||||||
|
|
||||||
left, diag, top := _GetMatrixFrom(scoreMatrix, la, la1, j)
|
left, diag, top := _GetMatrixFrom(scoreMatrix, la, la1, j)
|
||||||
diag += _PairingScorePeAlign(seqA[la1], qualA[la1], seqB[j], qualB[j])
|
diag += _PairingScorePeAlign(seqA[la1], qualA[la1], seqB[j], qualB[j])
|
||||||
top += gap
|
top += gapPenalty
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case diag > left && diag > top:
|
case diag > left && diag > top:
|
||||||
@ -143,7 +143,7 @@ func _FillMatrixPeLeftAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
|||||||
return _GetMatrix(scoreMatrix, la, la1, lb-1)
|
return _GetMatrix(scoreMatrix, la, la1, lb-1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap float64,
|
||||||
scoreMatrix, pathMatrix *[]int) int {
|
scoreMatrix, pathMatrix *[]int) int {
|
||||||
|
|
||||||
la := len(seqA)
|
la := len(seqA)
|
||||||
@ -151,7 +151,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
|||||||
|
|
||||||
// The actual gap score is the gap score times the mismatch between
|
// The actual gap score is the gap score times the mismatch between
|
||||||
// two bases with a score of 40
|
// two bases with a score of 40
|
||||||
gap = gap * _NucScorePartMatchMismatch[40][40]
|
gapPenalty := int(gap * float64(_NucScorePartMatchMismatch[40][40]))
|
||||||
|
|
||||||
needed := (la + 1) * (lb + 1)
|
needed := (la + 1) * (lb + 1)
|
||||||
|
|
||||||
@ -170,7 +170,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
|||||||
|
|
||||||
// Fills the first column with score 0
|
// Fills the first column with score 0
|
||||||
for i := 0; i < la; i++ {
|
for i := 0; i < la; i++ {
|
||||||
_SetMatrices(scoreMatrix, pathMatrix, la, i, -1, (i+1)*gap, -1)
|
_SetMatrices(scoreMatrix, pathMatrix, la, i, -1, (i+1)*gapPenalty, -1)
|
||||||
}
|
}
|
||||||
|
|
||||||
lb1 := lb - 1
|
lb1 := lb - 1
|
||||||
@ -183,8 +183,8 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
|||||||
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
|
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, j)
|
||||||
|
|
||||||
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j])
|
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[j], qualB[j])
|
||||||
left += gap
|
left += gapPenalty
|
||||||
top += gap
|
top += gapPenalty
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case diag > left && left > top:
|
case diag > left && left > top:
|
||||||
@ -205,7 +205,7 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
|||||||
|
|
||||||
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, lb1)
|
left, diag, top := _GetMatrixFrom(scoreMatrix, la, i, lb1)
|
||||||
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[lb1], qualB[lb1])
|
diag += _PairingScorePeAlign(seqA[i], qualA[i], seqB[lb1], qualB[lb1])
|
||||||
left += gap
|
left += gapPenalty
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case diag > left && diag > top:
|
case diag > left && diag > top:
|
||||||
@ -220,7 +220,8 @@ func _FillMatrixPeRightAlign(seqA, qualA, seqB, qualB []byte, gap int,
|
|||||||
return _GetMatrix(scoreMatrix, la, la-1, lb1)
|
return _GetMatrix(scoreMatrix, la, la-1, lb1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func PELeftAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (int, []int) {
|
func PELeftAlign(seqA, seqB obiseq.BioSequence, gap float64,
|
||||||
|
arena PEAlignArena) (int, []int) {
|
||||||
|
|
||||||
if !_InitializedDnaScore {
|
if !_InitializedDnaScore {
|
||||||
log.Println("Initializing the DNA Scoring matrix")
|
log.Println("Initializing the DNA Scoring matrix")
|
||||||
@ -243,7 +244,8 @@ func PELeftAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (in
|
|||||||
return score, arena.pointer.path
|
return score, arena.pointer.path
|
||||||
}
|
}
|
||||||
|
|
||||||
func PERightAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (int, []int) {
|
func PERightAlign(seqA, seqB obiseq.BioSequence, gap float64,
|
||||||
|
arena PEAlignArena) (int, []int) {
|
||||||
|
|
||||||
if !_InitializedDnaScore {
|
if !_InitializedDnaScore {
|
||||||
log.Println("Initializing the DNA Scoring matrix")
|
log.Println("Initializing the DNA Scoring matrix")
|
||||||
@ -267,7 +269,7 @@ func PERightAlign(seqA, seqB obiseq.BioSequence, gap int, arena PEAlignArena) (i
|
|||||||
}
|
}
|
||||||
|
|
||||||
func PEAlign(seqA, seqB obiseq.BioSequence,
|
func PEAlign(seqA, seqB obiseq.BioSequence,
|
||||||
gap, delta int,
|
gap float64, delta int,
|
||||||
arena PEAlignArena) (int, []int) {
|
arena PEAlignArena) (int, []int) {
|
||||||
var score, shift int
|
var score, shift int
|
||||||
var startA, startB int
|
var startA, startB int
|
||||||
|
@ -127,6 +127,10 @@ func (s BioSequence) Features() string {
|
|||||||
return string(s.sequence.feature)
|
return string(s.sequence.feature)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s BioSequence) HasAnnotation() bool {
|
||||||
|
return len(s.sequence.annotations) > 0
|
||||||
|
}
|
||||||
|
|
||||||
func (s BioSequence) Annotations() Annotation {
|
func (s BioSequence) Annotations() Annotation {
|
||||||
if s.sequence.annotations == nil {
|
if s.sequence.annotations == nil {
|
||||||
s.sequence.annotations = GetAnnotation()
|
s.sequence.annotations = GetAnnotation()
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
package obiseq
|
package obiseq
|
||||||
|
|
||||||
type BioSequenceSlice []BioSequence
|
type BioSequenceSlice []BioSequence
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ var _ForwardFiles = make([]string, 0, 10)
|
|||||||
var _ReverseFiles = make([]string, 0, 10)
|
var _ReverseFiles = make([]string, 0, 10)
|
||||||
var _Delta = 5
|
var _Delta = 5
|
||||||
var _MinOverlap = 20
|
var _MinOverlap = 20
|
||||||
var _GapPenality = 2
|
var _GapPenality = float64(2.0)
|
||||||
var _WithoutStats = false
|
var _WithoutStats = false
|
||||||
|
|
||||||
func PairingOptionSet(options *getoptions.GetOpt) {
|
func PairingOptionSet(options *getoptions.GetOpt) {
|
||||||
@ -28,7 +28,7 @@ func PairingOptionSet(options *getoptions.GetOpt) {
|
|||||||
options.IntVar(&_MinOverlap, "min-overlap", 20,
|
options.IntVar(&_MinOverlap, "min-overlap", 20,
|
||||||
options.Alias("O"),
|
options.Alias("O"),
|
||||||
options.Description("Minimum ovelap between both the reads to consider the aligment (default 20)."))
|
options.Description("Minimum ovelap between both the reads to consider the aligment (default 20)."))
|
||||||
options.IntVar(&_GapPenality, "gap-penality", 2,
|
options.Float64Var(&_GapPenality, "gap-penality", 2,
|
||||||
options.Alias("G"),
|
options.Alias("G"),
|
||||||
options.Description("Gap penality expressed as the multiply factor applied to the mismatch score between two nucleotides with a quality of 40 (default 2)."))
|
options.Description("Gap penality expressed as the multiply factor applied to the mismatch score between two nucleotides with a quality of 40 (default 2)."))
|
||||||
options.BoolVar(&_WithoutStats, "without-stat", false,
|
options.BoolVar(&_WithoutStats, "without-stat", false,
|
||||||
@ -65,7 +65,7 @@ func MinOverlap() int {
|
|||||||
return _MinOverlap
|
return _MinOverlap
|
||||||
}
|
}
|
||||||
|
|
||||||
func GapPenality() int {
|
func GapPenality() float64 {
|
||||||
return _GapPenality
|
return _GapPenality
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -57,7 +57,7 @@ func JoinPairedSequence(seqA, seqB obiseq.BioSequence, inplace bool) obiseq.BioS
|
|||||||
// If the inplace parameter is set to true, the seqA and seqB are
|
// If the inplace parameter is set to true, the seqA and seqB are
|
||||||
// destroyed during the assembling process and cannot be reuse later on.
|
// destroyed during the assembling process and cannot be reuse later on.
|
||||||
func AssemblePESequences(seqA, seqB obiseq.BioSequence,
|
func AssemblePESequences(seqA, seqB obiseq.BioSequence,
|
||||||
gap, delta, overlapMin int, withStats bool,
|
gap float64, delta, overlapMin int, withStats bool,
|
||||||
inplace bool,
|
inplace bool,
|
||||||
arenaAlign obialign.PEAlignArena) obiseq.BioSequence {
|
arenaAlign obialign.PEAlignArena) obiseq.BioSequence {
|
||||||
|
|
||||||
@ -120,8 +120,42 @@ func AssemblePESequences(seqA, seqB obiseq.BioSequence,
|
|||||||
return cons
|
return cons
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IAssemblePESequencesBatch aligns paired reads.
|
||||||
|
//
|
||||||
|
// The function consumes an iterator over batches of paired sequences and
|
||||||
|
// aligns each pair of sequences if they overlap. If they do not, both
|
||||||
|
// sequences are pasted together and a strech of ten dots is added at the
|
||||||
|
// juction of both the sequences.
|
||||||
|
//
|
||||||
|
// Parameters
|
||||||
|
//
|
||||||
|
// - iterator is an iterator of paired sequences as produced by the method
|
||||||
|
// IBioSequenceBatch.PairWith
|
||||||
|
//
|
||||||
|
// - gap the gap penality is expressed as a multiplicator factor of the cost
|
||||||
|
// of a mismatch between two bases having a quality score of 40.
|
||||||
|
//
|
||||||
|
// - delta the extension in number of base pairs added on both sides of the
|
||||||
|
// overlap detected by the FAST algorithm before the optimal alignment.
|
||||||
|
//
|
||||||
|
// - minOverlap the minimal length of the overlap to accept the alignment of
|
||||||
|
// the paired reads as correct. If the actual length is below this limit. The
|
||||||
|
// the alignment is discarded and both sequences are pasted.
|
||||||
|
//
|
||||||
|
// - withStats indicates (true value) if the algorithm adds annotation to each
|
||||||
|
// sequence on the quality of the aligned overlap.
|
||||||
|
//
|
||||||
|
// Two extra interger parameters can be added during the call of the function.
|
||||||
|
// The first one indicates how many parallel workers run for aligning the sequences.
|
||||||
|
// The second allows too specify the size of the channel buffer.
|
||||||
|
//
|
||||||
|
// Returns
|
||||||
|
//
|
||||||
|
// The function returns an iterator over batches of obiseq.Biosequence object.
|
||||||
|
// each pair of processed sequences produces one sequence in the result iterator.
|
||||||
|
//
|
||||||
func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
|
func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
|
||||||
gap, delta, overlapMin int, withStats bool, sizes ...int) obiseq.IBioSequenceBatch {
|
gap float64, delta, minOverlap int, withStats bool, sizes ...int) obiseq.IBioSequenceBatch {
|
||||||
|
|
||||||
nworkers := runtime.NumCPU() * 3 / 2
|
nworkers := runtime.NumCPU() * 3 / 2
|
||||||
buffsize := iterator.BufferSize()
|
buffsize := iterator.BufferSize()
|
||||||
@ -158,17 +192,13 @@ func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
|
|||||||
f := func(iterator obiseq.IPairedBioSequenceBatch, wid int) {
|
f := func(iterator obiseq.IPairedBioSequenceBatch, wid int) {
|
||||||
arena := obialign.MakePEAlignArena(150, 150)
|
arena := obialign.MakePEAlignArena(150, 150)
|
||||||
|
|
||||||
// log.Printf("\n==> %d Wait data to align\n", wid)
|
|
||||||
// start := time.Now()
|
|
||||||
for iterator.Next() {
|
for iterator.Next() {
|
||||||
// elapsed := time.Since(start)
|
|
||||||
// log.Printf("\n==>%d got data to align after %s\n", wid, elapsed)
|
|
||||||
batch := iterator.Get()
|
batch := iterator.Get()
|
||||||
cons := make(obiseq.BioSequenceSlice, len(batch.Forward()))
|
cons := make(obiseq.BioSequenceSlice, len(batch.Forward()))
|
||||||
processed := 0
|
processed := 0
|
||||||
for i, A := range batch.Forward() {
|
for i, A := range batch.Forward() {
|
||||||
B := batch.Reverse()[i]
|
B := batch.Reverse()[i]
|
||||||
cons[i] = AssemblePESequences(A, B, 2, 5, 20, true, true, arena)
|
cons[i] = AssemblePESequences(A, B, gap, delta, minOverlap, withStats, true, arena)
|
||||||
if i%59 == 0 {
|
if i%59 == 0 {
|
||||||
bar.Add(59)
|
bar.Add(59)
|
||||||
processed += 59
|
processed += 59
|
||||||
@ -179,8 +209,6 @@ func IAssemblePESequencesBatch(iterator obiseq.IPairedBioSequenceBatch,
|
|||||||
batch.Order(),
|
batch.Order(),
|
||||||
cons...,
|
cons...,
|
||||||
)
|
)
|
||||||
// log.Printf("\n==> %d Wait data to align\n", wid)
|
|
||||||
// start = time.Now()
|
|
||||||
}
|
}
|
||||||
newIter.Done()
|
newIter.Done()
|
||||||
}
|
}
|
||||||
|
@ -15,8 +15,15 @@ var _AllowedMismatch = 0
|
|||||||
var _MinimumLength = 0
|
var _MinimumLength = 0
|
||||||
var _MaximumLength = -1
|
var _MaximumLength = -1
|
||||||
|
|
||||||
// PCROptionSet adds to a command line option set every options
|
// PCROptionSet defines every options related to a simulated PCR.
|
||||||
// needed by the PCR algorithm.
|
//
|
||||||
|
// The function adds to a CLI every options proposed to the user
|
||||||
|
// to tune the parametters of the PCR simulation algorithm.
|
||||||
|
//
|
||||||
|
// Parameters
|
||||||
|
//
|
||||||
|
// - option : is a pointer to a getoptions.GetOpt instance normaly
|
||||||
|
// produced by the
|
||||||
func PCROptionSet(options *getoptions.GetOpt) {
|
func PCROptionSet(options *getoptions.GetOpt) {
|
||||||
options.BoolVar(&_Circular, "circular", false,
|
options.BoolVar(&_Circular, "circular", false,
|
||||||
options.Alias("c"),
|
options.Alias("c"),
|
||||||
|
Reference in New Issue
Block a user