mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Adds option to tune the pairing of the sequences in obipairing and some stats to the results
Former-commit-id: a6cf9cb4d4ab20a433a2534fd7d11cd3ca8ebbaa
This commit is contained in:
@ -1,8 +1,8 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||
@ -34,7 +34,7 @@ func main() {
|
||||
pairs, err := obipairing.CLIPairedSequence()
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("Cannot open file (%v)",err)
|
||||
log.Errorf("Cannot open file (%v)", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
@ -43,10 +43,12 @@ func main() {
|
||||
obipairing.CLIDelta(),
|
||||
obipairing.CLIMinOverlap(),
|
||||
obipairing.CLIMinIdentity(),
|
||||
obipairing.CLIFastMode(),
|
||||
obipairing.CLIFastRelativeScore(),
|
||||
obipairing.CLIWithStats(),
|
||||
obioptions.CLIParallelWorkers(),
|
||||
)
|
||||
|
||||
|
||||
obiconvert.CLIWriteBioSequences(paired, true)
|
||||
|
||||
obiiter.WaitForLastPipe()
|
||||
|
@ -47,6 +47,8 @@ func main() {
|
||||
obipairing.CLIDelta(),
|
||||
obipairing.CLIMinOverlap(),
|
||||
obipairing.CLIMinIdentity(),
|
||||
obipairing.CLIFastMode(),
|
||||
obipairing.CLIFastRelativeScore(),
|
||||
obipairing.CLIWithStats())
|
||||
|
||||
obiconvert.CLIWriteBioSequences(paired, true)
|
||||
|
@ -351,8 +351,8 @@ func PERightAlign(seqA, seqB *obiseq.BioSequence, gap float64,
|
||||
}
|
||||
|
||||
func PEAlign(seqA, seqB *obiseq.BioSequence,
|
||||
gap float64, delta int,
|
||||
arena PEAlignArena) (int, []int, int, int) {
|
||||
gap float64, fastAlign bool, delta int, fastScoreRel bool,
|
||||
arena PEAlignArena) (int, []int, int, int, float64) {
|
||||
var score, shift int
|
||||
var startA, startB int
|
||||
var partLen, over int
|
||||
@ -365,107 +365,143 @@ func PEAlign(seqA, seqB *obiseq.BioSequence,
|
||||
_InitDNAScoreMatrix()
|
||||
}
|
||||
|
||||
index := obikmer.Index4mer(seqA,
|
||||
&arena.pointer.fastIndex,
|
||||
&arena.pointer.fastBuffer)
|
||||
fastCount := -1
|
||||
fastScore := -1.0
|
||||
|
||||
shift, fastScore := obikmer.FastShiftFourMer(index, seqB, nil)
|
||||
if fastAlign {
|
||||
|
||||
if shift > 0 {
|
||||
over = seqA.Len() - shift
|
||||
} else {
|
||||
over = seqB.Len() + shift
|
||||
}
|
||||
index := obikmer.Index4mer(seqA,
|
||||
&arena.pointer.fastIndex,
|
||||
&arena.pointer.fastBuffer)
|
||||
|
||||
// At least one mismatch exists in the overlaping region
|
||||
if fastScore+3 < over {
|
||||
shift, fastCount, fastScore = obikmer.FastShiftFourMer(index, seqA.Len(), seqB, fastScoreRel, nil)
|
||||
|
||||
if shift > 0 {
|
||||
startA = shift - delta
|
||||
if startA < 0 {
|
||||
startA = 0
|
||||
}
|
||||
extra5 = -startA
|
||||
startB = 0
|
||||
over = seqA.Len() - shift
|
||||
} else {
|
||||
over = seqB.Len() + shift
|
||||
}
|
||||
|
||||
rawSeqA = seqA.Sequence()[startA:]
|
||||
qualSeqA = seqA.Qualities()[startA:]
|
||||
partLen = len(rawSeqA)
|
||||
if partLen > seqB.Len() {
|
||||
partLen = seqB.Len()
|
||||
// At least one mismatch exists in the overlaping region
|
||||
if fastCount+3 < over {
|
||||
|
||||
if shift > 0 {
|
||||
startA = shift - delta
|
||||
if startA < 0 {
|
||||
startA = 0
|
||||
}
|
||||
extra5 = -startA
|
||||
startB = 0
|
||||
|
||||
rawSeqA = seqA.Sequence()[startA:]
|
||||
qualSeqA = seqA.Qualities()[startA:]
|
||||
partLen = len(rawSeqA)
|
||||
if partLen > seqB.Len() {
|
||||
partLen = seqB.Len()
|
||||
}
|
||||
rawSeqB = seqB.Sequence()[0:partLen]
|
||||
qualSeqB = seqB.Qualities()[0:partLen]
|
||||
extra3 = seqB.Len() - partLen
|
||||
score = _FillMatrixPeLeftAlign(
|
||||
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
|
||||
&arena.pointer.scoreMatrix,
|
||||
&arena.pointer.pathMatrix)
|
||||
} else {
|
||||
|
||||
startA = 0
|
||||
startB = -shift - delta
|
||||
if startB < 0 {
|
||||
startB = 0
|
||||
}
|
||||
extra5 = startB
|
||||
rawSeqB = seqB.Sequence()[startB:]
|
||||
qualSeqB = seqB.Qualities()[startB:]
|
||||
partLen = len(rawSeqB)
|
||||
if partLen > seqA.Len() {
|
||||
partLen = seqA.Len()
|
||||
}
|
||||
rawSeqA = seqA.Sequence()[:partLen]
|
||||
qualSeqA = seqA.Qualities()[:partLen]
|
||||
extra3 = partLen - seqA.Len()
|
||||
|
||||
score = _FillMatrixPeRightAlign(
|
||||
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
|
||||
&arena.pointer.scoreMatrix,
|
||||
&arena.pointer.pathMatrix)
|
||||
}
|
||||
rawSeqB = seqB.Sequence()[0:partLen]
|
||||
qualSeqB = seqB.Qualities()[0:partLen]
|
||||
extra3 = seqB.Len() - partLen
|
||||
score = _FillMatrixPeLeftAlign(
|
||||
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
|
||||
&arena.pointer.scoreMatrix,
|
||||
&arena.pointer.pathMatrix)
|
||||
|
||||
arena.pointer.path = _Backtracking(arena.pointer.pathMatrix,
|
||||
len(rawSeqA), len(rawSeqB),
|
||||
&arena.pointer.path)
|
||||
|
||||
} else {
|
||||
|
||||
startA = 0
|
||||
startB = -shift - delta
|
||||
if startB < 0 {
|
||||
startB = 0
|
||||
}
|
||||
extra5 = startB
|
||||
rawSeqB = seqB.Sequence()[startB:]
|
||||
qualSeqB = seqB.Qualities()[startB:]
|
||||
partLen = len(rawSeqB)
|
||||
if partLen > seqA.Len() {
|
||||
partLen = seqA.Len()
|
||||
}
|
||||
rawSeqA = seqA.Sequence()[:partLen]
|
||||
qualSeqA = seqA.Qualities()[:partLen]
|
||||
extra3 = partLen - seqA.Len()
|
||||
// Both overlaping regions are identicals
|
||||
|
||||
score = _FillMatrixPeRightAlign(
|
||||
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
|
||||
&arena.pointer.scoreMatrix,
|
||||
&arena.pointer.pathMatrix)
|
||||
if shift > 0 {
|
||||
startA = shift
|
||||
startB = 0
|
||||
extra5 = -startA
|
||||
qualSeqA = seqA.Qualities()[startA:]
|
||||
partLen = len(qualSeqA)
|
||||
qualSeqB = seqB.Qualities()[0:partLen]
|
||||
extra3 = seqB.Len() - partLen
|
||||
score = 0
|
||||
} else {
|
||||
startA = 0
|
||||
startB = -shift
|
||||
extra5 = startB
|
||||
qualSeqB = seqB.Qualities()[startB:]
|
||||
partLen = len(qualSeqB)
|
||||
extra3 = partLen - seqA.Len()
|
||||
qualSeqA = seqA.Qualities()[:partLen]
|
||||
}
|
||||
score = 0
|
||||
for i, qualA := range qualSeqA {
|
||||
qualB := qualSeqB[i]
|
||||
score += _NucScorePartMatchMatch[qualA][qualB]
|
||||
}
|
||||
arena.pointer.path = arena.pointer.path[:0]
|
||||
arena.pointer.path = append(arena.pointer.path, 0, partLen)
|
||||
}
|
||||
|
||||
arena.pointer.path[0] += extra5
|
||||
if arena.pointer.path[len(arena.pointer.path)-1] == 0 {
|
||||
arena.pointer.path[len(arena.pointer.path)-2] += extra3
|
||||
} else {
|
||||
arena.pointer.path = append(arena.pointer.path, extra3, 0)
|
||||
}
|
||||
} else {
|
||||
//
|
||||
// No Fast Heuristic
|
||||
//
|
||||
|
||||
rawSeqA = seqA.Sequence()
|
||||
qualSeqA = seqA.Qualities()
|
||||
rawSeqB = seqB.Sequence()
|
||||
qualSeqB = seqB.Qualities()
|
||||
|
||||
scoreR := _FillMatrixPeRightAlign(
|
||||
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
|
||||
&arena.pointer.scoreMatrix,
|
||||
&arena.pointer.pathMatrix)
|
||||
|
||||
arena.pointer.path = _Backtracking(arena.pointer.pathMatrix,
|
||||
len(rawSeqA), len(rawSeqB),
|
||||
&arena.pointer.path)
|
||||
|
||||
} else {
|
||||
scoreL := _FillMatrixPeLeftAlign(
|
||||
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
|
||||
&arena.pointer.scoreMatrix,
|
||||
&arena.pointer.pathMatrix)
|
||||
|
||||
// Both overlaping regions are identicals
|
||||
if scoreL > scoreR {
|
||||
arena.pointer.path = _Backtracking(arena.pointer.pathMatrix,
|
||||
len(rawSeqA), len(rawSeqB),
|
||||
&arena.pointer.path)
|
||||
}
|
||||
|
||||
if shift > 0 {
|
||||
startA = shift
|
||||
startB = 0
|
||||
extra5 = -startA
|
||||
qualSeqA = seqA.Qualities()[startA:]
|
||||
partLen = len(qualSeqA)
|
||||
qualSeqB = seqB.Qualities()[0:partLen]
|
||||
extra3 = seqB.Len() - partLen
|
||||
score = 0
|
||||
} else {
|
||||
startA = 0
|
||||
startB = -shift
|
||||
extra5 = startB
|
||||
qualSeqB = seqB.Qualities()[startB:]
|
||||
partLen = len(qualSeqB)
|
||||
extra3 = partLen - seqA.Len()
|
||||
qualSeqA = seqA.Qualities()[:partLen]
|
||||
}
|
||||
score = 0
|
||||
for i, qualA := range qualSeqA {
|
||||
qualB := qualSeqB[i]
|
||||
score += _NucScorePartMatchMatch[qualA][qualB]
|
||||
}
|
||||
arena.pointer.path = arena.pointer.path[:0]
|
||||
arena.pointer.path = append(arena.pointer.path, 0, partLen)
|
||||
}
|
||||
|
||||
arena.pointer.path[0] += extra5
|
||||
if arena.pointer.path[len(arena.pointer.path)-1] == 0 {
|
||||
arena.pointer.path[len(arena.pointer.path)-2] += extra3
|
||||
} else {
|
||||
arena.pointer.path = append(arena.pointer.path, extra3, 0)
|
||||
}
|
||||
|
||||
return score, arena.pointer.path, fastScore, over
|
||||
return score, arena.pointer.path, fastCount, over, fastScore
|
||||
}
|
||||
|
@ -1,6 +1,8 @@
|
||||
package obikmer
|
||||
|
||||
import "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
import (
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
)
|
||||
|
||||
var __single_base_code__ = []byte{0,
|
||||
// A, B, C, D,
|
||||
@ -97,7 +99,7 @@ func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int
|
||||
// FastShiftFourMer runs a Fast algorithm (similar to the one used in FASTA) to compare two sequences.
|
||||
// The returned values are two integer values. The shift between both the sequences and the count of
|
||||
// matching 4mer when this shift is applied between both the sequences.
|
||||
func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (int, int) {
|
||||
func FastShiftFourMer(index [][]int, lindex int, seq *obiseq.BioSequence, relscore bool, buffer *[]byte) (int, int, float64) {
|
||||
|
||||
iternal_buffer := Encode4mer(seq, buffer)
|
||||
|
||||
@ -116,18 +118,31 @@ func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (i
|
||||
}
|
||||
|
||||
maxshift := 0
|
||||
maxcount := -1
|
||||
maxcount := 0
|
||||
maxscore := -1.0
|
||||
|
||||
for shift, count := range shifts {
|
||||
if count > maxcount {
|
||||
score := float64(count)
|
||||
if relscore {
|
||||
over := -shift
|
||||
if shift > 0 {
|
||||
over += lindex
|
||||
} else {
|
||||
over = seq.Len() - over
|
||||
}
|
||||
score = score / float64(over)
|
||||
}
|
||||
if score > maxscore {
|
||||
maxshift = shift
|
||||
maxcount = count
|
||||
maxscore = score
|
||||
} else {
|
||||
if count == maxcount && shift < maxshift {
|
||||
if score == maxscore && shift < maxshift {
|
||||
maxshift = shift
|
||||
maxcount = count
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return maxshift, maxcount
|
||||
return maxshift, maxcount, maxscore
|
||||
}
|
||||
|
@ -13,6 +13,8 @@ var _MinOverlap = 20
|
||||
var _GapPenality = float64(2.0)
|
||||
var _WithoutStats = false
|
||||
var _MinIdentity = 0.9
|
||||
var _NoFastAlign = false
|
||||
var _FastScoreAbs = false
|
||||
|
||||
func PairingOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringVar(&_ForwardFile, "forward-reads", "",
|
||||
@ -39,6 +41,10 @@ func PairingOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&_WithoutStats, "without-stat", _WithoutStats,
|
||||
options.Alias("S"),
|
||||
options.Description("Remove alignment statistics from the produced consensus sequences."))
|
||||
options.BoolVar(&_NoFastAlign, "exact-mode", _NoFastAlign,
|
||||
options.Description("Do not run fast alignment heuristic."))
|
||||
options.BoolVar(&_FastScoreAbs, "fast-absolute", _FastScoreAbs,
|
||||
options.Description("Compute absolute fast score (no action in exact mode)."))
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
@ -82,3 +88,11 @@ func CLIGapPenality() float64 {
|
||||
func CLIWithStats() bool {
|
||||
return !_WithoutStats
|
||||
}
|
||||
|
||||
func CLIFastMode() bool {
|
||||
return !_NoFastAlign
|
||||
}
|
||||
|
||||
func CLIFastRelativeScore() bool {
|
||||
return !_FastScoreAbs
|
||||
}
|
||||
|
@ -99,16 +99,18 @@ func JoinPairedSequence(seqA, seqB *obiseq.BioSequence, inplace bool) *obiseq.Bi
|
||||
// destroyed during the assembling process and cannot be reuse later on.
|
||||
// the gap and delta parametters.
|
||||
//
|
||||
// - fastModeRel: if set to true, the FAST score mode is set to relative score
|
||||
//
|
||||
// # Returns
|
||||
//
|
||||
// An obiseq.BioSequence corresponding to the assembling of the both
|
||||
// input sequence.
|
||||
func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
|
||||
gap float64, delta, minOverlap int, minIdentity float64, withStats bool,
|
||||
inplace bool,
|
||||
inplace bool, fastAlign, fastModeRel bool,
|
||||
arenaAlign obialign.PEAlignArena) *obiseq.BioSequence {
|
||||
|
||||
score, path, fastscore, over := obialign.PEAlign(seqA, seqB, gap, delta, arenaAlign)
|
||||
score, path, fastcount, over, fastscore := obialign.PEAlign(seqA, seqB, gap, fastAlign, delta, fastModeRel, arenaAlign)
|
||||
cons, match := obialign.BuildQualityConsensus(seqA, seqB, path, true)
|
||||
|
||||
left := path[0]
|
||||
@ -123,8 +125,12 @@ func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
|
||||
identity = 0
|
||||
}
|
||||
annot := cons.Annotations()
|
||||
annot["paring_fast_score"] = fastscore
|
||||
annot["paring_fast_overlap"] = over
|
||||
|
||||
if fastAlign {
|
||||
annot["paring_fast_count"] = fastcount
|
||||
annot["paring_fast_score"] = math.Round(fastscore*1000) / 1000
|
||||
annot["paring_fast_overlap"] = over
|
||||
}
|
||||
|
||||
if aliLength >= minOverlap && identity >= minIdentity {
|
||||
annot["mode"] = "alignment"
|
||||
@ -205,7 +211,7 @@ func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
|
||||
// each pair of processed sequences produces one sequence in the result iterator.
|
||||
func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
|
||||
gap float64, delta, minOverlap int,
|
||||
minIdentity float64,
|
||||
minIdentity float64, fastAlign, fastModeRel,
|
||||
withStats bool, sizes ...int) obiiter.IBioSequence {
|
||||
|
||||
if !iterator.IsPaired() {
|
||||
@ -235,7 +241,7 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
|
||||
cons := make(obiseq.BioSequenceSlice, len(batch.Slice()))
|
||||
for i, A := range batch.Slice() {
|
||||
B := A.PairedWith()
|
||||
cons[i] = AssemblePESequences(A, B.ReverseComplement(true), gap, delta, minOverlap, minIdentity, withStats, true, arena)
|
||||
cons[i] = AssemblePESequences(A, B.ReverseComplement(true), gap, delta, minOverlap, minIdentity, withStats, true, fastAlign, fastModeRel, arena)
|
||||
}
|
||||
newIter.Push(obiiter.MakeBioSequenceBatch(
|
||||
batch.Order(),
|
||||
|
@ -14,7 +14,7 @@ import (
|
||||
|
||||
func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
|
||||
gap float64, delta, minOverlap int,
|
||||
minIdentity float64,
|
||||
minIdentity float64, fastAlign, fastScoreRel,
|
||||
withStats bool) obiiter.IBioSequence {
|
||||
|
||||
if !iterator.IsPaired() {
|
||||
@ -50,7 +50,8 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
|
||||
B := A.PairedWith()
|
||||
consensus := obipairing.AssemblePESequences(
|
||||
A.Copy(), B.ReverseComplement(false),
|
||||
gap, delta, minOverlap, minIdentity, withStats, true, arena,
|
||||
gap, delta, minOverlap, minIdentity, withStats, true,
|
||||
fastAlign, fastScoreRel, arena,
|
||||
)
|
||||
|
||||
consensus, err = ngsfilter.ExtractBarcode(consensus, true)
|
||||
|
Reference in New Issue
Block a user