Adds option to tune the pairing of the sequences in obipairing and some stats to the results

Former-commit-id: a6cf9cb4d4ab20a433a2534fd7d11cd3ca8ebbaa
This commit is contained in:
2023-11-24 12:29:37 +01:00
parent ec31ae86b9
commit b556e045e5
7 changed files with 178 additions and 102 deletions

View File

@ -1,8 +1,8 @@
package main package main
import ( import (
"os"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"os"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
@ -34,7 +34,7 @@ func main() {
pairs, err := obipairing.CLIPairedSequence() pairs, err := obipairing.CLIPairedSequence()
if err != nil { if err != nil {
log.Errorf("Cannot open file (%v)",err) log.Errorf("Cannot open file (%v)", err)
os.Exit(1) os.Exit(1)
} }
@ -43,6 +43,8 @@ func main() {
obipairing.CLIDelta(), obipairing.CLIDelta(),
obipairing.CLIMinOverlap(), obipairing.CLIMinOverlap(),
obipairing.CLIMinIdentity(), obipairing.CLIMinIdentity(),
obipairing.CLIFastMode(),
obipairing.CLIFastRelativeScore(),
obipairing.CLIWithStats(), obipairing.CLIWithStats(),
obioptions.CLIParallelWorkers(), obioptions.CLIParallelWorkers(),
) )

View File

@ -47,6 +47,8 @@ func main() {
obipairing.CLIDelta(), obipairing.CLIDelta(),
obipairing.CLIMinOverlap(), obipairing.CLIMinOverlap(),
obipairing.CLIMinIdentity(), obipairing.CLIMinIdentity(),
obipairing.CLIFastMode(),
obipairing.CLIFastRelativeScore(),
obipairing.CLIWithStats()) obipairing.CLIWithStats())
obiconvert.CLIWriteBioSequences(paired, true) obiconvert.CLIWriteBioSequences(paired, true)

View File

@ -351,8 +351,8 @@ func PERightAlign(seqA, seqB *obiseq.BioSequence, gap float64,
} }
func PEAlign(seqA, seqB *obiseq.BioSequence, func PEAlign(seqA, seqB *obiseq.BioSequence,
gap float64, delta int, gap float64, fastAlign bool, delta int, fastScoreRel bool,
arena PEAlignArena) (int, []int, int, int) { arena PEAlignArena) (int, []int, int, int, float64) {
var score, shift int var score, shift int
var startA, startB int var startA, startB int
var partLen, over int var partLen, over int
@ -365,107 +365,143 @@ func PEAlign(seqA, seqB *obiseq.BioSequence,
_InitDNAScoreMatrix() _InitDNAScoreMatrix()
} }
index := obikmer.Index4mer(seqA, fastCount := -1
&arena.pointer.fastIndex, fastScore := -1.0
&arena.pointer.fastBuffer)
shift, fastScore := obikmer.FastShiftFourMer(index, seqB, nil) if fastAlign {
if shift > 0 { index := obikmer.Index4mer(seqA,
over = seqA.Len() - shift &arena.pointer.fastIndex,
} else { &arena.pointer.fastBuffer)
over = seqB.Len() + shift
}
// At least one mismatch exists in the overlaping region shift, fastCount, fastScore = obikmer.FastShiftFourMer(index, seqA.Len(), seqB, fastScoreRel, nil)
if fastScore+3 < over {
if shift > 0 { if shift > 0 {
startA = shift - delta over = seqA.Len() - shift
if startA < 0 { } else {
startA = 0 over = seqB.Len() + shift
} }
extra5 = -startA
startB = 0
rawSeqA = seqA.Sequence()[startA:] // At least one mismatch exists in the overlaping region
qualSeqA = seqA.Qualities()[startA:] if fastCount+3 < over {
partLen = len(rawSeqA)
if partLen > seqB.Len() { if shift > 0 {
partLen = seqB.Len() startA = shift - delta
if startA < 0 {
startA = 0
}
extra5 = -startA
startB = 0
rawSeqA = seqA.Sequence()[startA:]
qualSeqA = seqA.Qualities()[startA:]
partLen = len(rawSeqA)
if partLen > seqB.Len() {
partLen = seqB.Len()
}
rawSeqB = seqB.Sequence()[0:partLen]
qualSeqB = seqB.Qualities()[0:partLen]
extra3 = seqB.Len() - partLen
score = _FillMatrixPeLeftAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
} else {
startA = 0
startB = -shift - delta
if startB < 0 {
startB = 0
}
extra5 = startB
rawSeqB = seqB.Sequence()[startB:]
qualSeqB = seqB.Qualities()[startB:]
partLen = len(rawSeqB)
if partLen > seqA.Len() {
partLen = seqA.Len()
}
rawSeqA = seqA.Sequence()[:partLen]
qualSeqA = seqA.Qualities()[:partLen]
extra3 = partLen - seqA.Len()
score = _FillMatrixPeRightAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
} }
rawSeqB = seqB.Sequence()[0:partLen]
qualSeqB = seqB.Qualities()[0:partLen] arena.pointer.path = _Backtracking(arena.pointer.pathMatrix,
extra3 = seqB.Len() - partLen len(rawSeqA), len(rawSeqB),
score = _FillMatrixPeLeftAlign( &arena.pointer.path)
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
} else { } else {
startA = 0 // Both overlaping regions are identicals
startB = -shift - delta
if startB < 0 {
startB = 0
}
extra5 = startB
rawSeqB = seqB.Sequence()[startB:]
qualSeqB = seqB.Qualities()[startB:]
partLen = len(rawSeqB)
if partLen > seqA.Len() {
partLen = seqA.Len()
}
rawSeqA = seqA.Sequence()[:partLen]
qualSeqA = seqA.Qualities()[:partLen]
extra3 = partLen - seqA.Len()
score = _FillMatrixPeRightAlign( if shift > 0 {
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, startA = shift
&arena.pointer.scoreMatrix, startB = 0
&arena.pointer.pathMatrix) extra5 = -startA
qualSeqA = seqA.Qualities()[startA:]
partLen = len(qualSeqA)
qualSeqB = seqB.Qualities()[0:partLen]
extra3 = seqB.Len() - partLen
score = 0
} else {
startA = 0
startB = -shift
extra5 = startB
qualSeqB = seqB.Qualities()[startB:]
partLen = len(qualSeqB)
extra3 = partLen - seqA.Len()
qualSeqA = seqA.Qualities()[:partLen]
}
score = 0
for i, qualA := range qualSeqA {
qualB := qualSeqB[i]
score += _NucScorePartMatchMatch[qualA][qualB]
}
arena.pointer.path = arena.pointer.path[:0]
arena.pointer.path = append(arena.pointer.path, 0, partLen)
} }
arena.pointer.path[0] += extra5
if arena.pointer.path[len(arena.pointer.path)-1] == 0 {
arena.pointer.path[len(arena.pointer.path)-2] += extra3
} else {
arena.pointer.path = append(arena.pointer.path, extra3, 0)
}
} else {
//
// No Fast Heuristic
//
rawSeqA = seqA.Sequence()
qualSeqA = seqA.Qualities()
rawSeqB = seqB.Sequence()
qualSeqB = seqB.Qualities()
scoreR := _FillMatrixPeRightAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
arena.pointer.path = _Backtracking(arena.pointer.pathMatrix, arena.pointer.path = _Backtracking(arena.pointer.pathMatrix,
len(rawSeqA), len(rawSeqB), len(rawSeqA), len(rawSeqB),
&arena.pointer.path) &arena.pointer.path)
} else { scoreL := _FillMatrixPeLeftAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
// Both overlaping regions are identicals if scoreL > scoreR {
arena.pointer.path = _Backtracking(arena.pointer.pathMatrix,
len(rawSeqA), len(rawSeqB),
&arena.pointer.path)
}
if shift > 0 {
startA = shift
startB = 0
extra5 = -startA
qualSeqA = seqA.Qualities()[startA:]
partLen = len(qualSeqA)
qualSeqB = seqB.Qualities()[0:partLen]
extra3 = seqB.Len() - partLen
score = 0
} else {
startA = 0
startB = -shift
extra5 = startB
qualSeqB = seqB.Qualities()[startB:]
partLen = len(qualSeqB)
extra3 = partLen - seqA.Len()
qualSeqA = seqA.Qualities()[:partLen]
}
score = 0
for i, qualA := range qualSeqA {
qualB := qualSeqB[i]
score += _NucScorePartMatchMatch[qualA][qualB]
}
arena.pointer.path = arena.pointer.path[:0]
arena.pointer.path = append(arena.pointer.path, 0, partLen)
} }
arena.pointer.path[0] += extra5 return score, arena.pointer.path, fastCount, over, fastScore
if arena.pointer.path[len(arena.pointer.path)-1] == 0 {
arena.pointer.path[len(arena.pointer.path)-2] += extra3
} else {
arena.pointer.path = append(arena.pointer.path, extra3, 0)
}
return score, arena.pointer.path, fastScore, over
} }

View File

@ -1,6 +1,8 @@
package obikmer package obikmer
import "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
var __single_base_code__ = []byte{0, var __single_base_code__ = []byte{0,
// A, B, C, D, // A, B, C, D,
@ -97,7 +99,7 @@ func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int
// FastShiftFourMer runs a Fast algorithm (similar to the one used in FASTA) to compare two sequences. // FastShiftFourMer runs a Fast algorithm (similar to the one used in FASTA) to compare two sequences.
// The returned values are two integer values. The shift between both the sequences and the count of // The returned values are two integer values. The shift between both the sequences and the count of
// matching 4mer when this shift is applied between both the sequences. // matching 4mer when this shift is applied between both the sequences.
func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (int, int) { func FastShiftFourMer(index [][]int, lindex int, seq *obiseq.BioSequence, relscore bool, buffer *[]byte) (int, int, float64) {
iternal_buffer := Encode4mer(seq, buffer) iternal_buffer := Encode4mer(seq, buffer)
@ -116,18 +118,31 @@ func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (i
} }
maxshift := 0 maxshift := 0
maxcount := -1 maxcount := 0
maxscore := -1.0
for shift, count := range shifts { for shift, count := range shifts {
if count > maxcount { score := float64(count)
if relscore {
over := -shift
if shift > 0 {
over += lindex
} else {
over = seq.Len() - over
}
score = score / float64(over)
}
if score > maxscore {
maxshift = shift maxshift = shift
maxcount = count maxcount = count
maxscore = score
} else { } else {
if count == maxcount && shift < maxshift { if score == maxscore && shift < maxshift {
maxshift = shift maxshift = shift
maxcount = count
} }
} }
} }
return maxshift, maxcount return maxshift, maxcount, maxscore
} }

View File

@ -13,6 +13,8 @@ var _MinOverlap = 20
var _GapPenality = float64(2.0) var _GapPenality = float64(2.0)
var _WithoutStats = false var _WithoutStats = false
var _MinIdentity = 0.9 var _MinIdentity = 0.9
var _NoFastAlign = false
var _FastScoreAbs = false
func PairingOptionSet(options *getoptions.GetOpt) { func PairingOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_ForwardFile, "forward-reads", "", options.StringVar(&_ForwardFile, "forward-reads", "",
@ -39,6 +41,10 @@ func PairingOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_WithoutStats, "without-stat", _WithoutStats, options.BoolVar(&_WithoutStats, "without-stat", _WithoutStats,
options.Alias("S"), options.Alias("S"),
options.Description("Remove alignment statistics from the produced consensus sequences.")) options.Description("Remove alignment statistics from the produced consensus sequences."))
options.BoolVar(&_NoFastAlign, "exact-mode", _NoFastAlign,
options.Description("Do not run fast alignment heuristic."))
options.BoolVar(&_FastScoreAbs, "fast-absolute", _FastScoreAbs,
options.Description("Compute absolute fast score (no action in exact mode)."))
} }
func OptionSet(options *getoptions.GetOpt) { func OptionSet(options *getoptions.GetOpt) {
@ -82,3 +88,11 @@ func CLIGapPenality() float64 {
func CLIWithStats() bool { func CLIWithStats() bool {
return !_WithoutStats return !_WithoutStats
} }
func CLIFastMode() bool {
return !_NoFastAlign
}
func CLIFastRelativeScore() bool {
return !_FastScoreAbs
}

View File

@ -99,16 +99,18 @@ func JoinPairedSequence(seqA, seqB *obiseq.BioSequence, inplace bool) *obiseq.Bi
// destroyed during the assembling process and cannot be reuse later on. // destroyed during the assembling process and cannot be reuse later on.
// the gap and delta parametters. // the gap and delta parametters.
// //
// - fastModeRel: if set to true, the FAST score mode is set to relative score
//
// # Returns // # Returns
// //
// An obiseq.BioSequence corresponding to the assembling of the both // An obiseq.BioSequence corresponding to the assembling of the both
// input sequence. // input sequence.
func AssemblePESequences(seqA, seqB *obiseq.BioSequence, func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
gap float64, delta, minOverlap int, minIdentity float64, withStats bool, gap float64, delta, minOverlap int, minIdentity float64, withStats bool,
inplace bool, inplace bool, fastAlign, fastModeRel bool,
arenaAlign obialign.PEAlignArena) *obiseq.BioSequence { arenaAlign obialign.PEAlignArena) *obiseq.BioSequence {
score, path, fastscore, over := obialign.PEAlign(seqA, seqB, gap, delta, arenaAlign) score, path, fastcount, over, fastscore := obialign.PEAlign(seqA, seqB, gap, fastAlign, delta, fastModeRel, arenaAlign)
cons, match := obialign.BuildQualityConsensus(seqA, seqB, path, true) cons, match := obialign.BuildQualityConsensus(seqA, seqB, path, true)
left := path[0] left := path[0]
@ -123,8 +125,12 @@ func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
identity = 0 identity = 0
} }
annot := cons.Annotations() annot := cons.Annotations()
annot["paring_fast_score"] = fastscore
annot["paring_fast_overlap"] = over if fastAlign {
annot["paring_fast_count"] = fastcount
annot["paring_fast_score"] = math.Round(fastscore*1000) / 1000
annot["paring_fast_overlap"] = over
}
if aliLength >= minOverlap && identity >= minIdentity { if aliLength >= minOverlap && identity >= minIdentity {
annot["mode"] = "alignment" annot["mode"] = "alignment"
@ -205,7 +211,7 @@ func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
// each pair of processed sequences produces one sequence in the result iterator. // each pair of processed sequences produces one sequence in the result iterator.
func IAssemblePESequencesBatch(iterator obiiter.IBioSequence, func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
gap float64, delta, minOverlap int, gap float64, delta, minOverlap int,
minIdentity float64, minIdentity float64, fastAlign, fastModeRel,
withStats bool, sizes ...int) obiiter.IBioSequence { withStats bool, sizes ...int) obiiter.IBioSequence {
if !iterator.IsPaired() { if !iterator.IsPaired() {
@ -235,7 +241,7 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
cons := make(obiseq.BioSequenceSlice, len(batch.Slice())) cons := make(obiseq.BioSequenceSlice, len(batch.Slice()))
for i, A := range batch.Slice() { for i, A := range batch.Slice() {
B := A.PairedWith() B := A.PairedWith()
cons[i] = AssemblePESequences(A, B.ReverseComplement(true), gap, delta, minOverlap, minIdentity, withStats, true, arena) cons[i] = AssemblePESequences(A, B.ReverseComplement(true), gap, delta, minOverlap, minIdentity, withStats, true, fastAlign, fastModeRel, arena)
} }
newIter.Push(obiiter.MakeBioSequenceBatch( newIter.Push(obiiter.MakeBioSequenceBatch(
batch.Order(), batch.Order(),

View File

@ -14,7 +14,7 @@ import (
func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence, func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
gap float64, delta, minOverlap int, gap float64, delta, minOverlap int,
minIdentity float64, minIdentity float64, fastAlign, fastScoreRel,
withStats bool) obiiter.IBioSequence { withStats bool) obiiter.IBioSequence {
if !iterator.IsPaired() { if !iterator.IsPaired() {
@ -50,7 +50,8 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
B := A.PairedWith() B := A.PairedWith()
consensus := obipairing.AssemblePESequences( consensus := obipairing.AssemblePESequences(
A.Copy(), B.ReverseComplement(false), A.Copy(), B.ReverseComplement(false),
gap, delta, minOverlap, minIdentity, withStats, true, arena, gap, delta, minOverlap, minIdentity, withStats, true,
fastAlign, fastScoreRel, arena,
) )
consensus, err = ngsfilter.ExtractBarcode(consensus, true) consensus, err = ngsfilter.ExtractBarcode(consensus, true)