Adds option to tune the pairing of the sequences in obipairing and some stats to the results

Former-commit-id: a6cf9cb4d4ab20a433a2534fd7d11cd3ca8ebbaa
This commit is contained in:
2023-11-24 12:29:37 +01:00
parent ec31ae86b9
commit b556e045e5
7 changed files with 178 additions and 102 deletions

View File

@ -1,8 +1,8 @@
package main
import (
"os"
log "github.com/sirupsen/logrus"
"os"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
@ -34,7 +34,7 @@ func main() {
pairs, err := obipairing.CLIPairedSequence()
if err != nil {
log.Errorf("Cannot open file (%v)",err)
log.Errorf("Cannot open file (%v)", err)
os.Exit(1)
}
@ -43,10 +43,12 @@ func main() {
obipairing.CLIDelta(),
obipairing.CLIMinOverlap(),
obipairing.CLIMinIdentity(),
obipairing.CLIFastMode(),
obipairing.CLIFastRelativeScore(),
obipairing.CLIWithStats(),
obioptions.CLIParallelWorkers(),
)
obiconvert.CLIWriteBioSequences(paired, true)
obiiter.WaitForLastPipe()

View File

@ -47,6 +47,8 @@ func main() {
obipairing.CLIDelta(),
obipairing.CLIMinOverlap(),
obipairing.CLIMinIdentity(),
obipairing.CLIFastMode(),
obipairing.CLIFastRelativeScore(),
obipairing.CLIWithStats())
obiconvert.CLIWriteBioSequences(paired, true)

View File

@ -351,8 +351,8 @@ func PERightAlign(seqA, seqB *obiseq.BioSequence, gap float64,
}
func PEAlign(seqA, seqB *obiseq.BioSequence,
gap float64, delta int,
arena PEAlignArena) (int, []int, int, int) {
gap float64, fastAlign bool, delta int, fastScoreRel bool,
arena PEAlignArena) (int, []int, int, int, float64) {
var score, shift int
var startA, startB int
var partLen, over int
@ -365,107 +365,143 @@ func PEAlign(seqA, seqB *obiseq.BioSequence,
_InitDNAScoreMatrix()
}
index := obikmer.Index4mer(seqA,
&arena.pointer.fastIndex,
&arena.pointer.fastBuffer)
fastCount := -1
fastScore := -1.0
shift, fastScore := obikmer.FastShiftFourMer(index, seqB, nil)
if fastAlign {
if shift > 0 {
over = seqA.Len() - shift
} else {
over = seqB.Len() + shift
}
index := obikmer.Index4mer(seqA,
&arena.pointer.fastIndex,
&arena.pointer.fastBuffer)
// At least one mismatch exists in the overlaping region
if fastScore+3 < over {
shift, fastCount, fastScore = obikmer.FastShiftFourMer(index, seqA.Len(), seqB, fastScoreRel, nil)
if shift > 0 {
startA = shift - delta
if startA < 0 {
startA = 0
}
extra5 = -startA
startB = 0
over = seqA.Len() - shift
} else {
over = seqB.Len() + shift
}
rawSeqA = seqA.Sequence()[startA:]
qualSeqA = seqA.Qualities()[startA:]
partLen = len(rawSeqA)
if partLen > seqB.Len() {
partLen = seqB.Len()
// At least one mismatch exists in the overlaping region
if fastCount+3 < over {
if shift > 0 {
startA = shift - delta
if startA < 0 {
startA = 0
}
extra5 = -startA
startB = 0
rawSeqA = seqA.Sequence()[startA:]
qualSeqA = seqA.Qualities()[startA:]
partLen = len(rawSeqA)
if partLen > seqB.Len() {
partLen = seqB.Len()
}
rawSeqB = seqB.Sequence()[0:partLen]
qualSeqB = seqB.Qualities()[0:partLen]
extra3 = seqB.Len() - partLen
score = _FillMatrixPeLeftAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
} else {
startA = 0
startB = -shift - delta
if startB < 0 {
startB = 0
}
extra5 = startB
rawSeqB = seqB.Sequence()[startB:]
qualSeqB = seqB.Qualities()[startB:]
partLen = len(rawSeqB)
if partLen > seqA.Len() {
partLen = seqA.Len()
}
rawSeqA = seqA.Sequence()[:partLen]
qualSeqA = seqA.Qualities()[:partLen]
extra3 = partLen - seqA.Len()
score = _FillMatrixPeRightAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
}
rawSeqB = seqB.Sequence()[0:partLen]
qualSeqB = seqB.Qualities()[0:partLen]
extra3 = seqB.Len() - partLen
score = _FillMatrixPeLeftAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
arena.pointer.path = _Backtracking(arena.pointer.pathMatrix,
len(rawSeqA), len(rawSeqB),
&arena.pointer.path)
} else {
startA = 0
startB = -shift - delta
if startB < 0 {
startB = 0
}
extra5 = startB
rawSeqB = seqB.Sequence()[startB:]
qualSeqB = seqB.Qualities()[startB:]
partLen = len(rawSeqB)
if partLen > seqA.Len() {
partLen = seqA.Len()
}
rawSeqA = seqA.Sequence()[:partLen]
qualSeqA = seqA.Qualities()[:partLen]
extra3 = partLen - seqA.Len()
// Both overlaping regions are identicals
score = _FillMatrixPeRightAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
if shift > 0 {
startA = shift
startB = 0
extra5 = -startA
qualSeqA = seqA.Qualities()[startA:]
partLen = len(qualSeqA)
qualSeqB = seqB.Qualities()[0:partLen]
extra3 = seqB.Len() - partLen
score = 0
} else {
startA = 0
startB = -shift
extra5 = startB
qualSeqB = seqB.Qualities()[startB:]
partLen = len(qualSeqB)
extra3 = partLen - seqA.Len()
qualSeqA = seqA.Qualities()[:partLen]
}
score = 0
for i, qualA := range qualSeqA {
qualB := qualSeqB[i]
score += _NucScorePartMatchMatch[qualA][qualB]
}
arena.pointer.path = arena.pointer.path[:0]
arena.pointer.path = append(arena.pointer.path, 0, partLen)
}
arena.pointer.path[0] += extra5
if arena.pointer.path[len(arena.pointer.path)-1] == 0 {
arena.pointer.path[len(arena.pointer.path)-2] += extra3
} else {
arena.pointer.path = append(arena.pointer.path, extra3, 0)
}
} else {
//
// No Fast Heuristic
//
rawSeqA = seqA.Sequence()
qualSeqA = seqA.Qualities()
rawSeqB = seqB.Sequence()
qualSeqB = seqB.Qualities()
scoreR := _FillMatrixPeRightAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
arena.pointer.path = _Backtracking(arena.pointer.pathMatrix,
len(rawSeqA), len(rawSeqB),
&arena.pointer.path)
} else {
scoreL := _FillMatrixPeLeftAlign(
rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap,
&arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix)
// Both overlaping regions are identicals
if scoreL > scoreR {
arena.pointer.path = _Backtracking(arena.pointer.pathMatrix,
len(rawSeqA), len(rawSeqB),
&arena.pointer.path)
}
if shift > 0 {
startA = shift
startB = 0
extra5 = -startA
qualSeqA = seqA.Qualities()[startA:]
partLen = len(qualSeqA)
qualSeqB = seqB.Qualities()[0:partLen]
extra3 = seqB.Len() - partLen
score = 0
} else {
startA = 0
startB = -shift
extra5 = startB
qualSeqB = seqB.Qualities()[startB:]
partLen = len(qualSeqB)
extra3 = partLen - seqA.Len()
qualSeqA = seqA.Qualities()[:partLen]
}
score = 0
for i, qualA := range qualSeqA {
qualB := qualSeqB[i]
score += _NucScorePartMatchMatch[qualA][qualB]
}
arena.pointer.path = arena.pointer.path[:0]
arena.pointer.path = append(arena.pointer.path, 0, partLen)
}
arena.pointer.path[0] += extra5
if arena.pointer.path[len(arena.pointer.path)-1] == 0 {
arena.pointer.path[len(arena.pointer.path)-2] += extra3
} else {
arena.pointer.path = append(arena.pointer.path, extra3, 0)
}
return score, arena.pointer.path, fastScore, over
return score, arena.pointer.path, fastCount, over, fastScore
}

View File

@ -1,6 +1,8 @@
package obikmer
import "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
var __single_base_code__ = []byte{0,
// A, B, C, D,
@ -97,7 +99,7 @@ func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int
// FastShiftFourMer runs a Fast algorithm (similar to the one used in FASTA) to compare two sequences.
// The returned values are two integer values. The shift between both the sequences and the count of
// matching 4mer when this shift is applied between both the sequences.
func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (int, int) {
func FastShiftFourMer(index [][]int, lindex int, seq *obiseq.BioSequence, relscore bool, buffer *[]byte) (int, int, float64) {
iternal_buffer := Encode4mer(seq, buffer)
@ -116,18 +118,31 @@ func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (i
}
maxshift := 0
maxcount := -1
maxcount := 0
maxscore := -1.0
for shift, count := range shifts {
if count > maxcount {
score := float64(count)
if relscore {
over := -shift
if shift > 0 {
over += lindex
} else {
over = seq.Len() - over
}
score = score / float64(over)
}
if score > maxscore {
maxshift = shift
maxcount = count
maxscore = score
} else {
if count == maxcount && shift < maxshift {
if score == maxscore && shift < maxshift {
maxshift = shift
maxcount = count
}
}
}
return maxshift, maxcount
return maxshift, maxcount, maxscore
}

View File

@ -13,6 +13,8 @@ var _MinOverlap = 20
var _GapPenality = float64(2.0)
var _WithoutStats = false
var _MinIdentity = 0.9
var _NoFastAlign = false
var _FastScoreAbs = false
func PairingOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_ForwardFile, "forward-reads", "",
@ -39,6 +41,10 @@ func PairingOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_WithoutStats, "without-stat", _WithoutStats,
options.Alias("S"),
options.Description("Remove alignment statistics from the produced consensus sequences."))
options.BoolVar(&_NoFastAlign, "exact-mode", _NoFastAlign,
options.Description("Do not run fast alignment heuristic."))
options.BoolVar(&_FastScoreAbs, "fast-absolute", _FastScoreAbs,
options.Description("Compute absolute fast score (no action in exact mode)."))
}
func OptionSet(options *getoptions.GetOpt) {
@ -82,3 +88,11 @@ func CLIGapPenality() float64 {
func CLIWithStats() bool {
return !_WithoutStats
}
func CLIFastMode() bool {
return !_NoFastAlign
}
func CLIFastRelativeScore() bool {
return !_FastScoreAbs
}

View File

@ -99,16 +99,18 @@ func JoinPairedSequence(seqA, seqB *obiseq.BioSequence, inplace bool) *obiseq.Bi
// destroyed during the assembling process and cannot be reuse later on.
// the gap and delta parametters.
//
// - fastModeRel: if set to true, the FAST score mode is set to relative score
//
// # Returns
//
// An obiseq.BioSequence corresponding to the assembling of the both
// input sequence.
func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
gap float64, delta, minOverlap int, minIdentity float64, withStats bool,
inplace bool,
inplace bool, fastAlign, fastModeRel bool,
arenaAlign obialign.PEAlignArena) *obiseq.BioSequence {
score, path, fastscore, over := obialign.PEAlign(seqA, seqB, gap, delta, arenaAlign)
score, path, fastcount, over, fastscore := obialign.PEAlign(seqA, seqB, gap, fastAlign, delta, fastModeRel, arenaAlign)
cons, match := obialign.BuildQualityConsensus(seqA, seqB, path, true)
left := path[0]
@ -123,8 +125,12 @@ func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
identity = 0
}
annot := cons.Annotations()
annot["paring_fast_score"] = fastscore
annot["paring_fast_overlap"] = over
if fastAlign {
annot["paring_fast_count"] = fastcount
annot["paring_fast_score"] = math.Round(fastscore*1000) / 1000
annot["paring_fast_overlap"] = over
}
if aliLength >= minOverlap && identity >= minIdentity {
annot["mode"] = "alignment"
@ -205,7 +211,7 @@ func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
// each pair of processed sequences produces one sequence in the result iterator.
func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
gap float64, delta, minOverlap int,
minIdentity float64,
minIdentity float64, fastAlign, fastModeRel,
withStats bool, sizes ...int) obiiter.IBioSequence {
if !iterator.IsPaired() {
@ -235,7 +241,7 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
cons := make(obiseq.BioSequenceSlice, len(batch.Slice()))
for i, A := range batch.Slice() {
B := A.PairedWith()
cons[i] = AssemblePESequences(A, B.ReverseComplement(true), gap, delta, minOverlap, minIdentity, withStats, true, arena)
cons[i] = AssemblePESequences(A, B.ReverseComplement(true), gap, delta, minOverlap, minIdentity, withStats, true, fastAlign, fastModeRel, arena)
}
newIter.Push(obiiter.MakeBioSequenceBatch(
batch.Order(),

View File

@ -14,7 +14,7 @@ import (
func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
gap float64, delta, minOverlap int,
minIdentity float64,
minIdentity float64, fastAlign, fastScoreRel,
withStats bool) obiiter.IBioSequence {
if !iterator.IsPaired() {
@ -50,7 +50,8 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
B := A.PairedWith()
consensus := obipairing.AssemblePESequences(
A.Copy(), B.ReverseComplement(false),
gap, delta, minOverlap, minIdentity, withStats, true, arena,
gap, delta, minOverlap, minIdentity, withStats, true,
fastAlign, fastScoreRel, arena,
)
consensus, err = ngsfilter.ExtractBarcode(consensus, true)