From b556e045e50777f740bd67519da825bd53347911 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 24 Nov 2023 12:29:37 +0100 Subject: [PATCH] Adds option to tune the pairing of the sequences in obipairing and some stats to the results Former-commit-id: a6cf9cb4d4ab20a433a2534fd7d11cd3ca8ebbaa --- cmd/obitools/obipairing/main.go | 8 +- cmd/obitools/obitagpcr/main.go | 2 + pkg/obialign/pairedendalign.go | 206 +++++++++++++++++------------ pkg/obikmer/encodefourmer.go | 27 +++- pkg/obitools/obipairing/options.go | 14 ++ pkg/obitools/obipairing/pairing.go | 18 ++- pkg/obitools/obitagpcr/pcrtag.go | 5 +- 7 files changed, 178 insertions(+), 102 deletions(-) diff --git a/cmd/obitools/obipairing/main.go b/cmd/obitools/obipairing/main.go index c242fa1..9589510 100644 --- a/cmd/obitools/obipairing/main.go +++ b/cmd/obitools/obipairing/main.go @@ -1,8 +1,8 @@ package main import ( - "os" log "github.com/sirupsen/logrus" + "os" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" @@ -34,7 +34,7 @@ func main() { pairs, err := obipairing.CLIPairedSequence() if err != nil { - log.Errorf("Cannot open file (%v)",err) + log.Errorf("Cannot open file (%v)", err) os.Exit(1) } @@ -43,10 +43,12 @@ func main() { obipairing.CLIDelta(), obipairing.CLIMinOverlap(), obipairing.CLIMinIdentity(), + obipairing.CLIFastMode(), + obipairing.CLIFastRelativeScore(), obipairing.CLIWithStats(), obioptions.CLIParallelWorkers(), ) - + obiconvert.CLIWriteBioSequences(paired, true) obiiter.WaitForLastPipe() diff --git a/cmd/obitools/obitagpcr/main.go b/cmd/obitools/obitagpcr/main.go index 68b6248..d89d750 100644 --- a/cmd/obitools/obitagpcr/main.go +++ b/cmd/obitools/obitagpcr/main.go @@ -47,6 +47,8 @@ func main() { obipairing.CLIDelta(), obipairing.CLIMinOverlap(), obipairing.CLIMinIdentity(), + obipairing.CLIFastMode(), + obipairing.CLIFastRelativeScore(), obipairing.CLIWithStats()) obiconvert.CLIWriteBioSequences(paired, true) diff --git a/pkg/obialign/pairedendalign.go b/pkg/obialign/pairedendalign.go index a481ade..025afed 100644 --- a/pkg/obialign/pairedendalign.go +++ b/pkg/obialign/pairedendalign.go @@ -351,8 +351,8 @@ func PERightAlign(seqA, seqB *obiseq.BioSequence, gap float64, } func PEAlign(seqA, seqB *obiseq.BioSequence, - gap float64, delta int, - arena PEAlignArena) (int, []int, int, int) { + gap float64, fastAlign bool, delta int, fastScoreRel bool, + arena PEAlignArena) (int, []int, int, int, float64) { var score, shift int var startA, startB int var partLen, over int @@ -365,107 +365,143 @@ func PEAlign(seqA, seqB *obiseq.BioSequence, _InitDNAScoreMatrix() } - index := obikmer.Index4mer(seqA, - &arena.pointer.fastIndex, - &arena.pointer.fastBuffer) + fastCount := -1 + fastScore := -1.0 - shift, fastScore := obikmer.FastShiftFourMer(index, seqB, nil) + if fastAlign { - if shift > 0 { - over = seqA.Len() - shift - } else { - over = seqB.Len() + shift - } + index := obikmer.Index4mer(seqA, + &arena.pointer.fastIndex, + &arena.pointer.fastBuffer) - // At least one mismatch exists in the overlaping region - if fastScore+3 < over { + shift, fastCount, fastScore = obikmer.FastShiftFourMer(index, seqA.Len(), seqB, fastScoreRel, nil) if shift > 0 { - startA = shift - delta - if startA < 0 { - startA = 0 - } - extra5 = -startA - startB = 0 + over = seqA.Len() - shift + } else { + over = seqB.Len() + shift + } - rawSeqA = seqA.Sequence()[startA:] - qualSeqA = seqA.Qualities()[startA:] - partLen = len(rawSeqA) - if partLen > seqB.Len() { - partLen = seqB.Len() + // At least one mismatch exists in the overlaping region + if fastCount+3 < over { + + if shift > 0 { + startA = shift - delta + if startA < 0 { + startA = 0 + } + extra5 = -startA + startB = 0 + + rawSeqA = seqA.Sequence()[startA:] + qualSeqA = seqA.Qualities()[startA:] + partLen = len(rawSeqA) + if partLen > seqB.Len() { + partLen = seqB.Len() + } + rawSeqB = seqB.Sequence()[0:partLen] + qualSeqB = seqB.Qualities()[0:partLen] + extra3 = seqB.Len() - partLen + score = _FillMatrixPeLeftAlign( + rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, + &arena.pointer.scoreMatrix, + &arena.pointer.pathMatrix) + } else { + + startA = 0 + startB = -shift - delta + if startB < 0 { + startB = 0 + } + extra5 = startB + rawSeqB = seqB.Sequence()[startB:] + qualSeqB = seqB.Qualities()[startB:] + partLen = len(rawSeqB) + if partLen > seqA.Len() { + partLen = seqA.Len() + } + rawSeqA = seqA.Sequence()[:partLen] + qualSeqA = seqA.Qualities()[:partLen] + extra3 = partLen - seqA.Len() + + score = _FillMatrixPeRightAlign( + rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, + &arena.pointer.scoreMatrix, + &arena.pointer.pathMatrix) } - rawSeqB = seqB.Sequence()[0:partLen] - qualSeqB = seqB.Qualities()[0:partLen] - extra3 = seqB.Len() - partLen - score = _FillMatrixPeLeftAlign( - rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, - &arena.pointer.scoreMatrix, - &arena.pointer.pathMatrix) + + arena.pointer.path = _Backtracking(arena.pointer.pathMatrix, + len(rawSeqA), len(rawSeqB), + &arena.pointer.path) + } else { - startA = 0 - startB = -shift - delta - if startB < 0 { - startB = 0 - } - extra5 = startB - rawSeqB = seqB.Sequence()[startB:] - qualSeqB = seqB.Qualities()[startB:] - partLen = len(rawSeqB) - if partLen > seqA.Len() { - partLen = seqA.Len() - } - rawSeqA = seqA.Sequence()[:partLen] - qualSeqA = seqA.Qualities()[:partLen] - extra3 = partLen - seqA.Len() + // Both overlaping regions are identicals - score = _FillMatrixPeRightAlign( - rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, - &arena.pointer.scoreMatrix, - &arena.pointer.pathMatrix) + if shift > 0 { + startA = shift + startB = 0 + extra5 = -startA + qualSeqA = seqA.Qualities()[startA:] + partLen = len(qualSeqA) + qualSeqB = seqB.Qualities()[0:partLen] + extra3 = seqB.Len() - partLen + score = 0 + } else { + startA = 0 + startB = -shift + extra5 = startB + qualSeqB = seqB.Qualities()[startB:] + partLen = len(qualSeqB) + extra3 = partLen - seqA.Len() + qualSeqA = seqA.Qualities()[:partLen] + } + score = 0 + for i, qualA := range qualSeqA { + qualB := qualSeqB[i] + score += _NucScorePartMatchMatch[qualA][qualB] + } + arena.pointer.path = arena.pointer.path[:0] + arena.pointer.path = append(arena.pointer.path, 0, partLen) } + arena.pointer.path[0] += extra5 + if arena.pointer.path[len(arena.pointer.path)-1] == 0 { + arena.pointer.path[len(arena.pointer.path)-2] += extra3 + } else { + arena.pointer.path = append(arena.pointer.path, extra3, 0) + } + } else { + // + // No Fast Heuristic + // + + rawSeqA = seqA.Sequence() + qualSeqA = seqA.Qualities() + rawSeqB = seqB.Sequence() + qualSeqB = seqB.Qualities() + + scoreR := _FillMatrixPeRightAlign( + rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, + &arena.pointer.scoreMatrix, + &arena.pointer.pathMatrix) + arena.pointer.path = _Backtracking(arena.pointer.pathMatrix, len(rawSeqA), len(rawSeqB), &arena.pointer.path) - } else { + scoreL := _FillMatrixPeLeftAlign( + rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, + &arena.pointer.scoreMatrix, + &arena.pointer.pathMatrix) - // Both overlaping regions are identicals + if scoreL > scoreR { + arena.pointer.path = _Backtracking(arena.pointer.pathMatrix, + len(rawSeqA), len(rawSeqB), + &arena.pointer.path) + } - if shift > 0 { - startA = shift - startB = 0 - extra5 = -startA - qualSeqA = seqA.Qualities()[startA:] - partLen = len(qualSeqA) - qualSeqB = seqB.Qualities()[0:partLen] - extra3 = seqB.Len() - partLen - score = 0 - } else { - startA = 0 - startB = -shift - extra5 = startB - qualSeqB = seqB.Qualities()[startB:] - partLen = len(qualSeqB) - extra3 = partLen - seqA.Len() - qualSeqA = seqA.Qualities()[:partLen] - } - score = 0 - for i, qualA := range qualSeqA { - qualB := qualSeqB[i] - score += _NucScorePartMatchMatch[qualA][qualB] - } - arena.pointer.path = arena.pointer.path[:0] - arena.pointer.path = append(arena.pointer.path, 0, partLen) } - arena.pointer.path[0] += extra5 - if arena.pointer.path[len(arena.pointer.path)-1] == 0 { - arena.pointer.path[len(arena.pointer.path)-2] += extra3 - } else { - arena.pointer.path = append(arena.pointer.path, extra3, 0) - } - - return score, arena.pointer.path, fastScore, over + return score, arena.pointer.path, fastCount, over, fastScore } diff --git a/pkg/obikmer/encodefourmer.go b/pkg/obikmer/encodefourmer.go index 6ddfffa..3074923 100644 --- a/pkg/obikmer/encodefourmer.go +++ b/pkg/obikmer/encodefourmer.go @@ -1,6 +1,8 @@ package obikmer -import "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" +import ( + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" +) var __single_base_code__ = []byte{0, // A, B, C, D, @@ -97,7 +99,7 @@ func Index4mer(seq *obiseq.BioSequence, index *[][]int, buffer *[]byte) [][]int // FastShiftFourMer runs a Fast algorithm (similar to the one used in FASTA) to compare two sequences. // The returned values are two integer values. The shift between both the sequences and the count of // matching 4mer when this shift is applied between both the sequences. -func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (int, int) { +func FastShiftFourMer(index [][]int, lindex int, seq *obiseq.BioSequence, relscore bool, buffer *[]byte) (int, int, float64) { iternal_buffer := Encode4mer(seq, buffer) @@ -116,18 +118,31 @@ func FastShiftFourMer(index [][]int, seq *obiseq.BioSequence, buffer *[]byte) (i } maxshift := 0 - maxcount := -1 + maxcount := 0 + maxscore := -1.0 for shift, count := range shifts { - if count > maxcount { + score := float64(count) + if relscore { + over := -shift + if shift > 0 { + over += lindex + } else { + over = seq.Len() - over + } + score = score / float64(over) + } + if score > maxscore { maxshift = shift maxcount = count + maxscore = score } else { - if count == maxcount && shift < maxshift { + if score == maxscore && shift < maxshift { maxshift = shift + maxcount = count } } } - return maxshift, maxcount + return maxshift, maxcount, maxscore } diff --git a/pkg/obitools/obipairing/options.go b/pkg/obitools/obipairing/options.go index 479d45f..41d7bf5 100644 --- a/pkg/obitools/obipairing/options.go +++ b/pkg/obitools/obipairing/options.go @@ -13,6 +13,8 @@ var _MinOverlap = 20 var _GapPenality = float64(2.0) var _WithoutStats = false var _MinIdentity = 0.9 +var _NoFastAlign = false +var _FastScoreAbs = false func PairingOptionSet(options *getoptions.GetOpt) { options.StringVar(&_ForwardFile, "forward-reads", "", @@ -39,6 +41,10 @@ func PairingOptionSet(options *getoptions.GetOpt) { options.BoolVar(&_WithoutStats, "without-stat", _WithoutStats, options.Alias("S"), options.Description("Remove alignment statistics from the produced consensus sequences.")) + options.BoolVar(&_NoFastAlign, "exact-mode", _NoFastAlign, + options.Description("Do not run fast alignment heuristic.")) + options.BoolVar(&_FastScoreAbs, "fast-absolute", _FastScoreAbs, + options.Description("Compute absolute fast score (no action in exact mode).")) } func OptionSet(options *getoptions.GetOpt) { @@ -82,3 +88,11 @@ func CLIGapPenality() float64 { func CLIWithStats() bool { return !_WithoutStats } + +func CLIFastMode() bool { + return !_NoFastAlign +} + +func CLIFastRelativeScore() bool { + return !_FastScoreAbs +} diff --git a/pkg/obitools/obipairing/pairing.go b/pkg/obitools/obipairing/pairing.go index 9d41fba..192536c 100644 --- a/pkg/obitools/obipairing/pairing.go +++ b/pkg/obitools/obipairing/pairing.go @@ -99,16 +99,18 @@ func JoinPairedSequence(seqA, seqB *obiseq.BioSequence, inplace bool) *obiseq.Bi // destroyed during the assembling process and cannot be reuse later on. // the gap and delta parametters. // +// - fastModeRel: if set to true, the FAST score mode is set to relative score +// // # Returns // // An obiseq.BioSequence corresponding to the assembling of the both // input sequence. func AssemblePESequences(seqA, seqB *obiseq.BioSequence, gap float64, delta, minOverlap int, minIdentity float64, withStats bool, - inplace bool, + inplace bool, fastAlign, fastModeRel bool, arenaAlign obialign.PEAlignArena) *obiseq.BioSequence { - score, path, fastscore, over := obialign.PEAlign(seqA, seqB, gap, delta, arenaAlign) + score, path, fastcount, over, fastscore := obialign.PEAlign(seqA, seqB, gap, fastAlign, delta, fastModeRel, arenaAlign) cons, match := obialign.BuildQualityConsensus(seqA, seqB, path, true) left := path[0] @@ -123,8 +125,12 @@ func AssemblePESequences(seqA, seqB *obiseq.BioSequence, identity = 0 } annot := cons.Annotations() - annot["paring_fast_score"] = fastscore - annot["paring_fast_overlap"] = over + + if fastAlign { + annot["paring_fast_count"] = fastcount + annot["paring_fast_score"] = math.Round(fastscore*1000) / 1000 + annot["paring_fast_overlap"] = over + } if aliLength >= minOverlap && identity >= minIdentity { annot["mode"] = "alignment" @@ -205,7 +211,7 @@ func AssemblePESequences(seqA, seqB *obiseq.BioSequence, // each pair of processed sequences produces one sequence in the result iterator. func IAssemblePESequencesBatch(iterator obiiter.IBioSequence, gap float64, delta, minOverlap int, - minIdentity float64, + minIdentity float64, fastAlign, fastModeRel, withStats bool, sizes ...int) obiiter.IBioSequence { if !iterator.IsPaired() { @@ -235,7 +241,7 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence, cons := make(obiseq.BioSequenceSlice, len(batch.Slice())) for i, A := range batch.Slice() { B := A.PairedWith() - cons[i] = AssemblePESequences(A, B.ReverseComplement(true), gap, delta, minOverlap, minIdentity, withStats, true, arena) + cons[i] = AssemblePESequences(A, B.ReverseComplement(true), gap, delta, minOverlap, minIdentity, withStats, true, fastAlign, fastModeRel, arena) } newIter.Push(obiiter.MakeBioSequenceBatch( batch.Order(), diff --git a/pkg/obitools/obitagpcr/pcrtag.go b/pkg/obitools/obitagpcr/pcrtag.go index 0ec0788..d3fb4ad 100644 --- a/pkg/obitools/obitagpcr/pcrtag.go +++ b/pkg/obitools/obitagpcr/pcrtag.go @@ -14,7 +14,7 @@ import ( func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence, gap float64, delta, minOverlap int, - minIdentity float64, + minIdentity float64, fastAlign, fastScoreRel, withStats bool) obiiter.IBioSequence { if !iterator.IsPaired() { @@ -50,7 +50,8 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence, B := A.PairedWith() consensus := obipairing.AssemblePESequences( A.Copy(), B.ReverseComplement(false), - gap, delta, minOverlap, minIdentity, withStats, true, arena, + gap, delta, minOverlap, minIdentity, withStats, true, + fastAlign, fastScoreRel, arena, ) consensus, err = ngsfilter.ExtractBarcode(consensus, true)