From e9dcacbf24f6d1348a643a4930a7c1c500bc47fe Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 19 Apr 2023 09:12:12 +0200 Subject: [PATCH] Correction of one base deletion in the consensus Former-commit-id: 6aabd8bfdb5263a2285718cb2eca27628c0e717c --- Release-notes.md | 4 +- pkg/obikmer/debruijn.go | 72 ++++++++++++++++++----- pkg/obitools/obiconsensus/obiconsensus.go | 2 +- 3 files changed, 60 insertions(+), 18 deletions(-) diff --git a/Release-notes.md b/Release-notes.md index 490670e..5929729 100644 --- a/Release-notes.md +++ b/Release-notes.md @@ -35,11 +35,13 @@ - Rename the `forward_mismatches` and `reverse_mismatches` from instanced by `obimutiplex` into `forward_error` and `reverse_error` to be coherent with the tags instanced by `obipcr` - + ### Corrected bugs - Correction of a bug in memory management and Slice recycling. - Correction of the **--fragmented** option help and logging information +- Correction of a bug in `obiconsensus` leading into the deletion of a base close to the + beginning of the consensus sequence. ## March 31th, 2023. Release 4.0.2 diff --git a/pkg/obikmer/debruijn.go b/pkg/obikmer/debruijn.go index a496f94..a739855 100644 --- a/pkg/obikmer/debruijn.go +++ b/pkg/obikmer/debruijn.go @@ -3,10 +3,12 @@ package obikmer import ( "bytes" "fmt" - "log" "math" + "math/bits" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "github.com/daichi-m/go18ds/sets/linkedhashset" + "github.com/daichi-m/go18ds/stacks/arraystack" ) type KmerIdx32 uint32 @@ -134,7 +136,7 @@ func (g *DeBruijnGraph) Previouses(index uint64) []uint64 { func (g *DeBruijnGraph) Nexts(index uint64) []uint64 { rep := make([]uint64, 0, 4) - index = index << 2 & g.kmermask + index = (index << 2) & g.kmermask if _, ok := g.graph[index]; ok { rep = append(rep, index) @@ -199,7 +201,6 @@ func (g *DeBruijnGraph) LongestPath() []uint64 { ok := true starts := g.Heads() - log.Println(starts) for _, idx := range starts { lp := make([]uint64, 0, 1000) ok = true @@ -211,7 +212,6 @@ func (g *DeBruijnGraph) LongestPath() []uint64 { idx, ok = g.MaxNext(idx) } - log.Printf("max : %d current %d", wmax, w) if w > wmax { path = lp wmax = w @@ -282,7 +282,7 @@ func (g *DeBruijnGraph) DecodePath(path []uint64) string { if len(path) > 0 { buf.WriteString(g.DecodeNode(path[0])) - for _, idx := range path[1:] { + for _, idx := range path { buf.WriteByte(decode[idx&3]) } } @@ -323,22 +323,13 @@ func (graph *DeBruijnGraph) append(sequence []byte, current uint64) { b := iupac[sequence[i]] if len(b) == 1 { current |= b[0] - - weight, ok := graph.graph[current] - if !ok { - weight = 0 - } - graph.graph[current] = weight + 1 + graph.graph[current] = uint(graph.Weight(current) + 1) } else { for j := 0; j < len(b); j++ { current &= ^uint64(3) current |= b[j] - weight, ok := graph.graph[current] - if !ok { - weight = 0 - } - graph.graph[current] = weight + 1 + graph.graph[current] = uint(graph.Weight(current) + 1) graph.append(sequence[(i+1):], current) } return @@ -430,3 +421,52 @@ func (graph *DeBruijnGraph) Gml() string { return buffer.String() } + +// fonction tri_topologique(G, V): +// T <- une liste vide pour stocker l'ordre topologique +// S <- une pile vide pour stocker les nœuds sans prédécesseurs +// pour chaque nœud v dans V: +// si Pred(v) est vide: +// empiler S avec v +// tant que S n'est pas vide: +// nœud <- dépiler S +// ajouter nœud à T +// pour chaque successeur s de nœud: +// supprimer l'arc (nœud, s) de G +// si Pred(s) est vide: +// empiler S avec s +// si G contient encore des arcs: +// renvoyer une erreur (le graphe contient au moins un cycle) +// sinon: +// renvoyer T (l'ordre topologique) + +// A topological sort of the graph. +func (g *DeBruijnGraph) PartialOrder() *linkedhashset.Set[uint64] { + S := arraystack.New[uint64]() + T := linkedhashset.New[uint64]() + + for v := range g.graph { + if len(g.Previouses(v)) == 0 { + S.Push(v) + } + } + + for !S.Empty() { + v, _ := S.Pop() + T.Add(v) + for _, w := range g.Nexts(v) { + if T.Contains(g.Previouses(w)...) { + S.Push(w) + } + } + } + return T +} + +// Calculating the hamming distance between two k-mers. +func (g *DeBruijnGraph) HammingDistance(kmer1, kmer2 uint64) int { + ident := ^((kmer1 & kmer2) | (^kmer1 & ^kmer2)) + ident |= (ident >> 1) + ident &= 0x5555555555555555 & g.kmermask + return bits.OnesCount64(ident) +} diff --git a/pkg/obitools/obiconsensus/obiconsensus.go b/pkg/obitools/obiconsensus/obiconsensus.go index b1d0b1e..2c33df4 100644 --- a/pkg/obitools/obiconsensus/obiconsensus.go +++ b/pkg/obitools/obiconsensus/obiconsensus.go @@ -83,7 +83,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, } graph.FilterMin(threshold) - + log.Printf("Graph size : %d\n", graph.Len()) if save_graph {