correction of several small bugs

This commit is contained in:
Eric Coissac
2024-09-03 06:08:07 -03:00
parent 373464cb06
commit 65ae82622e
22 changed files with 770 additions and 79 deletions

View File

@ -1,30 +1,72 @@
package obialign
import log "github.com/sirupsen/logrus"
// buffIndex converts a pair of coordinates (i, j) into a linear index in a matrix
// of size width x width. The coordinates are (-1)-indexed, and the linear index
// is 0-indexed as well. The function first adds 1 to both coordinates to make
// sure the (-1,-1) coordinate is at position 0 in the matrix, and then computes
// the linear index by multiplying the first coordinate by the width and adding
// the second coordinate.
func buffIndex(i, j, width int) int {
return (i+1)*width + (j + 1)
}
// LocatePattern is a function to locate a pattern in a sequence.
//
// It uses a dynamic programming approach to build a matrix of scores.
// The score at each cell is the maximum of the score of the cell
// above it (representing a deletion), the score of the cell to its
// left (representing an insertion), and the score of the cell
// diagonally above it (representing a match).
//
// The score of a match is 0 if the two characters are the same,
// and -1 if they are different.
//
// The function returns the start and end positions of the best
// match, as well as the number of errors in the best match.
func LocatePattern(pattern, sequence []byte) (int, int, int) {
// Pattern spreads over the columns
// Sequence spreads over the rows
width := len(pattern) + 1
buffsize := (len(pattern) + 1) * (len(sequence) + 1)
buffer := make([]int, buffsize)
if len(pattern) >= len(sequence) {
log.Panicf("Pattern %s must be shorter than sequence %s", pattern, sequence)
}
// The path matrix keeps track of the best path through the matrix
// 0 : indicate the diagonal path
// 1 : indicate the up path
// -1 : indicate the left path
path := make([]int, buffsize)
// Initialize the first row of the matrix
for j := 0; j < len(pattern); j++ {
idx := buffIndex(-1, j, width)
buffer[idx] = -j - 1
path[idx] = -1
}
// Initialize the first column of the matrix
// Alignment is endgap free so first column = 0
// to allow primer to shift freely along the sequence
for i := -1; i < len(sequence); i++ {
idx := buffIndex(i, -1, width)
buffer[idx] = 0
path[idx] = +1
}
// Fills the matrix except the last column
// where gaps must be free too.
path[0] = 0
jmax := len(pattern) - 1
for i := 0; i < len(sequence); i++ {
for j := 0; j < jmax; j++ {
// Mismatch score = -1
// Match score = 0
match := -1
if _samenuc(pattern[j], sequence[i]) {
match = 0
@ -33,6 +75,8 @@ func LocatePattern(pattern, sequence []byte) (int, int, int) {
idx := buffIndex(i, j, width)
diag := buffer[buffIndex(i-1, j-1, width)] + match
// Each gap cost -1
left := buffer[buffIndex(i, j-1, width)] - 1
up := buffer[buffIndex(i-1, j, width)] - 1
@ -51,9 +95,12 @@ func LocatePattern(pattern, sequence []byte) (int, int, int) {
}
}
// Fills the last column considering the free up gap
for i := 0; i < len(sequence); i++ {
idx := buffIndex(i, jmax, width)
// Mismatch score = -1
// Match score = 0
match := -1
if _samenuc(pattern[jmax], sequence[i]) {
match = 0
@ -65,6 +112,7 @@ func LocatePattern(pattern, sequence []byte) (int, int, int) {
score := max(diag, up, left)
buffer[idx] = score
switch {
case score == left:
path[idx] = -1
@ -75,11 +123,13 @@ func LocatePattern(pattern, sequence []byte) (int, int, int) {
}
}
// Bactracking of the aligment
i := len(sequence) - 1
j := jmax
end := -1
lali := 0
for i > -1 && j > 0 {
for j > 0 { // C'était i > -1 && j > 0
lali++
switch path[buffIndex(i, j, width)] {
case 0:
@ -100,5 +150,9 @@ func LocatePattern(pattern, sequence []byte) (int, int, int) {
}
}
// log.Warnf("from : %d to: %d error: %d match: %v",
// i, end+1, -buffer[buffIndex(len(sequence)-1, len(pattern)-1, width)],
// string(sequence[i:(end+1)]))
return i, end + 1, -buffer[buffIndex(len(sequence)-1, len(pattern)-1, width)]
}

View File

@ -137,6 +137,28 @@ char *reverseSequence(char *str,char isPattern)
return str;
}
/* -------------------------------------------- */
/* lowercase sequence */
/* -------------------------------------------- */
#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'A'))
#define TO_LOWER(c) ((c) - 'A' + 'a')
char *LowerSequence(char *seq)
{
char *cseq;
for (cseq = seq ; *cseq ; cseq++)
if (IS_UPPER(*cseq))
*cseq = TO_LOWER(*cseq);
return seq;
}
#undef IS_UPPER
#undef TO_LOWER
char *ecoComplementPattern(char *nucAcSeq)
{
return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
@ -165,6 +187,7 @@ void UpperSequence(char *seq)
/* -------------------------------------------- */
/* encode sequence */
/* IS_UPPER is slightly faster than isupper */

View File

@ -9,6 +9,7 @@ import "C"
import (
"errors"
"runtime"
"strings"
"unsafe"
log "github.com/sirupsen/logrus"
@ -114,7 +115,7 @@ func (pattern ApatPattern) ReverseComplement() (ApatPattern, error) {
C.free(unsafe.Pointer(errmsg))
return ApatPattern{nil}, errors.New(message)
}
spat := C.GoString(apc.cpat)
spat := strings.ToLower(C.GoString(apc.cpat))
ap := _ApatPattern{apc, spat}
runtime.SetFinalizer(&ap, func(p *_ApatPattern) {
@ -366,6 +367,18 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) (
return
}
// FilterBestMatch filters the best non overlapping matches of a given pattern in a sequence.
//
// It takes the following parameters:
// - pattern: the pattern to search for (ApatPattern).
// - sequence: the sequence to search in (ApatSequence).
// - begin: the starting index of the search (int).
// - length: the length of the search (int).
//
// It returns a slice of [3]int representing the locations of all non-overlapping matches in the sequence.
// The two firsts values of the [3]int indicate respectively the start and the end position of
// the match. Following the GO convention the end position is not included in the
// match. The third value indicates the number of error detected for this occurrence.
func (pattern ApatPattern) FilterBestMatch(sequence ApatSequence, begin, length int) (loc [][3]int) {
res := pattern.FindAllIndex(sequence, begin, length)
filtered := make([][3]int, 0, len(res))
@ -424,13 +437,15 @@ func (pattern ApatPattern) FilterBestMatch(sequence ApatSequence, begin, length
func (pattern ApatPattern) AllMatches(sequence ApatSequence, begin, length int) (loc [][3]int) {
res := pattern.FilterBestMatch(sequence, begin, length)
j := 0
for _, m := range res {
// Recompute the start and end position of the match
// when the pattern allows for indels
if m[2] > 0 && pattern.pointer.pointer.hasIndel {
start := m[0] - m[2]
// log.Warnf("Locating indel on sequence %s[%s]", sequence.pointer.reference.Id(), pattern.String())
start := m[0] - m[2]*2
start = max(start, 0)
end := start + int(pattern.pointer.pointer.patlen) + 2*m[2]
end := start + int(pattern.pointer.pointer.patlen) + 4*m[2]
end = min(end, sequence.Len())
// 1 << 30 = 1,073,741,824 = 1Gb
// It's a virtual array mapping the sequence to the pattern
@ -439,18 +454,21 @@ func (pattern ApatPattern) AllMatches(sequence ApatSequence, begin, length int)
cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat))
frg := sequence.pointer.reference.Sequence()[start:end]
begin, end, score := obialign.LocatePattern((*cpattern)[0:int(pattern.pointer.pointer.patlen)], frg)
pb, pe, score := obialign.LocatePattern((*cpattern)[0:int(pattern.pointer.pointer.patlen)], frg)
// olderr := m[2]
m[2] = score
m[0] = start + begin
m[1] = start + end
m[0] = start + pb
m[1] = start + pe
// log.Warnf("seq[%d@%d:%d] %d: %s %d - %s:%s:%s", i, m[0], m[1], olderr, sequence.pointer.reference.Id(), score,
// frg, (*cpattern)[0:int(pattern.pointer.pointer.patlen)], sequence.pointer.reference.Sequence()[m[0]:m[1]])
}
if int(pattern.pointer.pointer.maxerr) >= m[2] {
res[j] = m
j++
}
}
// log.Debugf("All matches : %v", res)
return res
return res[0:j]
}

View File

@ -642,6 +642,8 @@ func ReadCSVNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) {
}
ngsfilter.CheckPrimerUnicity()
for i := 0; i < len(params); i++ {
param := params[i][1]
if len(params[i]) < 3 {

View File

@ -3,6 +3,8 @@ package obiformats
import (
"bufio"
"bytes"
"encoding/csv"
"errors"
"io"
"path"
"regexp"
@ -39,6 +41,31 @@ type SequenceReader func(reader io.Reader, options ...WithOption) (obiiter.IBioS
// - io.Reader: A modified reader with the read data.
// - error: Any error encountered during the process.
func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
csv := func(in []byte, limit uint32) bool {
in = dropLastLine(in, limit)
br := bytes.NewReader(in)
r := csv.NewReader(br)
r.Comma = ','
r.ReuseRecord = true
r.LazyQuotes = true
r.Comment = '#'
lines := 0
for {
_, err := r.Read()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
return false
}
lines++
}
return r.FieldsPerRecord > 1 && lines > 1
}
fastaDetector := func(raw []byte, limit uint32) bool {
ok, err := regexp.Match("^>[^ ]", raw)
return ok && err == nil
@ -70,12 +97,14 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq")
mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat")
mimetype.Lookup("text/plain").Extend(csv, "text/csv", ".csv")
mimetype.Lookup("application/octet-stream").Extend(fastaDetector, "text/fasta", ".fasta")
mimetype.Lookup("application/octet-stream").Extend(fastqDetector, "text/fastq", ".fastq")
mimetype.Lookup("application/octet-stream").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
mimetype.Lookup("application/octet-stream").Extend(genbankDetector, "text/genbank", ".seq")
mimetype.Lookup("application/octet-stream").Extend(emblDetector, "text/embl", ".dat")
mimetype.Lookup("application/octet-stream").Extend(csv, "text/csv", ".csv")
// Create a buffer to store the read data
buf := make([]byte, 1024*128)
@ -87,6 +116,7 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
// Detect the MIME type using the mimetype library
mimeType := mimetype.Detect(buf)
if mimeType == nil {
return nil, nil, err
}

250
pkg/obifp/uint128_test.go Normal file
View File

@ -0,0 +1,250 @@
package obifp
import (
"math"
"reflect"
"testing"
"github.com/stretchr/testify/assert"
)
func TestUint128_Add(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
v := Uint128{w1: 3, w0: 4}
w := u.Add(v)
assert.Equal(t, Uint128{w1: 4, w0: 6}, w)
u = Uint128{w1: 0, w0: 0}
v = Uint128{w1: 0, w0: 0}
w = u.Add(v)
assert.Equal(t, Uint128{w1: 0, w0: 0}, w)
u = Uint128{w1: 0, w0: math.MaxUint64}
v = Uint128{w1: 0, w0: 1}
w = u.Add(v)
assert.Equal(t, Uint128{w1: 1, w0: 0}, w)
}
func TestUint128_Add64(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
v := uint64(3)
w := u.Add64(v)
assert.Equal(t, Uint128{w1: 1, w0: 5}, w)
u = Uint128{w1: 0, w0: 0}
v = uint64(0)
w = u.Add64(v)
assert.Equal(t, Uint128{w1: 0, w0: 0}, w)
u = Uint128{w1: 0, w0: math.MaxUint64}
v = uint64(1)
w = u.Add64(v)
assert.Equal(t, Uint128{w1: 1, w0: 0}, w)
}
func TestUint128_Sub(t *testing.T) {
u := Uint128{w1: 4, w0: 6}
v := Uint128{w1: 3, w0: 4}
w := u.Sub(v)
assert.Equal(t, Uint128{w1: 1, w0: 2}, w)
u = Uint128{w1: 0, w0: 0}
v = Uint128{w1: 0, w0: 0}
w = u.Sub(v)
assert.Equal(t, Uint128{w1: 0, w0: 0}, w)
u = Uint128{w1: 1, w0: 0}
v = Uint128{w1: 0, w0: 1}
w = u.Sub(v)
assert.Equal(t, Uint128{w1: 0, w0: math.MaxUint64}, w)
}
func TestUint128_Mul64(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
v := uint64(3)
w := u.Mul64(v)
if w.w1 != 3 || w.w0 != 6 {
t.Errorf("Mul64(%v, %v) = %v, want %v", u, v, w, Uint128{w1: 3, w0: 6})
}
u = Uint128{w1: 0, w0: 0}
v = uint64(0)
w = u.Mul64(v)
if w.w1 != 0 || w.w0 != 0 {
t.Errorf("Mul64(%v, %v) = %v, want %v", u, v, w, Uint128{w1: 0, w0: 0})
}
u = Uint128{w1: 0, w0: math.MaxUint64}
v = uint64(2)
w = u.Mul64(v)
if w.w1 != 1 || w.w0 != 18446744073709551614 {
t.Errorf("Mul64(%v, %v) = %v, want %v", u, v, w, Uint128{w1: 1, w0: 18446744073709551614})
}
}
func TestUint128_Mul(t *testing.T) {
tests := []struct {
name string
u Uint128
v Uint128
expected Uint128
}{
{
name: "simple multiplication",
u: Uint128{w1: 1, w0: 2},
v: Uint128{w1: 3, w0: 4},
expected: Uint128{w1: 10, w0: 8},
},
{
name: "multiplication with overflow",
u: Uint128{w1: 0, w0: math.MaxUint64},
v: Uint128{w1: 0, w0: 2},
expected: Uint128{w1: 1, w0: 18446744073709551614},
},
{
name: "multiplication with zero",
u: Uint128{w1: 0, w0: 0},
v: Uint128{w1: 0, w0: 0},
expected: Uint128{w1: 0, w0: 0},
},
{
name: "multiplication with large numbers",
u: Uint128{w1: 100, w0: 200},
v: Uint128{w1: 300, w0: 400},
expected: Uint128{w1: 100000, w0: 80000},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
actual := tt.u.Mul(tt.v)
if !reflect.DeepEqual(actual, tt.expected) {
t.Errorf("Mul(%v, %v) = %v, want %v", tt.u, tt.v, actual, tt.expected)
}
})
}
}
func TestUint128_QuoRem(t *testing.T) {
u := Uint128{w1: 3, w0: 8}
v := Uint128{w1: 0, w0: 4}
q, r := u.QuoRem(v)
assert.Equal(t, Uint128{w1: 0, w0: 2}, q)
assert.Equal(t, Uint128{w1: 0, w0: 0}, r)
}
func TestUint128_QuoRem64(t *testing.T) {
u := Uint128{w1: 0, w0: 6}
v := uint64(3)
q, r := u.QuoRem64(v)
assert.Equal(t, Uint128{w1: 0, w0: 2}, q)
assert.Equal(t, uint64(0), r)
}
func TestUint128_Div(t *testing.T) {
u := Uint128{w1: 3, w0: 8}
v := Uint128{w1: 0, w0: 4}
q := u.Div(v)
assert.Equal(t, Uint128{w1: 0, w0: 2}, q)
}
func TestUint128_Div64(t *testing.T) {
u := Uint128{w1: 0, w0: 6}
v := uint64(3)
q := u.Div64(v)
assert.Equal(t, Uint128{w1: 0, w0: 2}, q)
}
func TestUint128_Mod(t *testing.T) {
u := Uint128{w1: 3, w0: 8}
v := Uint128{w1: 0, w0: 4}
r := u.Mod(v)
assert.Equal(t, Uint128{w1: 0, w0: 0}, r)
}
func TestUint128_Mod64(t *testing.T) {
u := Uint128{w1: 0, w0: 6}
v := uint64(3)
r := u.Mod64(v)
assert.Equal(t, uint64(0), r)
}
func TestUint128_Cmp(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
v := Uint128{w1: 3, w0: 4}
assert.Equal(t, -1, u.Cmp(v))
}
func TestUint128_Cmp64(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
v := uint64(3)
assert.Equal(t, -1, u.Cmp64(v))
}
func TestUint128_Equals(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
v := Uint128{w1: 1, w0: 2}
assert.Equal(t, true, u.Equals(v))
}
func TestUint128_LessThan(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
v := Uint128{w1: 3, w0: 4}
assert.Equal(t, true, u.LessThan(v))
}
func TestUint128_GreaterThan(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
v := Uint128{w1: 3, w0: 4}
assert.Equal(t, false, u.GreaterThan(v))
}
func TestUint128_LessThanOrEqual(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
v := Uint128{w1: 3, w0: 4}
assert.Equal(t, true, u.LessThanOrEqual(v))
}
func TestUint128_GreaterThanOrEqual(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
v := Uint128{w1: 3, w0: 4}
assert.Equal(t, false, u.GreaterThanOrEqual(v))
}
func TestUint128_And(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
v := Uint128{w1: 3, w0: 4}
w := u.And(v)
assert.Equal(t, Uint128{w1: 1, w0: 0}, w)
}
func TestUint128_Or(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
v := Uint128{w1: 3, w0: 4}
w := u.Or(v)
assert.Equal(t, Uint128{w1: 3, w0: 6}, w)
}
func TestUint128_Xor(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
v := Uint128{w1: 3, w0: 4}
w := u.Xor(v)
assert.Equal(t, Uint128{w1: 2, w0: 6}, w)
}
func TestUint128_Not(t *testing.T) {
u := Uint128{w1: 1, w0: 2}
w := u.Not()
assert.Equal(t, Uint128{w1: math.MaxUint64 - 1, w0: math.MaxUint64 - 2}, w)
}
func TestUint128_AsUint64(t *testing.T) {
u := Uint128{w1: 0, w0: 5}
v := u.AsUint64()
assert.Equal(t, uint64(5), v)
}

View File

@ -10,6 +10,7 @@ import (
"slices"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
log "github.com/sirupsen/logrus"
)
@ -312,10 +313,48 @@ func (g *DeBruijnGraph) LongestPath(max_length int) []uint64 {
return path
}
func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error) {
func (g *DeBruijnGraph) LongestConsensus(id string, min_cov float64) (*obiseq.BioSequence, error) {
if g.Len() == 0 {
return nil, fmt.Errorf("graph is empty")
}
//path := g.LongestPath(max_length)
path := g.HaviestPath()
s := g.DecodePath(path)
spath := path
if min_cov > 0 {
wp := make([]uint, len(path))
for i, n := range path {
wp[i] = g.graph[n]
}
mp := uint(float64(obistats.Mode(wp))*min_cov + 0.5)
from := 0
for i, n := range path {
if g.graph[n] < mp {
from = i + 1
} else {
break
}
}
to := len(path)
for i := len(path) - 1; i >= 0; i-- {
n := path[i]
if g.graph[n] < mp {
to = i
} else {
break
}
}
spath = path[from:to]
}
s := g.DecodePath(spath)
if len(s) > 0 {
seq := obiseq.NewBioSequence(
@ -387,6 +426,48 @@ func (g *DeBruijnGraph) Weight(index uint64) int {
return int(val)
}
// WeightMode returns the mode weight of the nodes in the DeBruijnGraph.
//
// It iterates over each count in the graph map and updates the max value if the current count is greater.
// Finally, it returns the mode weight as an integer.
//
// Returns:
// - int: the mode weight value.
func (g *DeBruijnGraph) WeightMode() int {
dist := make(map[uint]int)
for _, w := range g.graph {
dist[w]++
}
maxi := 0
wmax := uint(0)
for k, v := range dist {
if v > maxi && k > 1 {
maxi = v
wmax = k
}
}
return int(wmax)
}
// WeightMean returns the mean weight of the nodes in the DeBruijnGraph.
//
// Returns:
// - float64: the mean weight of the nodes in the graph.
func (g *DeBruijnGraph) WeightMean() float64 {
sum := 0
for _, w := range g.graph {
i := int(w)
sum += i
}
return float64(sum) / float64(len(g.graph))
}
// append appends a sequence of nucleotides to the DeBruijnGraph.
//
// Parameters:
@ -637,7 +718,7 @@ func (g *DeBruijnGraph) HaviestPath() []uint64 {
}
if slices.Contains(heaviestPath, currentNode) {
log.Fatalf("Cycle detected %v -> %v (%v) len(%v)", heaviestPath, currentNode, startNodes, len(heaviestPath))
log.Panicf("Cycle detected %v -> %v (%v) len(%v), graph: %v", heaviestPath, currentNode, startNodes, len(heaviestPath), g.Len())
return nil
}

View File

@ -20,6 +20,8 @@ import (
func NewInterpreter() *lua.LState {
lua := lua.NewState()
registerMutexType(lua)
RegisterObilib(lua)
RegisterObiContext(lua)
@ -117,7 +119,7 @@ func LuaWorker(proto *lua.FunctionProto) obiseq.SeqWorker {
case *obiseq.BioSequenceSlice:
return *val, err
default:
return nil, fmt.Errorf("worker function doesn't return the correct type")
return nil, fmt.Errorf("worker function doesn't return the correct type %T", val)
}
}

View File

@ -36,6 +36,8 @@ func obicontextGetSet(interpreter *lua.LState) int {
__lua_obicontext.Store(key, float64(val))
case lua.LString:
__lua_obicontext.Store(key, string(val))
case *lua.LUserData:
__lua_obicontext.Store(key, val.Value)
case *lua.LTable:
__lua_obicontext.Store(key, Table2Interface(interpreter, val))
default:

View File

@ -1,6 +1,8 @@
package obilua
import (
"sync"
log "github.com/sirupsen/logrus"
lua "github.com/yuin/gopher-lua"
@ -46,6 +48,8 @@ func pushInterfaceToLua(L *lua.LState, val interface{}) {
pushSliceBoolToLua(L, v)
case nil:
L.Push(lua.LNil)
case *sync.Mutex:
pushMutexToLua(L, v)
default:
log.Fatalf("Cannot deal with value (%T) : %v", val, val)
}

63
pkg/obilua/mutex.go Normal file
View File

@ -0,0 +1,63 @@
package obilua
import (
lua "github.com/yuin/gopher-lua"
"sync"
)
const luaMutexTypeName = "Mutex"
func registerMutexType(luaState *lua.LState) {
mutexType := luaState.NewTypeMetatable(luaMutexTypeName)
luaState.SetGlobal(luaMutexTypeName, mutexType)
luaState.SetField(mutexType, "new", luaState.NewFunction(newMutex))
luaState.SetField(mutexType, "__index",
luaState.SetFuncs(luaState.NewTable(),
mutexMethods))
}
func mutex2Lua(interpreter *lua.LState, mutex *sync.Mutex) lua.LValue {
ud := interpreter.NewUserData()
ud.Value = mutex
interpreter.SetMetatable(ud, interpreter.GetTypeMetatable(luaMutexTypeName))
return ud
}
func pushMutexToLua(luaState *lua.LState, mutex *sync.Mutex) {
luaState.Push(mutex2Lua(luaState, mutex))
}
func newMutex(luaState *lua.LState) int {
m := &sync.Mutex{}
pushMutexToLua(luaState, m)
return 1
}
var mutexMethods = map[string]lua.LGFunction{
"lock": mutexLock,
"unlock": mutexUnlock,
}
func checkMutex(L *lua.LState) *sync.Mutex {
ud := L.CheckUserData(1)
if mutex, ok := ud.Value.(*sync.Mutex); ok {
return mutex
}
L.ArgError(1, "Mutex expected")
return nil
}
func mutexLock(L *lua.LState) int {
mutex := checkMutex(L)
mutex.Lock()
return 0
}
func mutexUnlock(L *lua.LState) int {
mutex := checkMutex(L)
mutex.Unlock()
return 0
}

View File

@ -2,9 +2,10 @@ package obingslibrary
import (
"fmt"
"log"
"strings"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"

View File

@ -3,7 +3,7 @@ package obingslibrary
import (
"fmt"
"math"
"sort"
"slices"
log "github.com/sirupsen/logrus"
@ -87,6 +87,8 @@ func lookForTag(seq string, delimiter byte) string {
i := len(seq) - 1
// log.Warnf("Provided fragment : %s", string(seq))
for i >= 0 && seq[i] != delimiter {
i--
}
@ -107,6 +109,7 @@ func lookForTag(seq string, delimiter byte) string {
return ""
}
// log.Warnf("extracted : %s", string(seq[begin:end]))
return seq[begin:end]
}
@ -172,11 +175,12 @@ func (marker *Marker) beginDelimitedTagExtractor(
begin int,
forward bool) string {
taglength := marker.Forward_spacer + marker.Forward_tag_length
// log.Warn("beginDelimitedTagExtractor")
taglength := 2*marker.Forward_spacer + marker.Forward_tag_length
delimiter := marker.Forward_tag_delimiter
if !forward {
taglength = marker.Reverse_spacer + marker.Reverse_tag_length
taglength = 2*marker.Reverse_spacer + marker.Reverse_tag_length
delimiter = marker.Reverse_tag_delimiter
}
@ -240,6 +244,7 @@ func (marker *Marker) endDelimitedTagExtractor(
sequence *obiseq.BioSequence,
end int,
forward bool) string {
// log.Warn("endDelimitedTagExtractor")
taglength := marker.Reverse_spacer + marker.Reverse_tag_length
delimiter := marker.Reverse_tag_delimiter
@ -335,14 +340,17 @@ func (marker *Marker) beginTagExtractor(
sequence *obiseq.BioSequence,
begin int,
forward bool) string {
// log.Warnf("Forward : %v -> %d %c", forward, marker.Forward_spacer, marker.Forward_tag_delimiter)
// log.Warnf("Forward : %v -> %d %c", forward, marker.Reverse_spacer, marker.Reverse_tag_delimiter)
if forward {
if marker.Forward_tag_delimiter == 0 {
return marker.beginFixedTagExtractor(sequence, begin, forward)
} else {
if marker.Forward_tag_indels == 0 {
// log.Warnf("Delimited tag for forward primers %s", marker.forward.String())
return marker.beginDelimitedTagExtractor(sequence, begin, forward)
} else {
// log.Warn("Rescue tag for forward primers")
// log.Warnf("Rescue tag for forward primers %s", marker.forward.String())
return marker.beginRescueTagExtractor(sequence, begin, forward)
}
}
@ -351,9 +359,10 @@ func (marker *Marker) beginTagExtractor(
return marker.beginFixedTagExtractor(sequence, begin, forward)
} else {
if marker.Reverse_tag_indels == 0 {
// log.Warnf("Delimited tag for reverse/complement primers %s", marker.creverse.String())
return marker.beginDelimitedTagExtractor(sequence, begin, forward)
} else {
// log.Warn("Rescue tag for reverse/complement primers")
// log.Warnf("Rescue tag for reverse/complement primers %s", marker.creverse.String())
return marker.beginRescueTagExtractor(sequence, begin, forward)
}
}
@ -369,9 +378,10 @@ func (marker *Marker) endTagExtractor(
return marker.endFixedTagExtractor(sequence, end, forward)
} else {
if marker.Reverse_tag_indels == 0 {
// log.Warnf("Delimited tag for reverse primers %s", marker.reverse.String())
return marker.endDelimitedTagExtractor(sequence, end, forward)
} else {
// log.Warn("Rescue tag for reverse primers")
// log.Warnf("Rescue tag for reverse primers %s", marker.reverse.String())
return marker.endRescueTagExtractor(sequence, end, forward)
}
}
@ -380,9 +390,10 @@ func (marker *Marker) endTagExtractor(
return marker.endFixedTagExtractor(sequence, end, forward)
} else {
if marker.Forward_tag_indels == 0 {
// log.Warnf("Delimited tag for forward/complement primers %s", marker.cforward.String())
return marker.endDelimitedTagExtractor(sequence, end, forward)
} else {
// log.Warn("Rescue tag for forward/complement primers")
// log.Warnf("Rescue tag for forward/complement primers %s", marker.cforward.String())
return marker.endRescueTagExtractor(sequence, end, forward)
}
}
@ -609,9 +620,7 @@ func (library *NGSLibrary) ExtractMultiBarcode(sequence *obiseq.BioSequence) (ob
}
if len(matches) > 0 {
sort.Slice(matches, func(i, j int) bool {
return matches[i].Begin < matches[j].Begin
})
slices.SortFunc(matches, func(a, b PrimerMatch) int { return a.Begin - b.Begin })
state := 0
var from PrimerMatch

View File

@ -36,8 +36,9 @@ func MakeNGSLibrary() NGSLibrary {
}
func (library *NGSLibrary) GetMarker(forward, reverse string) (*Marker, bool) {
pair := PrimerPair{strings.ToLower(forward),
strings.ToLower(reverse)}
forward = strings.ToLower(forward)
reverse = strings.ToLower(reverse)
pair := PrimerPair{forward, reverse}
marker, ok := (library.Markers)[pair]
if ok {

View File

@ -7,7 +7,7 @@ import (
// TODO: The version number is extracted from git. This induces that the version
// corresponds to the last commit, and not the one when the file will be
// commited
var _Commit = "31bfc88"
var _Commit = "373464c"
var _Version = "Release 4.2.0"
// Version returns the version of the obitools package.

View File

@ -72,8 +72,8 @@ func (s *BioSequence) HasAttribute(key string) bool {
ok := s.annotations != nil
if ok {
defer s.AnnotationsUnlock()
s.AnnotationsLock()
defer s.AnnotationsUnlock()
_, ok = s.annotations[key]
}
@ -112,8 +112,8 @@ func (s *BioSequence) GetAttribute(key string) (interface{}, bool) {
ok := s.annotations != nil
if ok {
defer s.AnnotationsUnlock()
s.AnnotationsLock()
defer s.AnnotationsUnlock()
val, ok = s.annotations[key]
}
@ -144,8 +144,8 @@ func (s *BioSequence) SetAttribute(key string, value interface{}) {
annot := s.Annotations()
defer s.AnnotationsUnlock()
s.AnnotationsLock()
defer s.AnnotationsUnlock()
annot[key] = value
}
@ -205,8 +205,8 @@ func (s *BioSequence) GetFloatAttribute(key string) (float64, bool) {
// No return value.
func (s *BioSequence) DeleteAttribute(key string) {
if s.annotations != nil {
defer s.AnnotationsUnlock()
s.AnnotationsLock()
defer s.AnnotationsUnlock()
delete(s.annotations, key)
}
}

View File

@ -65,7 +65,7 @@ type BioSequence struct {
feature []byte
paired *BioSequence // A pointer to the paired sequence
annotations Annotation
annot_lock sync.Mutex
annot_lock *sync.Mutex
}
// NewEmptyBioSequence creates a new BioSequence object with an empty sequence.
@ -90,7 +90,7 @@ func NewEmptyBioSequence(preallocate int) *BioSequence {
feature: nil,
paired: nil,
annotations: nil,
annot_lock: sync.Mutex{},
annot_lock: &sync.Mutex{},
}
}

View File

@ -187,7 +187,7 @@ func TestCopy(t *testing.T) {
"annotation1": "value1",
"annotation2": "value2",
},
annot_lock: sync.Mutex{},
annot_lock: &sync.Mutex{},
}
newSeq := seq.Copy()

View File

@ -91,8 +91,7 @@ func (sequence *BioSequence) HasStatsOn(key string) bool {
// - StatsOnValues
func (sequence *BioSequence) StatsOn(desc StatsOnDescription, na string) StatsOnValues {
mkey := StatsOnSlotName(desc.Name)
annotations := sequence.Annotations()
istat, ok := annotations[mkey]
istat, ok := sequence.GetAttribute(mkey)
var stats StatsOnValues
var newstat bool
@ -117,39 +116,40 @@ func (sequence *BioSequence) StatsOn(desc StatsOnDescription, na string) StatsOn
}
}
default:
stats = make(StatsOnValues, 10)
annotations[mkey] = stats
stats = make(StatsOnValues)
sequence.SetAttribute(mkey, stats)
newstat = true
}
} else {
stats = make(StatsOnValues, 10)
annotations[mkey] = stats
stats = make(StatsOnValues)
sequence.SetAttribute(mkey, stats)
newstat = true
}
if newstat && sequence.StatsPlusOne(desc, sequence, na) {
delete(sequence.Annotations(), desc.Key)
if newstat {
sequence.StatsPlusOne(desc, sequence, na)
}
return stats
}
// StatsPlusOne adds the count of the sequence toAdd to the count of the key in the stats.
// StatsPlusOne updates the statistics on the given attribute (desc) on the receiver BioSequence
// with the value of the attribute on the toAdd BioSequence.
//
// Parameters:
// - key: the attribute key (string) to be summarized
// - toAdd: the BioSequence to add to the stats
// - desc: StatsOnDescription of the attribute to be updated
// - toAdd: the BioSequence containing the attribute to be updated
// - na: the value to be used if the attribute is not present
//
// Return type:
// - bool
// - bool: true if the update was successful, false otherwise
func (sequence *BioSequence) StatsPlusOne(desc StatsOnDescription, toAdd *BioSequence, na string) bool {
sval := na
annotations := sequence.Annotations()
stats := sequence.StatsOn(desc, na)
retval := false
if toAdd.HasAnnotation() {
value, ok := toAdd.Annotations()[desc.Key]
value, ok := toAdd.GetAttribute(desc.Key)
if ok {
@ -178,8 +178,9 @@ func (sequence *BioSequence) StatsPlusOne(desc StatsOnDescription, toAdd *BioSeq
if !ok {
old = 0
}
stats[sval] = old + desc.Weight(toAdd)
annotations[StatsOnSlotName(desc.Name)] = stats // TODO: check if this is necessary
sequence.SetAttribute(StatsOnSlotName(desc.Name), stats) // TODO: check if this is necessary
return retval
}
@ -263,7 +264,7 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
}
}
annotations["count"] = count
sequence.SetCount(count)
return sequence
}
@ -282,7 +283,7 @@ func (sequences BioSequenceSlice) Merge(na string, statsOn StatsOnDescriptions)
seq.SetQualities(nil)
if len(sequences) == 1 {
seq.Annotations()["count"] = seq.Count()
seq.SetCount(seq.Count())
for _, desc := range statsOn {
seq.StatsOn(desc, na)
}
@ -296,5 +297,4 @@ func (sequences BioSequenceSlice) Merge(na string, statsOn StatsOnDescriptions)
sequences.Recycle(false)
return seq
}

View File

@ -1,11 +1,9 @@
package obistats
func Max[T int64 | int32 | int16 | int8 | int | float32 | float64] (data []T) T {
func Max[T int64 | int32 | int16 | int8 | int | float32 | float64](data []T) T {
m := data[0]
for _,v := range data {
for _, v := range data {
if v > m {
m = v
}
@ -14,14 +12,34 @@ func Max[T int64 | int32 | int16 | int8 | int | float32 | float64] (data []T) T
return m
}
func Min[T int64 | int32 | int16 | int8 | int | float32 | float64] (data []T) T {
func Min[T int64 | int32 | int16 | int8 | int | uint64 | uint32 | uint16 | uint8 | uint | float32 | float64](data []T) T {
m := data[0]
for _,v := range data {
for _, v := range data {
if v < m {
m = v
}
}
return m
}
}
func Mode[T int64 | int32 | int16 | int8 | int | uint64 | uint32 | uint16 | uint8 | uint](data []T) T {
ds := make(map[T]int)
for _, v := range data {
ds[v]++
}
md := T(0)
maxocc := 0
for v, occ := range ds {
if occ > maxocc {
maxocc = occ
md = v
}
}
return md
}

View File

@ -25,8 +25,19 @@ import (
func BuildConsensus(seqs obiseq.BioSequenceSlice,
consensus_id string,
kmer_size int,
filter_out float64,
save_graph bool, dirname string) (*obiseq.BioSequence, error) {
if seqs.Len() == 0 {
return nil, fmt.Errorf("no sequence provided")
}
if seqs.Len() == 1 {
seq := seqs[0].Copy()
seq.SetAttribute("obiconsensus_consensus", false)
return seq, nil
}
if save_graph {
if dirname == "" {
dirname = "."
@ -104,7 +115,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
log.Debugf("Graph size : %d\n", graph.Len())
total_kmer := graph.Len()
seq, err := graph.LongestConsensus(consensus_id)
seq, err := graph.LongestConsensus(consensus_id, filter_out)
sumCount := 0
@ -112,7 +123,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
for _, s := range seqs {
sumCount += s.Count()
}
seq.SetAttribute("obiconsensus_consensus", true)
seq.SetAttribute("obiconsensus_weight", sumCount)
seq.SetAttribute("obiconsensus_seq_length", seq.Len())
seq.SetAttribute("obiconsensus_kmer_size", kmer_size)
@ -136,6 +147,10 @@ func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func
stats := (*seqs)[i].StatsOn(obiseq.MakeStatsOnDescription(sample_key), "NA")
if stats == nil {
log.Panicf("Sample %s not found in sequence %d", sample, i)
}
if value, ok := stats[sample]; ok {
return float64(value)
}
@ -292,16 +307,16 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
pack[degree] = v
clean, err = BuildConsensus(pack,
fmt.Sprintf("%s_consensus", v.Id()),
kmer_size,
kmer_size, CLILowCoverage(),
CLISaveGraphToFiles(), CLIGraphFilesDirectory())
if err != nil {
log.Warning(err)
clean = (*graph.Vertices)[i]
clean = (*graph.Vertices)[i].Copy()
clean.SetAttribute("obiconsensus_consensus", false)
} else {
clean.SetAttribute("obiconsensus_consensus", true)
}
pack.Recycle(false)
} else {
@ -318,8 +333,9 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
annotations := v.Annotations()
staton := obiseq.StatsOnSlotName(sample_key)
for k, v := range annotations {
if !clean.HasAttribute(k) {
if !clean.HasAttribute(k) && k != staton {
clean.SetAttribute(k, v)
}
}
@ -334,6 +350,83 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
return denoised
}
func MinionClusterDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
sample_key string, kmer_size int) obiseq.BioSequenceSlice {
denoised := obiseq.MakeBioSequenceSlice()
seqs := (*obiseq.BioSequenceSlice)(graph.Vertices)
weight := SampleWeight(seqs, graph.Name, sample_key)
seqWeights := make([]float64, len(*seqs))
// Compute weights for each vertex as the sum of the weights of its neighbors
log.Info("")
log.Infof("Sample %s: Computing weights", graph.Name)
for i := range *seqs {
w := weight(i)
for _, j := range graph.Neighbors(i) {
w += weight(j)
}
seqWeights[i] = w
}
log.Infof("Sample %s: Done computing weights", graph.Name)
log.Infof("Sample %s: Clustering", graph.Name)
// Look for vertex not having a neighbor with a higher weight
for i := range *seqs {
v := (*seqs)[i]
head := true
neighbors := graph.Neighbors(i)
for _, j := range neighbors {
if seqWeights[i] < seqWeights[j] {
head = false
continue
}
}
if head {
pack := obiseq.MakeBioSequenceSlice(len(neighbors) + 1)
for k, j := range neighbors {
pack[k] = (*seqs)[j]
}
pack[len(neighbors)] = v
clean, err := BuildConsensus(pack,
fmt.Sprintf("%s_consensus", v.Id()),
kmer_size, CLILowCoverage(),
CLISaveGraphToFiles(), CLIGraphFilesDirectory())
if err != nil {
log.Warning(err)
clean = (*graph.Vertices)[i].Copy()
clean.SetAttribute("obiconsensus_consensus", false)
}
pack.Recycle(false)
clean.SetAttribute(sample_key, graph.Name)
annotations := v.Annotations()
clean.SetCount(int(weight(i)))
staton := obiseq.StatsOnSlotName(sample_key)
for k, v := range annotations {
if !clean.HasAttribute(k) && k != staton {
clean.SetAttribute(k, v)
}
}
denoised = append(denoised, clean)
}
}
log.Infof("Sample %s: Done clustering", graph.Name)
return denoised
}
func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
dirname := CLIGraphFilesDirectory()
newIter := obiiter.MakeIBioSequence()
@ -395,9 +488,17 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
false, 1, 0, 3)
}
denoised := MinionDenoise(graph,
CLISampleAttribute(),
CLIKmerSize())
var denoised obiseq.BioSequenceSlice
if CLICluterDenoise() {
denoised = MinionClusterDenoise(graph,
CLISampleAttribute(),
CLIKmerSize())
} else {
denoised = MinionDenoise(graph,
CLISampleAttribute(),
CLIKmerSize())
}
newIter.Push(obiiter.MakeBioSequenceBatch(source, sample_order, denoised))
@ -411,9 +512,14 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
newIter.WaitAndClose()
}()
obiuniq.AddStatsOn(CLISampleAttribute())
// obiuniq.AddStatsOn("sample:obiconsensus_weight")
obiuniq.SetUniqueInMemory(false)
obiuniq.SetNoSingleton(CLINoSingleton())
return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))
res := newIter
if CLIUnique() {
obiuniq.AddStatsOn(CLISampleAttribute())
// obiuniq.AddStatsOn("sample:obiconsensus_weight")
obiuniq.SetUniqueInMemory(false)
obiuniq.SetNoSingleton(CLINoSingleton())
res = obiuniq.CLIUnique(newIter)
}
return res.Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))
}

View File

@ -8,8 +8,6 @@ import (
var _distStepMax = 1
var _sampleAttribute = "sample"
var _ratioMax = 1.0
var _clusterMode = false
var _onlyHead = false
@ -20,6 +18,10 @@ var _NoSingleton = false
var _saveGraph = "__@@NOSAVE@@__"
var _saveRatio = "__@@NOSAVE@@__"
var _lowCoverage = 0.0
var _unique = false
// ObiminionOptionSet sets the options for obiminion.
//
// options: The options for configuring obiminion.
@ -50,6 +52,19 @@ func ObiminionOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_NoSingleton, "no-singleton", _NoSingleton,
options.Description("If set, sequences occurring a single time in the data set are discarded."))
options.BoolVar(&_clusterMode, "cluster", _clusterMode,
options.Alias("C"),
options.Description("Switch obiconsensus into its clustering mode."),
)
options.BoolVar(&_unique, "unique", _unique,
options.Alias("U"),
options.Description("If set, sequences are dereplicated on the output (obiuniq)."),
)
options.Float64Var(&_lowCoverage, "low-coverage", _lowCoverage,
options.Description("If the coverage of a sample is lower than this value, it will be discarded."),
)
}
// OptionSet sets up the options for the obiminion package.
@ -129,4 +144,16 @@ func CLIKmerSize() int {
// Returns a boolean value indicating whether or not singleton sequences should be discarded.
func CLINoSingleton() bool {
return _NoSingleton
}
}
func CLICluterDenoise() bool {
return _clusterMode
}
func CLIUnique() bool {
return _unique
}
func CLILowCoverage() float64 {
return _lowCoverage
}