mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
correction of several small bugs
This commit is contained in:
@ -1,30 +1,72 @@
|
||||
package obialign
|
||||
|
||||
import log "github.com/sirupsen/logrus"
|
||||
|
||||
// buffIndex converts a pair of coordinates (i, j) into a linear index in a matrix
|
||||
// of size width x width. The coordinates are (-1)-indexed, and the linear index
|
||||
// is 0-indexed as well. The function first adds 1 to both coordinates to make
|
||||
// sure the (-1,-1) coordinate is at position 0 in the matrix, and then computes
|
||||
// the linear index by multiplying the first coordinate by the width and adding
|
||||
// the second coordinate.
|
||||
func buffIndex(i, j, width int) int {
|
||||
return (i+1)*width + (j + 1)
|
||||
}
|
||||
|
||||
// LocatePattern is a function to locate a pattern in a sequence.
|
||||
//
|
||||
// It uses a dynamic programming approach to build a matrix of scores.
|
||||
// The score at each cell is the maximum of the score of the cell
|
||||
// above it (representing a deletion), the score of the cell to its
|
||||
// left (representing an insertion), and the score of the cell
|
||||
// diagonally above it (representing a match).
|
||||
//
|
||||
// The score of a match is 0 if the two characters are the same,
|
||||
// and -1 if they are different.
|
||||
//
|
||||
// The function returns the start and end positions of the best
|
||||
// match, as well as the number of errors in the best match.
|
||||
func LocatePattern(pattern, sequence []byte) (int, int, int) {
|
||||
// Pattern spreads over the columns
|
||||
// Sequence spreads over the rows
|
||||
width := len(pattern) + 1
|
||||
buffsize := (len(pattern) + 1) * (len(sequence) + 1)
|
||||
buffer := make([]int, buffsize)
|
||||
|
||||
if len(pattern) >= len(sequence) {
|
||||
log.Panicf("Pattern %s must be shorter than sequence %s", pattern, sequence)
|
||||
}
|
||||
|
||||
// The path matrix keeps track of the best path through the matrix
|
||||
// 0 : indicate the diagonal path
|
||||
// 1 : indicate the up path
|
||||
// -1 : indicate the left path
|
||||
path := make([]int, buffsize)
|
||||
|
||||
// Initialize the first row of the matrix
|
||||
for j := 0; j < len(pattern); j++ {
|
||||
idx := buffIndex(-1, j, width)
|
||||
buffer[idx] = -j - 1
|
||||
path[idx] = -1
|
||||
}
|
||||
|
||||
// Initialize the first column of the matrix
|
||||
// Alignment is endgap free so first column = 0
|
||||
// to allow primer to shift freely along the sequence
|
||||
for i := -1; i < len(sequence); i++ {
|
||||
idx := buffIndex(i, -1, width)
|
||||
buffer[idx] = 0
|
||||
path[idx] = +1
|
||||
}
|
||||
|
||||
// Fills the matrix except the last column
|
||||
// where gaps must be free too.
|
||||
path[0] = 0
|
||||
jmax := len(pattern) - 1
|
||||
for i := 0; i < len(sequence); i++ {
|
||||
for j := 0; j < jmax; j++ {
|
||||
|
||||
// Mismatch score = -1
|
||||
// Match score = 0
|
||||
match := -1
|
||||
if _samenuc(pattern[j], sequence[i]) {
|
||||
match = 0
|
||||
@ -33,6 +75,8 @@ func LocatePattern(pattern, sequence []byte) (int, int, int) {
|
||||
idx := buffIndex(i, j, width)
|
||||
|
||||
diag := buffer[buffIndex(i-1, j-1, width)] + match
|
||||
|
||||
// Each gap cost -1
|
||||
left := buffer[buffIndex(i, j-1, width)] - 1
|
||||
up := buffer[buffIndex(i-1, j, width)] - 1
|
||||
|
||||
@ -51,9 +95,12 @@ func LocatePattern(pattern, sequence []byte) (int, int, int) {
|
||||
}
|
||||
}
|
||||
|
||||
// Fills the last column considering the free up gap
|
||||
for i := 0; i < len(sequence); i++ {
|
||||
idx := buffIndex(i, jmax, width)
|
||||
|
||||
// Mismatch score = -1
|
||||
// Match score = 0
|
||||
match := -1
|
||||
if _samenuc(pattern[jmax], sequence[i]) {
|
||||
match = 0
|
||||
@ -65,6 +112,7 @@ func LocatePattern(pattern, sequence []byte) (int, int, int) {
|
||||
|
||||
score := max(diag, up, left)
|
||||
buffer[idx] = score
|
||||
|
||||
switch {
|
||||
case score == left:
|
||||
path[idx] = -1
|
||||
@ -75,11 +123,13 @@ func LocatePattern(pattern, sequence []byte) (int, int, int) {
|
||||
}
|
||||
}
|
||||
|
||||
// Bactracking of the aligment
|
||||
|
||||
i := len(sequence) - 1
|
||||
j := jmax
|
||||
end := -1
|
||||
lali := 0
|
||||
for i > -1 && j > 0 {
|
||||
for j > 0 { // C'était i > -1 && j > 0
|
||||
lali++
|
||||
switch path[buffIndex(i, j, width)] {
|
||||
case 0:
|
||||
@ -100,5 +150,9 @@ func LocatePattern(pattern, sequence []byte) (int, int, int) {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// log.Warnf("from : %d to: %d error: %d match: %v",
|
||||
// i, end+1, -buffer[buffIndex(len(sequence)-1, len(pattern)-1, width)],
|
||||
// string(sequence[i:(end+1)]))
|
||||
return i, end + 1, -buffer[buffIndex(len(sequence)-1, len(pattern)-1, width)]
|
||||
}
|
||||
|
@ -137,6 +137,28 @@ char *reverseSequence(char *str,char isPattern)
|
||||
return str;
|
||||
}
|
||||
|
||||
/* -------------------------------------------- */
|
||||
/* lowercase sequence */
|
||||
/* -------------------------------------------- */
|
||||
|
||||
#define IS_UPPER(c) (((c) >= 'A') && ((c) <= 'A'))
|
||||
#define TO_LOWER(c) ((c) - 'A' + 'a')
|
||||
|
||||
char *LowerSequence(char *seq)
|
||||
{
|
||||
char *cseq;
|
||||
|
||||
for (cseq = seq ; *cseq ; cseq++)
|
||||
if (IS_UPPER(*cseq))
|
||||
*cseq = TO_LOWER(*cseq);
|
||||
|
||||
return seq;
|
||||
}
|
||||
|
||||
#undef IS_UPPER
|
||||
#undef TO_LOWER
|
||||
|
||||
|
||||
char *ecoComplementPattern(char *nucAcSeq)
|
||||
{
|
||||
return reverseSequence(LXBioSeqComplement(nucAcSeq),1);
|
||||
@ -165,6 +187,7 @@ void UpperSequence(char *seq)
|
||||
|
||||
|
||||
|
||||
|
||||
/* -------------------------------------------- */
|
||||
/* encode sequence */
|
||||
/* IS_UPPER is slightly faster than isupper */
|
||||
|
@ -9,6 +9,7 @@ import "C"
|
||||
import (
|
||||
"errors"
|
||||
"runtime"
|
||||
"strings"
|
||||
"unsafe"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
@ -114,7 +115,7 @@ func (pattern ApatPattern) ReverseComplement() (ApatPattern, error) {
|
||||
C.free(unsafe.Pointer(errmsg))
|
||||
return ApatPattern{nil}, errors.New(message)
|
||||
}
|
||||
spat := C.GoString(apc.cpat)
|
||||
spat := strings.ToLower(C.GoString(apc.cpat))
|
||||
ap := _ApatPattern{apc, spat}
|
||||
|
||||
runtime.SetFinalizer(&ap, func(p *_ApatPattern) {
|
||||
@ -366,6 +367,18 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) (
|
||||
return
|
||||
}
|
||||
|
||||
// FilterBestMatch filters the best non overlapping matches of a given pattern in a sequence.
|
||||
//
|
||||
// It takes the following parameters:
|
||||
// - pattern: the pattern to search for (ApatPattern).
|
||||
// - sequence: the sequence to search in (ApatSequence).
|
||||
// - begin: the starting index of the search (int).
|
||||
// - length: the length of the search (int).
|
||||
//
|
||||
// It returns a slice of [3]int representing the locations of all non-overlapping matches in the sequence.
|
||||
// The two firsts values of the [3]int indicate respectively the start and the end position of
|
||||
// the match. Following the GO convention the end position is not included in the
|
||||
// match. The third value indicates the number of error detected for this occurrence.
|
||||
func (pattern ApatPattern) FilterBestMatch(sequence ApatSequence, begin, length int) (loc [][3]int) {
|
||||
res := pattern.FindAllIndex(sequence, begin, length)
|
||||
filtered := make([][3]int, 0, len(res))
|
||||
@ -424,13 +437,15 @@ func (pattern ApatPattern) FilterBestMatch(sequence ApatSequence, begin, length
|
||||
func (pattern ApatPattern) AllMatches(sequence ApatSequence, begin, length int) (loc [][3]int) {
|
||||
res := pattern.FilterBestMatch(sequence, begin, length)
|
||||
|
||||
j := 0
|
||||
for _, m := range res {
|
||||
// Recompute the start and end position of the match
|
||||
// when the pattern allows for indels
|
||||
if m[2] > 0 && pattern.pointer.pointer.hasIndel {
|
||||
start := m[0] - m[2]
|
||||
// log.Warnf("Locating indel on sequence %s[%s]", sequence.pointer.reference.Id(), pattern.String())
|
||||
start := m[0] - m[2]*2
|
||||
start = max(start, 0)
|
||||
end := start + int(pattern.pointer.pointer.patlen) + 2*m[2]
|
||||
end := start + int(pattern.pointer.pointer.patlen) + 4*m[2]
|
||||
end = min(end, sequence.Len())
|
||||
// 1 << 30 = 1,073,741,824 = 1Gb
|
||||
// It's a virtual array mapping the sequence to the pattern
|
||||
@ -439,18 +454,21 @@ func (pattern ApatPattern) AllMatches(sequence ApatSequence, begin, length int)
|
||||
cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat))
|
||||
frg := sequence.pointer.reference.Sequence()[start:end]
|
||||
|
||||
begin, end, score := obialign.LocatePattern((*cpattern)[0:int(pattern.pointer.pointer.patlen)], frg)
|
||||
pb, pe, score := obialign.LocatePattern((*cpattern)[0:int(pattern.pointer.pointer.patlen)], frg)
|
||||
|
||||
// olderr := m[2]
|
||||
m[2] = score
|
||||
m[0] = start + begin
|
||||
m[1] = start + end
|
||||
m[0] = start + pb
|
||||
m[1] = start + pe
|
||||
|
||||
// log.Warnf("seq[%d@%d:%d] %d: %s %d - %s:%s:%s", i, m[0], m[1], olderr, sequence.pointer.reference.Id(), score,
|
||||
// frg, (*cpattern)[0:int(pattern.pointer.pointer.patlen)], sequence.pointer.reference.Sequence()[m[0]:m[1]])
|
||||
}
|
||||
|
||||
if int(pattern.pointer.pointer.maxerr) >= m[2] {
|
||||
res[j] = m
|
||||
j++
|
||||
}
|
||||
}
|
||||
|
||||
// log.Debugf("All matches : %v", res)
|
||||
|
||||
return res
|
||||
return res[0:j]
|
||||
}
|
||||
|
@ -642,6 +642,8 @@ func ReadCSVNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) {
|
||||
|
||||
}
|
||||
|
||||
ngsfilter.CheckPrimerUnicity()
|
||||
|
||||
for i := 0; i < len(params); i++ {
|
||||
param := params[i][1]
|
||||
if len(params[i]) < 3 {
|
||||
|
@ -3,6 +3,8 @@ package obiformats
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/csv"
|
||||
"errors"
|
||||
"io"
|
||||
"path"
|
||||
"regexp"
|
||||
@ -39,6 +41,31 @@ type SequenceReader func(reader io.Reader, options ...WithOption) (obiiter.IBioS
|
||||
// - io.Reader: A modified reader with the read data.
|
||||
// - error: Any error encountered during the process.
|
||||
func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
|
||||
csv := func(in []byte, limit uint32) bool {
|
||||
in = dropLastLine(in, limit)
|
||||
|
||||
br := bytes.NewReader(in)
|
||||
r := csv.NewReader(br)
|
||||
r.Comma = ','
|
||||
r.ReuseRecord = true
|
||||
r.LazyQuotes = true
|
||||
r.Comment = '#'
|
||||
|
||||
lines := 0
|
||||
for {
|
||||
_, err := r.Read()
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
lines++
|
||||
}
|
||||
|
||||
return r.FieldsPerRecord > 1 && lines > 1
|
||||
}
|
||||
|
||||
fastaDetector := func(raw []byte, limit uint32) bool {
|
||||
ok, err := regexp.Match("^>[^ ]", raw)
|
||||
return ok && err == nil
|
||||
@ -70,12 +97,14 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
|
||||
mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
|
||||
mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq")
|
||||
mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat")
|
||||
mimetype.Lookup("text/plain").Extend(csv, "text/csv", ".csv")
|
||||
|
||||
mimetype.Lookup("application/octet-stream").Extend(fastaDetector, "text/fasta", ".fasta")
|
||||
mimetype.Lookup("application/octet-stream").Extend(fastqDetector, "text/fastq", ".fastq")
|
||||
mimetype.Lookup("application/octet-stream").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
|
||||
mimetype.Lookup("application/octet-stream").Extend(genbankDetector, "text/genbank", ".seq")
|
||||
mimetype.Lookup("application/octet-stream").Extend(emblDetector, "text/embl", ".dat")
|
||||
mimetype.Lookup("application/octet-stream").Extend(csv, "text/csv", ".csv")
|
||||
|
||||
// Create a buffer to store the read data
|
||||
buf := make([]byte, 1024*128)
|
||||
@ -87,6 +116,7 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
|
||||
|
||||
// Detect the MIME type using the mimetype library
|
||||
mimeType := mimetype.Detect(buf)
|
||||
|
||||
if mimeType == nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
250
pkg/obifp/uint128_test.go
Normal file
250
pkg/obifp/uint128_test.go
Normal file
@ -0,0 +1,250 @@
|
||||
package obifp
|
||||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestUint128_Add(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
v := Uint128{w1: 3, w0: 4}
|
||||
w := u.Add(v)
|
||||
assert.Equal(t, Uint128{w1: 4, w0: 6}, w)
|
||||
|
||||
u = Uint128{w1: 0, w0: 0}
|
||||
v = Uint128{w1: 0, w0: 0}
|
||||
w = u.Add(v)
|
||||
assert.Equal(t, Uint128{w1: 0, w0: 0}, w)
|
||||
|
||||
u = Uint128{w1: 0, w0: math.MaxUint64}
|
||||
v = Uint128{w1: 0, w0: 1}
|
||||
w = u.Add(v)
|
||||
assert.Equal(t, Uint128{w1: 1, w0: 0}, w)
|
||||
}
|
||||
|
||||
func TestUint128_Add64(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
v := uint64(3)
|
||||
w := u.Add64(v)
|
||||
assert.Equal(t, Uint128{w1: 1, w0: 5}, w)
|
||||
|
||||
u = Uint128{w1: 0, w0: 0}
|
||||
v = uint64(0)
|
||||
w = u.Add64(v)
|
||||
assert.Equal(t, Uint128{w1: 0, w0: 0}, w)
|
||||
|
||||
u = Uint128{w1: 0, w0: math.MaxUint64}
|
||||
v = uint64(1)
|
||||
w = u.Add64(v)
|
||||
assert.Equal(t, Uint128{w1: 1, w0: 0}, w)
|
||||
}
|
||||
|
||||
func TestUint128_Sub(t *testing.T) {
|
||||
u := Uint128{w1: 4, w0: 6}
|
||||
v := Uint128{w1: 3, w0: 4}
|
||||
w := u.Sub(v)
|
||||
assert.Equal(t, Uint128{w1: 1, w0: 2}, w)
|
||||
|
||||
u = Uint128{w1: 0, w0: 0}
|
||||
v = Uint128{w1: 0, w0: 0}
|
||||
w = u.Sub(v)
|
||||
assert.Equal(t, Uint128{w1: 0, w0: 0}, w)
|
||||
|
||||
u = Uint128{w1: 1, w0: 0}
|
||||
v = Uint128{w1: 0, w0: 1}
|
||||
w = u.Sub(v)
|
||||
assert.Equal(t, Uint128{w1: 0, w0: math.MaxUint64}, w)
|
||||
}
|
||||
|
||||
func TestUint128_Mul64(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
v := uint64(3)
|
||||
w := u.Mul64(v)
|
||||
|
||||
if w.w1 != 3 || w.w0 != 6 {
|
||||
t.Errorf("Mul64(%v, %v) = %v, want %v", u, v, w, Uint128{w1: 3, w0: 6})
|
||||
}
|
||||
|
||||
u = Uint128{w1: 0, w0: 0}
|
||||
v = uint64(0)
|
||||
w = u.Mul64(v)
|
||||
|
||||
if w.w1 != 0 || w.w0 != 0 {
|
||||
t.Errorf("Mul64(%v, %v) = %v, want %v", u, v, w, Uint128{w1: 0, w0: 0})
|
||||
}
|
||||
|
||||
u = Uint128{w1: 0, w0: math.MaxUint64}
|
||||
v = uint64(2)
|
||||
w = u.Mul64(v)
|
||||
|
||||
if w.w1 != 1 || w.w0 != 18446744073709551614 {
|
||||
t.Errorf("Mul64(%v, %v) = %v, want %v", u, v, w, Uint128{w1: 1, w0: 18446744073709551614})
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestUint128_Mul(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
u Uint128
|
||||
v Uint128
|
||||
expected Uint128
|
||||
}{
|
||||
{
|
||||
name: "simple multiplication",
|
||||
u: Uint128{w1: 1, w0: 2},
|
||||
v: Uint128{w1: 3, w0: 4},
|
||||
expected: Uint128{w1: 10, w0: 8},
|
||||
},
|
||||
{
|
||||
name: "multiplication with overflow",
|
||||
u: Uint128{w1: 0, w0: math.MaxUint64},
|
||||
v: Uint128{w1: 0, w0: 2},
|
||||
expected: Uint128{w1: 1, w0: 18446744073709551614},
|
||||
},
|
||||
{
|
||||
name: "multiplication with zero",
|
||||
u: Uint128{w1: 0, w0: 0},
|
||||
v: Uint128{w1: 0, w0: 0},
|
||||
expected: Uint128{w1: 0, w0: 0},
|
||||
},
|
||||
{
|
||||
name: "multiplication with large numbers",
|
||||
u: Uint128{w1: 100, w0: 200},
|
||||
v: Uint128{w1: 300, w0: 400},
|
||||
expected: Uint128{w1: 100000, w0: 80000},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
actual := tt.u.Mul(tt.v)
|
||||
if !reflect.DeepEqual(actual, tt.expected) {
|
||||
t.Errorf("Mul(%v, %v) = %v, want %v", tt.u, tt.v, actual, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUint128_QuoRem(t *testing.T) {
|
||||
u := Uint128{w1: 3, w0: 8}
|
||||
v := Uint128{w1: 0, w0: 4}
|
||||
q, r := u.QuoRem(v)
|
||||
assert.Equal(t, Uint128{w1: 0, w0: 2}, q)
|
||||
assert.Equal(t, Uint128{w1: 0, w0: 0}, r)
|
||||
}
|
||||
|
||||
func TestUint128_QuoRem64(t *testing.T) {
|
||||
u := Uint128{w1: 0, w0: 6}
|
||||
v := uint64(3)
|
||||
q, r := u.QuoRem64(v)
|
||||
assert.Equal(t, Uint128{w1: 0, w0: 2}, q)
|
||||
assert.Equal(t, uint64(0), r)
|
||||
}
|
||||
|
||||
func TestUint128_Div(t *testing.T) {
|
||||
u := Uint128{w1: 3, w0: 8}
|
||||
v := Uint128{w1: 0, w0: 4}
|
||||
q := u.Div(v)
|
||||
assert.Equal(t, Uint128{w1: 0, w0: 2}, q)
|
||||
}
|
||||
|
||||
func TestUint128_Div64(t *testing.T) {
|
||||
u := Uint128{w1: 0, w0: 6}
|
||||
v := uint64(3)
|
||||
q := u.Div64(v)
|
||||
assert.Equal(t, Uint128{w1: 0, w0: 2}, q)
|
||||
}
|
||||
|
||||
func TestUint128_Mod(t *testing.T) {
|
||||
u := Uint128{w1: 3, w0: 8}
|
||||
v := Uint128{w1: 0, w0: 4}
|
||||
r := u.Mod(v)
|
||||
assert.Equal(t, Uint128{w1: 0, w0: 0}, r)
|
||||
}
|
||||
|
||||
func TestUint128_Mod64(t *testing.T) {
|
||||
u := Uint128{w1: 0, w0: 6}
|
||||
v := uint64(3)
|
||||
r := u.Mod64(v)
|
||||
assert.Equal(t, uint64(0), r)
|
||||
}
|
||||
|
||||
func TestUint128_Cmp(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
v := Uint128{w1: 3, w0: 4}
|
||||
assert.Equal(t, -1, u.Cmp(v))
|
||||
}
|
||||
|
||||
func TestUint128_Cmp64(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
v := uint64(3)
|
||||
assert.Equal(t, -1, u.Cmp64(v))
|
||||
}
|
||||
|
||||
func TestUint128_Equals(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
v := Uint128{w1: 1, w0: 2}
|
||||
assert.Equal(t, true, u.Equals(v))
|
||||
}
|
||||
|
||||
func TestUint128_LessThan(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
v := Uint128{w1: 3, w0: 4}
|
||||
assert.Equal(t, true, u.LessThan(v))
|
||||
}
|
||||
|
||||
func TestUint128_GreaterThan(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
v := Uint128{w1: 3, w0: 4}
|
||||
assert.Equal(t, false, u.GreaterThan(v))
|
||||
}
|
||||
|
||||
func TestUint128_LessThanOrEqual(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
v := Uint128{w1: 3, w0: 4}
|
||||
assert.Equal(t, true, u.LessThanOrEqual(v))
|
||||
}
|
||||
|
||||
func TestUint128_GreaterThanOrEqual(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
v := Uint128{w1: 3, w0: 4}
|
||||
assert.Equal(t, false, u.GreaterThanOrEqual(v))
|
||||
}
|
||||
|
||||
func TestUint128_And(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
v := Uint128{w1: 3, w0: 4}
|
||||
w := u.And(v)
|
||||
assert.Equal(t, Uint128{w1: 1, w0: 0}, w)
|
||||
}
|
||||
|
||||
func TestUint128_Or(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
v := Uint128{w1: 3, w0: 4}
|
||||
w := u.Or(v)
|
||||
assert.Equal(t, Uint128{w1: 3, w0: 6}, w)
|
||||
}
|
||||
|
||||
func TestUint128_Xor(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
v := Uint128{w1: 3, w0: 4}
|
||||
w := u.Xor(v)
|
||||
assert.Equal(t, Uint128{w1: 2, w0: 6}, w)
|
||||
}
|
||||
|
||||
func TestUint128_Not(t *testing.T) {
|
||||
u := Uint128{w1: 1, w0: 2}
|
||||
w := u.Not()
|
||||
assert.Equal(t, Uint128{w1: math.MaxUint64 - 1, w0: math.MaxUint64 - 2}, w)
|
||||
}
|
||||
|
||||
func TestUint128_AsUint64(t *testing.T) {
|
||||
u := Uint128{w1: 0, w0: 5}
|
||||
v := u.AsUint64()
|
||||
assert.Equal(t, uint64(5), v)
|
||||
}
|
@ -10,6 +10,7 @@ import (
|
||||
"slices"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
@ -312,10 +313,48 @@ func (g *DeBruijnGraph) LongestPath(max_length int) []uint64 {
|
||||
return path
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error) {
|
||||
func (g *DeBruijnGraph) LongestConsensus(id string, min_cov float64) (*obiseq.BioSequence, error) {
|
||||
if g.Len() == 0 {
|
||||
return nil, fmt.Errorf("graph is empty")
|
||||
}
|
||||
//path := g.LongestPath(max_length)
|
||||
path := g.HaviestPath()
|
||||
s := g.DecodePath(path)
|
||||
|
||||
spath := path
|
||||
|
||||
if min_cov > 0 {
|
||||
wp := make([]uint, len(path))
|
||||
|
||||
for i, n := range path {
|
||||
wp[i] = g.graph[n]
|
||||
}
|
||||
|
||||
mp := uint(float64(obistats.Mode(wp))*min_cov + 0.5)
|
||||
|
||||
from := 0
|
||||
for i, n := range path {
|
||||
if g.graph[n] < mp {
|
||||
from = i + 1
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
to := len(path)
|
||||
|
||||
for i := len(path) - 1; i >= 0; i-- {
|
||||
n := path[i]
|
||||
if g.graph[n] < mp {
|
||||
to = i
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
spath = path[from:to]
|
||||
}
|
||||
|
||||
s := g.DecodePath(spath)
|
||||
|
||||
if len(s) > 0 {
|
||||
seq := obiseq.NewBioSequence(
|
||||
@ -387,6 +426,48 @@ func (g *DeBruijnGraph) Weight(index uint64) int {
|
||||
return int(val)
|
||||
}
|
||||
|
||||
// WeightMode returns the mode weight of the nodes in the DeBruijnGraph.
|
||||
//
|
||||
// It iterates over each count in the graph map and updates the max value if the current count is greater.
|
||||
// Finally, it returns the mode weight as an integer.
|
||||
//
|
||||
// Returns:
|
||||
// - int: the mode weight value.
|
||||
func (g *DeBruijnGraph) WeightMode() int {
|
||||
dist := make(map[uint]int)
|
||||
|
||||
for _, w := range g.graph {
|
||||
dist[w]++
|
||||
}
|
||||
|
||||
maxi := 0
|
||||
wmax := uint(0)
|
||||
|
||||
for k, v := range dist {
|
||||
if v > maxi && k > 1 {
|
||||
maxi = v
|
||||
wmax = k
|
||||
}
|
||||
}
|
||||
|
||||
return int(wmax)
|
||||
}
|
||||
|
||||
// WeightMean returns the mean weight of the nodes in the DeBruijnGraph.
|
||||
//
|
||||
// Returns:
|
||||
// - float64: the mean weight of the nodes in the graph.
|
||||
func (g *DeBruijnGraph) WeightMean() float64 {
|
||||
sum := 0
|
||||
|
||||
for _, w := range g.graph {
|
||||
i := int(w)
|
||||
sum += i
|
||||
}
|
||||
|
||||
return float64(sum) / float64(len(g.graph))
|
||||
}
|
||||
|
||||
// append appends a sequence of nucleotides to the DeBruijnGraph.
|
||||
//
|
||||
// Parameters:
|
||||
@ -637,7 +718,7 @@ func (g *DeBruijnGraph) HaviestPath() []uint64 {
|
||||
}
|
||||
|
||||
if slices.Contains(heaviestPath, currentNode) {
|
||||
log.Fatalf("Cycle detected %v -> %v (%v) len(%v)", heaviestPath, currentNode, startNodes, len(heaviestPath))
|
||||
log.Panicf("Cycle detected %v -> %v (%v) len(%v), graph: %v", heaviestPath, currentNode, startNodes, len(heaviestPath), g.Len())
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -20,6 +20,8 @@ import (
|
||||
func NewInterpreter() *lua.LState {
|
||||
lua := lua.NewState()
|
||||
|
||||
registerMutexType(lua)
|
||||
|
||||
RegisterObilib(lua)
|
||||
RegisterObiContext(lua)
|
||||
|
||||
@ -117,7 +119,7 @@ func LuaWorker(proto *lua.FunctionProto) obiseq.SeqWorker {
|
||||
case *obiseq.BioSequenceSlice:
|
||||
return *val, err
|
||||
default:
|
||||
return nil, fmt.Errorf("worker function doesn't return the correct type")
|
||||
return nil, fmt.Errorf("worker function doesn't return the correct type %T", val)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -36,6 +36,8 @@ func obicontextGetSet(interpreter *lua.LState) int {
|
||||
__lua_obicontext.Store(key, float64(val))
|
||||
case lua.LString:
|
||||
__lua_obicontext.Store(key, string(val))
|
||||
case *lua.LUserData:
|
||||
__lua_obicontext.Store(key, val.Value)
|
||||
case *lua.LTable:
|
||||
__lua_obicontext.Store(key, Table2Interface(interpreter, val))
|
||||
default:
|
||||
|
@ -1,6 +1,8 @@
|
||||
package obilua
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
lua "github.com/yuin/gopher-lua"
|
||||
@ -46,6 +48,8 @@ func pushInterfaceToLua(L *lua.LState, val interface{}) {
|
||||
pushSliceBoolToLua(L, v)
|
||||
case nil:
|
||||
L.Push(lua.LNil)
|
||||
case *sync.Mutex:
|
||||
pushMutexToLua(L, v)
|
||||
default:
|
||||
log.Fatalf("Cannot deal with value (%T) : %v", val, val)
|
||||
}
|
||||
|
63
pkg/obilua/mutex.go
Normal file
63
pkg/obilua/mutex.go
Normal file
@ -0,0 +1,63 @@
|
||||
package obilua
|
||||
|
||||
import (
|
||||
lua "github.com/yuin/gopher-lua"
|
||||
|
||||
"sync"
|
||||
)
|
||||
|
||||
const luaMutexTypeName = "Mutex"
|
||||
|
||||
func registerMutexType(luaState *lua.LState) {
|
||||
mutexType := luaState.NewTypeMetatable(luaMutexTypeName)
|
||||
luaState.SetGlobal(luaMutexTypeName, mutexType)
|
||||
|
||||
luaState.SetField(mutexType, "new", luaState.NewFunction(newMutex))
|
||||
|
||||
luaState.SetField(mutexType, "__index",
|
||||
luaState.SetFuncs(luaState.NewTable(),
|
||||
mutexMethods))
|
||||
}
|
||||
|
||||
func mutex2Lua(interpreter *lua.LState, mutex *sync.Mutex) lua.LValue {
|
||||
ud := interpreter.NewUserData()
|
||||
ud.Value = mutex
|
||||
interpreter.SetMetatable(ud, interpreter.GetTypeMetatable(luaMutexTypeName))
|
||||
|
||||
return ud
|
||||
}
|
||||
|
||||
func pushMutexToLua(luaState *lua.LState, mutex *sync.Mutex) {
|
||||
luaState.Push(mutex2Lua(luaState, mutex))
|
||||
}
|
||||
func newMutex(luaState *lua.LState) int {
|
||||
m := &sync.Mutex{}
|
||||
pushMutexToLua(luaState, m)
|
||||
return 1
|
||||
}
|
||||
|
||||
var mutexMethods = map[string]lua.LGFunction{
|
||||
"lock": mutexLock,
|
||||
"unlock": mutexUnlock,
|
||||
}
|
||||
|
||||
func checkMutex(L *lua.LState) *sync.Mutex {
|
||||
ud := L.CheckUserData(1)
|
||||
if mutex, ok := ud.Value.(*sync.Mutex); ok {
|
||||
return mutex
|
||||
}
|
||||
L.ArgError(1, "Mutex expected")
|
||||
return nil
|
||||
}
|
||||
|
||||
func mutexLock(L *lua.LState) int {
|
||||
mutex := checkMutex(L)
|
||||
mutex.Lock()
|
||||
return 0
|
||||
}
|
||||
|
||||
func mutexUnlock(L *lua.LState) int {
|
||||
mutex := checkMutex(L)
|
||||
mutex.Unlock()
|
||||
return 0
|
||||
}
|
@ -2,9 +2,10 @@ package obingslibrary
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
|
@ -3,7 +3,7 @@ package obingslibrary
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
"slices"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
@ -87,6 +87,8 @@ func lookForTag(seq string, delimiter byte) string {
|
||||
|
||||
i := len(seq) - 1
|
||||
|
||||
// log.Warnf("Provided fragment : %s", string(seq))
|
||||
|
||||
for i >= 0 && seq[i] != delimiter {
|
||||
i--
|
||||
}
|
||||
@ -107,6 +109,7 @@ func lookForTag(seq string, delimiter byte) string {
|
||||
return ""
|
||||
}
|
||||
|
||||
// log.Warnf("extracted : %s", string(seq[begin:end]))
|
||||
return seq[begin:end]
|
||||
}
|
||||
|
||||
@ -172,11 +175,12 @@ func (marker *Marker) beginDelimitedTagExtractor(
|
||||
begin int,
|
||||
forward bool) string {
|
||||
|
||||
taglength := marker.Forward_spacer + marker.Forward_tag_length
|
||||
// log.Warn("beginDelimitedTagExtractor")
|
||||
taglength := 2*marker.Forward_spacer + marker.Forward_tag_length
|
||||
delimiter := marker.Forward_tag_delimiter
|
||||
|
||||
if !forward {
|
||||
taglength = marker.Reverse_spacer + marker.Reverse_tag_length
|
||||
taglength = 2*marker.Reverse_spacer + marker.Reverse_tag_length
|
||||
delimiter = marker.Reverse_tag_delimiter
|
||||
}
|
||||
|
||||
@ -240,6 +244,7 @@ func (marker *Marker) endDelimitedTagExtractor(
|
||||
sequence *obiseq.BioSequence,
|
||||
end int,
|
||||
forward bool) string {
|
||||
// log.Warn("endDelimitedTagExtractor")
|
||||
|
||||
taglength := marker.Reverse_spacer + marker.Reverse_tag_length
|
||||
delimiter := marker.Reverse_tag_delimiter
|
||||
@ -335,14 +340,17 @@ func (marker *Marker) beginTagExtractor(
|
||||
sequence *obiseq.BioSequence,
|
||||
begin int,
|
||||
forward bool) string {
|
||||
// log.Warnf("Forward : %v -> %d %c", forward, marker.Forward_spacer, marker.Forward_tag_delimiter)
|
||||
// log.Warnf("Forward : %v -> %d %c", forward, marker.Reverse_spacer, marker.Reverse_tag_delimiter)
|
||||
if forward {
|
||||
if marker.Forward_tag_delimiter == 0 {
|
||||
return marker.beginFixedTagExtractor(sequence, begin, forward)
|
||||
} else {
|
||||
if marker.Forward_tag_indels == 0 {
|
||||
// log.Warnf("Delimited tag for forward primers %s", marker.forward.String())
|
||||
return marker.beginDelimitedTagExtractor(sequence, begin, forward)
|
||||
} else {
|
||||
// log.Warn("Rescue tag for forward primers")
|
||||
// log.Warnf("Rescue tag for forward primers %s", marker.forward.String())
|
||||
return marker.beginRescueTagExtractor(sequence, begin, forward)
|
||||
}
|
||||
}
|
||||
@ -351,9 +359,10 @@ func (marker *Marker) beginTagExtractor(
|
||||
return marker.beginFixedTagExtractor(sequence, begin, forward)
|
||||
} else {
|
||||
if marker.Reverse_tag_indels == 0 {
|
||||
// log.Warnf("Delimited tag for reverse/complement primers %s", marker.creverse.String())
|
||||
return marker.beginDelimitedTagExtractor(sequence, begin, forward)
|
||||
} else {
|
||||
// log.Warn("Rescue tag for reverse/complement primers")
|
||||
// log.Warnf("Rescue tag for reverse/complement primers %s", marker.creverse.String())
|
||||
return marker.beginRescueTagExtractor(sequence, begin, forward)
|
||||
}
|
||||
}
|
||||
@ -369,9 +378,10 @@ func (marker *Marker) endTagExtractor(
|
||||
return marker.endFixedTagExtractor(sequence, end, forward)
|
||||
} else {
|
||||
if marker.Reverse_tag_indels == 0 {
|
||||
// log.Warnf("Delimited tag for reverse primers %s", marker.reverse.String())
|
||||
return marker.endDelimitedTagExtractor(sequence, end, forward)
|
||||
} else {
|
||||
// log.Warn("Rescue tag for reverse primers")
|
||||
// log.Warnf("Rescue tag for reverse primers %s", marker.reverse.String())
|
||||
return marker.endRescueTagExtractor(sequence, end, forward)
|
||||
}
|
||||
}
|
||||
@ -380,9 +390,10 @@ func (marker *Marker) endTagExtractor(
|
||||
return marker.endFixedTagExtractor(sequence, end, forward)
|
||||
} else {
|
||||
if marker.Forward_tag_indels == 0 {
|
||||
// log.Warnf("Delimited tag for forward/complement primers %s", marker.cforward.String())
|
||||
return marker.endDelimitedTagExtractor(sequence, end, forward)
|
||||
} else {
|
||||
// log.Warn("Rescue tag for forward/complement primers")
|
||||
// log.Warnf("Rescue tag for forward/complement primers %s", marker.cforward.String())
|
||||
return marker.endRescueTagExtractor(sequence, end, forward)
|
||||
}
|
||||
}
|
||||
@ -609,9 +620,7 @@ func (library *NGSLibrary) ExtractMultiBarcode(sequence *obiseq.BioSequence) (ob
|
||||
}
|
||||
|
||||
if len(matches) > 0 {
|
||||
sort.Slice(matches, func(i, j int) bool {
|
||||
return matches[i].Begin < matches[j].Begin
|
||||
})
|
||||
slices.SortFunc(matches, func(a, b PrimerMatch) int { return a.Begin - b.Begin })
|
||||
|
||||
state := 0
|
||||
var from PrimerMatch
|
||||
|
@ -36,8 +36,9 @@ func MakeNGSLibrary() NGSLibrary {
|
||||
}
|
||||
|
||||
func (library *NGSLibrary) GetMarker(forward, reverse string) (*Marker, bool) {
|
||||
pair := PrimerPair{strings.ToLower(forward),
|
||||
strings.ToLower(reverse)}
|
||||
forward = strings.ToLower(forward)
|
||||
reverse = strings.ToLower(reverse)
|
||||
pair := PrimerPair{forward, reverse}
|
||||
marker, ok := (library.Markers)[pair]
|
||||
|
||||
if ok {
|
||||
|
@ -7,7 +7,7 @@ import (
|
||||
// TODO: The version number is extracted from git. This induces that the version
|
||||
// corresponds to the last commit, and not the one when the file will be
|
||||
// commited
|
||||
var _Commit = "31bfc88"
|
||||
var _Commit = "373464c"
|
||||
var _Version = "Release 4.2.0"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
|
@ -72,8 +72,8 @@ func (s *BioSequence) HasAttribute(key string) bool {
|
||||
ok := s.annotations != nil
|
||||
|
||||
if ok {
|
||||
defer s.AnnotationsUnlock()
|
||||
s.AnnotationsLock()
|
||||
defer s.AnnotationsUnlock()
|
||||
_, ok = s.annotations[key]
|
||||
}
|
||||
|
||||
@ -112,8 +112,8 @@ func (s *BioSequence) GetAttribute(key string) (interface{}, bool) {
|
||||
ok := s.annotations != nil
|
||||
|
||||
if ok {
|
||||
defer s.AnnotationsUnlock()
|
||||
s.AnnotationsLock()
|
||||
defer s.AnnotationsUnlock()
|
||||
val, ok = s.annotations[key]
|
||||
}
|
||||
|
||||
@ -144,8 +144,8 @@ func (s *BioSequence) SetAttribute(key string, value interface{}) {
|
||||
|
||||
annot := s.Annotations()
|
||||
|
||||
defer s.AnnotationsUnlock()
|
||||
s.AnnotationsLock()
|
||||
defer s.AnnotationsUnlock()
|
||||
annot[key] = value
|
||||
}
|
||||
|
||||
@ -205,8 +205,8 @@ func (s *BioSequence) GetFloatAttribute(key string) (float64, bool) {
|
||||
// No return value.
|
||||
func (s *BioSequence) DeleteAttribute(key string) {
|
||||
if s.annotations != nil {
|
||||
defer s.AnnotationsUnlock()
|
||||
s.AnnotationsLock()
|
||||
defer s.AnnotationsUnlock()
|
||||
delete(s.annotations, key)
|
||||
}
|
||||
}
|
||||
|
@ -65,7 +65,7 @@ type BioSequence struct {
|
||||
feature []byte
|
||||
paired *BioSequence // A pointer to the paired sequence
|
||||
annotations Annotation
|
||||
annot_lock sync.Mutex
|
||||
annot_lock *sync.Mutex
|
||||
}
|
||||
|
||||
// NewEmptyBioSequence creates a new BioSequence object with an empty sequence.
|
||||
@ -90,7 +90,7 @@ func NewEmptyBioSequence(preallocate int) *BioSequence {
|
||||
feature: nil,
|
||||
paired: nil,
|
||||
annotations: nil,
|
||||
annot_lock: sync.Mutex{},
|
||||
annot_lock: &sync.Mutex{},
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -187,7 +187,7 @@ func TestCopy(t *testing.T) {
|
||||
"annotation1": "value1",
|
||||
"annotation2": "value2",
|
||||
},
|
||||
annot_lock: sync.Mutex{},
|
||||
annot_lock: &sync.Mutex{},
|
||||
}
|
||||
|
||||
newSeq := seq.Copy()
|
||||
|
@ -91,8 +91,7 @@ func (sequence *BioSequence) HasStatsOn(key string) bool {
|
||||
// - StatsOnValues
|
||||
func (sequence *BioSequence) StatsOn(desc StatsOnDescription, na string) StatsOnValues {
|
||||
mkey := StatsOnSlotName(desc.Name)
|
||||
annotations := sequence.Annotations()
|
||||
istat, ok := annotations[mkey]
|
||||
istat, ok := sequence.GetAttribute(mkey)
|
||||
|
||||
var stats StatsOnValues
|
||||
var newstat bool
|
||||
@ -117,39 +116,40 @@ func (sequence *BioSequence) StatsOn(desc StatsOnDescription, na string) StatsOn
|
||||
}
|
||||
}
|
||||
default:
|
||||
stats = make(StatsOnValues, 10)
|
||||
annotations[mkey] = stats
|
||||
stats = make(StatsOnValues)
|
||||
sequence.SetAttribute(mkey, stats)
|
||||
newstat = true
|
||||
}
|
||||
} else {
|
||||
stats = make(StatsOnValues, 10)
|
||||
annotations[mkey] = stats
|
||||
stats = make(StatsOnValues)
|
||||
sequence.SetAttribute(mkey, stats)
|
||||
newstat = true
|
||||
}
|
||||
|
||||
if newstat && sequence.StatsPlusOne(desc, sequence, na) {
|
||||
delete(sequence.Annotations(), desc.Key)
|
||||
if newstat {
|
||||
sequence.StatsPlusOne(desc, sequence, na)
|
||||
}
|
||||
|
||||
return stats
|
||||
}
|
||||
|
||||
// StatsPlusOne adds the count of the sequence toAdd to the count of the key in the stats.
|
||||
// StatsPlusOne updates the statistics on the given attribute (desc) on the receiver BioSequence
|
||||
// with the value of the attribute on the toAdd BioSequence.
|
||||
//
|
||||
// Parameters:
|
||||
// - key: the attribute key (string) to be summarized
|
||||
// - toAdd: the BioSequence to add to the stats
|
||||
// - desc: StatsOnDescription of the attribute to be updated
|
||||
// - toAdd: the BioSequence containing the attribute to be updated
|
||||
// - na: the value to be used if the attribute is not present
|
||||
//
|
||||
// Return type:
|
||||
// - bool
|
||||
// - bool: true if the update was successful, false otherwise
|
||||
func (sequence *BioSequence) StatsPlusOne(desc StatsOnDescription, toAdd *BioSequence, na string) bool {
|
||||
sval := na
|
||||
annotations := sequence.Annotations()
|
||||
stats := sequence.StatsOn(desc, na)
|
||||
retval := false
|
||||
|
||||
if toAdd.HasAnnotation() {
|
||||
value, ok := toAdd.Annotations()[desc.Key]
|
||||
value, ok := toAdd.GetAttribute(desc.Key)
|
||||
|
||||
if ok {
|
||||
|
||||
@ -178,8 +178,9 @@ func (sequence *BioSequence) StatsPlusOne(desc StatsOnDescription, toAdd *BioSeq
|
||||
if !ok {
|
||||
old = 0
|
||||
}
|
||||
|
||||
stats[sval] = old + desc.Weight(toAdd)
|
||||
annotations[StatsOnSlotName(desc.Name)] = stats // TODO: check if this is necessary
|
||||
sequence.SetAttribute(StatsOnSlotName(desc.Name), stats) // TODO: check if this is necessary
|
||||
return retval
|
||||
}
|
||||
|
||||
@ -263,7 +264,7 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
|
||||
}
|
||||
}
|
||||
|
||||
annotations["count"] = count
|
||||
sequence.SetCount(count)
|
||||
return sequence
|
||||
}
|
||||
|
||||
@ -282,7 +283,7 @@ func (sequences BioSequenceSlice) Merge(na string, statsOn StatsOnDescriptions)
|
||||
seq.SetQualities(nil)
|
||||
|
||||
if len(sequences) == 1 {
|
||||
seq.Annotations()["count"] = seq.Count()
|
||||
seq.SetCount(seq.Count())
|
||||
for _, desc := range statsOn {
|
||||
seq.StatsOn(desc, na)
|
||||
}
|
||||
@ -296,5 +297,4 @@ func (sequences BioSequenceSlice) Merge(na string, statsOn StatsOnDescriptions)
|
||||
|
||||
sequences.Recycle(false)
|
||||
return seq
|
||||
|
||||
}
|
||||
|
@ -1,11 +1,9 @@
|
||||
package obistats
|
||||
|
||||
|
||||
|
||||
func Max[T int64 | int32 | int16 | int8 | int | float32 | float64] (data []T) T {
|
||||
func Max[T int64 | int32 | int16 | int8 | int | float32 | float64](data []T) T {
|
||||
m := data[0]
|
||||
|
||||
for _,v := range data {
|
||||
for _, v := range data {
|
||||
if v > m {
|
||||
m = v
|
||||
}
|
||||
@ -14,14 +12,34 @@ func Max[T int64 | int32 | int16 | int8 | int | float32 | float64] (data []T) T
|
||||
return m
|
||||
}
|
||||
|
||||
func Min[T int64 | int32 | int16 | int8 | int | float32 | float64] (data []T) T {
|
||||
func Min[T int64 | int32 | int16 | int8 | int | uint64 | uint32 | uint16 | uint8 | uint | float32 | float64](data []T) T {
|
||||
m := data[0]
|
||||
|
||||
for _,v := range data {
|
||||
for _, v := range data {
|
||||
if v < m {
|
||||
m = v
|
||||
}
|
||||
}
|
||||
|
||||
return m
|
||||
}
|
||||
}
|
||||
|
||||
func Mode[T int64 | int32 | int16 | int8 | int | uint64 | uint32 | uint16 | uint8 | uint](data []T) T {
|
||||
ds := make(map[T]int)
|
||||
|
||||
for _, v := range data {
|
||||
ds[v]++
|
||||
}
|
||||
|
||||
md := T(0)
|
||||
maxocc := 0
|
||||
|
||||
for v, occ := range ds {
|
||||
if occ > maxocc {
|
||||
maxocc = occ
|
||||
md = v
|
||||
}
|
||||
}
|
||||
|
||||
return md
|
||||
}
|
||||
|
@ -25,8 +25,19 @@ import (
|
||||
func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
||||
consensus_id string,
|
||||
kmer_size int,
|
||||
filter_out float64,
|
||||
save_graph bool, dirname string) (*obiseq.BioSequence, error) {
|
||||
|
||||
if seqs.Len() == 0 {
|
||||
return nil, fmt.Errorf("no sequence provided")
|
||||
}
|
||||
|
||||
if seqs.Len() == 1 {
|
||||
seq := seqs[0].Copy()
|
||||
seq.SetAttribute("obiconsensus_consensus", false)
|
||||
return seq, nil
|
||||
}
|
||||
|
||||
if save_graph {
|
||||
if dirname == "" {
|
||||
dirname = "."
|
||||
@ -104,7 +115,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
||||
log.Debugf("Graph size : %d\n", graph.Len())
|
||||
total_kmer := graph.Len()
|
||||
|
||||
seq, err := graph.LongestConsensus(consensus_id)
|
||||
seq, err := graph.LongestConsensus(consensus_id, filter_out)
|
||||
|
||||
sumCount := 0
|
||||
|
||||
@ -112,7 +123,7 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
||||
for _, s := range seqs {
|
||||
sumCount += s.Count()
|
||||
}
|
||||
|
||||
seq.SetAttribute("obiconsensus_consensus", true)
|
||||
seq.SetAttribute("obiconsensus_weight", sumCount)
|
||||
seq.SetAttribute("obiconsensus_seq_length", seq.Len())
|
||||
seq.SetAttribute("obiconsensus_kmer_size", kmer_size)
|
||||
@ -136,6 +147,10 @@ func SampleWeight(seqs *obiseq.BioSequenceSlice, sample, sample_key string) func
|
||||
|
||||
stats := (*seqs)[i].StatsOn(obiseq.MakeStatsOnDescription(sample_key), "NA")
|
||||
|
||||
if stats == nil {
|
||||
log.Panicf("Sample %s not found in sequence %d", sample, i)
|
||||
}
|
||||
|
||||
if value, ok := stats[sample]; ok {
|
||||
return float64(value)
|
||||
}
|
||||
@ -292,16 +307,16 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
|
||||
pack[degree] = v
|
||||
clean, err = BuildConsensus(pack,
|
||||
fmt.Sprintf("%s_consensus", v.Id()),
|
||||
kmer_size,
|
||||
kmer_size, CLILowCoverage(),
|
||||
CLISaveGraphToFiles(), CLIGraphFilesDirectory())
|
||||
|
||||
if err != nil {
|
||||
log.Warning(err)
|
||||
clean = (*graph.Vertices)[i]
|
||||
clean = (*graph.Vertices)[i].Copy()
|
||||
clean.SetAttribute("obiconsensus_consensus", false)
|
||||
} else {
|
||||
clean.SetAttribute("obiconsensus_consensus", true)
|
||||
|
||||
}
|
||||
|
||||
pack.Recycle(false)
|
||||
|
||||
} else {
|
||||
@ -318,8 +333,9 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
|
||||
|
||||
annotations := v.Annotations()
|
||||
|
||||
staton := obiseq.StatsOnSlotName(sample_key)
|
||||
for k, v := range annotations {
|
||||
if !clean.HasAttribute(k) {
|
||||
if !clean.HasAttribute(k) && k != staton {
|
||||
clean.SetAttribute(k, v)
|
||||
}
|
||||
}
|
||||
@ -334,6 +350,83 @@ func MinionDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
|
||||
|
||||
return denoised
|
||||
}
|
||||
|
||||
func MinionClusterDenoise(graph *obigraph.Graph[*obiseq.BioSequence, Mutation],
|
||||
sample_key string, kmer_size int) obiseq.BioSequenceSlice {
|
||||
denoised := obiseq.MakeBioSequenceSlice()
|
||||
seqs := (*obiseq.BioSequenceSlice)(graph.Vertices)
|
||||
weight := SampleWeight(seqs, graph.Name, sample_key)
|
||||
seqWeights := make([]float64, len(*seqs))
|
||||
|
||||
// Compute weights for each vertex as the sum of the weights of its neighbors
|
||||
|
||||
log.Info("")
|
||||
log.Infof("Sample %s: Computing weights", graph.Name)
|
||||
for i := range *seqs {
|
||||
w := weight(i)
|
||||
for _, j := range graph.Neighbors(i) {
|
||||
w += weight(j)
|
||||
}
|
||||
|
||||
seqWeights[i] = w
|
||||
}
|
||||
|
||||
log.Infof("Sample %s: Done computing weights", graph.Name)
|
||||
|
||||
log.Infof("Sample %s: Clustering", graph.Name)
|
||||
// Look for vertex not having a neighbor with a higher weight
|
||||
for i := range *seqs {
|
||||
v := (*seqs)[i]
|
||||
head := true
|
||||
neighbors := graph.Neighbors(i)
|
||||
for _, j := range neighbors {
|
||||
if seqWeights[i] < seqWeights[j] {
|
||||
head = false
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
if head {
|
||||
pack := obiseq.MakeBioSequenceSlice(len(neighbors) + 1)
|
||||
for k, j := range neighbors {
|
||||
pack[k] = (*seqs)[j]
|
||||
}
|
||||
pack[len(neighbors)] = v
|
||||
|
||||
clean, err := BuildConsensus(pack,
|
||||
fmt.Sprintf("%s_consensus", v.Id()),
|
||||
kmer_size, CLILowCoverage(),
|
||||
CLISaveGraphToFiles(), CLIGraphFilesDirectory())
|
||||
|
||||
if err != nil {
|
||||
log.Warning(err)
|
||||
clean = (*graph.Vertices)[i].Copy()
|
||||
clean.SetAttribute("obiconsensus_consensus", false)
|
||||
}
|
||||
pack.Recycle(false)
|
||||
|
||||
clean.SetAttribute(sample_key, graph.Name)
|
||||
|
||||
annotations := v.Annotations()
|
||||
clean.SetCount(int(weight(i)))
|
||||
|
||||
staton := obiseq.StatsOnSlotName(sample_key)
|
||||
|
||||
for k, v := range annotations {
|
||||
if !clean.HasAttribute(k) && k != staton {
|
||||
clean.SetAttribute(k, v)
|
||||
}
|
||||
}
|
||||
|
||||
denoised = append(denoised, clean)
|
||||
}
|
||||
}
|
||||
|
||||
log.Infof("Sample %s: Done clustering", graph.Name)
|
||||
|
||||
return denoised
|
||||
}
|
||||
|
||||
func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
dirname := CLIGraphFilesDirectory()
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
@ -395,9 +488,17 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
false, 1, 0, 3)
|
||||
}
|
||||
|
||||
denoised := MinionDenoise(graph,
|
||||
CLISampleAttribute(),
|
||||
CLIKmerSize())
|
||||
var denoised obiseq.BioSequenceSlice
|
||||
|
||||
if CLICluterDenoise() {
|
||||
denoised = MinionClusterDenoise(graph,
|
||||
CLISampleAttribute(),
|
||||
CLIKmerSize())
|
||||
} else {
|
||||
denoised = MinionDenoise(graph,
|
||||
CLISampleAttribute(),
|
||||
CLIKmerSize())
|
||||
}
|
||||
|
||||
newIter.Push(obiiter.MakeBioSequenceBatch(source, sample_order, denoised))
|
||||
|
||||
@ -411,9 +512,14 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
newIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
obiuniq.AddStatsOn(CLISampleAttribute())
|
||||
// obiuniq.AddStatsOn("sample:obiconsensus_weight")
|
||||
obiuniq.SetUniqueInMemory(false)
|
||||
obiuniq.SetNoSingleton(CLINoSingleton())
|
||||
return obiuniq.CLIUnique(newIter).Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))
|
||||
res := newIter
|
||||
if CLIUnique() {
|
||||
obiuniq.AddStatsOn(CLISampleAttribute())
|
||||
// obiuniq.AddStatsOn("sample:obiconsensus_weight")
|
||||
obiuniq.SetUniqueInMemory(false)
|
||||
obiuniq.SetNoSingleton(CLINoSingleton())
|
||||
res = obiuniq.CLIUnique(newIter)
|
||||
}
|
||||
|
||||
return res.Pipe(obiiter.WorkerPipe(obiannotate.AddSeqLengthWorker(), false))
|
||||
}
|
||||
|
@ -8,8 +8,6 @@ import (
|
||||
var _distStepMax = 1
|
||||
var _sampleAttribute = "sample"
|
||||
|
||||
var _ratioMax = 1.0
|
||||
|
||||
var _clusterMode = false
|
||||
var _onlyHead = false
|
||||
|
||||
@ -20,6 +18,10 @@ var _NoSingleton = false
|
||||
var _saveGraph = "__@@NOSAVE@@__"
|
||||
var _saveRatio = "__@@NOSAVE@@__"
|
||||
|
||||
var _lowCoverage = 0.0
|
||||
|
||||
var _unique = false
|
||||
|
||||
// ObiminionOptionSet sets the options for obiminion.
|
||||
//
|
||||
// options: The options for configuring obiminion.
|
||||
@ -50,6 +52,19 @@ func ObiminionOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&_NoSingleton, "no-singleton", _NoSingleton,
|
||||
options.Description("If set, sequences occurring a single time in the data set are discarded."))
|
||||
|
||||
options.BoolVar(&_clusterMode, "cluster", _clusterMode,
|
||||
options.Alias("C"),
|
||||
options.Description("Switch obiconsensus into its clustering mode."),
|
||||
)
|
||||
|
||||
options.BoolVar(&_unique, "unique", _unique,
|
||||
options.Alias("U"),
|
||||
options.Description("If set, sequences are dereplicated on the output (obiuniq)."),
|
||||
)
|
||||
|
||||
options.Float64Var(&_lowCoverage, "low-coverage", _lowCoverage,
|
||||
options.Description("If the coverage of a sample is lower than this value, it will be discarded."),
|
||||
)
|
||||
}
|
||||
|
||||
// OptionSet sets up the options for the obiminion package.
|
||||
@ -129,4 +144,16 @@ func CLIKmerSize() int {
|
||||
// Returns a boolean value indicating whether or not singleton sequences should be discarded.
|
||||
func CLINoSingleton() bool {
|
||||
return _NoSingleton
|
||||
}
|
||||
}
|
||||
|
||||
func CLICluterDenoise() bool {
|
||||
return _clusterMode
|
||||
}
|
||||
|
||||
func CLIUnique() bool {
|
||||
return _unique
|
||||
}
|
||||
|
||||
func CLILowCoverage() float64 {
|
||||
return _lowCoverage
|
||||
}
|
||||
|
Reference in New Issue
Block a user