mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-08 00:30:27 +00:00
Merge pull request #51 from metabarcoding/push-urtwmwktsrru
Push urtwmwktsrru
This commit is contained in:
@@ -4,33 +4,6 @@ import (
|
|||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
)
|
)
|
||||||
|
|
||||||
var _iupac = [26]byte{
|
|
||||||
// a b c d e f
|
|
||||||
1, 14, 2, 13, 0, 0,
|
|
||||||
// g h i j k l
|
|
||||||
4, 11, 0, 0, 12, 0,
|
|
||||||
// m n o p q r
|
|
||||||
3, 15, 0, 0, 0, 5,
|
|
||||||
// s t u v w x
|
|
||||||
6, 8, 8, 13, 9, 0,
|
|
||||||
// y z
|
|
||||||
10, 0,
|
|
||||||
}
|
|
||||||
|
|
||||||
func _samenuc(a, b byte) bool {
|
|
||||||
if (a >= 'A') && (a <= 'Z') {
|
|
||||||
a |= 32
|
|
||||||
}
|
|
||||||
if (b >= 'A') && (b <= 'Z') {
|
|
||||||
b |= 32
|
|
||||||
}
|
|
||||||
|
|
||||||
if (a >= 'a') && (a <= 'z') && (b >= 'a') && (b <= 'z') {
|
|
||||||
return (_iupac[a-'a'] & _iupac[b-'a']) > 0
|
|
||||||
}
|
|
||||||
return a == b
|
|
||||||
}
|
|
||||||
|
|
||||||
// FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices.
|
// FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices.
|
||||||
//
|
//
|
||||||
// The score is calculated using the following scoring matrix:
|
// The score is calculated using the following scoring matrix:
|
||||||
@@ -165,7 +138,7 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
|
|||||||
default:
|
default:
|
||||||
// We are in the middle of the matrix
|
// We are in the middle of the matrix
|
||||||
Sdiag = _incpath(previous[x])
|
Sdiag = _incpath(previous[x])
|
||||||
if _samenuc(bA[j-1], bB[i-1]) {
|
if obiseq.SameIUPACNuc(bA[j-1], bB[i-1]) {
|
||||||
Sdiag = _incscore(Sdiag)
|
Sdiag = _incscore(Sdiag)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -265,7 +238,7 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[
|
|||||||
Sleft = _notavail
|
Sleft = _notavail
|
||||||
default:
|
default:
|
||||||
Sdiag = _incpath(previous[x])
|
Sdiag = _incpath(previous[x])
|
||||||
if _samenuc(bA[j-1], bB[i-1]) {
|
if obiseq.SameIUPACNuc(bA[j-1], bB[i-1]) {
|
||||||
Sdiag = _incscore(Sdiag)
|
Sdiag = _incscore(Sdiag)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
package obialign
|
package obialign
|
||||||
|
|
||||||
import log "github.com/sirupsen/logrus"
|
import (
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
// buffIndex converts a pair of coordinates (i, j) into a linear index in a matrix
|
// buffIndex converts a pair of coordinates (i, j) into a linear index in a matrix
|
||||||
// of size width x width. The coordinates are (-1)-indexed, and the linear index
|
// of size width x width. The coordinates are (-1)-indexed, and the linear index
|
||||||
@@ -69,7 +72,7 @@ func LocatePattern(id string, pattern, sequence []byte) (int, int, int) {
|
|||||||
// Mismatch score = -1
|
// Mismatch score = -1
|
||||||
// Match score = 0
|
// Match score = 0
|
||||||
match := -1
|
match := -1
|
||||||
if _samenuc(pattern[j], sequence[i]) {
|
if obiseq.SameIUPACNuc(pattern[j], sequence[i]) {
|
||||||
match = 0
|
match = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -103,7 +106,7 @@ func LocatePattern(id string, pattern, sequence []byte) (int, int, int) {
|
|||||||
// Mismatch score = -1
|
// Mismatch score = -1
|
||||||
// Match score = 0
|
// Match score = 0
|
||||||
match := -1
|
match := -1
|
||||||
if _samenuc(pattern[jmax], sequence[i]) {
|
if obiseq.SameIUPACNuc(pattern[jmax], sequence[i]) {
|
||||||
match = 0
|
match = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import (
|
|||||||
// corresponds to the last commit, and not the one when the file will be
|
// corresponds to the last commit, and not the one when the file will be
|
||||||
// commited
|
// commited
|
||||||
|
|
||||||
var _Commit = "6d204f6"
|
var _Commit = "29bf4ce"
|
||||||
var _Version = "Release 4.4.0"
|
var _Version = "Release 4.4.0"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
|
|||||||
28
pkg/obiseq/iupac_nog.go
Normal file
28
pkg/obiseq/iupac_nog.go
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
package obiseq
|
||||||
|
|
||||||
|
var _iupac = [26]byte{
|
||||||
|
// a b c d e f
|
||||||
|
1, 14, 2, 13, 0, 0,
|
||||||
|
// g h i j k l
|
||||||
|
4, 11, 0, 0, 12, 0,
|
||||||
|
// m n o p q r
|
||||||
|
3, 15, 0, 0, 0, 5,
|
||||||
|
// s t u v w x
|
||||||
|
6, 8, 8, 13, 9, 0,
|
||||||
|
// y z
|
||||||
|
10, 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
func SameIUPACNuc(a, b byte) bool {
|
||||||
|
if (a >= 'A') && (a <= 'Z') {
|
||||||
|
a |= 32
|
||||||
|
}
|
||||||
|
if (b >= 'A') && (b <= 'Z') {
|
||||||
|
b |= 32
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a >= 'a') && (a <= 'z') && (b >= 'a') && (b <= 'z') {
|
||||||
|
return (_iupac[a-'a'] & _iupac[b-'a']) > 0
|
||||||
|
}
|
||||||
|
return a == b
|
||||||
|
}
|
||||||
@@ -2,12 +2,10 @@ package obiclean
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||||
"github.com/schollz/progressbar/v3"
|
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -21,13 +19,13 @@ func commonPrefix(a, b *obiseq.BioSequence) int {
|
|||||||
as := a.Sequence()
|
as := a.Sequence()
|
||||||
bs := b.Sequence()
|
bs := b.Sequence()
|
||||||
|
|
||||||
for i < l && as[i] == bs[i] {
|
for i < l && obiseq.SameIUPACNuc(as[i], bs[i]) {
|
||||||
i++
|
i++
|
||||||
}
|
}
|
||||||
|
|
||||||
if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) {
|
// if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) {
|
||||||
log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i])
|
// log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i])
|
||||||
}
|
// }
|
||||||
|
|
||||||
return i
|
return i
|
||||||
}
|
}
|
||||||
@@ -44,29 +42,97 @@ func commonSuffix(a, b *obiseq.BioSequence) int {
|
|||||||
bs := b.Sequence()
|
bs := b.Sequence()
|
||||||
|
|
||||||
l := 0
|
l := 0
|
||||||
for i >= 0 && j >= 0 && as[i] == bs[j] {
|
for i >= 0 && j >= 0 && obiseq.SameIUPACNuc(as[i], bs[j]) {
|
||||||
i--
|
i--
|
||||||
j--
|
j--
|
||||||
l++
|
l++
|
||||||
}
|
}
|
||||||
|
|
||||||
if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) {
|
// if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) {
|
||||||
log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:])
|
// log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:])
|
||||||
}
|
// }
|
||||||
// obilog.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:])
|
// log.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:])
|
||||||
|
|
||||||
return l
|
return l
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// oneDifference return true if s1 and s2 differ by exactly 1 operation
|
||||||
|
// (substitution, insertion or deletion)
|
||||||
|
func oneDifference(s1, s2 []byte) bool {
|
||||||
|
l1 := len(s1)
|
||||||
|
l2 := len(s2)
|
||||||
|
|
||||||
|
// Case 1: same lengths → test substitution
|
||||||
|
if l1 == l2 {
|
||||||
|
diff := 0
|
||||||
|
for i := 0; i < l1; i++ {
|
||||||
|
if s1[i] != s2[i] {
|
||||||
|
diff++
|
||||||
|
if diff > 1 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return diff == 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 2: difference of 1 character → insertion/deletion
|
||||||
|
if l1 == l2+1 {
|
||||||
|
for i := 0; i < l1; i++ {
|
||||||
|
if string(s1[:i])+string(s1[i+1:]) == string(s2) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if l2 == l1+1 {
|
||||||
|
for i := 0; i < l2; i++ {
|
||||||
|
if string(s2[:i])+string(s2[i+1:]) == string(s1) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 3: difference > 1 character
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetChimera(sequence *obiseq.BioSequence) map[string]string {
|
||||||
|
annotation := sequence.Annotations()
|
||||||
|
iobistatus, ok := annotation["chimera"]
|
||||||
|
var chimera map[string]string
|
||||||
|
var err error
|
||||||
|
|
||||||
|
if ok {
|
||||||
|
switch iobistatus := iobistatus.(type) {
|
||||||
|
case map[string]string:
|
||||||
|
chimera = iobistatus
|
||||||
|
case map[string]interface{}:
|
||||||
|
chimera = make(map[string]string)
|
||||||
|
for k, v := range iobistatus {
|
||||||
|
chimera[k], err = obiutils.InterfaceToString(v)
|
||||||
|
if err != nil {
|
||||||
|
log.Panicf("chimera attribute of sequence %s must be castable to a map[string]string", sequence.Id())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
chimera = make(map[string]string)
|
||||||
|
annotation["chimera"] = chimera
|
||||||
|
}
|
||||||
|
|
||||||
|
return chimera
|
||||||
|
}
|
||||||
|
|
||||||
|
// add the suffix/prefix comparisons to detect chimeras (handle IUPAC codes)
|
||||||
func AnnotateChimera(samples map[string]*[]*seqPCR) {
|
func AnnotateChimera(samples map[string]*[]*seqPCR) {
|
||||||
|
|
||||||
w := func(sample string, seqs *[]*seqPCR) {
|
w := func(sample string, seqs *[]*seqPCR) {
|
||||||
ls := len(*seqs)
|
ls := len(*seqs)
|
||||||
cp := make([]int, ls)
|
|
||||||
cs := make([]int, ls)
|
|
||||||
|
|
||||||
pcrs := make([]*seqPCR, 0, ls)
|
pcrs := make([]*seqPCR, 0, ls)
|
||||||
|
|
||||||
|
// select only sequences without edges (head sequences)
|
||||||
for _, s := range *seqs {
|
for _, s := range *seqs {
|
||||||
if len(s.Edges) == 0 {
|
if len(s.Edges) == 0 {
|
||||||
pcrs = append(pcrs, s)
|
pcrs = append(pcrs, s)
|
||||||
@@ -75,66 +141,63 @@ func AnnotateChimera(samples map[string]*[]*seqPCR) {
|
|||||||
|
|
||||||
lp := len(pcrs)
|
lp := len(pcrs)
|
||||||
|
|
||||||
|
// sort by increasing weight (like increasing abundance)
|
||||||
sort.Slice(pcrs, func(i, j int) bool {
|
sort.Slice(pcrs, func(i, j int) bool {
|
||||||
return pcrs[i].Weight < pcrs[j].Weight
|
return pcrs[i].Weight < pcrs[j].Weight
|
||||||
})
|
})
|
||||||
|
|
||||||
for i, s := range pcrs {
|
for i, s := range pcrs {
|
||||||
for j := i + 1; j < lp; j++ {
|
seqRef := s.Sequence
|
||||||
s2 := pcrs[j]
|
L := seqRef.Len()
|
||||||
cp[j] = commonPrefix(s.Sequence, s2.Sequence)
|
|
||||||
cs[j] = commonSuffix(s.Sequence, s2.Sequence)
|
|
||||||
}
|
|
||||||
|
|
||||||
var cm map[string]string
|
maxLeft, maxRight := 0, 0
|
||||||
var err error
|
var nameLeft, nameRight string
|
||||||
|
|
||||||
chimera, ok := s.Sequence.GetAttribute("chimera")
|
// looking for potential parents
|
||||||
|
for j := 0; j < lp; j++ {
|
||||||
|
if j == i {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seqParent := pcrs[j].Sequence
|
||||||
|
|
||||||
if !ok {
|
// Check abundance: parent must be more abundant
|
||||||
cm = map[string]string{}
|
if pcrs[j].Weight <= s.Weight {
|
||||||
} else {
|
continue
|
||||||
cm, err = obiutils.InterfaceToStringMap(chimera)
|
}
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("type of chimera not map[string]string: %T (%v)",
|
// Check edit distance (skip if only one diff, supposed to never happen)
|
||||||
chimera, err)
|
if oneDifference(seqRef.Sequence(), seqParent.Sequence()) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Common prefix
|
||||||
|
left := commonPrefix(seqRef, seqParent)
|
||||||
|
if left > maxLeft {
|
||||||
|
maxLeft = left
|
||||||
|
nameLeft = seqParent.Id()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Common suffix
|
||||||
|
right := commonSuffix(seqRef, seqParent)
|
||||||
|
if right > maxRight {
|
||||||
|
maxRight = right
|
||||||
|
nameRight = seqParent.Id()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ls := s.Sequence.Len()
|
// Select parents with longuest prefix/suffix
|
||||||
|
// Condition prefix+suffix covers the sequence and sequence not include into parent
|
||||||
|
if maxLeft >= L-maxRight && maxLeft > 0 && maxRight < L {
|
||||||
|
|
||||||
for k := i + 1; k < lp; k++ {
|
chimeraMap := GetChimera(s.Sequence)
|
||||||
for l := i + 1; l < lp; l++ {
|
// overlap sequence
|
||||||
if k != l && cp[k]+cs[l] == ls {
|
overlap := seqRef.Sequence()[L-maxRight : maxLeft]
|
||||||
cm[sample] = fmt.Sprintf("{%s}/{%s}@(%d)",
|
chimeraMap[sample] = fmt.Sprintf("{%s}/{%s}@(%s)(%d)(%d)(%d)", nameLeft, nameRight, overlap, L-maxRight, maxLeft, len(overlap))
|
||||||
pcrs[k].Sequence.Id(),
|
|
||||||
pcrs[l].Sequence.Id(),
|
|
||||||
cp[k])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(cm) > 0 {
|
|
||||||
s.Sequence.SetAttribute("chimera", cm)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pbopt := make([]progressbar.Option, 0, 5)
|
|
||||||
pbopt = append(pbopt,
|
|
||||||
progressbar.OptionSetWriter(os.Stderr),
|
|
||||||
progressbar.OptionSetWidth(15),
|
|
||||||
progressbar.OptionShowIts(),
|
|
||||||
progressbar.OptionSetPredictTime(true),
|
|
||||||
progressbar.OptionSetDescription("[Chimera detection]"),
|
|
||||||
)
|
|
||||||
|
|
||||||
bar := progressbar.NewOptions(len(samples), pbopt...)
|
|
||||||
|
|
||||||
for sn, sqs := range samples {
|
for sn, sqs := range samples {
|
||||||
w(sn, sqs)
|
w(sn, sqs)
|
||||||
bar.Add(1)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ package obimatrix
|
|||||||
import (
|
import (
|
||||||
"encoding/csv"
|
"encoding/csv"
|
||||||
"os"
|
"os"
|
||||||
|
"slices"
|
||||||
"sort"
|
"sort"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
@@ -11,41 +12,55 @@ import (
|
|||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicsv"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||||
"golang.org/x/exp/maps"
|
"golang.org/x/exp/maps"
|
||||||
)
|
)
|
||||||
|
|
||||||
type MatrixData map[string]map[string]interface{}
|
type MatrixData struct {
|
||||||
|
matrix map[string]map[string]interface{}
|
||||||
|
attributes map[string]map[string]interface{}
|
||||||
|
attributeList []string
|
||||||
|
naValue string
|
||||||
|
}
|
||||||
|
|
||||||
// MakeMatrixData generates a MatrixData instance.
|
// MakeMatrixData generates a MatrixData instance.
|
||||||
//
|
//
|
||||||
// No parameters.
|
// No parameters.
|
||||||
// Returns a MatrixData.
|
// Returns a MatrixData.
|
||||||
func MakeMatrixData() MatrixData {
|
func MakeMatrixData(naValue string, attributes ...string) MatrixData {
|
||||||
return make(MatrixData)
|
return MatrixData{
|
||||||
|
matrix: make(map[string]map[string]interface{}),
|
||||||
|
attributes: make(map[string]map[string]interface{}),
|
||||||
|
attributeList: slices.Clone(attributes),
|
||||||
|
naValue: naValue,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewMatrixData creates a new instance of MatrixData.
|
// NewMatrixData creates a new instance of MatrixData.
|
||||||
//
|
//
|
||||||
// It does not take any parameters.
|
// It does not take any parameters.
|
||||||
// It returns a pointer to a MatrixData object.
|
// It returns a pointer to a MatrixData object.
|
||||||
func NewMatrixData() *MatrixData {
|
func NewMatrixData(naValue string, attributes ...string) *MatrixData {
|
||||||
m := make(MatrixData)
|
m := MakeMatrixData(naValue, attributes...)
|
||||||
return &m
|
return &m
|
||||||
}
|
}
|
||||||
|
|
||||||
// TransposeMatrixData transposes the MatrixData.
|
// TransposeMatrixData transposes the MatrixData.
|
||||||
//
|
//
|
||||||
// It takes no parameters.
|
// It takes no parameters.
|
||||||
|
// If the input matrix has attributes, they are lost.
|
||||||
|
// A unique attribute "id" is added to store the column ids of the input matrix.
|
||||||
// It returns a pointer to the transposed MatrixData.
|
// It returns a pointer to the transposed MatrixData.
|
||||||
func (matrix *MatrixData) TransposeMatrixData() *MatrixData {
|
func (matrix *MatrixData) TransposeMatrixData() *MatrixData {
|
||||||
m := make(MatrixData)
|
m := MakeMatrixData(matrix.naValue, "id")
|
||||||
for k, v := range *matrix {
|
for k, v := range *&matrix.matrix {
|
||||||
for kk, vv := range v {
|
for kk, vv := range v {
|
||||||
if _, ok := m[kk]; !ok {
|
if _, ok := m.matrix[kk]; !ok {
|
||||||
m[kk] = make(map[string]interface{})
|
m.matrix[kk] = make(map[string]interface{})
|
||||||
}
|
}
|
||||||
m[kk][k] = vv
|
m.matrix[kk][k] = vv
|
||||||
|
m.attributes[kk] = map[string]interface{}{"id": k}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return &m
|
return &m
|
||||||
@@ -58,11 +73,12 @@ func (matrix *MatrixData) TransposeMatrixData() *MatrixData {
|
|||||||
// Returns the pointer to the merged MatrixData.
|
// Returns the pointer to the merged MatrixData.
|
||||||
func (data1 *MatrixData) MergeMatrixData(data2 *MatrixData) *MatrixData {
|
func (data1 *MatrixData) MergeMatrixData(data2 *MatrixData) *MatrixData {
|
||||||
|
|
||||||
for k := range *data2 {
|
for k := range data2.matrix {
|
||||||
if _, ok := (*data1)[k]; ok {
|
if _, ok := data1.matrix[k]; ok {
|
||||||
log.Panicf("Sequence Id %s exists at least twice in the data set", k)
|
log.Panicf("Sequence Id %s exists at least twice in the data set", k)
|
||||||
} else {
|
} else {
|
||||||
(*data1)[k] = (*data2)[k]
|
data1.matrix[k] = data2.matrix[k]
|
||||||
|
data1.attributes[k] = data2.attributes[k]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -77,21 +93,74 @@ func (data1 *MatrixData) MergeMatrixData(data2 *MatrixData) *MatrixData {
|
|||||||
//
|
//
|
||||||
// Returns:
|
// Returns:
|
||||||
// - *MatrixData: The updated MatrixData object.
|
// - *MatrixData: The updated MatrixData object.
|
||||||
func (data *MatrixData) Update(s *obiseq.BioSequence, mapkey string) *MatrixData {
|
func (data *MatrixData) Update(s *obiseq.BioSequence, mapkey string, strict bool) *MatrixData {
|
||||||
|
|
||||||
|
sid := s.Id()
|
||||||
|
|
||||||
|
if _, ok := data.matrix[sid]; ok {
|
||||||
|
log.Panicf("Sequence Id %s exists at least twice in the data set", sid)
|
||||||
|
}
|
||||||
if v, ok := s.GetAttribute(mapkey); ok {
|
if v, ok := s.GetAttribute(mapkey); ok {
|
||||||
if m, ok := v.(*obiseq.StatsOnValues); ok {
|
if m, ok := v.(*obiseq.StatsOnValues); ok {
|
||||||
m.RLock()
|
m.RLock()
|
||||||
(*data)[s.Id()] = obiutils.MapToMapInterface(m.Map())
|
data.matrix[sid] = obiutils.MapToMapInterface(m.Map())
|
||||||
m.RUnlock()
|
m.RUnlock()
|
||||||
} else if obiutils.IsAMap(v) {
|
} else if obiutils.IsAMap(v) {
|
||||||
(*data)[s.Id()] = obiutils.MapToMapInterface(v)
|
data.matrix[sid] = obiutils.MapToMapInterface(v)
|
||||||
} else {
|
} else {
|
||||||
log.Panicf("Attribute %s is not a map in the sequence %s", mapkey, s.Id())
|
log.Panicf("Attribute %s is not a map in the sequence %s", mapkey, s.Id())
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
log.Panicf("Attribute %s does not exist in the sequence %s", mapkey, s.Id())
|
if strict {
|
||||||
|
log.Panicf("Attribute %s does not exist in the sequence %s", mapkey, s.Id())
|
||||||
|
}
|
||||||
|
data.matrix[sid] = make(map[string]interface{})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
attrs := make(map[string]interface{}, len(data.attributeList))
|
||||||
|
for _, attrname := range data.attributeList {
|
||||||
|
var value interface{}
|
||||||
|
ok := false
|
||||||
|
switch attrname {
|
||||||
|
case "id":
|
||||||
|
value = s.Id
|
||||||
|
ok = true
|
||||||
|
case "count":
|
||||||
|
value = s.Count()
|
||||||
|
ok = true
|
||||||
|
case "taxon":
|
||||||
|
taxon := s.Taxon(nil)
|
||||||
|
if taxon != nil {
|
||||||
|
value = taxon.String()
|
||||||
|
|
||||||
|
} else {
|
||||||
|
value = s.Taxid()
|
||||||
|
}
|
||||||
|
ok = true
|
||||||
|
case "sequence":
|
||||||
|
value = s.String()
|
||||||
|
ok = true
|
||||||
|
case "quality":
|
||||||
|
if s.HasQualities() {
|
||||||
|
l := s.Len()
|
||||||
|
q := s.Qualities()
|
||||||
|
ascii := make([]byte, l)
|
||||||
|
quality_shift := obidefault.WriteQualitiesShift()
|
||||||
|
for j := 0; j < l; j++ {
|
||||||
|
ascii[j] = uint8(q[j]) + uint8(quality_shift)
|
||||||
|
}
|
||||||
|
value = string(ascii)
|
||||||
|
ok = true
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
value, ok = s.GetAttribute(attrname)
|
||||||
|
}
|
||||||
|
if ok {
|
||||||
|
attrs[attrname] = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
data.attributes[sid] = attrs
|
||||||
|
|
||||||
return data
|
return data
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -101,6 +170,49 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData {
|
|||||||
waiter := sync.WaitGroup{}
|
waiter := sync.WaitGroup{}
|
||||||
|
|
||||||
mapAttribute := CLIMapAttribute()
|
mapAttribute := CLIMapAttribute()
|
||||||
|
attribList := make([]string, 0)
|
||||||
|
|
||||||
|
if obicsv.CLIPrintId() {
|
||||||
|
attribList = append(attribList, "id")
|
||||||
|
}
|
||||||
|
|
||||||
|
if obicsv.CLIPrintCount() {
|
||||||
|
attribList = append(attribList, "count")
|
||||||
|
}
|
||||||
|
|
||||||
|
if obicsv.CLIPrintTaxon() {
|
||||||
|
attribList = append(attribList, "taxon")
|
||||||
|
}
|
||||||
|
|
||||||
|
if obicsv.CLIPrintDefinition() {
|
||||||
|
attribList = append(attribList, "definition")
|
||||||
|
}
|
||||||
|
|
||||||
|
if obicsv.CLIPrintSequence() {
|
||||||
|
attribList = append(attribList, "sequence")
|
||||||
|
}
|
||||||
|
|
||||||
|
if obicsv.CLIPrintQuality() {
|
||||||
|
attribList = append(attribList, "qualities")
|
||||||
|
}
|
||||||
|
|
||||||
|
attribList = append(attribList, obicsv.CLIToBeKeptAttributes()...)
|
||||||
|
|
||||||
|
if obicsv.CLIAutoColumns() {
|
||||||
|
if iterator.Next() {
|
||||||
|
batch := iterator.Get()
|
||||||
|
if len(batch.Slice()) == 0 {
|
||||||
|
log.Panicf("first batch should not be empty")
|
||||||
|
}
|
||||||
|
auto_slot := batch.Slice().AttributeKeys(true, true).Members()
|
||||||
|
slices.Sort(auto_slot)
|
||||||
|
attribList = append(attribList, auto_slot...)
|
||||||
|
iterator.PushBack()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
naValue := obicsv.CLINAValue()
|
||||||
|
strict := CLIStrict()
|
||||||
|
|
||||||
summaries := make([]*MatrixData, nproc)
|
summaries := make([]*MatrixData, nproc)
|
||||||
|
|
||||||
@@ -109,7 +221,7 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData {
|
|||||||
for iseq.Next() {
|
for iseq.Next() {
|
||||||
batch := iseq.Get()
|
batch := iseq.Get()
|
||||||
for _, seq := range batch.Slice() {
|
for _, seq := range batch.Slice() {
|
||||||
summary.Update(seq, mapAttribute)
|
summary.Update(seq, mapAttribute, strict)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
waiter.Done()
|
waiter.Done()
|
||||||
@@ -117,11 +229,11 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData {
|
|||||||
|
|
||||||
waiter.Add(nproc)
|
waiter.Add(nproc)
|
||||||
|
|
||||||
summaries[0] = NewMatrixData()
|
summaries[0] = NewMatrixData(naValue, attribList...)
|
||||||
go ff(iterator, summaries[0])
|
go ff(iterator, summaries[0])
|
||||||
|
|
||||||
for i := 1; i < nproc; i++ {
|
for i := 1; i < nproc; i++ {
|
||||||
summaries[i] = NewMatrixData()
|
summaries[i] = NewMatrixData(naValue, attribList...)
|
||||||
go ff(iterator.Split(), summaries[i])
|
go ff(iterator.Split(), summaries[i])
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -138,7 +250,7 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func CLIWriteCSVToStdout(matrix *MatrixData) {
|
func CLIWriteCSVToStdout(matrix *MatrixData) {
|
||||||
navalue := CLINaValue()
|
navalue := CLIMapNaValue()
|
||||||
csvwriter := csv.NewWriter(os.Stdout)
|
csvwriter := csv.NewWriter(os.Stdout)
|
||||||
|
|
||||||
if CLITranspose() {
|
if CLITranspose() {
|
||||||
@@ -147,33 +259,45 @@ func CLIWriteCSVToStdout(matrix *MatrixData) {
|
|||||||
|
|
||||||
samples := obiutils.NewSet[string]()
|
samples := obiutils.NewSet[string]()
|
||||||
|
|
||||||
for _, v := range *matrix {
|
for _, v := range matrix.matrix {
|
||||||
samples.Add(maps.Keys(v)...)
|
samples.Add(maps.Keys(v)...)
|
||||||
}
|
}
|
||||||
|
|
||||||
osamples := samples.Members()
|
osamples := samples.Members()
|
||||||
sort.Strings(osamples)
|
sort.Strings(osamples)
|
||||||
|
columns := make([]string, 0, len(osamples)+len(matrix.attributeList))
|
||||||
columns := make([]string, 1, len(osamples)+1)
|
columns = append(columns, matrix.attributeList...)
|
||||||
columns[0] = "id"
|
|
||||||
columns = append(columns, osamples...)
|
columns = append(columns, osamples...)
|
||||||
|
|
||||||
|
header := slices.Clone(columns)
|
||||||
|
|
||||||
csvwriter.Write(columns)
|
csvwriter.Write(columns)
|
||||||
|
nattribs := len(matrix.attributeList)
|
||||||
|
|
||||||
for k, data := range *matrix {
|
for k, data := range matrix.matrix {
|
||||||
columns = columns[0:1]
|
attrs := matrix.attributes[k]
|
||||||
columns[0] = k
|
for i, kk := range header {
|
||||||
for _, kk := range osamples {
|
if i < nattribs {
|
||||||
if v, ok := data[kk]; ok {
|
if v, ok := attrs[kk]; ok {
|
||||||
vs, err := obiutils.InterfaceToString(v)
|
vs, err := obiutils.InterfaceToString(v)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk)
|
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk)
|
||||||
|
}
|
||||||
|
columns[i] = vs
|
||||||
|
} else {
|
||||||
|
columns[i] = matrix.naValue
|
||||||
}
|
}
|
||||||
columns = append(columns, vs)
|
|
||||||
} else {
|
} else {
|
||||||
columns = append(columns, navalue)
|
if v, ok := data[kk]; ok {
|
||||||
|
vs, err := obiutils.InterfaceToString(v)
|
||||||
|
if err != nil {
|
||||||
|
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk)
|
||||||
|
}
|
||||||
|
columns[i] = vs
|
||||||
|
} else {
|
||||||
|
columns[i] = navalue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
csvwriter.Write(columns)
|
csvwriter.Write(columns)
|
||||||
}
|
}
|
||||||
@@ -187,8 +311,8 @@ func CLIWriteThreeColumnsToStdout(matrix *MatrixData) {
|
|||||||
csvwriter := csv.NewWriter(os.Stdout)
|
csvwriter := csv.NewWriter(os.Stdout)
|
||||||
|
|
||||||
csvwriter.Write([]string{"id", sname, vname})
|
csvwriter.Write([]string{"id", sname, vname})
|
||||||
for seqid := range *matrix {
|
for seqid := range matrix.matrix {
|
||||||
for attr, v := range (*matrix)[seqid] {
|
for attr, v := range matrix.matrix[seqid] {
|
||||||
vs, err := obiutils.InterfaceToString(v)
|
vs, err := obiutils.InterfaceToString(v)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, seqid, attr)
|
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, seqid, attr)
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ package obimatrix
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicsv"
|
||||||
"github.com/DavidGamba/go-getoptions"
|
"github.com/DavidGamba/go-getoptions"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -14,7 +15,8 @@ var __transpose__ = true
|
|||||||
var __mapAttribute__ = "merged_sample"
|
var __mapAttribute__ = "merged_sample"
|
||||||
var __valueName__ = "count"
|
var __valueName__ = "count"
|
||||||
var __sampleName__ = "sample"
|
var __sampleName__ = "sample"
|
||||||
var __NAValue__ = "0"
|
var __MapNAValue__ = "0"
|
||||||
|
var __AllowEmpty__ = false
|
||||||
|
|
||||||
func MatrixOptionSet(options *getoptions.GetOpt) {
|
func MatrixOptionSet(options *getoptions.GetOpt) {
|
||||||
options.BoolVar(&__threeColumns__, "three-columns", false,
|
options.BoolVar(&__threeColumns__, "three-columns", false,
|
||||||
@@ -32,12 +34,16 @@ func MatrixOptionSet(options *getoptions.GetOpt) {
|
|||||||
options.StringVar(&__sampleName__, "sample-name", __sampleName__,
|
options.StringVar(&__sampleName__, "sample-name", __sampleName__,
|
||||||
options.Description("Name of the coulumn containing the sample names in the three column format."))
|
options.Description("Name of the coulumn containing the sample names in the three column format."))
|
||||||
|
|
||||||
options.StringVar(&__NAValue__, "na-value", __NAValue__,
|
options.StringVar(&__MapNAValue__, "map-na-value", __MapNAValue__,
|
||||||
options.Description("Value used when the map attribute is not defined for a sequence."))
|
options.Description("Value used when the map attribute is not defined for a sequence."))
|
||||||
|
|
||||||
|
options.BoolVar(&__AllowEmpty__, "allow-empty", __AllowEmpty__,
|
||||||
|
options.Description("Allow sequences with empty map"))
|
||||||
}
|
}
|
||||||
|
|
||||||
func OptionSet(options *getoptions.GetOpt) {
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
MatrixOptionSet(options)
|
MatrixOptionSet(options)
|
||||||
|
obicsv.CSVOptionSet(options)
|
||||||
obiconvert.InputOptionSet(options)
|
obiconvert.InputOptionSet(options)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -57,8 +63,8 @@ func CLISampleName() string {
|
|||||||
return __sampleName__
|
return __sampleName__
|
||||||
}
|
}
|
||||||
|
|
||||||
func CLINaValue() string {
|
func CLIMapNaValue() string {
|
||||||
return __NAValue__
|
return __MapNAValue__
|
||||||
}
|
}
|
||||||
|
|
||||||
func CLIMapAttribute() string {
|
func CLIMapAttribute() string {
|
||||||
@@ -68,3 +74,7 @@ func CLIMapAttribute() string {
|
|||||||
func CLITranspose() bool {
|
func CLITranspose() bool {
|
||||||
return __transpose__
|
return __transpose__
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func CLIStrict() bool {
|
||||||
|
return !__AllowEmpty__
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user