2022-02-14 00:01:01 +01:00
|
|
|
package obiseq
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2024-06-19 13:15:30 +02:00
|
|
|
"math"
|
2022-09-08 07:50:17 +02:00
|
|
|
"reflect"
|
2022-02-15 00:47:02 +01:00
|
|
|
"strings"
|
2022-05-27 11:53:29 +03:00
|
|
|
|
2023-11-29 12:14:37 +01:00
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
2022-05-27 11:53:29 +03:00
|
|
|
log "github.com/sirupsen/logrus"
|
2022-02-14 00:01:01 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
type StatsOnValues map[string]int
|
|
|
|
|
2024-05-07 10:54:12 +02:00
|
|
|
// StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute.
|
|
|
|
//
|
|
|
|
// Parameters:
|
|
|
|
// - key: the attribute key (string)
|
|
|
|
//
|
|
|
|
// Return type:
|
|
|
|
// - string
|
2022-06-14 09:54:33 +02:00
|
|
|
func StatsOnSlotName(key string) string {
|
|
|
|
return "merged_" + key
|
|
|
|
}
|
|
|
|
|
2024-05-07 10:54:12 +02:00
|
|
|
// HasStatsOn tests if the sequence has already a slot summarizing statistics of occurrence for a given attribute.
|
|
|
|
//
|
|
|
|
// Parameters:
|
|
|
|
// - key: the attribute key (string)
|
|
|
|
//
|
|
|
|
// Return type:
|
|
|
|
// - bool
|
2022-02-21 19:00:23 +01:00
|
|
|
func (sequence *BioSequence) HasStatsOn(key string) bool {
|
2022-02-14 00:01:01 +01:00
|
|
|
if !sequence.HasAnnotation() {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2022-06-14 09:54:33 +02:00
|
|
|
mkey := StatsOnSlotName(key)
|
2022-02-14 00:01:01 +01:00
|
|
|
annotations := sequence.Annotations()
|
|
|
|
_, ok := annotations[mkey]
|
|
|
|
|
|
|
|
return ok
|
|
|
|
}
|
|
|
|
|
2024-05-07 10:54:12 +02:00
|
|
|
// StatsOn returns the slot summarizing statistics of occurrence for a given attribute.
|
|
|
|
//
|
|
|
|
// Parameters:
|
|
|
|
// - key: the attribute key (string) to be summarized
|
|
|
|
// - na: the value to be used if the attribute is not present
|
|
|
|
//
|
|
|
|
// Return type:
|
|
|
|
// - StatsOnValues
|
2022-02-21 19:00:23 +01:00
|
|
|
func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
|
2022-06-14 09:54:33 +02:00
|
|
|
mkey := StatsOnSlotName(key)
|
2022-02-14 00:01:01 +01:00
|
|
|
annotations := sequence.Annotations()
|
|
|
|
istat, ok := annotations[mkey]
|
|
|
|
|
|
|
|
var stats StatsOnValues
|
|
|
|
var newstat bool
|
|
|
|
|
|
|
|
if ok {
|
|
|
|
switch istat := istat.(type) {
|
|
|
|
case StatsOnValues:
|
|
|
|
stats = istat
|
|
|
|
newstat = false
|
2022-08-20 18:01:07 +02:00
|
|
|
case map[string]int:
|
|
|
|
stats = istat
|
|
|
|
newstat = false
|
2022-05-27 11:53:29 +03:00
|
|
|
case map[string]interface{}:
|
|
|
|
stats = make(StatsOnValues, len(istat))
|
2022-05-30 16:28:59 +02:00
|
|
|
newstat = false
|
2022-05-27 11:53:29 +03:00
|
|
|
var err error
|
|
|
|
for k, v := range istat {
|
2023-03-24 10:25:12 +07:00
|
|
|
stats[k], err = obiutils.InterfaceToInt(v)
|
2022-05-27 11:53:29 +03:00
|
|
|
if err != nil {
|
|
|
|
log.Panicf("In sequence %s : %s stat tag not only containing integer values %s",
|
|
|
|
sequence.Id(), mkey, istat)
|
|
|
|
}
|
|
|
|
}
|
2022-02-14 00:01:01 +01:00
|
|
|
default:
|
|
|
|
stats = make(StatsOnValues, 100)
|
|
|
|
annotations[mkey] = stats
|
|
|
|
newstat = true
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
stats = make(StatsOnValues, 100)
|
|
|
|
annotations[mkey] = stats
|
|
|
|
newstat = true
|
|
|
|
}
|
|
|
|
|
2022-02-18 10:00:42 +01:00
|
|
|
if newstat && sequence.StatsPlusOne(key, sequence, na) {
|
2022-02-14 00:01:01 +01:00
|
|
|
delete(sequence.Annotations(), key)
|
|
|
|
}
|
|
|
|
|
|
|
|
return stats
|
|
|
|
}
|
|
|
|
|
2024-05-07 10:54:12 +02:00
|
|
|
// StatsPlusOne adds the count of the sequence toAdd to the count of the key in the stats.
|
|
|
|
//
|
|
|
|
// Parameters:
|
|
|
|
// - key: the attribute key (string) to be summarized
|
|
|
|
// - toAdd: the BioSequence to add to the stats
|
|
|
|
// - na: the value to be used if the attribute is not present
|
|
|
|
// Return type:
|
|
|
|
// - bool
|
2022-02-21 19:00:23 +01:00
|
|
|
func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na string) bool {
|
2022-02-18 10:00:42 +01:00
|
|
|
sval := na
|
2022-06-14 09:54:33 +02:00
|
|
|
annotations := sequence.Annotations()
|
2022-02-21 19:00:23 +01:00
|
|
|
stats := sequence.StatsOn(key, na)
|
2022-02-18 10:00:42 +01:00
|
|
|
retval := false
|
|
|
|
|
2022-02-14 00:01:01 +01:00
|
|
|
if toAdd.HasAnnotation() {
|
|
|
|
value, ok := toAdd.Annotations()[key]
|
|
|
|
|
|
|
|
if ok {
|
|
|
|
|
|
|
|
switch value := value.(type) {
|
|
|
|
case string:
|
|
|
|
sval = value
|
|
|
|
case int,
|
|
|
|
uint8, uint16, uint32, uint64,
|
|
|
|
int8, int16, int32, int64, bool:
|
|
|
|
sval = fmt.Sprint(value)
|
2024-06-19 13:15:30 +02:00
|
|
|
case float64:
|
|
|
|
if math.Floor(value) == value {
|
|
|
|
sval = fmt.Sprint(int(value))
|
|
|
|
} else {
|
|
|
|
log.Fatalf("Trying to make stats on a float value (%v : %T)", value, value)
|
|
|
|
}
|
2022-02-14 00:01:01 +01:00
|
|
|
default:
|
2024-06-19 13:15:30 +02:00
|
|
|
log.Fatalf("Trying to make stats on a none string, integer or boolean value (%v : %T)", value, value)
|
2022-02-14 00:01:01 +01:00
|
|
|
}
|
2022-02-18 10:00:42 +01:00
|
|
|
retval = true
|
2022-02-14 00:01:01 +01:00
|
|
|
}
|
2022-02-18 10:00:42 +01:00
|
|
|
|
2022-02-14 00:01:01 +01:00
|
|
|
}
|
|
|
|
|
2022-02-18 10:00:42 +01:00
|
|
|
old, ok := stats[sval]
|
|
|
|
if !ok {
|
|
|
|
old = 0
|
|
|
|
}
|
2022-06-14 09:54:33 +02:00
|
|
|
stats[sval] = old + toAdd.Count()
|
2024-05-07 10:54:12 +02:00
|
|
|
annotations[StatsOnSlotName(key)] = stats // TODO: check if this is necessary
|
2022-02-18 10:00:42 +01:00
|
|
|
return retval
|
2022-02-14 00:01:01 +01:00
|
|
|
}
|
|
|
|
|
2024-05-07 10:54:12 +02:00
|
|
|
// Merge merges the given StatsOnValues with the current StatsOnValues.
|
|
|
|
//
|
|
|
|
// It takes a parameter `toMerged` of type StatsOnValues, which represents the StatsOnValues to be merged.
|
|
|
|
// It returns a value of type StatsOnValues, which represents the merged StatsOnValues.
|
2022-02-14 00:01:01 +01:00
|
|
|
func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
|
|
|
|
for k, val := range toMerged {
|
|
|
|
old, ok := stats[k]
|
|
|
|
if !ok {
|
|
|
|
old = 0
|
|
|
|
}
|
|
|
|
stats[k] = old + val
|
|
|
|
}
|
|
|
|
|
|
|
|
return stats
|
|
|
|
}
|
|
|
|
|
2024-05-07 10:54:12 +02:00
|
|
|
// Merge merges two sequences into a single sequence.
|
|
|
|
//
|
|
|
|
// Parameters:
|
|
|
|
// - tomerge: the sequence to be merged (BioSequence)
|
|
|
|
// - na: the value to be used if the attribute is not present (string)
|
|
|
|
// - inplace: a boolean indicating whether to merge in place or not (bool)
|
|
|
|
// - statsOn: a variadic string parameter representing the attributes to be summarized (string)
|
|
|
|
//
|
|
|
|
// Return type:
|
|
|
|
// - *BioSequence: the merged sequence (BioSequence)
|
2022-02-21 19:00:23 +01:00
|
|
|
func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn ...string) *BioSequence {
|
2022-02-14 00:01:01 +01:00
|
|
|
if !inplace {
|
|
|
|
sequence = sequence.Copy()
|
|
|
|
}
|
|
|
|
|
2022-02-15 00:47:02 +01:00
|
|
|
if sequence.HasQualities() {
|
|
|
|
sequence.SetQualities(nil)
|
|
|
|
}
|
|
|
|
|
2022-06-14 09:54:33 +02:00
|
|
|
annotations := sequence.Annotations()
|
2022-02-14 00:01:01 +01:00
|
|
|
|
2022-02-18 10:00:42 +01:00
|
|
|
count := sequence.Count() + tomerge.Count()
|
2022-02-14 00:01:01 +01:00
|
|
|
|
2022-02-18 10:00:42 +01:00
|
|
|
for _, key := range statsOn {
|
2022-02-14 00:01:01 +01:00
|
|
|
if tomerge.HasStatsOn(key) {
|
2022-02-21 19:00:23 +01:00
|
|
|
smk := sequence.StatsOn(key, na)
|
|
|
|
mmk := tomerge.StatsOn(key, na)
|
2022-06-14 09:54:33 +02:00
|
|
|
|
|
|
|
annotations[StatsOnSlotName(key)] = smk.Merge(mmk)
|
2022-02-14 00:01:01 +01:00
|
|
|
} else {
|
2022-02-21 19:00:23 +01:00
|
|
|
sequence.StatsPlusOne(key, tomerge, na)
|
2022-02-14 00:01:01 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-02-15 00:47:02 +01:00
|
|
|
if tomerge.HasAnnotation() {
|
|
|
|
ma := tomerge.Annotations()
|
2022-06-14 09:54:33 +02:00
|
|
|
for k, va := range annotations {
|
2022-02-15 00:47:02 +01:00
|
|
|
if !strings.HasPrefix(k, "merged_") {
|
|
|
|
vm, ok := ma[k]
|
2022-09-08 07:50:17 +02:00
|
|
|
if ok {
|
|
|
|
switch vm := vm.(type) {
|
|
|
|
case int, float64, string, bool:
|
|
|
|
if va != vm {
|
|
|
|
delete(annotations, k)
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
if !reflect.DeepEqual(va, vm) {
|
|
|
|
delete(annotations, k)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
2022-06-14 09:54:33 +02:00
|
|
|
delete(annotations, k)
|
2022-02-15 00:47:02 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2022-06-14 09:54:33 +02:00
|
|
|
for k := range annotations {
|
2022-02-15 00:47:02 +01:00
|
|
|
if !strings.HasPrefix(k, "merged_") {
|
2022-06-14 09:54:33 +02:00
|
|
|
delete(annotations, k)
|
2022-02-15 00:47:02 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-14 09:54:33 +02:00
|
|
|
annotations["count"] = count
|
2022-02-14 00:01:01 +01:00
|
|
|
return sequence
|
|
|
|
}
|
2022-02-15 00:47:02 +01:00
|
|
|
|
2024-05-07 10:54:12 +02:00
|
|
|
// Merge merges the given sequences into a single sequence.
|
|
|
|
//
|
|
|
|
// Parameters:
|
|
|
|
// - sequences: a slice of BioSequence objects to be merged (BioSequenceSlice)
|
|
|
|
// - na: the value to be used if the attribute is not present (string)
|
|
|
|
// - statsOn: a slice of strings representing the attributes to be summarized ([]string)
|
|
|
|
//
|
|
|
|
// Return type:
|
|
|
|
// - *BioSequence: the merged sequence (BioSequence)
|
2022-02-21 19:00:23 +01:00
|
|
|
func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequence {
|
2022-02-18 10:00:42 +01:00
|
|
|
seq := sequences[0]
|
2022-02-21 19:00:23 +01:00
|
|
|
//sequences[0] = nil
|
2022-02-18 10:00:42 +01:00
|
|
|
seq.SetQualities(nil)
|
2022-02-15 00:47:02 +01:00
|
|
|
|
2022-02-21 19:00:23 +01:00
|
|
|
if len(sequences) == 1 {
|
2022-05-30 16:28:59 +02:00
|
|
|
seq.Annotations()["count"] = seq.Count()
|
2022-02-21 19:00:23 +01:00
|
|
|
for _, v := range statsOn {
|
|
|
|
seq.StatsOn(v, na)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for k, toMerge := range sequences[1:] {
|
|
|
|
seq.Merge(toMerge, na, true, statsOn...)
|
|
|
|
toMerge.Recycle()
|
|
|
|
sequences[1+k] = nil
|
|
|
|
}
|
2022-02-15 00:47:02 +01:00
|
|
|
}
|
|
|
|
|
2023-03-28 19:37:05 +07:00
|
|
|
sequences.Recycle(false)
|
2022-02-21 19:00:23 +01:00
|
|
|
return seq
|
|
|
|
|
2022-02-15 00:47:02 +01:00
|
|
|
}
|