Files
obitools4/pkg/obitools/obimatrix/obimatrix.go
Eric Coissac 0844dcc607 bug obimatrix
2025-10-28 13:57:31 +01:00

326 lines
7.9 KiB
Go

package obimatrix
import (
"encoding/csv"
"os"
"slices"
"sort"
"sync"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicsv"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"golang.org/x/exp/maps"
)
type MatrixData struct {
matrix map[string]map[string]interface{}
attributes map[string]map[string]interface{}
attributeList []string
naValue string
}
// MakeMatrixData generates a MatrixData instance.
//
// No parameters.
// Returns a MatrixData.
func MakeMatrixData(naValue string, attributes ...string) MatrixData {
return MatrixData{
matrix: make(map[string]map[string]interface{}),
attributes: make(map[string]map[string]interface{}),
attributeList: slices.Clone(attributes),
naValue: naValue,
}
}
// NewMatrixData creates a new instance of MatrixData.
//
// It does not take any parameters.
// It returns a pointer to a MatrixData object.
func NewMatrixData(naValue string, attributes ...string) *MatrixData {
m := MakeMatrixData(naValue, attributes...)
return &m
}
// TransposeMatrixData transposes the MatrixData.
//
// It takes no parameters.
// If the input matrix has attributes, they are lost.
// A unique attribute "id" is added to store the column ids of the input matrix.
// It returns a pointer to the transposed MatrixData.
func (matrix *MatrixData) TransposeMatrixData() *MatrixData {
m := MakeMatrixData(matrix.naValue, "id")
for k, v := range *&matrix.matrix {
for kk, vv := range v {
if _, ok := m.matrix[kk]; !ok {
m.matrix[kk] = make(map[string]interface{})
}
m.matrix[kk][k] = vv
m.attributes[kk] = map[string]interface{}{"id": k}
}
}
return &m
}
// MergeMatrixData merges the data from data2 into data1.
//
// data1 - Pointer to the MatrixData to merge into.
// data2 - Pointer to the MatrixData to merge from.
// Returns the pointer to the merged MatrixData.
func (data1 *MatrixData) MergeMatrixData(data2 *MatrixData) *MatrixData {
for k := range data2.matrix {
if _, ok := data1.matrix[k]; ok {
log.Panicf("Sequence Id %s exists at least twice in the data set", k)
} else {
data1.matrix[k] = data2.matrix[k]
data1.attributes[k] = data2.attributes[k]
}
}
return data1
}
// Update updates the MatrixData with the given BioSequence and mapkey.
//
// Parameters:
// - s: The BioSequence object to update MatrixData with.
// - mapkey: The key to retrieve the attribute from the BioSequence object.
//
// Returns:
// - *MatrixData: The updated MatrixData object.
func (data *MatrixData) Update(s *obiseq.BioSequence, mapkey string, strict bool) *MatrixData {
sid := s.Id()
if _, ok := data.matrix[sid]; ok {
log.Panicf("Sequence Id %s exists at least twice in the data set", sid)
}
if v, ok := s.GetAttribute(mapkey); ok {
if m, ok := v.(*obiseq.StatsOnValues); ok {
m.RLock()
data.matrix[sid] = obiutils.MapToMapInterface(m.Map())
m.RUnlock()
} else if obiutils.IsAMap(v) {
data.matrix[sid] = obiutils.MapToMapInterface(v)
} else {
log.Panicf("Attribute %s is not a map in the sequence %s", mapkey, s.Id())
}
} else {
if strict {
log.Panicf("Attribute %s does not exist in the sequence %s", mapkey, s.Id())
}
data.matrix[sid] = make(map[string]interface{})
}
attrs := make(map[string]interface{}, len(data.attributeList))
for _, attrname := range data.attributeList {
var value interface{}
ok := false
switch attrname {
case "id":
value = s.Id()
ok = true
case "count":
value = s.Count()
ok = true
case "taxon":
taxon := s.Taxon(nil)
if taxon != nil {
value = taxon.String()
} else {
value = s.Taxid()
}
ok = true
case "sequence":
value = s.String()
ok = true
case "quality":
if s.HasQualities() {
l := s.Len()
q := s.Qualities()
ascii := make([]byte, l)
quality_shift := obidefault.WriteQualitiesShift()
for j := 0; j < l; j++ {
ascii[j] = uint8(q[j]) + uint8(quality_shift)
}
value = string(ascii)
ok = true
}
default:
value, ok = s.GetAttribute(attrname)
}
if ok {
attrs[attrname] = value
}
}
data.attributes[sid] = attrs
return data
}
func IMatrix(iterator obiiter.IBioSequence) *MatrixData {
nproc := obidefault.ParallelWorkers()
waiter := sync.WaitGroup{}
mapAttribute := CLIMapAttribute()
attribList := make([]string, 0)
if obicsv.CLIPrintId() {
attribList = append(attribList, "id")
}
if obicsv.CLIPrintCount() {
attribList = append(attribList, "count")
}
if obicsv.CLIPrintTaxon() {
attribList = append(attribList, "taxon")
}
if obicsv.CLIPrintDefinition() {
attribList = append(attribList, "definition")
}
if obicsv.CLIPrintSequence() {
attribList = append(attribList, "sequence")
}
if obicsv.CLIPrintQuality() {
attribList = append(attribList, "qualities")
}
attribList = append(attribList, obicsv.CLIToBeKeptAttributes()...)
if obicsv.CLIAutoColumns() {
if iterator.Next() {
batch := iterator.Get()
if len(batch.Slice()) == 0 {
log.Panicf("first batch should not be empty")
}
auto_slot := batch.Slice().AttributeKeys(true, true).Members()
slices.Sort(auto_slot)
attribList = append(attribList, auto_slot...)
iterator.PushBack()
}
}
naValue := obicsv.CLINAValue()
strict := CLIStrict()
summaries := make([]*MatrixData, nproc)
ff := func(iseq obiiter.IBioSequence, summary *MatrixData) {
for iseq.Next() {
batch := iseq.Get()
for _, seq := range batch.Slice() {
summary.Update(seq, mapAttribute, strict)
}
}
waiter.Done()
}
waiter.Add(nproc)
summaries[0] = NewMatrixData(naValue, attribList...)
go ff(iterator, summaries[0])
for i := 1; i < nproc; i++ {
summaries[i] = NewMatrixData(naValue, attribList...)
go ff(iterator.Split(), summaries[i])
}
waiter.Wait()
obiutils.WaitForLastPipe()
rep := summaries[0]
for i := 1; i < nproc; i++ {
rep = rep.MergeMatrixData(summaries[i])
}
return rep
}
func CLIWriteCSVToStdout(matrix *MatrixData) {
navalue := CLIMapNaValue()
csvwriter := csv.NewWriter(os.Stdout)
if CLITranspose() {
matrix = matrix.TransposeMatrixData()
}
samples := obiutils.NewSet[string]()
for _, v := range matrix.matrix {
samples.Add(maps.Keys(v)...)
}
osamples := samples.Members()
sort.Strings(osamples)
columns := make([]string, 0, len(osamples)+len(matrix.attributeList))
columns = append(columns, matrix.attributeList...)
columns = append(columns, osamples...)
header := slices.Clone(columns)
csvwriter.Write(columns)
nattribs := len(matrix.attributeList)
for k, data := range matrix.matrix {
attrs := matrix.attributes[k]
for i, kk := range header {
if i < nattribs {
if v, ok := attrs[kk]; ok {
vs, err := obiutils.InterfaceToString(v)
if err != nil {
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk)
}
columns[i] = vs
} else {
columns[i] = matrix.naValue
}
} else {
if v, ok := data[kk]; ok {
vs, err := obiutils.InterfaceToString(v)
if err != nil {
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk)
}
columns[i] = vs
} else {
columns[i] = navalue
}
}
}
csvwriter.Write(columns)
}
csvwriter.Flush()
}
func CLIWriteThreeColumnsToStdout(matrix *MatrixData) {
sname := CLISampleName()
vname := CLIValueName()
csvwriter := csv.NewWriter(os.Stdout)
csvwriter.Write([]string{"id", sname, vname})
for seqid := range matrix.matrix {
for attr, v := range matrix.matrix[seqid] {
vs, err := obiutils.InterfaceToString(v)
if err != nil {
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, seqid, attr)
}
csvwriter.Write([]string{seqid, attr, vs})
}
}
csvwriter.Flush()
}