mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
198 lines
4.5 KiB
Go
198 lines
4.5 KiB
Go
package obimatrix
|
|
|
|
import (
|
|
"encoding/csv"
|
|
"os"
|
|
"sort"
|
|
"sync"
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
|
"golang.org/x/exp/maps"
|
|
)
|
|
|
|
type MatrixData map[string]map[string]interface{}
|
|
|
|
// MakeMatrixData generates a MatrixData instance.
|
|
//
|
|
// No parameters.
|
|
// Returns a MatrixData.
|
|
func MakeMatrixData() MatrixData {
|
|
return make(MatrixData)
|
|
}
|
|
|
|
// NewMatrixData creates a new instance of MatrixData.
|
|
//
|
|
// It does not take any parameters.
|
|
// It returns a pointer to a MatrixData object.
|
|
func NewMatrixData() *MatrixData {
|
|
m := make(MatrixData)
|
|
return &m
|
|
}
|
|
|
|
// TransposeMatrixData transposes the MatrixData.
|
|
//
|
|
// It takes no parameters.
|
|
// It returns a pointer to the transposed MatrixData.
|
|
func (matrix *MatrixData) TransposeMatrixData() *MatrixData {
|
|
m := make(MatrixData)
|
|
for k, v := range *matrix {
|
|
for kk, vv := range v {
|
|
if _, ok := m[kk]; !ok {
|
|
m[kk] = make(map[string]interface{})
|
|
}
|
|
m[kk][k] = vv
|
|
}
|
|
}
|
|
return &m
|
|
}
|
|
|
|
// MergeMatrixData merges the data from data2 into data1.
|
|
//
|
|
// data1 - Pointer to the MatrixData to merge into.
|
|
// data2 - Pointer to the MatrixData to merge from.
|
|
// Returns the pointer to the merged MatrixData.
|
|
func (data1 *MatrixData) MergeMatrixData(data2 *MatrixData) *MatrixData {
|
|
|
|
for k := range *data2 {
|
|
if _, ok := (*data1)[k]; ok {
|
|
log.Panicf("Sequence Id %s exists at least twice in the data set", k)
|
|
} else {
|
|
(*data1)[k] = (*data2)[k]
|
|
}
|
|
}
|
|
|
|
return data1
|
|
}
|
|
|
|
// Update updates the MatrixData with the given BioSequence and mapkey.
|
|
//
|
|
// Parameters:
|
|
// - s: The BioSequence object to update MatrixData with.
|
|
// - mapkey: The key to retrieve the attribute from the BioSequence object.
|
|
//
|
|
// Returns:
|
|
// - *MatrixData: The updated MatrixData object.
|
|
func (data *MatrixData) Update(s *obiseq.BioSequence, mapkey string) *MatrixData {
|
|
if v, ok := s.GetAttribute(mapkey); ok {
|
|
if obiutils.IsAMap(v) {
|
|
(*data)[s.Id()] = obiutils.MapToMapInterface(v)
|
|
} else {
|
|
log.Panicf("Attribute %s is not a map in the sequence %s", mapkey, s.Id())
|
|
}
|
|
} else {
|
|
log.Panicf("Attribute %s does not exist in the sequence %s", mapkey, s.Id())
|
|
}
|
|
|
|
return data
|
|
}
|
|
|
|
func IMatrix(iterator obiiter.IBioSequence) *MatrixData {
|
|
|
|
nproc := obidefault.ParallelWorkers()
|
|
waiter := sync.WaitGroup{}
|
|
|
|
mapAttribute := CLIMapAttribute()
|
|
|
|
summaries := make([]*MatrixData, nproc)
|
|
|
|
ff := func(iseq obiiter.IBioSequence, summary *MatrixData) {
|
|
|
|
for iseq.Next() {
|
|
batch := iseq.Get()
|
|
for _, seq := range batch.Slice() {
|
|
summary.Update(seq, mapAttribute)
|
|
}
|
|
}
|
|
waiter.Done()
|
|
}
|
|
|
|
waiter.Add(nproc)
|
|
|
|
summaries[0] = NewMatrixData()
|
|
go ff(iterator, summaries[0])
|
|
|
|
for i := 1; i < nproc; i++ {
|
|
summaries[i] = NewMatrixData()
|
|
go ff(iterator.Split(), summaries[i])
|
|
}
|
|
|
|
waiter.Wait()
|
|
obiutils.WaitForLastPipe()
|
|
|
|
rep := summaries[0]
|
|
|
|
for i := 1; i < nproc; i++ {
|
|
rep = rep.MergeMatrixData(summaries[i])
|
|
}
|
|
|
|
return rep
|
|
}
|
|
|
|
func CLIWriteCSVToStdout(matrix *MatrixData) {
|
|
navalue := CLINaValue()
|
|
csvwriter := csv.NewWriter(os.Stdout)
|
|
|
|
if CLITranspose() {
|
|
matrix = matrix.TransposeMatrixData()
|
|
}
|
|
|
|
samples := obiutils.NewSet[string]()
|
|
|
|
for _, v := range *matrix {
|
|
samples.Add(maps.Keys(v)...)
|
|
}
|
|
|
|
osamples := samples.Members()
|
|
sort.Strings(osamples)
|
|
|
|
columns := make([]string, 1, len(osamples)+1)
|
|
columns[0] = "id"
|
|
columns = append(columns, osamples...)
|
|
|
|
csvwriter.Write(columns)
|
|
|
|
for k, data := range *matrix {
|
|
columns = columns[0:1]
|
|
columns[0] = k
|
|
for _, kk := range osamples {
|
|
if v, ok := data[kk]; ok {
|
|
vs, err := obiutils.InterfaceToString(v)
|
|
if err != nil {
|
|
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk)
|
|
}
|
|
columns = append(columns, vs)
|
|
} else {
|
|
columns = append(columns, navalue)
|
|
}
|
|
|
|
}
|
|
csvwriter.Write(columns)
|
|
}
|
|
|
|
csvwriter.Flush()
|
|
}
|
|
|
|
func CLIWriteThreeColumnsToStdout(matrix *MatrixData) {
|
|
sname := CLISampleName()
|
|
vname := CLIValueName()
|
|
csvwriter := csv.NewWriter(os.Stdout)
|
|
|
|
csvwriter.Write([]string{"id", sname, vname})
|
|
for seqid := range *matrix {
|
|
for attr, v := range (*matrix)[seqid] {
|
|
vs, err := obiutils.InterfaceToString(v)
|
|
if err != nil {
|
|
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, seqid, attr)
|
|
}
|
|
csvwriter.Write([]string{seqid, attr, vs})
|
|
}
|
|
}
|
|
|
|
csvwriter.Flush()
|
|
}
|