First version of obimatrix

Former-commit-id: 6e09eb0dd75bc688a6c83ef40dd88658fb1b296e
This commit is contained in:
2023-11-12 20:40:56 +01:00
parent 677775bf04
commit 8620ea1637
8 changed files with 350 additions and 12 deletions

View File

@@ -0,0 +1,195 @@
package obimatrix
import (
"encoding/csv"
log "github.com/sirupsen/logrus"
"os"
"sort"
"sync"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
"golang.org/x/exp/maps"
)
type MatrixData map[string]map[string]interface{}
// MakeMatrixData generates a MatrixData instance.
//
// No parameters.
// Returns a MatrixData.
func MakeMatrixData() MatrixData {
return make(MatrixData)
}
// NewMatrixData creates a new instance of MatrixData.
//
// It does not take any parameters.
// It returns a pointer to a MatrixData object.
func NewMatrixData() *MatrixData {
m := make(MatrixData)
return &m
}
// TransposeMatrixData transposes the MatrixData.
//
// It takes no parameters.
// It returns a pointer to the transposed MatrixData.
func (matrix *MatrixData) TransposeMatrixData() *MatrixData {
m := make(MatrixData)
for k, v := range *matrix {
for kk, vv := range v {
if _, ok := m[kk]; !ok {
m[kk] = make(map[string]interface{})
}
m[kk][k] = vv
}
}
return &m
}
// MergeMatrixData merges the data from data2 into data1.
//
// data1 - Pointer to the MatrixData to merge into.
// data2 - Pointer to the MatrixData to merge from.
// Returns the pointer to the merged MatrixData.
func (data1 *MatrixData) MergeMatrixData(data2 *MatrixData) *MatrixData {
for k := range *data2 {
if _, ok := (*data1)[k]; ok {
log.Panicf("Sequence Id %s exists at least twice in the data set", k)
} else {
(*data1)[k] = (*data2)[k]
}
}
return data1
}
// Update updates the MatrixData with the given BioSequence and mapkey.
//
// Parameters:
// - s: The BioSequence object to update MatrixData with.
// - mapkey: The key to retrieve the attribute from the BioSequence object.
//
// Returns:
// - *MatrixData: The updated MatrixData object.
func (data *MatrixData) Update(s *obiseq.BioSequence, mapkey string) *MatrixData {
if v, ok := s.GetAttribute(mapkey); ok {
if obiutils.IsAMap(v) {
(*data)[s.Id()] = obiutils.MapToMapInterface(v)
} else {
log.Panicf("Attribute %s is not a map in the sequence %s", mapkey, s.Id())
}
} else {
log.Panicf("Attribute %s does not exist in the sequence %s", mapkey, s.Id())
}
return data
}
func IMatrix(iterator obiiter.IBioSequence) *MatrixData {
nproc := obioptions.CLIParallelWorkers()
waiter := sync.WaitGroup{}
mapAttribute := CLIMapAttribute()
summaries := make([]*MatrixData, nproc)
ff := func(iseq obiiter.IBioSequence, summary *MatrixData) {
for iseq.Next() {
batch := iseq.Get()
for _, seq := range batch.Slice() {
summary.Update(seq, mapAttribute)
}
batch.Recycle(true)
}
waiter.Done()
}
waiter.Add(nproc)
summaries[0] = NewMatrixData()
go ff(iterator, summaries[0])
for i := 1; i < nproc; i++ {
summaries[i] = NewMatrixData()
go ff(iterator.Split(), summaries[i])
}
waiter.Wait()
obiiter.WaitForLastPipe()
rep := summaries[0]
for i := 1; i < nproc; i++ {
rep = rep.MergeMatrixData(summaries[i])
}
return rep
}
func CLIWriteCSVToStdout(matrix *MatrixData) {
navalue := CLINaValue()
csvwriter := csv.NewWriter(os.Stdout)
matrix = matrix.TransposeMatrixData()
samples := obiutils.NewSet[string]()
for _, v := range *matrix {
samples.Add(maps.Keys(v)...)
}
osamples := samples.Members()
sort.Strings(osamples)
columns := make([]string, 1, len(osamples)+1)
columns[0] = "id"
columns = append(columns, osamples...)
csvwriter.Write(columns)
for k, data := range *matrix {
columns = columns[0:1]
columns[0] = k
for _, kk := range osamples {
if v, ok := data[kk]; ok {
vs, err := obiutils.InterfaceToString(v)
if err != nil {
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk)
}
columns = append(columns, vs)
} else {
columns = append(columns, navalue)
}
}
csvwriter.Write(columns)
}
csvwriter.Flush()
}
func CLIWriteThreeColumnsToStdout(matrix *MatrixData) {
sname := CLISampleName()
vname := CLIValueName()
csvwriter := csv.NewWriter(os.Stdout)
csvwriter.Write([]string{"id", sname, vname})
for seqid := range *matrix {
for attr, v := range (*matrix)[seqid] {
vs, err := obiutils.InterfaceToString(v)
if err != nil {
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, seqid, attr)
}
csvwriter.Write([]string{seqid, attr, vs})
}
}
csvwriter.Flush()
}

View File

@@ -0,0 +1,62 @@
// obicount function utility package.
//
// The obitols/obicount package contains every
// functions specificaly required by the obicount utility.
package obimatrix
import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var __threeColumns__ = false
var __mapAttribute__ = "merged_sample"
var __valueName__ = "count"
var __sampleName__ = "sample"
var __NAValue__ = "0"
func MatrixOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__threeColumns__, "three-columns", false,
options.Description("Printouts the matrix in tree column format."))
options.StringVar(&__mapAttribute__, "map", __mapAttribute__,
options.Description("Which attribute is usd to produce th matrix."))
options.StringVar(&__valueName__, "value-name", __valueName__,
options.Description("Name of the coulumn containing the values in the three column format."))
options.StringVar(&__sampleName__, "sample-name", __sampleName__,
options.Description("Name of the coulumn containing the sample names in the three column format."))
options.StringVar(&__NAValue__, "na-value", __NAValue__,
options.Description("Value used when the map attribute is not defined for a sequence."))
}
func OptionSet(options *getoptions.GetOpt) {
MatrixOptionSet(options)
obiconvert.InputOptionSet(options)
}
func CLIOutFormat() string {
if __threeColumns__ {
return "three-columns"
}
return "matrix"
}
func CLIValueName() string {
return __valueName__
}
func CLISampleName() string {
return __sampleName__
}
func CLINaValue() string {
return __NAValue__
}
func CLIMapAttribute() string {
return __mapAttribute__
}

View File

@@ -180,6 +180,7 @@ func ISummary(iterator obiiter.IBioSequence) map[string]interface{} {
}
waiter.Wait()
obiiter.WaitForLastPipe()
rep := summaries[0]

View File

@@ -5,13 +5,14 @@
package obisummary
import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var __json_output__ = false
var __yaml_output__ = false
func OptionSet(options *getoptions.GetOpt) {
func SummaryOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__json_output__, "json-output", false,
options.Description("Print results as JSON record."))
@@ -19,6 +20,11 @@ func OptionSet(options *getoptions.GetOpt) {
options.Description("Print results as YAML record."))
}
func OptionSet(options *getoptions.GetOpt) {
SummaryOptionSet(options)
obiconvert.InputOptionSet(options)
}
func CLIOutFormat() string {
if __yaml_output__ && !__json_output__ {
return "yaml"