From 29bf4ce8718f0981f3d5caf83eaa05d9e56d150f Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 17 Oct 2025 10:24:25 +0200 Subject: [PATCH] add a feature to obimatrix adding obicsv option to obimatrix --- pkg/obioptions/version.go | 2 +- pkg/obitools/obimatrix/obimatrix.go | 197 ++++++++++++++++++++++------ pkg/obitools/obimatrix/options.go | 18 ++- 3 files changed, 175 insertions(+), 42 deletions(-) diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index b326b22..7a2605a 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "6d204f6" +var _Commit = "d7ed9d3" var _Version = "Release 4.4.0" // Version returns the version of the obitools package. diff --git a/pkg/obitools/obimatrix/obimatrix.go b/pkg/obitools/obimatrix/obimatrix.go index a08a31c..3ad44f8 100644 --- a/pkg/obitools/obimatrix/obimatrix.go +++ b/pkg/obitools/obimatrix/obimatrix.go @@ -3,6 +3,7 @@ package obimatrix import ( "encoding/csv" "os" + "slices" "sort" "sync" @@ -11,41 +12,55 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicsv" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "golang.org/x/exp/maps" ) -type MatrixData map[string]map[string]interface{} +type MatrixData struct { + matrix map[string]map[string]interface{} + attributes map[string]map[string]interface{} + attributeList []string + naValue string +} // MakeMatrixData generates a MatrixData instance. // // No parameters. // Returns a MatrixData. -func MakeMatrixData() MatrixData { - return make(MatrixData) +func MakeMatrixData(naValue string, attributes ...string) MatrixData { + return MatrixData{ + matrix: make(map[string]map[string]interface{}), + attributes: make(map[string]map[string]interface{}), + attributeList: slices.Clone(attributes), + naValue: naValue, + } } // NewMatrixData creates a new instance of MatrixData. // // It does not take any parameters. // It returns a pointer to a MatrixData object. -func NewMatrixData() *MatrixData { - m := make(MatrixData) +func NewMatrixData(naValue string, attributes ...string) *MatrixData { + m := MakeMatrixData(naValue, attributes...) return &m } // TransposeMatrixData transposes the MatrixData. // // It takes no parameters. +// If the input matrix has attributes, they are lost. +// A unique attribute "id" is added to store the column ids of the input matrix. // It returns a pointer to the transposed MatrixData. func (matrix *MatrixData) TransposeMatrixData() *MatrixData { - m := make(MatrixData) - for k, v := range *matrix { + m := MakeMatrixData(matrix.naValue, "id") + for k, v := range *&matrix.matrix { for kk, vv := range v { - if _, ok := m[kk]; !ok { - m[kk] = make(map[string]interface{}) + if _, ok := m.matrix[kk]; !ok { + m.matrix[kk] = make(map[string]interface{}) } - m[kk][k] = vv + m.matrix[kk][k] = vv + m.attributes[kk] = map[string]interface{}{"id": k} } } return &m @@ -58,11 +73,12 @@ func (matrix *MatrixData) TransposeMatrixData() *MatrixData { // Returns the pointer to the merged MatrixData. func (data1 *MatrixData) MergeMatrixData(data2 *MatrixData) *MatrixData { - for k := range *data2 { - if _, ok := (*data1)[k]; ok { + for k := range data2.matrix { + if _, ok := data1.matrix[k]; ok { log.Panicf("Sequence Id %s exists at least twice in the data set", k) } else { - (*data1)[k] = (*data2)[k] + data1.matrix[k] = data2.matrix[k] + data1.attributes[k] = data2.attributes[k] } } @@ -77,21 +93,74 @@ func (data1 *MatrixData) MergeMatrixData(data2 *MatrixData) *MatrixData { // // Returns: // - *MatrixData: The updated MatrixData object. -func (data *MatrixData) Update(s *obiseq.BioSequence, mapkey string) *MatrixData { +func (data *MatrixData) Update(s *obiseq.BioSequence, mapkey string, strict bool) *MatrixData { + + sid := s.Id() + + if _, ok := data.matrix[sid]; ok { + log.Panicf("Sequence Id %s exists at least twice in the data set", sid) + } if v, ok := s.GetAttribute(mapkey); ok { if m, ok := v.(*obiseq.StatsOnValues); ok { m.RLock() - (*data)[s.Id()] = obiutils.MapToMapInterface(m.Map()) + data.matrix[sid] = obiutils.MapToMapInterface(m.Map()) m.RUnlock() } else if obiutils.IsAMap(v) { - (*data)[s.Id()] = obiutils.MapToMapInterface(v) + data.matrix[sid] = obiutils.MapToMapInterface(v) } else { log.Panicf("Attribute %s is not a map in the sequence %s", mapkey, s.Id()) } } else { - log.Panicf("Attribute %s does not exist in the sequence %s", mapkey, s.Id()) + if strict { + log.Panicf("Attribute %s does not exist in the sequence %s", mapkey, s.Id()) + } + data.matrix[sid] = make(map[string]interface{}) } + attrs := make(map[string]interface{}, len(data.attributeList)) + for _, attrname := range data.attributeList { + var value interface{} + ok := false + switch attrname { + case "id": + value = s.Id + ok = true + case "count": + value = s.Count() + ok = true + case "taxon": + taxon := s.Taxon(nil) + if taxon != nil { + value = taxon.String() + + } else { + value = s.Taxid() + } + ok = true + case "sequence": + value = s.String() + ok = true + case "quality": + if s.HasQualities() { + l := s.Len() + q := s.Qualities() + ascii := make([]byte, l) + quality_shift := obidefault.WriteQualitiesShift() + for j := 0; j < l; j++ { + ascii[j] = uint8(q[j]) + uint8(quality_shift) + } + value = string(ascii) + ok = true + } + default: + value, ok = s.GetAttribute(attrname) + } + if ok { + attrs[attrname] = value + } + } + data.attributes[sid] = attrs + return data } @@ -101,6 +170,49 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData { waiter := sync.WaitGroup{} mapAttribute := CLIMapAttribute() + attribList := make([]string, 0) + + if obicsv.CLIPrintId() { + attribList = append(attribList, "id") + } + + if obicsv.CLIPrintCount() { + attribList = append(attribList, "count") + } + + if obicsv.CLIPrintTaxon() { + attribList = append(attribList, "taxon") + } + + if obicsv.CLIPrintDefinition() { + attribList = append(attribList, "definition") + } + + if obicsv.CLIPrintSequence() { + attribList = append(attribList, "sequence") + } + + if obicsv.CLIPrintQuality() { + attribList = append(attribList, "qualities") + } + + attribList = append(attribList, obicsv.CLIToBeKeptAttributes()...) + + if obicsv.CLIAutoColumns() { + if iterator.Next() { + batch := iterator.Get() + if len(batch.Slice()) == 0 { + log.Panicf("first batch should not be empty") + } + auto_slot := batch.Slice().AttributeKeys(true, true).Members() + slices.Sort(auto_slot) + attribList = append(attribList, auto_slot...) + iterator.PushBack() + } + } + + naValue := obicsv.CLINAValue() + strict := CLIStrict() summaries := make([]*MatrixData, nproc) @@ -109,7 +221,7 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData { for iseq.Next() { batch := iseq.Get() for _, seq := range batch.Slice() { - summary.Update(seq, mapAttribute) + summary.Update(seq, mapAttribute, strict) } } waiter.Done() @@ -117,11 +229,11 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData { waiter.Add(nproc) - summaries[0] = NewMatrixData() + summaries[0] = NewMatrixData(naValue, attribList...) go ff(iterator, summaries[0]) for i := 1; i < nproc; i++ { - summaries[i] = NewMatrixData() + summaries[i] = NewMatrixData(naValue, attribList...) go ff(iterator.Split(), summaries[i]) } @@ -138,7 +250,7 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData { } func CLIWriteCSVToStdout(matrix *MatrixData) { - navalue := CLINaValue() + navalue := CLIMapNaValue() csvwriter := csv.NewWriter(os.Stdout) if CLITranspose() { @@ -147,33 +259,44 @@ func CLIWriteCSVToStdout(matrix *MatrixData) { samples := obiutils.NewSet[string]() - for _, v := range *matrix { + for _, v := range matrix.matrix { samples.Add(maps.Keys(v)...) } osamples := samples.Members() sort.Strings(osamples) - columns := make([]string, 1, len(osamples)+1) - columns[0] = "id" + columns := make([]string, 0, len(osamples)+len(matrix.attributeList)) + columns = append(columns, matrix.attributeList...) columns = append(columns, osamples...) csvwriter.Write(columns) + nattribs := len(matrix.attributeList) - for k, data := range *matrix { - columns = columns[0:1] - columns[0] = k - for _, kk := range osamples { - if v, ok := data[kk]; ok { - vs, err := obiutils.InterfaceToString(v) - if err != nil { - log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk) + for k, data := range matrix.matrix { + attrs := matrix.attributes[k] + for i, kk := range osamples { + if i < nattribs { + if v, ok := attrs[kk]; ok { + vs, err := obiutils.InterfaceToString(v) + if err != nil { + log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk) + } + columns[i] = vs + } else { + columns[i] = matrix.naValue } - columns = append(columns, vs) } else { - columns = append(columns, navalue) + if v, ok := data[kk]; ok { + vs, err := obiutils.InterfaceToString(v) + if err != nil { + log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk) + } + columns[i] = vs + } else { + columns[i] = navalue + } } - } csvwriter.Write(columns) } @@ -187,8 +310,8 @@ func CLIWriteThreeColumnsToStdout(matrix *MatrixData) { csvwriter := csv.NewWriter(os.Stdout) csvwriter.Write([]string{"id", sname, vname}) - for seqid := range *matrix { - for attr, v := range (*matrix)[seqid] { + for seqid := range matrix.matrix { + for attr, v := range matrix.matrix[seqid] { vs, err := obiutils.InterfaceToString(v) if err != nil { log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, seqid, attr) diff --git a/pkg/obitools/obimatrix/options.go b/pkg/obitools/obimatrix/options.go index 5c2216e..881cb02 100644 --- a/pkg/obitools/obimatrix/options.go +++ b/pkg/obitools/obimatrix/options.go @@ -6,6 +6,7 @@ package obimatrix import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicsv" "github.com/DavidGamba/go-getoptions" ) @@ -14,7 +15,8 @@ var __transpose__ = true var __mapAttribute__ = "merged_sample" var __valueName__ = "count" var __sampleName__ = "sample" -var __NAValue__ = "0" +var __MapNAValue__ = "0" +var __AllowEmpty__ = false func MatrixOptionSet(options *getoptions.GetOpt) { options.BoolVar(&__threeColumns__, "three-columns", false, @@ -32,12 +34,16 @@ func MatrixOptionSet(options *getoptions.GetOpt) { options.StringVar(&__sampleName__, "sample-name", __sampleName__, options.Description("Name of the coulumn containing the sample names in the three column format.")) - options.StringVar(&__NAValue__, "na-value", __NAValue__, + options.StringVar(&__MapNAValue__, "map-na-value", __MapNAValue__, options.Description("Value used when the map attribute is not defined for a sequence.")) + + options.BoolVar(&__AllowEmpty__, "allow-empty", __AllowEmpty__, + options.Description("Allow sequences with empty map")) } func OptionSet(options *getoptions.GetOpt) { MatrixOptionSet(options) + obicsv.CSVOptionSet(options) obiconvert.InputOptionSet(options) } @@ -57,8 +63,8 @@ func CLISampleName() string { return __sampleName__ } -func CLINaValue() string { - return __NAValue__ +func CLIMapNaValue() string { + return __MapNAValue__ } func CLIMapAttribute() string { @@ -68,3 +74,7 @@ func CLIMapAttribute() string { func CLITranspose() bool { return __transpose__ } + +func CLIStrict() bool { + return !__AllowEmpty__ +}