From 8620ea1637f71afb4f1434f2aae219c3d3b6fc2c Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sun, 12 Nov 2023 20:40:56 +0100 Subject: [PATCH] First version of obimatrix Former-commit-id: 6e09eb0dd75bc688a6c83ef40dd88658fb1b296e --- cmd/obitools/obimatrix/main.go | 56 ++++++++ cmd/obitools/obisummary/main.go | 5 +- pkg/obiformats/csv_writer.go | 7 +- pkg/obitools/obimatrix/obimatrix.go | 195 ++++++++++++++++++++++++++ pkg/obitools/obimatrix/options.go | 62 ++++++++ pkg/obitools/obisummary/obisummary.go | 1 + pkg/obitools/obisummary/options.go | 8 +- pkg/obiutils/cast_interface.go | 28 +++- 8 files changed, 350 insertions(+), 12 deletions(-) create mode 100644 cmd/obitools/obimatrix/main.go create mode 100644 pkg/obitools/obimatrix/obimatrix.go create mode 100644 pkg/obitools/obimatrix/options.go diff --git a/cmd/obitools/obimatrix/main.go b/cmd/obitools/obimatrix/main.go new file mode 100644 index 0000000..790b09e --- /dev/null +++ b/cmd/obitools/obimatrix/main.go @@ -0,0 +1,56 @@ +package main + +import ( + "fmt" + "os" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obimatrix" +) + +func main() { + + defer obiseq.LogBioSeqStatus() + + // go tool pprof -http=":8000" ./obipairing ./cpu.pprof + // f, err := os.Create("cpu.pprof") + // if err != nil { + // log.Fatal(err) + // } + // pprof.StartCPUProfile(f) + // defer pprof.StopCPUProfile() + + // go tool trace cpu.trace + // ftrace, err := os.Create("cpu.trace") + // if err != nil { + // log.Fatal(err) + // } + // trace.Start(ftrace) + // defer trace.Stop() + + optionParser := obioptions.GenerateOptionParser( + obimatrix.OptionSet, + ) + + _, args := optionParser(os.Args) + + fs, err := obiconvert.CLIReadBioSequences(args...) + + if err != nil { + log.Errorf("Cannot open file (%v)", err) + os.Exit(1) + } + + matrix := obimatrix.IMatrix(fs) + + if obimatrix.CLIOutFormat() == "matrix" { + obimatrix.CLIWriteCSVToStdout(matrix) + } else { + obimatrix.CLIWriteThreeColumnsToStdout(matrix) + } + fmt.Printf("\n") +} diff --git a/cmd/obitools/obisummary/main.go b/cmd/obitools/obisummary/main.go index ce8152c..ad13d78 100644 --- a/cmd/obitools/obisummary/main.go +++ b/cmd/obitools/obisummary/main.go @@ -34,10 +34,7 @@ func main() { // trace.Start(ftrace) // defer trace.Stop() - optionParser := obioptions.GenerateOptionParser( - obiconvert.InputOptionSet, - obisummary.OptionSet, - ) + optionParser := obioptions.GenerateOptionParser(obisummary.OptionSet) _, args := optionParser(os.Args) diff --git a/pkg/obiformats/csv_writer.go b/pkg/obiformats/csv_writer.go index d055043..ab33f11 100644 --- a/pkg/obiformats/csv_writer.go +++ b/pkg/obiformats/csv_writer.go @@ -197,7 +197,6 @@ func WriteCSV(iterator obiiter.IBioSequence, log.Debugln("End of the CSV file writing") obiiter.UnregisterPipe() waitWriter.Done() - }() if opt.pointer.csv_auto { @@ -205,11 +204,7 @@ func WriteCSV(iterator obiiter.IBioSequence, batch := iterator.Get() auto_slot = batch.Slice().AttributeKeys(true) CSVKeys(auto_slot.Members())(opt) - chunkchan <- FileChunck{ - FormatCVSBatch(batch, opt), - batch.Order(), - } - newIter.Push(batch) + iterator.PushBack() } } diff --git a/pkg/obitools/obimatrix/obimatrix.go b/pkg/obitools/obimatrix/obimatrix.go new file mode 100644 index 0000000..4b5ee6b --- /dev/null +++ b/pkg/obitools/obimatrix/obimatrix.go @@ -0,0 +1,195 @@ +package obimatrix + +import ( + "encoding/csv" + log "github.com/sirupsen/logrus" + "os" + "sort" + "sync" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" + "golang.org/x/exp/maps" +) + +type MatrixData map[string]map[string]interface{} + +// MakeMatrixData generates a MatrixData instance. +// +// No parameters. +// Returns a MatrixData. +func MakeMatrixData() MatrixData { + return make(MatrixData) +} + +// NewMatrixData creates a new instance of MatrixData. +// +// It does not take any parameters. +// It returns a pointer to a MatrixData object. +func NewMatrixData() *MatrixData { + m := make(MatrixData) + return &m +} + +// TransposeMatrixData transposes the MatrixData. +// +// It takes no parameters. +// It returns a pointer to the transposed MatrixData. +func (matrix *MatrixData) TransposeMatrixData() *MatrixData { + m := make(MatrixData) + for k, v := range *matrix { + for kk, vv := range v { + if _, ok := m[kk]; !ok { + m[kk] = make(map[string]interface{}) + } + m[kk][k] = vv + } + } + return &m +} + +// MergeMatrixData merges the data from data2 into data1. +// +// data1 - Pointer to the MatrixData to merge into. +// data2 - Pointer to the MatrixData to merge from. +// Returns the pointer to the merged MatrixData. +func (data1 *MatrixData) MergeMatrixData(data2 *MatrixData) *MatrixData { + + for k := range *data2 { + if _, ok := (*data1)[k]; ok { + log.Panicf("Sequence Id %s exists at least twice in the data set", k) + } else { + (*data1)[k] = (*data2)[k] + } + } + + return data1 +} + +// Update updates the MatrixData with the given BioSequence and mapkey. +// +// Parameters: +// - s: The BioSequence object to update MatrixData with. +// - mapkey: The key to retrieve the attribute from the BioSequence object. +// +// Returns: +// - *MatrixData: The updated MatrixData object. +func (data *MatrixData) Update(s *obiseq.BioSequence, mapkey string) *MatrixData { + if v, ok := s.GetAttribute(mapkey); ok { + if obiutils.IsAMap(v) { + (*data)[s.Id()] = obiutils.MapToMapInterface(v) + } else { + log.Panicf("Attribute %s is not a map in the sequence %s", mapkey, s.Id()) + } + } else { + log.Panicf("Attribute %s does not exist in the sequence %s", mapkey, s.Id()) + } + + return data +} + +func IMatrix(iterator obiiter.IBioSequence) *MatrixData { + + nproc := obioptions.CLIParallelWorkers() + waiter := sync.WaitGroup{} + + mapAttribute := CLIMapAttribute() + + summaries := make([]*MatrixData, nproc) + + ff := func(iseq obiiter.IBioSequence, summary *MatrixData) { + + for iseq.Next() { + batch := iseq.Get() + for _, seq := range batch.Slice() { + summary.Update(seq, mapAttribute) + } + batch.Recycle(true) + } + waiter.Done() + } + + waiter.Add(nproc) + + summaries[0] = NewMatrixData() + go ff(iterator, summaries[0]) + + for i := 1; i < nproc; i++ { + summaries[i] = NewMatrixData() + go ff(iterator.Split(), summaries[i]) + } + + waiter.Wait() + obiiter.WaitForLastPipe() + + rep := summaries[0] + + for i := 1; i < nproc; i++ { + rep = rep.MergeMatrixData(summaries[i]) + } + + return rep +} + +func CLIWriteCSVToStdout(matrix *MatrixData) { + navalue := CLINaValue() + csvwriter := csv.NewWriter(os.Stdout) + + matrix = matrix.TransposeMatrixData() + + samples := obiutils.NewSet[string]() + + for _, v := range *matrix { + samples.Add(maps.Keys(v)...) + } + + osamples := samples.Members() + sort.Strings(osamples) + + columns := make([]string, 1, len(osamples)+1) + columns[0] = "id" + columns = append(columns, osamples...) + + csvwriter.Write(columns) + + for k, data := range *matrix { + columns = columns[0:1] + columns[0] = k + for _, kk := range osamples { + if v, ok := data[kk]; ok { + vs, err := obiutils.InterfaceToString(v) + if err != nil { + log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk) + } + columns = append(columns, vs) + } else { + columns = append(columns, navalue) + } + + } + csvwriter.Write(columns) + } + + csvwriter.Flush() +} + +func CLIWriteThreeColumnsToStdout(matrix *MatrixData) { + sname := CLISampleName() + vname := CLIValueName() + csvwriter := csv.NewWriter(os.Stdout) + + csvwriter.Write([]string{"id", sname, vname}) + for seqid := range *matrix { + for attr, v := range (*matrix)[seqid] { + vs, err := obiutils.InterfaceToString(v) + if err != nil { + log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, seqid, attr) + } + csvwriter.Write([]string{seqid, attr, vs}) + } + } + + csvwriter.Flush() +} diff --git a/pkg/obitools/obimatrix/options.go b/pkg/obitools/obimatrix/options.go new file mode 100644 index 0000000..e91ccb3 --- /dev/null +++ b/pkg/obitools/obimatrix/options.go @@ -0,0 +1,62 @@ +// obicount function utility package. +// +// The obitols/obicount package contains every +// functions specificaly required by the obicount utility. +package obimatrix + +import ( + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +var __threeColumns__ = false +var __mapAttribute__ = "merged_sample" +var __valueName__ = "count" +var __sampleName__ = "sample" +var __NAValue__ = "0" + +func MatrixOptionSet(options *getoptions.GetOpt) { + options.BoolVar(&__threeColumns__, "three-columns", false, + options.Description("Printouts the matrix in tree column format.")) + + options.StringVar(&__mapAttribute__, "map", __mapAttribute__, + options.Description("Which attribute is usd to produce th matrix.")) + + options.StringVar(&__valueName__, "value-name", __valueName__, + options.Description("Name of the coulumn containing the values in the three column format.")) + + options.StringVar(&__sampleName__, "sample-name", __sampleName__, + options.Description("Name of the coulumn containing the sample names in the three column format.")) + + options.StringVar(&__NAValue__, "na-value", __NAValue__, + options.Description("Value used when the map attribute is not defined for a sequence.")) +} + +func OptionSet(options *getoptions.GetOpt) { + MatrixOptionSet(options) + obiconvert.InputOptionSet(options) +} + +func CLIOutFormat() string { + if __threeColumns__ { + return "three-columns" + } + + return "matrix" +} + +func CLIValueName() string { + return __valueName__ +} + +func CLISampleName() string { + return __sampleName__ +} + +func CLINaValue() string { + return __NAValue__ +} + +func CLIMapAttribute() string { + return __mapAttribute__ +} diff --git a/pkg/obitools/obisummary/obisummary.go b/pkg/obitools/obisummary/obisummary.go index 34cb3ed..0b0eb82 100644 --- a/pkg/obitools/obisummary/obisummary.go +++ b/pkg/obitools/obisummary/obisummary.go @@ -180,6 +180,7 @@ func ISummary(iterator obiiter.IBioSequence) map[string]interface{} { } waiter.Wait() + obiiter.WaitForLastPipe() rep := summaries[0] diff --git a/pkg/obitools/obisummary/options.go b/pkg/obitools/obisummary/options.go index 97de964..c9b8b85 100644 --- a/pkg/obitools/obisummary/options.go +++ b/pkg/obitools/obisummary/options.go @@ -5,13 +5,14 @@ package obisummary import ( + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" "github.com/DavidGamba/go-getoptions" ) var __json_output__ = false var __yaml_output__ = false -func OptionSet(options *getoptions.GetOpt) { +func SummaryOptionSet(options *getoptions.GetOpt) { options.BoolVar(&__json_output__, "json-output", false, options.Description("Print results as JSON record.")) @@ -19,6 +20,11 @@ func OptionSet(options *getoptions.GetOpt) { options.Description("Print results as YAML record.")) } +func OptionSet(options *getoptions.GetOpt) { + SummaryOptionSet(options) + obiconvert.InputOptionSet(options) +} + func CLIOutFormat() string { if __yaml_output__ && !__json_output__ { return "yaml" diff --git a/pkg/obiutils/cast_interface.go b/pkg/obiutils/cast_interface.go index 9244f6a..98319cf 100644 --- a/pkg/obiutils/cast_interface.go +++ b/pkg/obiutils/cast_interface.go @@ -1,6 +1,10 @@ package obiutils -import "fmt" +import ( + "fmt" + "log" + "reflect" +) // InterfaceToString converts an interface value to a string. // @@ -64,3 +68,25 @@ func InterfaceToBool(i interface{}) (val bool, err error) { } return } + +// MapToMapInterface converts a map to a map of type map[string]interface{}. +// +// It takes an interface{} parameter `m` which represents the map to be converted. +// +// It returns a map[string]interface{} which is the converted map. If the input map is not of type map[string]interface{}, +// it panics and logs an error message. +func MapToMapInterface(m interface{}) map[string]interface{} { + if IsAMap(m) { + reflectMap := reflect.ValueOf(m) + keys := reflectMap.MapKeys() + val := make(map[string]interface{}, len(keys)) + for k := range keys { + val[keys[k].String()] = reflectMap.MapIndex(keys[k]).Interface() + } + + return val + } + + log.Panic("Invalid map type") + return make(map[string]interface{}) +}