From 5ea2b8afcf557b64ddd194d70a0d8c2defc83221 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 9 Nov 2023 22:33:06 +0200 Subject: [PATCH] a first version of obisummary Former-commit-id: cca1019d82a14a322f46a20890b996b5c7491d41 --- cmd/obitools/obisummary/main.go | 61 +++++++ pkg/obiseq/attributes.go | 15 ++ pkg/obiseq/subseq_test.go | 12 ++ pkg/obitools/obisummary/obisummary.go | 239 ++++++++++++++++++++++++++ pkg/obitools/obisummary/options.go | 28 +++ pkg/obitools/obitagpcr/pcrtag.go | 32 +--- pkg/obiutils/goutils.go | 21 +++ 7 files changed, 378 insertions(+), 30 deletions(-) create mode 100644 cmd/obitools/obisummary/main.go create mode 100644 pkg/obitools/obisummary/obisummary.go create mode 100644 pkg/obitools/obisummary/options.go diff --git a/cmd/obitools/obisummary/main.go b/cmd/obitools/obisummary/main.go new file mode 100644 index 0000000..ce8152c --- /dev/null +++ b/cmd/obitools/obisummary/main.go @@ -0,0 +1,61 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + + log "github.com/sirupsen/logrus" + "gopkg.in/yaml.v3" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obisummary" +) + +func main() { + + defer obiseq.LogBioSeqStatus() + + // go tool pprof -http=":8000" ./obipairing ./cpu.pprof + // f, err := os.Create("cpu.pprof") + // if err != nil { + // log.Fatal(err) + // } + // pprof.StartCPUProfile(f) + // defer pprof.StopCPUProfile() + + // go tool trace cpu.trace + // ftrace, err := os.Create("cpu.trace") + // if err != nil { + // log.Fatal(err) + // } + // trace.Start(ftrace) + // defer trace.Stop() + + optionParser := obioptions.GenerateOptionParser( + obiconvert.InputOptionSet, + obisummary.OptionSet, + ) + + _, args := optionParser(os.Args) + + fs, err := obiconvert.CLIReadBioSequences(args...) + + if err != nil { + log.Errorf("Cannot open file (%v)", err) + os.Exit(1) + } + + summary := obisummary.ISummary(fs) + + if obisummary.CLIOutFormat() == "json" { + output, _ := json.MarshalIndent(summary, "", " ") + fmt.Print(string(output)) + } else { + output, _ := yaml.Marshal(summary) + fmt.Print(string(output)) + } + fmt.Printf("\n") +} diff --git a/pkg/obiseq/attributes.go b/pkg/obiseq/attributes.go index d1cbda9..12c9ddb 100644 --- a/pkg/obiseq/attributes.go +++ b/pkg/obiseq/attributes.go @@ -287,6 +287,21 @@ func (s *BioSequence) GetIntMap(key string) (map[string]int, bool) { return val, ok } +func (s *BioSequence) GetStringMap(key string) (map[string]string, bool) { + var val map[string]string + + var err error + + v, ok := s.GetAttribute(key) + + if ok { + val, err = obiutils.InterfaceToStringMap(v) + ok = err == nil + } + + return val, ok +} + // GetIntSlice returns the integer slice value associated with the given key in the BioSequence object. // // Parameters: diff --git a/pkg/obiseq/subseq_test.go b/pkg/obiseq/subseq_test.go index ba02bd9..727ee99 100644 --- a/pkg/obiseq/subseq_test.go +++ b/pkg/obiseq/subseq_test.go @@ -6,6 +6,18 @@ import ( "github.com/stretchr/testify/assert" ) +// TestSubsequence tests the Subsequence function. +// +// The function tests various cases of the Subsequence method of a BioSequence object. +// It checks different scenarios of subsequence slicing, including both valid and invalid parameters. +// The function is designed for unit testing purposes and uses the Go testing package. +// It asserts that the expected subsequence is returned for each test case and checks for any errors. +// The function also verifies the correctness of the subsequence qualities, if applicable. +// The test cases cover both non-circular and circular subsequence slicing. +// It ensures that the function handles different scenarios such as when `from` is greater than `to`, +// `from` or `to` is out of bounds, and normal subsequence slicing cases. +// +// TestSubsequence does not return any value. func TestSubsequence(t *testing.T) { // Test case 1: Subsequence with valid parameters and non-circular seq := NewBioSequence("ID1", []byte("ATCG"), "") diff --git a/pkg/obitools/obisummary/obisummary.go b/pkg/obitools/obisummary/obisummary.go new file mode 100644 index 0000000..34cb3ed --- /dev/null +++ b/pkg/obitools/obisummary/obisummary.go @@ -0,0 +1,239 @@ +package obisummary + +import ( + "sync" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" +) + +type DataSummary struct { + read_count int + variant_count int + symbole_count int + has_merged_sample int + has_obiclean_status int + has_obiclean_weight int + tags map[string]int + map_tags map[string]int + vector_tags map[string]int + samples map[string]int + sample_variants map[string]int + sample_singletons map[string]int + sample_obiclean_bad map[string]int +} + +func NewDataSummary() *DataSummary { + return &DataSummary{ + read_count: 0, + variant_count: 0, + symbole_count: 0, + has_merged_sample: 0, + has_obiclean_status: 0, + has_obiclean_weight: 0, + tags: make(map[string]int), + map_tags: make(map[string]int), + vector_tags: make(map[string]int), + samples: make(map[string]int), + sample_variants: make(map[string]int), + sample_singletons: make(map[string]int), + sample_obiclean_bad: make(map[string]int), + } +} + +func sumUpdateIntMap(m1, m2 map[string]int) map[string]int { + for k, v2 := range m2 { + if v1, ok := m1[k]; ok { + m1[k] = v1 + v2 + } else { + m1[k] = v2 + } + } + + return m1 +} + +func countUpdateIntMap(m1, m2 map[string]int) map[string]int { + for k := range m2 { + if v, ok := m1[k]; ok { + m1[k] = v + 1 + } else { + m1[k] = 1 + } + } + + return m1 +} + +func plusUpdateIntMap(m1 map[string]int, key string, val int) map[string]int { + if v, ok := m1[key]; ok { + m1[key] = v + val + } else { + m1[key] = val + } + return m1 +} + +func plusOneUpdateIntMap(m1 map[string]int, key string) map[string]int { + return plusUpdateIntMap(m1, key, 1) +} + +func (data1 *DataSummary) Add(data2 *DataSummary) *DataSummary { + rep := NewDataSummary() + rep.read_count = data1.read_count + data2.read_count + rep.variant_count = data1.variant_count + data2.variant_count + rep.symbole_count = data1.symbole_count + data2.symbole_count + rep.has_merged_sample = data1.has_merged_sample + data2.has_merged_sample + rep.has_obiclean_status = data1.has_obiclean_status + data2.has_obiclean_status + rep.has_obiclean_weight = data1.has_obiclean_weight + data2.has_obiclean_weight + + rep.tags = sumUpdateIntMap(data1.tags, data2.tags) + rep.map_tags = sumUpdateIntMap(data1.map_tags, data2.map_tags) + rep.vector_tags = sumUpdateIntMap(data1.vector_tags, data2.vector_tags) + rep.samples = sumUpdateIntMap(data1.samples, data2.samples) + rep.sample_variants = sumUpdateIntMap(data1.sample_variants, data2.sample_variants) + rep.sample_singletons = sumUpdateIntMap(data1.sample_singletons, data2.sample_singletons) + rep.sample_obiclean_bad = sumUpdateIntMap(data1.sample_obiclean_bad, data2.sample_obiclean_bad) + + return rep +} + +func (data *DataSummary) Update(s *obiseq.BioSequence) *DataSummary { + data.read_count += s.Count() + data.variant_count++ + data.symbole_count += s.Len() + + if s.HasAttribute("merged_sample") { + data.has_merged_sample++ + samples, _ := s.GetIntMap("merged_sample") + obiclean, obc_ok := s.GetStringMap("obiclean_status") + data.samples = sumUpdateIntMap(data.samples, samples) + data.sample_variants = countUpdateIntMap(data.sample_variants, samples) + for k, v := range samples { + if v == 1 { + data.sample_singletons = plusOneUpdateIntMap(data.sample_singletons, k) + } + if v > 1 && obc_ok && obiclean[k] == "i" { + data.sample_obiclean_bad = plusOneUpdateIntMap(data.sample_obiclean_bad, k) + } + } + } else if s.HasAttribute("sample") { + sample, _ := s.GetStringAttribute("sample") + data.samples = plusUpdateIntMap(data.samples, sample, s.Count()) + data.sample_variants = plusOneUpdateIntMap(data.sample_variants, sample) + if s.Count() == 1 { + data.sample_singletons = plusOneUpdateIntMap(data.sample_singletons, sample) + } + } + + if s.HasAttribute("obiclean_status") { + data.has_obiclean_status++ + } + + if s.HasAttribute("obiclean_weight") { + data.has_obiclean_weight++ + } + + for k, v := range s.Annotations() { + switch { + case obiutils.IsAMap(v): + plusOneUpdateIntMap(data.map_tags, k) + case obiutils.IsASlice(v): + plusOneUpdateIntMap(data.vector_tags, k) + default: + plusOneUpdateIntMap(data.tags, k) + } + } + + return data +} + +func ISummary(iterator obiiter.IBioSequence) map[string]interface{} { + + nproc := obioptions.CLIParallelWorkers() + waiter := sync.WaitGroup{} + + summaries := make([]*DataSummary, nproc) + + ff := func(iseq obiiter.IBioSequence, summary *DataSummary) { + + for iseq.Next() { + batch := iseq.Get() + for _, seq := range batch.Slice() { + summary.Update(seq) + } + batch.Recycle(true) + } + waiter.Done() + } + + waiter.Add(nproc) + + summaries[0] = NewDataSummary() + go ff(iterator, summaries[0]) + + for i := 1; i < nproc; i++ { + summaries[i] = NewDataSummary() + go ff(iterator.Split(), summaries[i]) + } + + waiter.Wait() + + rep := summaries[0] + + for i := 1; i < nproc; i++ { + rep = rep.Add(summaries[i]) + } + + dict := make(map[string]interface{}) + + dict["count"] = map[string]interface{}{ + "variants": rep.variant_count, + "reads": rep.read_count, + "total_length": rep.symbole_count, + } + + if len(rep.tags)+len(rep.map_tags)+len(rep.vector_tags) > 0 { + dict["annotations"] = map[string]interface{}{ + "scalar_attributes": len(rep.tags), + "map_attributes": len(rep.map_tags), + "vector_attributes": len(rep.vector_tags), + "keys": make(map[string]map[string]int, 3), + } + + if len(rep.tags) > 0 { + ((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["scalar"] = rep.tags + } + + if len(rep.map_tags) > 0 { + ((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["map"] = rep.map_tags + } + + if len(rep.vector_tags) > 0 { + ((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["vector"] = rep.vector_tags + } + + if len(rep.samples) > 0 { + dict["samples"] = map[string]interface{}{ + "sample_count": len(rep.samples), + "sample_stats": make(map[string]map[string]int, 2), + } + + stats := ((dict["samples"].(map[string]interface{}))["sample_stats"].(map[string]map[string]int)) + for k, v := range rep.samples { + stats[k] = map[string]int{ + "reads": v, + "variants": rep.sample_variants[k], + "singletons": rep.sample_singletons[k], + } + + if rep.variant_count == rep.has_obiclean_status { + stats[k]["obiclean_bad"] = rep.sample_obiclean_bad[k] + } + } + } + } + return dict +} diff --git a/pkg/obitools/obisummary/options.go b/pkg/obitools/obisummary/options.go new file mode 100644 index 0000000..97de964 --- /dev/null +++ b/pkg/obitools/obisummary/options.go @@ -0,0 +1,28 @@ +// obicount function utility package. +// +// The obitols/obicount package contains every +// functions specificaly required by the obicount utility. +package obisummary + +import ( + "github.com/DavidGamba/go-getoptions" +) + +var __json_output__ = false +var __yaml_output__ = false + +func OptionSet(options *getoptions.GetOpt) { + options.BoolVar(&__json_output__, "json-output", false, + options.Description("Print results as JSON record.")) + + options.BoolVar(&__yaml_output__, "yaml-output", false, + options.Description("Print results as YAML record.")) +} + +func CLIOutFormat() string { + if __yaml_output__ && !__json_output__ { + return "yaml" + } + + return "json" +} diff --git a/pkg/obitools/obitagpcr/pcrtag.go b/pkg/obitools/obitagpcr/pcrtag.go index b9ed8b1..7435f13 100644 --- a/pkg/obitools/obitagpcr/pcrtag.go +++ b/pkg/obitools/obitagpcr/pcrtag.go @@ -111,11 +111,11 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence, } } else { demultiplex_error := consensus.Annotations()["demultiplex_error"] - if demultiplex_error!=nil { + if demultiplex_error != nil { A.Annotations()["demultiplex_error"] = demultiplex_error.(string) B.Annotations()["demultiplex_error"] = demultiplex_error.(string) } else { - log.Panicln("@@ ",wid,"Error : ",err,*consensus) + log.Panicln("@@ ", wid, "Error : ", err, *consensus) } } } @@ -160,31 +160,3 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence, return iout } - -// if match.IsDirect { -// annot["direction"] = "direct" -// } else { -// annot["direction"] = "reverse" -// } - -// if match.ForwardMatch != "" { -// annot["forward_match"] = match.ForwardMatch -// annot["forward_mismatches"] = match.ForwardMismatches -// annot["forward_tag"] = match.ForwardTag -// } - -// if match.ReverseMatch != "" { -// annot["reverse_match"] = match.ReverseMatch -// annot["reverse_mismatches"] = match.ReverseMismatches -// annot["reverse_tag"] = match.ReverseTag -// } - -// if match.Error == nil { -// if match.Pcr != nil { -// annot["sample"] = match.Pcr.Sample -// annot["experiment"] = match.Pcr.Experiment -// for k, val := range match.Pcr.Annotations { -// annot[k] = val -// } -// } else { -// annot["demultiplex_error"] diff --git a/pkg/obiutils/goutils.go b/pkg/obiutils/goutils.go index f88c54c..374573b 100644 --- a/pkg/obiutils/goutils.go +++ b/pkg/obiutils/goutils.go @@ -148,6 +148,27 @@ func InterfaceToIntMap(i interface{}) (val map[string]int, err error) { return } +func InterfaceToStringMap(i interface{}) (val map[string]string, err error) { + err = nil + + switch i := i.(type) { + case map[string]string: + val = i + case map[string]interface{}: + val = make(map[string]string, len(i)) + for k, v := range i { + val[k], err = InterfaceToString(v) + if err != nil { + return + } + } + default: + err = &NotAMapInt{"value attribute cannot be casted to a map[string]int"} + } + + return +} + // NotABoolean defines a new type of Error : "NotAMapInt" type NotAMapFloat64 struct { message string