a first version of obisummary

Former-commit-id: cca1019d82a14a322f46a20890b996b5c7491d41
2025-06-29 16:20:46 +00:00 · 2023-11-09 22:33:06 +02:00
parent a96ecb4837
commit 5ea2b8afcf
7 changed files with 378 additions and 30 deletions
--- a/cmd/obitools/obisummary/main.go
+++ b/cmd/obitools/obisummary/main.go
@ -0,0 +1,61 @@
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+
+	log "github.com/sirupsen/logrus"
+	"gopkg.in/yaml.v3"
+
+	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
+	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
+	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
+	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obisummary"
+)
+
+func main() {
+
+	defer obiseq.LogBioSeqStatus()
+
+	// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
+	// f, err := os.Create("cpu.pprof")
+	// if err != nil {
+	// 	log.Fatal(err)
+	// }
+	// pprof.StartCPUProfile(f)
+	// defer pprof.StopCPUProfile()
+
+	// go tool trace cpu.trace
+	// ftrace, err := os.Create("cpu.trace")
+	// if err != nil {
+	// 	log.Fatal(err)
+	// }
+	// trace.Start(ftrace)
+	// defer trace.Stop()
+
+	optionParser := obioptions.GenerateOptionParser(
+		obiconvert.InputOptionSet,
+		obisummary.OptionSet,
+	)
+
+	_, args := optionParser(os.Args)
+
+	fs, err := obiconvert.CLIReadBioSequences(args...)
+
+	if err != nil {
+		log.Errorf("Cannot open file (%v)", err)
+		os.Exit(1)
+	}
+
+	summary := obisummary.ISummary(fs)
+
+	if obisummary.CLIOutFormat() == "json" {
+		output, _ := json.MarshalIndent(summary, "", "  ")
+		fmt.Print(string(output))
+	} else {
+		output, _ := yaml.Marshal(summary)
+		fmt.Print(string(output))
+	}
+	fmt.Printf("\n")
+}
--- a/pkg/obiseq/attributes.go
+++ b/pkg/obiseq/attributes.go
@ -287,6 +287,21 @@ func (s *BioSequence) GetIntMap(key string) (map[string]int, bool) {
 	return val, ok
 }

+func (s *BioSequence) GetStringMap(key string) (map[string]string, bool) {
+	var val map[string]string
+
+	var err error
+
+	v, ok := s.GetAttribute(key)
+
+	if ok {
+		val, err = obiutils.InterfaceToStringMap(v)
+		ok = err == nil
+	}
+
+	return val, ok
+}
+
 // GetIntSlice returns the integer slice value associated with the given key in the BioSequence object.
 //
 // Parameters:
--- a/pkg/obiseq/subseq_test.go
+++ b/pkg/obiseq/subseq_test.go
@ -6,6 +6,18 @@ import (
 	"github.com/stretchr/testify/assert"
 )

+// TestSubsequence tests the Subsequence function.
+//
+// The function tests various cases of the Subsequence method of a BioSequence object.
+// It checks different scenarios of subsequence slicing, including both valid and invalid parameters.
+// The function is designed for unit testing purposes and uses the Go testing package.
+// It asserts that the expected subsequence is returned for each test case and checks for any errors.
+// The function also verifies the correctness of the subsequence qualities, if applicable.
+// The test cases cover both non-circular and circular subsequence slicing.
+// It ensures that the function handles different scenarios such as when `from` is greater than `to`,
+// `from` or `to` is out of bounds, and normal subsequence slicing cases.
+//
+// TestSubsequence does not return any value.
 func TestSubsequence(t *testing.T) {
 	// Test case 1: Subsequence with valid parameters and non-circular
 	seq := NewBioSequence("ID1", []byte("ATCG"), "")
--- a/pkg/obitools/obisummary/obisummary.go
+++ b/pkg/obitools/obisummary/obisummary.go
@ -0,0 +1,239 @@
+package obisummary
+
+import (
+	"sync"
+
+	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
+	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
+	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
+	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
+)
+
+type DataSummary struct {
+	read_count          int
+	variant_count       int
+	symbole_count       int
+	has_merged_sample   int
+	has_obiclean_status int
+	has_obiclean_weight int
+	tags                map[string]int
+	map_tags            map[string]int
+	vector_tags         map[string]int
+	samples             map[string]int
+	sample_variants     map[string]int
+	sample_singletons   map[string]int
+	sample_obiclean_bad map[string]int
+}
+
+func NewDataSummary() *DataSummary {
+	return &DataSummary{
+		read_count:          0,
+		variant_count:       0,
+		symbole_count:       0,
+		has_merged_sample:   0,
+		has_obiclean_status: 0,
+		has_obiclean_weight: 0,
+		tags:                make(map[string]int),
+		map_tags:            make(map[string]int),
+		vector_tags:         make(map[string]int),
+		samples:             make(map[string]int),
+		sample_variants:     make(map[string]int),
+		sample_singletons:   make(map[string]int),
+		sample_obiclean_bad: make(map[string]int),
+	}
+}
+
+func sumUpdateIntMap(m1, m2 map[string]int) map[string]int {
+	for k, v2 := range m2 {
+		if v1, ok := m1[k]; ok {
+			m1[k] = v1 + v2
+		} else {
+			m1[k] = v2
+		}
+	}
+
+	return m1
+}
+
+func countUpdateIntMap(m1, m2 map[string]int) map[string]int {
+	for k := range m2 {
+		if v, ok := m1[k]; ok {
+			m1[k] = v + 1
+		} else {
+			m1[k] = 1
+		}
+	}
+
+	return m1
+}
+
+func plusUpdateIntMap(m1 map[string]int, key string, val int) map[string]int {
+	if v, ok := m1[key]; ok {
+		m1[key] = v + val
+	} else {
+		m1[key] = val
+	}
+	return m1
+}
+
+func plusOneUpdateIntMap(m1 map[string]int, key string) map[string]int {
+	return plusUpdateIntMap(m1, key, 1)
+}
+
+func (data1 *DataSummary) Add(data2 *DataSummary) *DataSummary {
+	rep := NewDataSummary()
+	rep.read_count = data1.read_count + data2.read_count
+	rep.variant_count = data1.variant_count + data2.variant_count
+	rep.symbole_count = data1.symbole_count + data2.symbole_count
+	rep.has_merged_sample = data1.has_merged_sample + data2.has_merged_sample
+	rep.has_obiclean_status = data1.has_obiclean_status + data2.has_obiclean_status
+	rep.has_obiclean_weight = data1.has_obiclean_weight + data2.has_obiclean_weight
+
+	rep.tags = sumUpdateIntMap(data1.tags, data2.tags)
+	rep.map_tags = sumUpdateIntMap(data1.map_tags, data2.map_tags)
+	rep.vector_tags = sumUpdateIntMap(data1.vector_tags, data2.vector_tags)
+	rep.samples = sumUpdateIntMap(data1.samples, data2.samples)
+	rep.sample_variants = sumUpdateIntMap(data1.sample_variants, data2.sample_variants)
+	rep.sample_singletons = sumUpdateIntMap(data1.sample_singletons, data2.sample_singletons)
+	rep.sample_obiclean_bad = sumUpdateIntMap(data1.sample_obiclean_bad, data2.sample_obiclean_bad)
+
+	return rep
+}
+
+func (data *DataSummary) Update(s *obiseq.BioSequence) *DataSummary {
+	data.read_count += s.Count()
+	data.variant_count++
+	data.symbole_count += s.Len()
+
+	if s.HasAttribute("merged_sample") {
+		data.has_merged_sample++
+		samples, _ := s.GetIntMap("merged_sample")
+		obiclean, obc_ok := s.GetStringMap("obiclean_status")
+		data.samples = sumUpdateIntMap(data.samples, samples)
+		data.sample_variants = countUpdateIntMap(data.sample_variants, samples)
+		for k, v := range samples {
+			if v == 1 {
+				data.sample_singletons = plusOneUpdateIntMap(data.sample_singletons, k)
+			}
+			if v > 1 && obc_ok && obiclean[k] == "i" {
+				data.sample_obiclean_bad = plusOneUpdateIntMap(data.sample_obiclean_bad, k)
+			}
+		}
+	} else if s.HasAttribute("sample") {
+		sample, _ := s.GetStringAttribute("sample")
+		data.samples = plusUpdateIntMap(data.samples, sample, s.Count())
+		data.sample_variants = plusOneUpdateIntMap(data.sample_variants, sample)
+		if s.Count() == 1 {
+			data.sample_singletons = plusOneUpdateIntMap(data.sample_singletons, sample)
+		}
+	}
+
+	if s.HasAttribute("obiclean_status") {
+		data.has_obiclean_status++
+	}
+
+	if s.HasAttribute("obiclean_weight") {
+		data.has_obiclean_weight++
+	}
+
+	for k, v := range s.Annotations() {
+		switch {
+		case obiutils.IsAMap(v):
+			plusOneUpdateIntMap(data.map_tags, k)
+		case obiutils.IsASlice(v):
+			plusOneUpdateIntMap(data.vector_tags, k)
+		default:
+			plusOneUpdateIntMap(data.tags, k)
+		}
+	}
+
+	return data
+}
+
+func ISummary(iterator obiiter.IBioSequence) map[string]interface{} {
+
+	nproc := obioptions.CLIParallelWorkers()
+	waiter := sync.WaitGroup{}
+
+	summaries := make([]*DataSummary, nproc)
+
+	ff := func(iseq obiiter.IBioSequence, summary *DataSummary) {
+
+		for iseq.Next() {
+			batch := iseq.Get()
+			for _, seq := range batch.Slice() {
+				summary.Update(seq)
+			}
+			batch.Recycle(true)
+		}
+		waiter.Done()
+	}
+
+	waiter.Add(nproc)
+
+	summaries[0] = NewDataSummary()
+	go ff(iterator, summaries[0])
+
+	for i := 1; i < nproc; i++ {
+		summaries[i] = NewDataSummary()
+		go ff(iterator.Split(), summaries[i])
+	}
+
+	waiter.Wait()
+
+	rep := summaries[0]
+
+	for i := 1; i < nproc; i++ {
+		rep = rep.Add(summaries[i])
+	}
+
+	dict := make(map[string]interface{})
+
+	dict["count"] = map[string]interface{}{
+		"variants":     rep.variant_count,
+		"reads":        rep.read_count,
+		"total_length": rep.symbole_count,
+	}
+
+	if len(rep.tags)+len(rep.map_tags)+len(rep.vector_tags) > 0 {
+		dict["annotations"] = map[string]interface{}{
+			"scalar_attributes": len(rep.tags),
+			"map_attributes":    len(rep.map_tags),
+			"vector_attributes": len(rep.vector_tags),
+			"keys":              make(map[string]map[string]int, 3),
+		}
+
+		if len(rep.tags) > 0 {
+			((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["scalar"] = rep.tags
+		}
+
+		if len(rep.map_tags) > 0 {
+			((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["map"] = rep.map_tags
+		}
+
+		if len(rep.vector_tags) > 0 {
+			((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["vector"] = rep.vector_tags
+		}
+
+		if len(rep.samples) > 0 {
+			dict["samples"] = map[string]interface{}{
+				"sample_count": len(rep.samples),
+				"sample_stats": make(map[string]map[string]int, 2),
+			}
+
+			stats := ((dict["samples"].(map[string]interface{}))["sample_stats"].(map[string]map[string]int))
+			for k, v := range rep.samples {
+				stats[k] = map[string]int{
+					"reads":      v,
+					"variants":   rep.sample_variants[k],
+					"singletons": rep.sample_singletons[k],
+				}
+
+				if rep.variant_count == rep.has_obiclean_status {
+					stats[k]["obiclean_bad"] = rep.sample_obiclean_bad[k]
+				}
+			}
+		}
+	}
+	return dict
+}
--- a/pkg/obitools/obisummary/options.go
+++ b/pkg/obitools/obisummary/options.go
@ -0,0 +1,28 @@
+// obicount function utility package.
+//
+// The obitols/obicount package contains every
+// functions specificaly required by the obicount utility.
+package obisummary
+
+import (
+	"github.com/DavidGamba/go-getoptions"
+)
+
+var __json_output__ = false
+var __yaml_output__ = false
+
+func OptionSet(options *getoptions.GetOpt) {
+	options.BoolVar(&__json_output__, "json-output", false,
+		options.Description("Print results as JSON record."))
+
+	options.BoolVar(&__yaml_output__, "yaml-output", false,
+		options.Description("Print results as YAML record."))
+}
+
+func CLIOutFormat() string {
+	if __yaml_output__ && !__json_output__ {
+		return "yaml"
+	}
+
+	return "json"
+}
--- a/pkg/obitools/obitagpcr/pcrtag.go
+++ b/pkg/obitools/obitagpcr/pcrtag.go
@ -111,11 +111,11 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
 					}
 				} else {
 					demultiplex_error := consensus.Annotations()["demultiplex_error"]
-					if demultiplex_error!=nil {
+					if demultiplex_error != nil {
 						A.Annotations()["demultiplex_error"] = demultiplex_error.(string)
 						B.Annotations()["demultiplex_error"] = demultiplex_error.(string)
 					} else {
-						log.Panicln("@@ ",wid,"Error : ",err,*consensus)
+						log.Panicln("@@ ", wid, "Error : ", err, *consensus)
 					}
 				}
 			}
@ -160,31 +160,3 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
 	return iout

 }
-
-// if match.IsDirect {
-// 	annot["direction"] = "direct"
-// } else {
-// 	annot["direction"] = "reverse"
-// }
-
-// if match.ForwardMatch != "" {
-// 	annot["forward_match"] = match.ForwardMatch
-// 	annot["forward_mismatches"] = match.ForwardMismatches
-// 	annot["forward_tag"] = match.ForwardTag
-// }
-
-// if match.ReverseMatch != "" {
-// 	annot["reverse_match"] = match.ReverseMatch
-// 	annot["reverse_mismatches"] = match.ReverseMismatches
-// 	annot["reverse_tag"] = match.ReverseTag
-// }
-
-// if match.Error == nil {
-// 	if match.Pcr != nil {
-// 		annot["sample"] = match.Pcr.Sample
-// 		annot["experiment"] = match.Pcr.Experiment
-// 		for k, val := range match.Pcr.Annotations {
-// 			annot[k] = val
-// 		}
-// 	} else {
-// 		annot["demultiplex_error"]
--- a/pkg/obiutils/goutils.go
+++ b/pkg/obiutils/goutils.go
@ -148,6 +148,27 @@ func InterfaceToIntMap(i interface{}) (val map[string]int, err error) {
 	return
 }

+func InterfaceToStringMap(i interface{}) (val map[string]string, err error) {
+	err = nil
+
+	switch i := i.(type) {
+	case map[string]string:
+		val = i
+	case map[string]interface{}:
+		val = make(map[string]string, len(i))
+		for k, v := range i {
+			val[k], err = InterfaceToString(v)
+			if err != nil {
+				return
+			}
+		}
+	default:
+		err = &NotAMapInt{"value attribute cannot be casted to a map[string]int"}
+	}
+
+	return
+}
+
 // NotABoolean defines a new type of Error : "NotAMapInt"
 type NotAMapFloat64 struct {
 	message string