a first version of obisummary

Former-commit-id: cca1019d82a14a322f46a20890b996b5c7491d41
2025-06-29 16:20:46 +00:00 · 2023-11-09 22:33:06 +02:00
parent a96ecb4837
commit 5ea2b8afcf
7 changed files with 378 additions and 30 deletions
--- a/cmd/obitools/obisummary/main.go
+++ b/cmd/obitools/obisummary/main.go
@ -0,0 +1,61 @@
 package main
 import (
 	"encoding/json"
 	"fmt"
 	"os"
 	log "github.com/sirupsen/logrus"
 	"gopkg.in/yaml.v3"
 	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
 	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
 	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
 	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obisummary"
 )
 func main() {
 	defer obiseq.LogBioSeqStatus()
 	// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
 	// f, err := os.Create("cpu.pprof")
 	// if err != nil {
 	// 	log.Fatal(err)
 	// }
 	// pprof.StartCPUProfile(f)
 	// defer pprof.StopCPUProfile()
 	// go tool trace cpu.trace
 	// ftrace, err := os.Create("cpu.trace")
 	// if err != nil {
 	// 	log.Fatal(err)
 	// }
 	// trace.Start(ftrace)
 	// defer trace.Stop()
 	optionParser := obioptions.GenerateOptionParser(
 		obiconvert.InputOptionSet,
 		obisummary.OptionSet,
 	)
 	_, args := optionParser(os.Args)
 	fs, err := obiconvert.CLIReadBioSequences(args...)
 	if err != nil {
 		log.Errorf("Cannot open file (%v)", err)
 		os.Exit(1)
 	}
 	summary := obisummary.ISummary(fs)
 	if obisummary.CLIOutFormat() == "json" {
 		output, _ := json.MarshalIndent(summary, "", "  ")
 		fmt.Print(string(output))
 	} else {
 		output, _ := yaml.Marshal(summary)
 		fmt.Print(string(output))
 	}
 	fmt.Printf("\n")
 }
--- a/pkg/obiseq/attributes.go
+++ b/pkg/obiseq/attributes.go
@ -287,6 +287,21 @@ func (s *BioSequence) GetIntMap(key string) (map[string]int, bool) {
 	return val, ok
 }
 func (s *BioSequence) GetStringMap(key string) (map[string]string, bool) {
 	var val map[string]string
 	var err error
 	v, ok := s.GetAttribute(key)
 	if ok {
 		val, err = obiutils.InterfaceToStringMap(v)
 		ok = err == nil
 	}
 	return val, ok
 }
 // GetIntSlice returns the integer slice value associated with the given key in the BioSequence object.
 //
 // Parameters:
--- a/pkg/obiseq/subseq_test.go
+++ b/pkg/obiseq/subseq_test.go
@ -6,6 +6,18 @@ import (
 	"github.com/stretchr/testify/assert"
 )
 // TestSubsequence tests the Subsequence function.
 //
 // The function tests various cases of the Subsequence method of a BioSequence object.
 // It checks different scenarios of subsequence slicing, including both valid and invalid parameters.
 // The function is designed for unit testing purposes and uses the Go testing package.
 // It asserts that the expected subsequence is returned for each test case and checks for any errors.
 // The function also verifies the correctness of the subsequence qualities, if applicable.
 // The test cases cover both non-circular and circular subsequence slicing.
 // It ensures that the function handles different scenarios such as when `from` is greater than `to`,
 // `from` or `to` is out of bounds, and normal subsequence slicing cases.
 //
 // TestSubsequence does not return any value.
 func TestSubsequence(t *testing.T) {
 	// Test case 1: Subsequence with valid parameters and non-circular
 	seq := NewBioSequence("ID1", []byte("ATCG"), "")
--- a/pkg/obitools/obisummary/obisummary.go
+++ b/pkg/obitools/obisummary/obisummary.go
@ -0,0 +1,239 @@
 package obisummary
 import (
 	"sync"
 	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
 	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
 	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
 	"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
 )
 type DataSummary struct {
 	read_count          int
 	variant_count       int
 	symbole_count       int
 	has_merged_sample   int
 	has_obiclean_status int
 	has_obiclean_weight int
 	tags                map[string]int
 	map_tags            map[string]int
 	vector_tags         map[string]int
 	samples             map[string]int
 	sample_variants     map[string]int
 	sample_singletons   map[string]int
 	sample_obiclean_bad map[string]int
 }
 func NewDataSummary() *DataSummary {
 	return &DataSummary{
 		read_count:          0,
 		variant_count:       0,
 		symbole_count:       0,
 		has_merged_sample:   0,
 		has_obiclean_status: 0,
 		has_obiclean_weight: 0,
 		tags:                make(map[string]int),
 		map_tags:            make(map[string]int),
 		vector_tags:         make(map[string]int),
 		samples:             make(map[string]int),
 		sample_variants:     make(map[string]int),
 		sample_singletons:   make(map[string]int),
 		sample_obiclean_bad: make(map[string]int),
 	}
 }
 func sumUpdateIntMap(m1, m2 map[string]int) map[string]int {
 	for k, v2 := range m2 {
 		if v1, ok := m1[k]; ok {
 			m1[k] = v1 + v2
 		} else {
 			m1[k] = v2
 		}
 	}
 	return m1
 }
 func countUpdateIntMap(m1, m2 map[string]int) map[string]int {
 	for k := range m2 {
 		if v, ok := m1[k]; ok {
 			m1[k] = v + 1
 		} else {
 			m1[k] = 1
 		}
 	}
 	return m1
 }
 func plusUpdateIntMap(m1 map[string]int, key string, val int) map[string]int {
 	if v, ok := m1[key]; ok {
 		m1[key] = v + val
 	} else {
 		m1[key] = val
 	}
 	return m1
 }
 func plusOneUpdateIntMap(m1 map[string]int, key string) map[string]int {
 	return plusUpdateIntMap(m1, key, 1)
 }
 func (data1 *DataSummary) Add(data2 *DataSummary) *DataSummary {
 	rep := NewDataSummary()
 	rep.read_count = data1.read_count + data2.read_count
 	rep.variant_count = data1.variant_count + data2.variant_count
 	rep.symbole_count = data1.symbole_count + data2.symbole_count
 	rep.has_merged_sample = data1.has_merged_sample + data2.has_merged_sample
 	rep.has_obiclean_status = data1.has_obiclean_status + data2.has_obiclean_status
 	rep.has_obiclean_weight = data1.has_obiclean_weight + data2.has_obiclean_weight
 	rep.tags = sumUpdateIntMap(data1.tags, data2.tags)
 	rep.map_tags = sumUpdateIntMap(data1.map_tags, data2.map_tags)
 	rep.vector_tags = sumUpdateIntMap(data1.vector_tags, data2.vector_tags)
 	rep.samples = sumUpdateIntMap(data1.samples, data2.samples)
 	rep.sample_variants = sumUpdateIntMap(data1.sample_variants, data2.sample_variants)
 	rep.sample_singletons = sumUpdateIntMap(data1.sample_singletons, data2.sample_singletons)
 	rep.sample_obiclean_bad = sumUpdateIntMap(data1.sample_obiclean_bad, data2.sample_obiclean_bad)
 	return rep
 }
 func (data *DataSummary) Update(s *obiseq.BioSequence) *DataSummary {
 	data.read_count += s.Count()
 	data.variant_count++
 	data.symbole_count += s.Len()
 	if s.HasAttribute("merged_sample") {
 		data.has_merged_sample++
 		samples, _ := s.GetIntMap("merged_sample")
 		obiclean, obc_ok := s.GetStringMap("obiclean_status")
 		data.samples = sumUpdateIntMap(data.samples, samples)
 		data.sample_variants = countUpdateIntMap(data.sample_variants, samples)
 		for k, v := range samples {
 			if v == 1 {
 				data.sample_singletons = plusOneUpdateIntMap(data.sample_singletons, k)
 			}
 			if v > 1 && obc_ok && obiclean[k] == "i" {
 				data.sample_obiclean_bad = plusOneUpdateIntMap(data.sample_obiclean_bad, k)
 			}
 		}
 	} else if s.HasAttribute("sample") {
 		sample, _ := s.GetStringAttribute("sample")
 		data.samples = plusUpdateIntMap(data.samples, sample, s.Count())
 		data.sample_variants = plusOneUpdateIntMap(data.sample_variants, sample)
 		if s.Count() == 1 {
 			data.sample_singletons = plusOneUpdateIntMap(data.sample_singletons, sample)
 		}
 	}
 	if s.HasAttribute("obiclean_status") {
 		data.has_obiclean_status++
 	}
 	if s.HasAttribute("obiclean_weight") {
 		data.has_obiclean_weight++
 	}
 	for k, v := range s.Annotations() {
 		switch {
 		case obiutils.IsAMap(v):
 			plusOneUpdateIntMap(data.map_tags, k)
 		case obiutils.IsASlice(v):
 			plusOneUpdateIntMap(data.vector_tags, k)
 		default:
 			plusOneUpdateIntMap(data.tags, k)
 		}
 	}
 	return data
 }
 func ISummary(iterator obiiter.IBioSequence) map[string]interface{} {
 	nproc := obioptions.CLIParallelWorkers()
 	waiter := sync.WaitGroup{}
 	summaries := make([]*DataSummary, nproc)
 	ff := func(iseq obiiter.IBioSequence, summary *DataSummary) {
 		for iseq.Next() {
 			batch := iseq.Get()
 			for _, seq := range batch.Slice() {
 				summary.Update(seq)
 			}
 			batch.Recycle(true)
 		}
 		waiter.Done()
 	}
 	waiter.Add(nproc)
 	summaries[0] = NewDataSummary()
 	go ff(iterator, summaries[0])
 	for i := 1; i < nproc; i++ {
 		summaries[i] = NewDataSummary()
 		go ff(iterator.Split(), summaries[i])
 	}
 	waiter.Wait()
 	rep := summaries[0]
 	for i := 1; i < nproc; i++ {
 		rep = rep.Add(summaries[i])
 	}
 	dict := make(map[string]interface{})
 	dict["count"] = map[string]interface{}{
 		"variants":     rep.variant_count,
 		"reads":        rep.read_count,
 		"total_length": rep.symbole_count,
 	}
 	if len(rep.tags)+len(rep.map_tags)+len(rep.vector_tags) > 0 {
 		dict["annotations"] = map[string]interface{}{
 			"scalar_attributes": len(rep.tags),
 			"map_attributes":    len(rep.map_tags),
 			"vector_attributes": len(rep.vector_tags),
 			"keys":              make(map[string]map[string]int, 3),
 		}
 		if len(rep.tags) > 0 {
 			((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["scalar"] = rep.tags
 		}
 		if len(rep.map_tags) > 0 {
 			((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["map"] = rep.map_tags
 		}
 		if len(rep.vector_tags) > 0 {
 			((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["vector"] = rep.vector_tags
 		}
 		if len(rep.samples) > 0 {
 			dict["samples"] = map[string]interface{}{
 				"sample_count": len(rep.samples),
 				"sample_stats": make(map[string]map[string]int, 2),
 			}
 			stats := ((dict["samples"].(map[string]interface{}))["sample_stats"].(map[string]map[string]int))
 			for k, v := range rep.samples {
 				stats[k] = map[string]int{
 					"reads":      v,
 					"variants":   rep.sample_variants[k],
 					"singletons": rep.sample_singletons[k],
 				}
 				if rep.variant_count == rep.has_obiclean_status {
 					stats[k]["obiclean_bad"] = rep.sample_obiclean_bad[k]
 				}
 			}
 		}
 	}
 	return dict
 }
--- a/pkg/obitools/obisummary/options.go
+++ b/pkg/obitools/obisummary/options.go
@ -0,0 +1,28 @@
 // obicount function utility package.
 //
 // The obitols/obicount package contains every
 // functions specificaly required by the obicount utility.
 package obisummary
 import (
 	"github.com/DavidGamba/go-getoptions"
 )
 var __json_output__ = false
 var __yaml_output__ = false
 func OptionSet(options *getoptions.GetOpt) {
 	options.BoolVar(&__json_output__, "json-output", false,
 		options.Description("Print results as JSON record."))
 	options.BoolVar(&__yaml_output__, "yaml-output", false,
 		options.Description("Print results as YAML record."))
 }
 func CLIOutFormat() string {
 	if __yaml_output__ && !__json_output__ {
 		return "yaml"
 	}
 	return "json"
 }
--- a/pkg/obitools/obitagpcr/pcrtag.go
+++ b/pkg/obitools/obitagpcr/pcrtag.go
@ -111,11 +111,11 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
 					}
 				} else {
 					demultiplex_error := consensus.Annotations()["demultiplex_error"]
-					if demultiplex_error!=nil {
+					if demultiplex_error != nil {
 						A.Annotations()["demultiplex_error"] = demultiplex_error.(string)
 						B.Annotations()["demultiplex_error"] = demultiplex_error.(string)
 					} else {
-						log.Panicln("@@ ",wid,"Error : ",err,*consensus)
+						log.Panicln("@@ ", wid, "Error : ", err, *consensus)
 					}
 				}
 			}
@ -160,31 +160,3 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
 	return iout
 }
 // if match.IsDirect {
 // 	annot["direction"] = "direct"
 // } else {
 // 	annot["direction"] = "reverse"
 // }
 // if match.ForwardMatch != "" {
 // 	annot["forward_match"] = match.ForwardMatch
 // 	annot["forward_mismatches"] = match.ForwardMismatches
 // 	annot["forward_tag"] = match.ForwardTag
 // }
 // if match.ReverseMatch != "" {
 // 	annot["reverse_match"] = match.ReverseMatch
 // 	annot["reverse_mismatches"] = match.ReverseMismatches
 // 	annot["reverse_tag"] = match.ReverseTag
 // }
 // if match.Error == nil {
 // 	if match.Pcr != nil {
 // 		annot["sample"] = match.Pcr.Sample
 // 		annot["experiment"] = match.Pcr.Experiment
 // 		for k, val := range match.Pcr.Annotations {
 // 			annot[k] = val
 // 		}
 // 	} else {
 // 		annot["demultiplex_error"]
--- a/pkg/obiutils/goutils.go
+++ b/pkg/obiutils/goutils.go
@ -148,6 +148,27 @@ func InterfaceToIntMap(i interface{}) (val map[string]int, err error) {
 	return
 }
 func InterfaceToStringMap(i interface{}) (val map[string]string, err error) {
 	err = nil
 	switch i := i.(type) {
 	case map[string]string:
 		val = i
 	case map[string]interface{}:
 		val = make(map[string]string, len(i))
 		for k, v := range i {
 			val[k], err = InterfaceToString(v)
 			if err != nil {
 				return
 			}
 		}
 	default:
 		err = &NotAMapInt{"value attribute cannot be casted to a map[string]int"}
 	}
 	return
 }
 // NotABoolean defines a new type of Error : "NotAMapInt"
 type NotAMapFloat64 struct {
 	message string