a first version of obisummary

Former-commit-id: cca1019d82a14a322f46a20890b996b5c7491d41
This commit is contained in:
2023-11-09 22:33:06 +02:00
parent a96ecb4837
commit 5ea2b8afcf
7 changed files with 378 additions and 30 deletions

View File

@ -0,0 +1,61 @@
package main
import (
"encoding/json"
"fmt"
"os"
log "github.com/sirupsen/logrus"
"gopkg.in/yaml.v3"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obisummary"
)
func main() {
defer obiseq.LogBioSeqStatus()
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
// f, err := os.Create("cpu.pprof")
// if err != nil {
// log.Fatal(err)
// }
// pprof.StartCPUProfile(f)
// defer pprof.StopCPUProfile()
// go tool trace cpu.trace
// ftrace, err := os.Create("cpu.trace")
// if err != nil {
// log.Fatal(err)
// }
// trace.Start(ftrace)
// defer trace.Stop()
optionParser := obioptions.GenerateOptionParser(
obiconvert.InputOptionSet,
obisummary.OptionSet,
)
_, args := optionParser(os.Args)
fs, err := obiconvert.CLIReadBioSequences(args...)
if err != nil {
log.Errorf("Cannot open file (%v)", err)
os.Exit(1)
}
summary := obisummary.ISummary(fs)
if obisummary.CLIOutFormat() == "json" {
output, _ := json.MarshalIndent(summary, "", " ")
fmt.Print(string(output))
} else {
output, _ := yaml.Marshal(summary)
fmt.Print(string(output))
}
fmt.Printf("\n")
}

View File

@ -287,6 +287,21 @@ func (s *BioSequence) GetIntMap(key string) (map[string]int, bool) {
return val, ok
}
func (s *BioSequence) GetStringMap(key string) (map[string]string, bool) {
var val map[string]string
var err error
v, ok := s.GetAttribute(key)
if ok {
val, err = obiutils.InterfaceToStringMap(v)
ok = err == nil
}
return val, ok
}
// GetIntSlice returns the integer slice value associated with the given key in the BioSequence object.
//
// Parameters:

View File

@ -6,6 +6,18 @@ import (
"github.com/stretchr/testify/assert"
)
// TestSubsequence tests the Subsequence function.
//
// The function tests various cases of the Subsequence method of a BioSequence object.
// It checks different scenarios of subsequence slicing, including both valid and invalid parameters.
// The function is designed for unit testing purposes and uses the Go testing package.
// It asserts that the expected subsequence is returned for each test case and checks for any errors.
// The function also verifies the correctness of the subsequence qualities, if applicable.
// The test cases cover both non-circular and circular subsequence slicing.
// It ensures that the function handles different scenarios such as when `from` is greater than `to`,
// `from` or `to` is out of bounds, and normal subsequence slicing cases.
//
// TestSubsequence does not return any value.
func TestSubsequence(t *testing.T) {
// Test case 1: Subsequence with valid parameters and non-circular
seq := NewBioSequence("ID1", []byte("ATCG"), "")

View File

@ -0,0 +1,239 @@
package obisummary
import (
"sync"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
)
type DataSummary struct {
read_count int
variant_count int
symbole_count int
has_merged_sample int
has_obiclean_status int
has_obiclean_weight int
tags map[string]int
map_tags map[string]int
vector_tags map[string]int
samples map[string]int
sample_variants map[string]int
sample_singletons map[string]int
sample_obiclean_bad map[string]int
}
func NewDataSummary() *DataSummary {
return &DataSummary{
read_count: 0,
variant_count: 0,
symbole_count: 0,
has_merged_sample: 0,
has_obiclean_status: 0,
has_obiclean_weight: 0,
tags: make(map[string]int),
map_tags: make(map[string]int),
vector_tags: make(map[string]int),
samples: make(map[string]int),
sample_variants: make(map[string]int),
sample_singletons: make(map[string]int),
sample_obiclean_bad: make(map[string]int),
}
}
func sumUpdateIntMap(m1, m2 map[string]int) map[string]int {
for k, v2 := range m2 {
if v1, ok := m1[k]; ok {
m1[k] = v1 + v2
} else {
m1[k] = v2
}
}
return m1
}
func countUpdateIntMap(m1, m2 map[string]int) map[string]int {
for k := range m2 {
if v, ok := m1[k]; ok {
m1[k] = v + 1
} else {
m1[k] = 1
}
}
return m1
}
func plusUpdateIntMap(m1 map[string]int, key string, val int) map[string]int {
if v, ok := m1[key]; ok {
m1[key] = v + val
} else {
m1[key] = val
}
return m1
}
func plusOneUpdateIntMap(m1 map[string]int, key string) map[string]int {
return plusUpdateIntMap(m1, key, 1)
}
func (data1 *DataSummary) Add(data2 *DataSummary) *DataSummary {
rep := NewDataSummary()
rep.read_count = data1.read_count + data2.read_count
rep.variant_count = data1.variant_count + data2.variant_count
rep.symbole_count = data1.symbole_count + data2.symbole_count
rep.has_merged_sample = data1.has_merged_sample + data2.has_merged_sample
rep.has_obiclean_status = data1.has_obiclean_status + data2.has_obiclean_status
rep.has_obiclean_weight = data1.has_obiclean_weight + data2.has_obiclean_weight
rep.tags = sumUpdateIntMap(data1.tags, data2.tags)
rep.map_tags = sumUpdateIntMap(data1.map_tags, data2.map_tags)
rep.vector_tags = sumUpdateIntMap(data1.vector_tags, data2.vector_tags)
rep.samples = sumUpdateIntMap(data1.samples, data2.samples)
rep.sample_variants = sumUpdateIntMap(data1.sample_variants, data2.sample_variants)
rep.sample_singletons = sumUpdateIntMap(data1.sample_singletons, data2.sample_singletons)
rep.sample_obiclean_bad = sumUpdateIntMap(data1.sample_obiclean_bad, data2.sample_obiclean_bad)
return rep
}
func (data *DataSummary) Update(s *obiseq.BioSequence) *DataSummary {
data.read_count += s.Count()
data.variant_count++
data.symbole_count += s.Len()
if s.HasAttribute("merged_sample") {
data.has_merged_sample++
samples, _ := s.GetIntMap("merged_sample")
obiclean, obc_ok := s.GetStringMap("obiclean_status")
data.samples = sumUpdateIntMap(data.samples, samples)
data.sample_variants = countUpdateIntMap(data.sample_variants, samples)
for k, v := range samples {
if v == 1 {
data.sample_singletons = plusOneUpdateIntMap(data.sample_singletons, k)
}
if v > 1 && obc_ok && obiclean[k] == "i" {
data.sample_obiclean_bad = plusOneUpdateIntMap(data.sample_obiclean_bad, k)
}
}
} else if s.HasAttribute("sample") {
sample, _ := s.GetStringAttribute("sample")
data.samples = plusUpdateIntMap(data.samples, sample, s.Count())
data.sample_variants = plusOneUpdateIntMap(data.sample_variants, sample)
if s.Count() == 1 {
data.sample_singletons = plusOneUpdateIntMap(data.sample_singletons, sample)
}
}
if s.HasAttribute("obiclean_status") {
data.has_obiclean_status++
}
if s.HasAttribute("obiclean_weight") {
data.has_obiclean_weight++
}
for k, v := range s.Annotations() {
switch {
case obiutils.IsAMap(v):
plusOneUpdateIntMap(data.map_tags, k)
case obiutils.IsASlice(v):
plusOneUpdateIntMap(data.vector_tags, k)
default:
plusOneUpdateIntMap(data.tags, k)
}
}
return data
}
func ISummary(iterator obiiter.IBioSequence) map[string]interface{} {
nproc := obioptions.CLIParallelWorkers()
waiter := sync.WaitGroup{}
summaries := make([]*DataSummary, nproc)
ff := func(iseq obiiter.IBioSequence, summary *DataSummary) {
for iseq.Next() {
batch := iseq.Get()
for _, seq := range batch.Slice() {
summary.Update(seq)
}
batch.Recycle(true)
}
waiter.Done()
}
waiter.Add(nproc)
summaries[0] = NewDataSummary()
go ff(iterator, summaries[0])
for i := 1; i < nproc; i++ {
summaries[i] = NewDataSummary()
go ff(iterator.Split(), summaries[i])
}
waiter.Wait()
rep := summaries[0]
for i := 1; i < nproc; i++ {
rep = rep.Add(summaries[i])
}
dict := make(map[string]interface{})
dict["count"] = map[string]interface{}{
"variants": rep.variant_count,
"reads": rep.read_count,
"total_length": rep.symbole_count,
}
if len(rep.tags)+len(rep.map_tags)+len(rep.vector_tags) > 0 {
dict["annotations"] = map[string]interface{}{
"scalar_attributes": len(rep.tags),
"map_attributes": len(rep.map_tags),
"vector_attributes": len(rep.vector_tags),
"keys": make(map[string]map[string]int, 3),
}
if len(rep.tags) > 0 {
((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["scalar"] = rep.tags
}
if len(rep.map_tags) > 0 {
((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["map"] = rep.map_tags
}
if len(rep.vector_tags) > 0 {
((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["vector"] = rep.vector_tags
}
if len(rep.samples) > 0 {
dict["samples"] = map[string]interface{}{
"sample_count": len(rep.samples),
"sample_stats": make(map[string]map[string]int, 2),
}
stats := ((dict["samples"].(map[string]interface{}))["sample_stats"].(map[string]map[string]int))
for k, v := range rep.samples {
stats[k] = map[string]int{
"reads": v,
"variants": rep.sample_variants[k],
"singletons": rep.sample_singletons[k],
}
if rep.variant_count == rep.has_obiclean_status {
stats[k]["obiclean_bad"] = rep.sample_obiclean_bad[k]
}
}
}
}
return dict
}

View File

@ -0,0 +1,28 @@
// obicount function utility package.
//
// The obitols/obicount package contains every
// functions specificaly required by the obicount utility.
package obisummary
import (
"github.com/DavidGamba/go-getoptions"
)
var __json_output__ = false
var __yaml_output__ = false
func OptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__json_output__, "json-output", false,
options.Description("Print results as JSON record."))
options.BoolVar(&__yaml_output__, "yaml-output", false,
options.Description("Print results as YAML record."))
}
func CLIOutFormat() string {
if __yaml_output__ && !__json_output__ {
return "yaml"
}
return "json"
}

View File

@ -111,11 +111,11 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
}
} else {
demultiplex_error := consensus.Annotations()["demultiplex_error"]
if demultiplex_error!=nil {
if demultiplex_error != nil {
A.Annotations()["demultiplex_error"] = demultiplex_error.(string)
B.Annotations()["demultiplex_error"] = demultiplex_error.(string)
} else {
log.Panicln("@@ ",wid,"Error : ",err,*consensus)
log.Panicln("@@ ", wid, "Error : ", err, *consensus)
}
}
}
@ -160,31 +160,3 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
return iout
}
// if match.IsDirect {
// annot["direction"] = "direct"
// } else {
// annot["direction"] = "reverse"
// }
// if match.ForwardMatch != "" {
// annot["forward_match"] = match.ForwardMatch
// annot["forward_mismatches"] = match.ForwardMismatches
// annot["forward_tag"] = match.ForwardTag
// }
// if match.ReverseMatch != "" {
// annot["reverse_match"] = match.ReverseMatch
// annot["reverse_mismatches"] = match.ReverseMismatches
// annot["reverse_tag"] = match.ReverseTag
// }
// if match.Error == nil {
// if match.Pcr != nil {
// annot["sample"] = match.Pcr.Sample
// annot["experiment"] = match.Pcr.Experiment
// for k, val := range match.Pcr.Annotations {
// annot[k] = val
// }
// } else {
// annot["demultiplex_error"]

View File

@ -148,6 +148,27 @@ func InterfaceToIntMap(i interface{}) (val map[string]int, err error) {
return
}
func InterfaceToStringMap(i interface{}) (val map[string]string, err error) {
err = nil
switch i := i.(type) {
case map[string]string:
val = i
case map[string]interface{}:
val = make(map[string]string, len(i))
for k, v := range i {
val[k], err = InterfaceToString(v)
if err != nil {
return
}
}
default:
err = &NotAMapInt{"value attribute cannot be casted to a map[string]int"}
}
return
}
// NotABoolean defines a new type of Error : "NotAMapInt"
type NotAMapFloat64 struct {
message string