mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
a first version of obisummary
Former-commit-id: cca1019d82a14a322f46a20890b996b5c7491d41
This commit is contained in:
61
cmd/obitools/obisummary/main.go
Normal file
61
cmd/obitools/obisummary/main.go
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
"gopkg.in/yaml.v3"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obisummary"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
|
||||||
|
defer obiseq.LogBioSeqStatus()
|
||||||
|
|
||||||
|
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
|
||||||
|
// f, err := os.Create("cpu.pprof")
|
||||||
|
// if err != nil {
|
||||||
|
// log.Fatal(err)
|
||||||
|
// }
|
||||||
|
// pprof.StartCPUProfile(f)
|
||||||
|
// defer pprof.StopCPUProfile()
|
||||||
|
|
||||||
|
// go tool trace cpu.trace
|
||||||
|
// ftrace, err := os.Create("cpu.trace")
|
||||||
|
// if err != nil {
|
||||||
|
// log.Fatal(err)
|
||||||
|
// }
|
||||||
|
// trace.Start(ftrace)
|
||||||
|
// defer trace.Stop()
|
||||||
|
|
||||||
|
optionParser := obioptions.GenerateOptionParser(
|
||||||
|
obiconvert.InputOptionSet,
|
||||||
|
obisummary.OptionSet,
|
||||||
|
)
|
||||||
|
|
||||||
|
_, args := optionParser(os.Args)
|
||||||
|
|
||||||
|
fs, err := obiconvert.CLIReadBioSequences(args...)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("Cannot open file (%v)", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
summary := obisummary.ISummary(fs)
|
||||||
|
|
||||||
|
if obisummary.CLIOutFormat() == "json" {
|
||||||
|
output, _ := json.MarshalIndent(summary, "", " ")
|
||||||
|
fmt.Print(string(output))
|
||||||
|
} else {
|
||||||
|
output, _ := yaml.Marshal(summary)
|
||||||
|
fmt.Print(string(output))
|
||||||
|
}
|
||||||
|
fmt.Printf("\n")
|
||||||
|
}
|
@ -287,6 +287,21 @@ func (s *BioSequence) GetIntMap(key string) (map[string]int, bool) {
|
|||||||
return val, ok
|
return val, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *BioSequence) GetStringMap(key string) (map[string]string, bool) {
|
||||||
|
var val map[string]string
|
||||||
|
|
||||||
|
var err error
|
||||||
|
|
||||||
|
v, ok := s.GetAttribute(key)
|
||||||
|
|
||||||
|
if ok {
|
||||||
|
val, err = obiutils.InterfaceToStringMap(v)
|
||||||
|
ok = err == nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return val, ok
|
||||||
|
}
|
||||||
|
|
||||||
// GetIntSlice returns the integer slice value associated with the given key in the BioSequence object.
|
// GetIntSlice returns the integer slice value associated with the given key in the BioSequence object.
|
||||||
//
|
//
|
||||||
// Parameters:
|
// Parameters:
|
||||||
|
@ -6,6 +6,18 @@ import (
|
|||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// TestSubsequence tests the Subsequence function.
|
||||||
|
//
|
||||||
|
// The function tests various cases of the Subsequence method of a BioSequence object.
|
||||||
|
// It checks different scenarios of subsequence slicing, including both valid and invalid parameters.
|
||||||
|
// The function is designed for unit testing purposes and uses the Go testing package.
|
||||||
|
// It asserts that the expected subsequence is returned for each test case and checks for any errors.
|
||||||
|
// The function also verifies the correctness of the subsequence qualities, if applicable.
|
||||||
|
// The test cases cover both non-circular and circular subsequence slicing.
|
||||||
|
// It ensures that the function handles different scenarios such as when `from` is greater than `to`,
|
||||||
|
// `from` or `to` is out of bounds, and normal subsequence slicing cases.
|
||||||
|
//
|
||||||
|
// TestSubsequence does not return any value.
|
||||||
func TestSubsequence(t *testing.T) {
|
func TestSubsequence(t *testing.T) {
|
||||||
// Test case 1: Subsequence with valid parameters and non-circular
|
// Test case 1: Subsequence with valid parameters and non-circular
|
||||||
seq := NewBioSequence("ID1", []byte("ATCG"), "")
|
seq := NewBioSequence("ID1", []byte("ATCG"), "")
|
||||||
|
239
pkg/obitools/obisummary/obisummary.go
Normal file
239
pkg/obitools/obisummary/obisummary.go
Normal file
@ -0,0 +1,239 @@
|
|||||||
|
package obisummary
|
||||||
|
|
||||||
|
import (
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
|
)
|
||||||
|
|
||||||
|
type DataSummary struct {
|
||||||
|
read_count int
|
||||||
|
variant_count int
|
||||||
|
symbole_count int
|
||||||
|
has_merged_sample int
|
||||||
|
has_obiclean_status int
|
||||||
|
has_obiclean_weight int
|
||||||
|
tags map[string]int
|
||||||
|
map_tags map[string]int
|
||||||
|
vector_tags map[string]int
|
||||||
|
samples map[string]int
|
||||||
|
sample_variants map[string]int
|
||||||
|
sample_singletons map[string]int
|
||||||
|
sample_obiclean_bad map[string]int
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewDataSummary() *DataSummary {
|
||||||
|
return &DataSummary{
|
||||||
|
read_count: 0,
|
||||||
|
variant_count: 0,
|
||||||
|
symbole_count: 0,
|
||||||
|
has_merged_sample: 0,
|
||||||
|
has_obiclean_status: 0,
|
||||||
|
has_obiclean_weight: 0,
|
||||||
|
tags: make(map[string]int),
|
||||||
|
map_tags: make(map[string]int),
|
||||||
|
vector_tags: make(map[string]int),
|
||||||
|
samples: make(map[string]int),
|
||||||
|
sample_variants: make(map[string]int),
|
||||||
|
sample_singletons: make(map[string]int),
|
||||||
|
sample_obiclean_bad: make(map[string]int),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sumUpdateIntMap(m1, m2 map[string]int) map[string]int {
|
||||||
|
for k, v2 := range m2 {
|
||||||
|
if v1, ok := m1[k]; ok {
|
||||||
|
m1[k] = v1 + v2
|
||||||
|
} else {
|
||||||
|
m1[k] = v2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return m1
|
||||||
|
}
|
||||||
|
|
||||||
|
func countUpdateIntMap(m1, m2 map[string]int) map[string]int {
|
||||||
|
for k := range m2 {
|
||||||
|
if v, ok := m1[k]; ok {
|
||||||
|
m1[k] = v + 1
|
||||||
|
} else {
|
||||||
|
m1[k] = 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return m1
|
||||||
|
}
|
||||||
|
|
||||||
|
func plusUpdateIntMap(m1 map[string]int, key string, val int) map[string]int {
|
||||||
|
if v, ok := m1[key]; ok {
|
||||||
|
m1[key] = v + val
|
||||||
|
} else {
|
||||||
|
m1[key] = val
|
||||||
|
}
|
||||||
|
return m1
|
||||||
|
}
|
||||||
|
|
||||||
|
func plusOneUpdateIntMap(m1 map[string]int, key string) map[string]int {
|
||||||
|
return plusUpdateIntMap(m1, key, 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (data1 *DataSummary) Add(data2 *DataSummary) *DataSummary {
|
||||||
|
rep := NewDataSummary()
|
||||||
|
rep.read_count = data1.read_count + data2.read_count
|
||||||
|
rep.variant_count = data1.variant_count + data2.variant_count
|
||||||
|
rep.symbole_count = data1.symbole_count + data2.symbole_count
|
||||||
|
rep.has_merged_sample = data1.has_merged_sample + data2.has_merged_sample
|
||||||
|
rep.has_obiclean_status = data1.has_obiclean_status + data2.has_obiclean_status
|
||||||
|
rep.has_obiclean_weight = data1.has_obiclean_weight + data2.has_obiclean_weight
|
||||||
|
|
||||||
|
rep.tags = sumUpdateIntMap(data1.tags, data2.tags)
|
||||||
|
rep.map_tags = sumUpdateIntMap(data1.map_tags, data2.map_tags)
|
||||||
|
rep.vector_tags = sumUpdateIntMap(data1.vector_tags, data2.vector_tags)
|
||||||
|
rep.samples = sumUpdateIntMap(data1.samples, data2.samples)
|
||||||
|
rep.sample_variants = sumUpdateIntMap(data1.sample_variants, data2.sample_variants)
|
||||||
|
rep.sample_singletons = sumUpdateIntMap(data1.sample_singletons, data2.sample_singletons)
|
||||||
|
rep.sample_obiclean_bad = sumUpdateIntMap(data1.sample_obiclean_bad, data2.sample_obiclean_bad)
|
||||||
|
|
||||||
|
return rep
|
||||||
|
}
|
||||||
|
|
||||||
|
func (data *DataSummary) Update(s *obiseq.BioSequence) *DataSummary {
|
||||||
|
data.read_count += s.Count()
|
||||||
|
data.variant_count++
|
||||||
|
data.symbole_count += s.Len()
|
||||||
|
|
||||||
|
if s.HasAttribute("merged_sample") {
|
||||||
|
data.has_merged_sample++
|
||||||
|
samples, _ := s.GetIntMap("merged_sample")
|
||||||
|
obiclean, obc_ok := s.GetStringMap("obiclean_status")
|
||||||
|
data.samples = sumUpdateIntMap(data.samples, samples)
|
||||||
|
data.sample_variants = countUpdateIntMap(data.sample_variants, samples)
|
||||||
|
for k, v := range samples {
|
||||||
|
if v == 1 {
|
||||||
|
data.sample_singletons = plusOneUpdateIntMap(data.sample_singletons, k)
|
||||||
|
}
|
||||||
|
if v > 1 && obc_ok && obiclean[k] == "i" {
|
||||||
|
data.sample_obiclean_bad = plusOneUpdateIntMap(data.sample_obiclean_bad, k)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if s.HasAttribute("sample") {
|
||||||
|
sample, _ := s.GetStringAttribute("sample")
|
||||||
|
data.samples = plusUpdateIntMap(data.samples, sample, s.Count())
|
||||||
|
data.sample_variants = plusOneUpdateIntMap(data.sample_variants, sample)
|
||||||
|
if s.Count() == 1 {
|
||||||
|
data.sample_singletons = plusOneUpdateIntMap(data.sample_singletons, sample)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.HasAttribute("obiclean_status") {
|
||||||
|
data.has_obiclean_status++
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.HasAttribute("obiclean_weight") {
|
||||||
|
data.has_obiclean_weight++
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, v := range s.Annotations() {
|
||||||
|
switch {
|
||||||
|
case obiutils.IsAMap(v):
|
||||||
|
plusOneUpdateIntMap(data.map_tags, k)
|
||||||
|
case obiutils.IsASlice(v):
|
||||||
|
plusOneUpdateIntMap(data.vector_tags, k)
|
||||||
|
default:
|
||||||
|
plusOneUpdateIntMap(data.tags, k)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
func ISummary(iterator obiiter.IBioSequence) map[string]interface{} {
|
||||||
|
|
||||||
|
nproc := obioptions.CLIParallelWorkers()
|
||||||
|
waiter := sync.WaitGroup{}
|
||||||
|
|
||||||
|
summaries := make([]*DataSummary, nproc)
|
||||||
|
|
||||||
|
ff := func(iseq obiiter.IBioSequence, summary *DataSummary) {
|
||||||
|
|
||||||
|
for iseq.Next() {
|
||||||
|
batch := iseq.Get()
|
||||||
|
for _, seq := range batch.Slice() {
|
||||||
|
summary.Update(seq)
|
||||||
|
}
|
||||||
|
batch.Recycle(true)
|
||||||
|
}
|
||||||
|
waiter.Done()
|
||||||
|
}
|
||||||
|
|
||||||
|
waiter.Add(nproc)
|
||||||
|
|
||||||
|
summaries[0] = NewDataSummary()
|
||||||
|
go ff(iterator, summaries[0])
|
||||||
|
|
||||||
|
for i := 1; i < nproc; i++ {
|
||||||
|
summaries[i] = NewDataSummary()
|
||||||
|
go ff(iterator.Split(), summaries[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
waiter.Wait()
|
||||||
|
|
||||||
|
rep := summaries[0]
|
||||||
|
|
||||||
|
for i := 1; i < nproc; i++ {
|
||||||
|
rep = rep.Add(summaries[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
dict := make(map[string]interface{})
|
||||||
|
|
||||||
|
dict["count"] = map[string]interface{}{
|
||||||
|
"variants": rep.variant_count,
|
||||||
|
"reads": rep.read_count,
|
||||||
|
"total_length": rep.symbole_count,
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(rep.tags)+len(rep.map_tags)+len(rep.vector_tags) > 0 {
|
||||||
|
dict["annotations"] = map[string]interface{}{
|
||||||
|
"scalar_attributes": len(rep.tags),
|
||||||
|
"map_attributes": len(rep.map_tags),
|
||||||
|
"vector_attributes": len(rep.vector_tags),
|
||||||
|
"keys": make(map[string]map[string]int, 3),
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(rep.tags) > 0 {
|
||||||
|
((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["scalar"] = rep.tags
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(rep.map_tags) > 0 {
|
||||||
|
((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["map"] = rep.map_tags
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(rep.vector_tags) > 0 {
|
||||||
|
((dict["annotations"].(map[string]interface{}))["keys"].(map[string]map[string]int))["vector"] = rep.vector_tags
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(rep.samples) > 0 {
|
||||||
|
dict["samples"] = map[string]interface{}{
|
||||||
|
"sample_count": len(rep.samples),
|
||||||
|
"sample_stats": make(map[string]map[string]int, 2),
|
||||||
|
}
|
||||||
|
|
||||||
|
stats := ((dict["samples"].(map[string]interface{}))["sample_stats"].(map[string]map[string]int))
|
||||||
|
for k, v := range rep.samples {
|
||||||
|
stats[k] = map[string]int{
|
||||||
|
"reads": v,
|
||||||
|
"variants": rep.sample_variants[k],
|
||||||
|
"singletons": rep.sample_singletons[k],
|
||||||
|
}
|
||||||
|
|
||||||
|
if rep.variant_count == rep.has_obiclean_status {
|
||||||
|
stats[k]["obiclean_bad"] = rep.sample_obiclean_bad[k]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return dict
|
||||||
|
}
|
28
pkg/obitools/obisummary/options.go
Normal file
28
pkg/obitools/obisummary/options.go
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
// obicount function utility package.
|
||||||
|
//
|
||||||
|
// The obitols/obicount package contains every
|
||||||
|
// functions specificaly required by the obicount utility.
|
||||||
|
package obisummary
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/DavidGamba/go-getoptions"
|
||||||
|
)
|
||||||
|
|
||||||
|
var __json_output__ = false
|
||||||
|
var __yaml_output__ = false
|
||||||
|
|
||||||
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
|
options.BoolVar(&__json_output__, "json-output", false,
|
||||||
|
options.Description("Print results as JSON record."))
|
||||||
|
|
||||||
|
options.BoolVar(&__yaml_output__, "yaml-output", false,
|
||||||
|
options.Description("Print results as YAML record."))
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIOutFormat() string {
|
||||||
|
if __yaml_output__ && !__json_output__ {
|
||||||
|
return "yaml"
|
||||||
|
}
|
||||||
|
|
||||||
|
return "json"
|
||||||
|
}
|
@ -111,11 +111,11 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
demultiplex_error := consensus.Annotations()["demultiplex_error"]
|
demultiplex_error := consensus.Annotations()["demultiplex_error"]
|
||||||
if demultiplex_error!=nil {
|
if demultiplex_error != nil {
|
||||||
A.Annotations()["demultiplex_error"] = demultiplex_error.(string)
|
A.Annotations()["demultiplex_error"] = demultiplex_error.(string)
|
||||||
B.Annotations()["demultiplex_error"] = demultiplex_error.(string)
|
B.Annotations()["demultiplex_error"] = demultiplex_error.(string)
|
||||||
} else {
|
} else {
|
||||||
log.Panicln("@@ ",wid,"Error : ",err,*consensus)
|
log.Panicln("@@ ", wid, "Error : ", err, *consensus)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -160,31 +160,3 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
|
|||||||
return iout
|
return iout
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// if match.IsDirect {
|
|
||||||
// annot["direction"] = "direct"
|
|
||||||
// } else {
|
|
||||||
// annot["direction"] = "reverse"
|
|
||||||
// }
|
|
||||||
|
|
||||||
// if match.ForwardMatch != "" {
|
|
||||||
// annot["forward_match"] = match.ForwardMatch
|
|
||||||
// annot["forward_mismatches"] = match.ForwardMismatches
|
|
||||||
// annot["forward_tag"] = match.ForwardTag
|
|
||||||
// }
|
|
||||||
|
|
||||||
// if match.ReverseMatch != "" {
|
|
||||||
// annot["reverse_match"] = match.ReverseMatch
|
|
||||||
// annot["reverse_mismatches"] = match.ReverseMismatches
|
|
||||||
// annot["reverse_tag"] = match.ReverseTag
|
|
||||||
// }
|
|
||||||
|
|
||||||
// if match.Error == nil {
|
|
||||||
// if match.Pcr != nil {
|
|
||||||
// annot["sample"] = match.Pcr.Sample
|
|
||||||
// annot["experiment"] = match.Pcr.Experiment
|
|
||||||
// for k, val := range match.Pcr.Annotations {
|
|
||||||
// annot[k] = val
|
|
||||||
// }
|
|
||||||
// } else {
|
|
||||||
// annot["demultiplex_error"]
|
|
||||||
|
@ -148,6 +148,27 @@ func InterfaceToIntMap(i interface{}) (val map[string]int, err error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func InterfaceToStringMap(i interface{}) (val map[string]string, err error) {
|
||||||
|
err = nil
|
||||||
|
|
||||||
|
switch i := i.(type) {
|
||||||
|
case map[string]string:
|
||||||
|
val = i
|
||||||
|
case map[string]interface{}:
|
||||||
|
val = make(map[string]string, len(i))
|
||||||
|
for k, v := range i {
|
||||||
|
val[k], err = InterfaceToString(v)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
err = &NotAMapInt{"value attribute cannot be casted to a map[string]int"}
|
||||||
|
}
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// NotABoolean defines a new type of Error : "NotAMapInt"
|
// NotABoolean defines a new type of Error : "NotAMapInt"
|
||||||
type NotAMapFloat64 struct {
|
type NotAMapFloat64 struct {
|
||||||
message string
|
message string
|
||||||
|
Reference in New Issue
Block a user