mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-07 16:20:27 +00:00
add a feature to obimatrix adding obicsv option to obimatrix
This commit is contained in:
@@ -8,7 +8,7 @@ import (
|
||||
// corresponds to the last commit, and not the one when the file will be
|
||||
// commited
|
||||
|
||||
var _Commit = "6d204f6"
|
||||
var _Commit = "d7ed9d3"
|
||||
var _Version = "Release 4.4.0"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
|
||||
@@ -3,6 +3,7 @@ package obimatrix
|
||||
import (
|
||||
"encoding/csv"
|
||||
"os"
|
||||
"slices"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
@@ -11,41 +12,55 @@ import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicsv"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"golang.org/x/exp/maps"
|
||||
)
|
||||
|
||||
type MatrixData map[string]map[string]interface{}
|
||||
type MatrixData struct {
|
||||
matrix map[string]map[string]interface{}
|
||||
attributes map[string]map[string]interface{}
|
||||
attributeList []string
|
||||
naValue string
|
||||
}
|
||||
|
||||
// MakeMatrixData generates a MatrixData instance.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns a MatrixData.
|
||||
func MakeMatrixData() MatrixData {
|
||||
return make(MatrixData)
|
||||
func MakeMatrixData(naValue string, attributes ...string) MatrixData {
|
||||
return MatrixData{
|
||||
matrix: make(map[string]map[string]interface{}),
|
||||
attributes: make(map[string]map[string]interface{}),
|
||||
attributeList: slices.Clone(attributes),
|
||||
naValue: naValue,
|
||||
}
|
||||
}
|
||||
|
||||
// NewMatrixData creates a new instance of MatrixData.
|
||||
//
|
||||
// It does not take any parameters.
|
||||
// It returns a pointer to a MatrixData object.
|
||||
func NewMatrixData() *MatrixData {
|
||||
m := make(MatrixData)
|
||||
func NewMatrixData(naValue string, attributes ...string) *MatrixData {
|
||||
m := MakeMatrixData(naValue, attributes...)
|
||||
return &m
|
||||
}
|
||||
|
||||
// TransposeMatrixData transposes the MatrixData.
|
||||
//
|
||||
// It takes no parameters.
|
||||
// If the input matrix has attributes, they are lost.
|
||||
// A unique attribute "id" is added to store the column ids of the input matrix.
|
||||
// It returns a pointer to the transposed MatrixData.
|
||||
func (matrix *MatrixData) TransposeMatrixData() *MatrixData {
|
||||
m := make(MatrixData)
|
||||
for k, v := range *matrix {
|
||||
m := MakeMatrixData(matrix.naValue, "id")
|
||||
for k, v := range *&matrix.matrix {
|
||||
for kk, vv := range v {
|
||||
if _, ok := m[kk]; !ok {
|
||||
m[kk] = make(map[string]interface{})
|
||||
if _, ok := m.matrix[kk]; !ok {
|
||||
m.matrix[kk] = make(map[string]interface{})
|
||||
}
|
||||
m[kk][k] = vv
|
||||
m.matrix[kk][k] = vv
|
||||
m.attributes[kk] = map[string]interface{}{"id": k}
|
||||
}
|
||||
}
|
||||
return &m
|
||||
@@ -58,11 +73,12 @@ func (matrix *MatrixData) TransposeMatrixData() *MatrixData {
|
||||
// Returns the pointer to the merged MatrixData.
|
||||
func (data1 *MatrixData) MergeMatrixData(data2 *MatrixData) *MatrixData {
|
||||
|
||||
for k := range *data2 {
|
||||
if _, ok := (*data1)[k]; ok {
|
||||
for k := range data2.matrix {
|
||||
if _, ok := data1.matrix[k]; ok {
|
||||
log.Panicf("Sequence Id %s exists at least twice in the data set", k)
|
||||
} else {
|
||||
(*data1)[k] = (*data2)[k]
|
||||
data1.matrix[k] = data2.matrix[k]
|
||||
data1.attributes[k] = data2.attributes[k]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,21 +93,74 @@ func (data1 *MatrixData) MergeMatrixData(data2 *MatrixData) *MatrixData {
|
||||
//
|
||||
// Returns:
|
||||
// - *MatrixData: The updated MatrixData object.
|
||||
func (data *MatrixData) Update(s *obiseq.BioSequence, mapkey string) *MatrixData {
|
||||
func (data *MatrixData) Update(s *obiseq.BioSequence, mapkey string, strict bool) *MatrixData {
|
||||
|
||||
sid := s.Id()
|
||||
|
||||
if _, ok := data.matrix[sid]; ok {
|
||||
log.Panicf("Sequence Id %s exists at least twice in the data set", sid)
|
||||
}
|
||||
if v, ok := s.GetAttribute(mapkey); ok {
|
||||
if m, ok := v.(*obiseq.StatsOnValues); ok {
|
||||
m.RLock()
|
||||
(*data)[s.Id()] = obiutils.MapToMapInterface(m.Map())
|
||||
data.matrix[sid] = obiutils.MapToMapInterface(m.Map())
|
||||
m.RUnlock()
|
||||
} else if obiutils.IsAMap(v) {
|
||||
(*data)[s.Id()] = obiutils.MapToMapInterface(v)
|
||||
data.matrix[sid] = obiutils.MapToMapInterface(v)
|
||||
} else {
|
||||
log.Panicf("Attribute %s is not a map in the sequence %s", mapkey, s.Id())
|
||||
}
|
||||
} else {
|
||||
log.Panicf("Attribute %s does not exist in the sequence %s", mapkey, s.Id())
|
||||
if strict {
|
||||
log.Panicf("Attribute %s does not exist in the sequence %s", mapkey, s.Id())
|
||||
}
|
||||
data.matrix[sid] = make(map[string]interface{})
|
||||
}
|
||||
|
||||
attrs := make(map[string]interface{}, len(data.attributeList))
|
||||
for _, attrname := range data.attributeList {
|
||||
var value interface{}
|
||||
ok := false
|
||||
switch attrname {
|
||||
case "id":
|
||||
value = s.Id
|
||||
ok = true
|
||||
case "count":
|
||||
value = s.Count()
|
||||
ok = true
|
||||
case "taxon":
|
||||
taxon := s.Taxon(nil)
|
||||
if taxon != nil {
|
||||
value = taxon.String()
|
||||
|
||||
} else {
|
||||
value = s.Taxid()
|
||||
}
|
||||
ok = true
|
||||
case "sequence":
|
||||
value = s.String()
|
||||
ok = true
|
||||
case "quality":
|
||||
if s.HasQualities() {
|
||||
l := s.Len()
|
||||
q := s.Qualities()
|
||||
ascii := make([]byte, l)
|
||||
quality_shift := obidefault.WriteQualitiesShift()
|
||||
for j := 0; j < l; j++ {
|
||||
ascii[j] = uint8(q[j]) + uint8(quality_shift)
|
||||
}
|
||||
value = string(ascii)
|
||||
ok = true
|
||||
}
|
||||
default:
|
||||
value, ok = s.GetAttribute(attrname)
|
||||
}
|
||||
if ok {
|
||||
attrs[attrname] = value
|
||||
}
|
||||
}
|
||||
data.attributes[sid] = attrs
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
@@ -101,6 +170,49 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData {
|
||||
waiter := sync.WaitGroup{}
|
||||
|
||||
mapAttribute := CLIMapAttribute()
|
||||
attribList := make([]string, 0)
|
||||
|
||||
if obicsv.CLIPrintId() {
|
||||
attribList = append(attribList, "id")
|
||||
}
|
||||
|
||||
if obicsv.CLIPrintCount() {
|
||||
attribList = append(attribList, "count")
|
||||
}
|
||||
|
||||
if obicsv.CLIPrintTaxon() {
|
||||
attribList = append(attribList, "taxon")
|
||||
}
|
||||
|
||||
if obicsv.CLIPrintDefinition() {
|
||||
attribList = append(attribList, "definition")
|
||||
}
|
||||
|
||||
if obicsv.CLIPrintSequence() {
|
||||
attribList = append(attribList, "sequence")
|
||||
}
|
||||
|
||||
if obicsv.CLIPrintQuality() {
|
||||
attribList = append(attribList, "qualities")
|
||||
}
|
||||
|
||||
attribList = append(attribList, obicsv.CLIToBeKeptAttributes()...)
|
||||
|
||||
if obicsv.CLIAutoColumns() {
|
||||
if iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
if len(batch.Slice()) == 0 {
|
||||
log.Panicf("first batch should not be empty")
|
||||
}
|
||||
auto_slot := batch.Slice().AttributeKeys(true, true).Members()
|
||||
slices.Sort(auto_slot)
|
||||
attribList = append(attribList, auto_slot...)
|
||||
iterator.PushBack()
|
||||
}
|
||||
}
|
||||
|
||||
naValue := obicsv.CLINAValue()
|
||||
strict := CLIStrict()
|
||||
|
||||
summaries := make([]*MatrixData, nproc)
|
||||
|
||||
@@ -109,7 +221,7 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData {
|
||||
for iseq.Next() {
|
||||
batch := iseq.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
summary.Update(seq, mapAttribute)
|
||||
summary.Update(seq, mapAttribute, strict)
|
||||
}
|
||||
}
|
||||
waiter.Done()
|
||||
@@ -117,11 +229,11 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData {
|
||||
|
||||
waiter.Add(nproc)
|
||||
|
||||
summaries[0] = NewMatrixData()
|
||||
summaries[0] = NewMatrixData(naValue, attribList...)
|
||||
go ff(iterator, summaries[0])
|
||||
|
||||
for i := 1; i < nproc; i++ {
|
||||
summaries[i] = NewMatrixData()
|
||||
summaries[i] = NewMatrixData(naValue, attribList...)
|
||||
go ff(iterator.Split(), summaries[i])
|
||||
}
|
||||
|
||||
@@ -138,7 +250,7 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData {
|
||||
}
|
||||
|
||||
func CLIWriteCSVToStdout(matrix *MatrixData) {
|
||||
navalue := CLINaValue()
|
||||
navalue := CLIMapNaValue()
|
||||
csvwriter := csv.NewWriter(os.Stdout)
|
||||
|
||||
if CLITranspose() {
|
||||
@@ -147,33 +259,44 @@ func CLIWriteCSVToStdout(matrix *MatrixData) {
|
||||
|
||||
samples := obiutils.NewSet[string]()
|
||||
|
||||
for _, v := range *matrix {
|
||||
for _, v := range matrix.matrix {
|
||||
samples.Add(maps.Keys(v)...)
|
||||
}
|
||||
|
||||
osamples := samples.Members()
|
||||
sort.Strings(osamples)
|
||||
|
||||
columns := make([]string, 1, len(osamples)+1)
|
||||
columns[0] = "id"
|
||||
columns := make([]string, 0, len(osamples)+len(matrix.attributeList))
|
||||
columns = append(columns, matrix.attributeList...)
|
||||
columns = append(columns, osamples...)
|
||||
|
||||
csvwriter.Write(columns)
|
||||
nattribs := len(matrix.attributeList)
|
||||
|
||||
for k, data := range *matrix {
|
||||
columns = columns[0:1]
|
||||
columns[0] = k
|
||||
for _, kk := range osamples {
|
||||
if v, ok := data[kk]; ok {
|
||||
vs, err := obiutils.InterfaceToString(v)
|
||||
if err != nil {
|
||||
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk)
|
||||
for k, data := range matrix.matrix {
|
||||
attrs := matrix.attributes[k]
|
||||
for i, kk := range osamples {
|
||||
if i < nattribs {
|
||||
if v, ok := attrs[kk]; ok {
|
||||
vs, err := obiutils.InterfaceToString(v)
|
||||
if err != nil {
|
||||
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk)
|
||||
}
|
||||
columns[i] = vs
|
||||
} else {
|
||||
columns[i] = matrix.naValue
|
||||
}
|
||||
columns = append(columns, vs)
|
||||
} else {
|
||||
columns = append(columns, navalue)
|
||||
if v, ok := data[kk]; ok {
|
||||
vs, err := obiutils.InterfaceToString(v)
|
||||
if err != nil {
|
||||
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk)
|
||||
}
|
||||
columns[i] = vs
|
||||
} else {
|
||||
columns[i] = navalue
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
csvwriter.Write(columns)
|
||||
}
|
||||
@@ -187,8 +310,8 @@ func CLIWriteThreeColumnsToStdout(matrix *MatrixData) {
|
||||
csvwriter := csv.NewWriter(os.Stdout)
|
||||
|
||||
csvwriter.Write([]string{"id", sname, vname})
|
||||
for seqid := range *matrix {
|
||||
for attr, v := range (*matrix)[seqid] {
|
||||
for seqid := range matrix.matrix {
|
||||
for attr, v := range matrix.matrix[seqid] {
|
||||
vs, err := obiutils.InterfaceToString(v)
|
||||
if err != nil {
|
||||
log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, seqid, attr)
|
||||
|
||||
@@ -6,6 +6,7 @@ package obimatrix
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicsv"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
@@ -14,7 +15,8 @@ var __transpose__ = true
|
||||
var __mapAttribute__ = "merged_sample"
|
||||
var __valueName__ = "count"
|
||||
var __sampleName__ = "sample"
|
||||
var __NAValue__ = "0"
|
||||
var __MapNAValue__ = "0"
|
||||
var __AllowEmpty__ = false
|
||||
|
||||
func MatrixOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&__threeColumns__, "three-columns", false,
|
||||
@@ -32,12 +34,16 @@ func MatrixOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringVar(&__sampleName__, "sample-name", __sampleName__,
|
||||
options.Description("Name of the coulumn containing the sample names in the three column format."))
|
||||
|
||||
options.StringVar(&__NAValue__, "na-value", __NAValue__,
|
||||
options.StringVar(&__MapNAValue__, "map-na-value", __MapNAValue__,
|
||||
options.Description("Value used when the map attribute is not defined for a sequence."))
|
||||
|
||||
options.BoolVar(&__AllowEmpty__, "allow-empty", __AllowEmpty__,
|
||||
options.Description("Allow sequences with empty map"))
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
MatrixOptionSet(options)
|
||||
obicsv.CSVOptionSet(options)
|
||||
obiconvert.InputOptionSet(options)
|
||||
}
|
||||
|
||||
@@ -57,8 +63,8 @@ func CLISampleName() string {
|
||||
return __sampleName__
|
||||
}
|
||||
|
||||
func CLINaValue() string {
|
||||
return __NAValue__
|
||||
func CLIMapNaValue() string {
|
||||
return __MapNAValue__
|
||||
}
|
||||
|
||||
func CLIMapAttribute() string {
|
||||
@@ -68,3 +74,7 @@ func CLIMapAttribute() string {
|
||||
func CLITranspose() bool {
|
||||
return __transpose__
|
||||
}
|
||||
|
||||
func CLIStrict() bool {
|
||||
return !__AllowEmpty__
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user