Patch rev complement and first implementation of --auto in obicsv

Former-commit-id: f3020e81283b1073c4d1c2d2ff0887e3998e6764
This commit is contained in:
2023-11-07 09:37:07 +02:00
parent 6a6a6f6f2c
commit 61c30f9b6a
21 changed files with 270 additions and 107 deletions

View File

@@ -8,6 +8,48 @@ import (
log "github.com/sirupsen/logrus"
)
// AttributeKeys returns the keys of the attributes in the BioSequence.
//
// It does not take any parameters.
//
// Returns:
//
// []string: The keys of the BioSequence.
func (s *BioSequence) AttributeKeys(skip_map bool) obiutils.Set[string] {
keys := obiutils.MakeSet[string]()
for k, v := range s.Annotations() {
if !skip_map || !obiutils.IsAMap(v) {
keys.Add(k)
}
}
return keys
}
// Keys returns the keys of the BioSequence.
//
// It returns a slice of strings containing the keys of the BioSequence.
// The keys include "id", "sequence", "qualities", and the attribute keys
// of the BioSequence.
//
// Returns:
//
// []string: The keys of the BioSequence.
func (s *BioSequence) Keys(skip_map bool) obiutils.Set[string] {
keys := s.AttributeKeys(skip_map)
keys.Add("id")
if s.HasSequence() {
keys.Add("sequence")
}
if s.HasQualities() {
keys.Add("qualities")
}
return keys
}
// HasAttribute checks if the BioSequence has the specified attribute.
//
// Parameters:
@@ -16,6 +58,17 @@ import (
// Returns:
// - a boolean indicating whether the BioSequence has the attribute.
func (s *BioSequence) HasAttribute(key string) bool {
if key == "id" {
return true
}
if key == "sequence" && s.sequence != nil {
return true
}
if key == "qualities" && s.qualities != nil {
return true
}
ok := s.annotations != nil
if ok {
@@ -36,6 +89,25 @@ func (s *BioSequence) HasAttribute(key string) bool {
// - val: The value associated with the given key.
// - ok: A boolean indicating whether the key exists in the annotations map.
func (s *BioSequence) GetAttribute(key string) (interface{}, bool) {
if key == "id" {
return s.id, true
}
if key == "sequence" {
if s.HasSequence() {
return s.String(), true
}
return nil, false
}
if key == "qualities" {
if s.HasQualities() {
return s.QualitiesString(), true
}
return nil, false
}
var val interface{}
ok := s.annotations != nil
@@ -54,6 +126,17 @@ func (s *BioSequence) GetAttribute(key string) (interface{}, bool) {
// - key: the key to set the value for.
// - value: the value to set for the given key.
func (s *BioSequence) SetAttribute(key string, value interface{}) {
if key == "id" {
s.SetId(value.(string))
return
}
if key == "sequence" {
s.SetSequence(value.([]byte))
return
}
annot := s.Annotations()
defer s.AnnotationsUnlock()

View File

@@ -15,6 +15,7 @@ import (
"sync"
"sync/atomic"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
@@ -55,8 +56,7 @@ type Annotation map[string]interface{}
// A BioSequence is a sequence of bytes with an identifier, a definition, a sequence, qualities,
// features and annotations. It aims to represent a biological sequence
type BioSequence struct {
id string // The identidier of the sequence (private accessible through the method Id)
//definition string // The documentation of the sequence (private accessible through the method Definition)
id string // The identidier of the sequence (private accessible through the method Id)
source string // The filename without directory name and extension from where the sequence was read.
sequence []byte // The sequence itself, it is accessible by the methode Sequence
qualities []byte // The quality scores of the sequence.
@@ -188,6 +188,14 @@ func (s *BioSequence) Definition() string {
return definition
}
// HasSequence checks if the BioSequence has a sequence.
//
// No parameters.
// Returns a boolean.
func (s *BioSequence) HasSequence() bool {
return s.sequence != nil && len(s.sequence) > 0
}
// Sequence returns the sequence of the BioSequence.
//
// Returns:
@@ -217,7 +225,7 @@ func (s *BioSequence) Len() int {
// This function does not have any parameters.
// It returns a boolean value indicating whether the BioSequence has qualities.
func (s *BioSequence) HasQualities() bool {
return len(s.qualities) > 0
return s.qualities != nil && len(s.qualities) > 0
}
// Qualities returns the sequence quality scores of the BioSequence.
@@ -235,6 +243,19 @@ func (s *BioSequence) Qualities() Quality {
return __make_default_qualities__(len(s.sequence))
}
// QualitiesString returns the string representation of the qualities of the BioSequence.
//
// Returns a string representing the qualities of the BioSequence after applying the shift.
func (s *BioSequence) QualitiesString() string {
quality_shift := obioptions.OutputQualityShift()
qual := s.Qualities()
qual_ascii := make([]byte, len(qual))
for i := 0; i < len(qual); i++ {
qual_ascii[i] = byte(qual[i] + byte(quality_shift))
}
return string(qual_ascii)
}
// Features returns the feature string of the BioSequence.
//
// The feature string contains the EMBL/GenBank not parsed feature table

View File

@@ -3,6 +3,7 @@ package obiseq
import (
"sync"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
log "github.com/sirupsen/logrus"
"golang.org/x/exp/slices"
)
@@ -171,3 +172,13 @@ func (s BioSequenceSlice) Size() int {
return size
}
func (s BioSequenceSlice) AttributeKeys(skip_map bool) obiutils.Set[string] {
keys := obiutils.MakeSet[string]()
for _, k := range s {
keys = keys.Union(k.AttributeKeys(skip_map))
}
return keys
}

View File

@@ -198,6 +198,15 @@ var OBILang = gval.NewLanguage(
composition := (args[0].(*BioSequence)).Composition()
return float64(composition['g']-composition['c']) / float64(composition['g']+composition['c']), nil
}),
gval.Function("gc", func(args ...interface{}) (interface{}, error) {
composition := (args[0].(*BioSequence)).Composition()
return float64(composition['g']+composition['c']) / float64(args[0].(*BioSequence).Len()), nil
}),
gval.Function("composition", func(args ...interface{}) (interface{}, error) {
return (args[0].(*BioSequence)).Composition(), nil
comp := (args[0].(*BioSequence)).Composition()
scomp := make(map[string]float64)
for k, v := range comp {
scomp[string(k)] = float64(v)
}
return scomp, nil
}))

View File

@@ -1,7 +1,17 @@
package obiseq
// ".ABCDEFGHIJKLMNOPQRSTUVWXYZ#![]"
var _revcmpDNA = []byte(".TVGHEFCDIJMLKNOPQYSAABWXRZ#!][")
var _revcmpDNA = []byte(".TVGHNNCDNNMNKNNNNYSAABWNRN]N[NNN")
func complement(n byte) byte {
switch {
case n == '.' || n == '-':
return n
case (n >= 'A' && n <= 'z'):
return _revcmpDNA[n&31] | (n & 0x20)
}
return 'n'
}
// Reverse complements a DNA sequence.
// If the inplace parametter is true, that operation is done in place.
@@ -18,8 +28,7 @@ func (sequence *BioSequence) ReverseComplement(inplace bool) *BioSequence {
// ASCII code & 31 -> builds an index in witch (a|A) is 1
// ASCII code & 0x20 -> Foce lower case
s[j], s[i] = _revcmpDNA[s[i]&31]|(s[i]&0x20),
_revcmpDNA[s[j]&31]|(s[j]&0x20)
s[j], s[i] = complement(s[i]), complement(s[j])
j++
}
@@ -40,8 +49,7 @@ func (sequence *BioSequence) _revcmpMutation() *BioSequence {
b := []byte(m)
// Echange and reverse complement symboles
b[1], b[9] = _revcmpDNA[b[9]&31]|(b[9]&0x20),
_revcmpDNA[b[1]&31]|(b[1]&0x20)
b[1], b[9] = complement(b[9]), complement(b[1])
// Exchange sequencing scores
b[3], b[4], b[11], b[12] = b[11], b[12], b[3], b[4]
@@ -65,7 +73,6 @@ func (sequence *BioSequence) _revcmpMutation() *BioSequence {
return sequence
}
func ReverseComplementWorker(inplace bool) SeqWorker {
f := func(input *BioSequence) *BioSequence {
return input.ReverseComplement(inplace)