mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Replace MakeBioSequence call by NewBioSequence call,
Implements a new file format guesser Adds some more API doc Former-commit-id: 9837bf1c28beca6ddb599b367f93548950ba83c1
This commit is contained in:
@ -8,21 +8,30 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
|
||||||
|
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
)
|
)
|
||||||
|
|
||||||
// A pool of byte slices.
|
// // A pool of byte slices.
|
||||||
var _BuildAlignArenaPool = sync.Pool{
|
// var _BuildAlignArenaPool = sync.Pool{
|
||||||
New: func() interface{} {
|
// New: func() interface{} {
|
||||||
bs := make([]byte, 0, 300)
|
// bs := make([]byte, 0, 300)
|
||||||
return &bs
|
// return &bs
|
||||||
},
|
// },
|
||||||
}
|
// }
|
||||||
|
|
||||||
|
// _BuildAlignment builds the alignment between two sequences.
|
||||||
|
//
|
||||||
// It takes two sequences, a path, a gap character, and two buffers, and it builds the alignment by
|
// It takes two sequences, a path, a gap character, and two buffers, and it builds the alignment by
|
||||||
// walking the path and copying the sequences into the buffers
|
// walking the path and copying the sequences into the buffers.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - seqA: a byte slice representing the first sequence.
|
||||||
|
// - seqB: a byte slice representing the second sequence.
|
||||||
|
// - path: a slice of integers representing the alignment path.
|
||||||
|
// - gap: a byte representing the gap character.
|
||||||
|
// - bufferA: a pointer to a byte slice for storing the aligned sequence A.
|
||||||
|
// - bufferB: a pointer to a byte slice for storing the aligned sequence B.
|
||||||
func _BuildAlignment(seqA, seqB []byte, path []int, gap byte, bufferA, bufferB *[]byte) {
|
func _BuildAlignment(seqA, seqB []byte, path []int, gap byte, bufferA, bufferB *[]byte) {
|
||||||
|
|
||||||
*bufferA = (*bufferA)[:0]
|
*bufferA = (*bufferA)[:0]
|
||||||
|
@ -2,11 +2,14 @@ package obiformats
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
gzip "github.com/klauspost/pgzip"
|
"bytes"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
"strings"
|
"regexp"
|
||||||
|
|
||||||
|
"github.com/gabriel-vasile/mimetype"
|
||||||
|
gzip "github.com/klauspost/pgzip"
|
||||||
|
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
|
|
||||||
@ -14,36 +17,89 @@ import (
|
|||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
func GuessSeqFileType(firstline string) string {
|
// OBIMimeTypeGuesser is a function that takes an io.Reader as input and guesses the MIME type of the data.
|
||||||
switch {
|
// It uses several detectors to identify specific file formats, such as FASTA, FASTQ, ecoPCR2, GenBank, and EMBL.
|
||||||
case strings.HasPrefix(firstline, "#@ecopcr-v2"):
|
// The function reads data from the input stream and analyzes it using the mimetype library.
|
||||||
return "ecopcr"
|
// It then returns the detected MIME type, a modified reader with the read data, and any error encountered during the process.
|
||||||
|
//
|
||||||
case strings.HasPrefix(firstline, "#"):
|
// The following file types are recognized:
|
||||||
return "ecopcr"
|
// - "text/ecopcr": if the first line starts with "#@ecopcr-v2".
|
||||||
|
// - "text/fasta": if the first line starts with ">".
|
||||||
case strings.HasPrefix(firstline, ">"):
|
// - "text/fastq": if the first line starts with "@".
|
||||||
return "fasta"
|
// - "text/embl": if the first line starts with "ID ".
|
||||||
|
// - "text/genbank": if the first line starts with "LOCUS ".
|
||||||
case strings.HasPrefix(firstline, "@"):
|
// - "text/genbank" (special case): if the first line "Genetic Sequence Data Bank" (for genbank release files).
|
||||||
return "fastq"
|
// - "text/csv"
|
||||||
|
//
|
||||||
case strings.HasPrefix(firstline, "ID "):
|
// Parameters:
|
||||||
return "embl"
|
// - stream: An io.Reader representing the input stream to read data from.
|
||||||
|
//
|
||||||
case strings.HasPrefix(firstline, "LOCUS "):
|
// Returns:
|
||||||
return "genbank"
|
// - *mimetype.MIME: The detected MIME type of the data.
|
||||||
|
// - io.Reader: A modified reader with the read data.
|
||||||
// Special case for genbank release files
|
// - error: Any error encountered during the process.
|
||||||
// I hope it is enougth stringeant
|
func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
|
||||||
case strings.HasSuffix(firstline, " Genetic Se"):
|
fastaDetector := func(raw []byte, limit uint32) bool {
|
||||||
return "genbank"
|
ok, err := regexp.Match("^>[^ ]", raw)
|
||||||
|
return ok && err == nil
|
||||||
default:
|
|
||||||
return "unknown"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fastqDetector := func(raw []byte, limit uint32) bool {
|
||||||
|
ok, err := regexp.Match("^@[^ ]", raw)
|
||||||
|
return ok && err == nil
|
||||||
|
}
|
||||||
|
|
||||||
|
ecoPCR2Detector := func(raw []byte, limit uint32) bool {
|
||||||
|
ok := bytes.HasPrefix(raw, []byte("#@ecopcr-v2"))
|
||||||
|
return ok
|
||||||
|
}
|
||||||
|
|
||||||
|
genbankDetector := func(raw []byte, limit uint32) bool {
|
||||||
|
ok2 := bytes.HasPrefix(raw, []byte("LOCUS "))
|
||||||
|
ok1, err := regexp.Match("^[^ ]* +Genetic Sequence Data Bank *\n", raw)
|
||||||
|
return ok2 || (ok1 && err == nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
emblDetector := func(raw []byte, limit uint32) bool {
|
||||||
|
ok := bytes.HasPrefix(raw, []byte("ID "))
|
||||||
|
return ok
|
||||||
|
}
|
||||||
|
|
||||||
|
mimetype.Lookup("text/plain").Extend(fastaDetector, "text/fasta", ".fasta")
|
||||||
|
mimetype.Lookup("text/plain").Extend(fastqDetector, "text/fastq", ".fastq")
|
||||||
|
mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr")
|
||||||
|
mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq")
|
||||||
|
mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat")
|
||||||
|
|
||||||
|
// Create a buffer to store the read data
|
||||||
|
buf := make([]byte, 1024*128)
|
||||||
|
n, err := stream.Read(buf)
|
||||||
|
|
||||||
|
if err != nil && err != io.EOF {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect the MIME type using the mimetype library
|
||||||
|
mimeType := mimetype.Detect(buf)
|
||||||
|
if mimeType == nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create a new reader based on the read data
|
||||||
|
newReader := io.MultiReader(bytes.NewReader(buf[:n]), stream)
|
||||||
|
|
||||||
|
return mimeType, newReader, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ReadSequencesFromFile reads sequences from a file and returns an iterator of bio sequences and an error.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - filename: The name of the file to read the sequences from.
|
||||||
|
// - options: Optional parameters to customize the reading process.
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - obiiter.IBioSequence: An iterator of bio sequences.
|
||||||
|
// - error: An error if any occurred during the reading process.
|
||||||
func ReadSequencesFromFile(filename string,
|
func ReadSequencesFromFile(filename string,
|
||||||
options ...WithOption) (obiiter.IBioSequence, error) {
|
options ...WithOption) (obiiter.IBioSequence, error) {
|
||||||
var file *os.File
|
var file *os.File
|
||||||
@ -71,35 +127,28 @@ func ReadSequencesFromFile(filename string,
|
|||||||
reader = greader
|
reader = greader
|
||||||
}
|
}
|
||||||
|
|
||||||
breader := bufio.NewReader(reader)
|
mime, reader, err := OBIMimeTypeGuesser(reader)
|
||||||
|
|
||||||
tag, _ := breader.Peek(30)
|
if err != nil {
|
||||||
|
return obiiter.NilIBioSequence, err
|
||||||
if len(tag) < 30 {
|
|
||||||
newIter := obiiter.MakeIBioSequence()
|
|
||||||
newIter.Close()
|
|
||||||
return newIter, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
filetype := GuessSeqFileType(string(tag))
|
reader = bufio.NewReader(reader)
|
||||||
log.Debugf("File guessed format : %s (tag: %s)",
|
|
||||||
filetype, (strings.Split(string(tag), "\n"))[0])
|
|
||||||
reader = breader
|
|
||||||
|
|
||||||
switch filetype {
|
switch mime.String() {
|
||||||
case "fastq", "fasta":
|
case "text/fasta", "text/fastq":
|
||||||
file.Close()
|
file.Close()
|
||||||
is, err := ReadFastSeqFromFile(filename, options...)
|
is, err := ReadFastSeqFromFile(filename, options...)
|
||||||
return is, err
|
return is, err
|
||||||
case "ecopcr":
|
case "text/ecopcr2":
|
||||||
return ReadEcoPCR(reader, options...), nil
|
return ReadEcoPCR(reader, options...), nil
|
||||||
case "embl":
|
case "text/embl":
|
||||||
return ReadEMBL(reader, options...), nil
|
return ReadEMBL(reader, options...), nil
|
||||||
case "genbank":
|
case "text/genbank":
|
||||||
return ReadGenbank(reader, options...), nil
|
return ReadGenbank(reader, options...), nil
|
||||||
default:
|
default:
|
||||||
log.Fatalf("File %s has guessed format %s which is not yet implemented",
|
log.Fatalf("File %s has guessed format %s which is not yet implemented",
|
||||||
filename, filetype)
|
filename, mime.String())
|
||||||
}
|
}
|
||||||
|
|
||||||
return obiiter.NilIBioSequence, nil
|
return obiiter.NilIBioSequence, nil
|
||||||
|
@ -226,13 +226,13 @@ func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error)
|
|||||||
s := g.DecodePath(path)
|
s := g.DecodePath(path)
|
||||||
|
|
||||||
if len(s) > 0 {
|
if len(s) > 0 {
|
||||||
seq := obiseq.MakeBioSequence(
|
seq := obiseq.NewBioSequence(
|
||||||
id,
|
id,
|
||||||
[]byte(s),
|
[]byte(s),
|
||||||
"",
|
"",
|
||||||
)
|
)
|
||||||
|
|
||||||
return &seq, nil
|
return seq, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, fmt.Errorf("cannot identify optimum path")
|
return nil, fmt.Errorf("cannot identify optimum path")
|
||||||
@ -295,13 +295,13 @@ func (g *DeBruijnGraph) BestConsensus(id string) (*obiseq.BioSequence, error) {
|
|||||||
s := g.DecodePath(path)
|
s := g.DecodePath(path)
|
||||||
|
|
||||||
if len(s) > 0 {
|
if len(s) > 0 {
|
||||||
seq := obiseq.MakeBioSequence(
|
seq := obiseq.NewBioSequence(
|
||||||
id,
|
id,
|
||||||
[]byte(s),
|
[]byte(s),
|
||||||
"",
|
"",
|
||||||
)
|
)
|
||||||
|
|
||||||
return &seq, nil
|
return seq, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, fmt.Errorf("cannot identify optimum path")
|
return nil, fmt.Errorf("cannot identify optimum path")
|
||||||
@ -366,7 +366,7 @@ func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) {
|
|||||||
|
|
||||||
for _, idx := range init {
|
for _, idx := range init {
|
||||||
graph.append(s[graph.kmersize:], idx)
|
graph.append(s[graph.kmersize:], idx)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -34,6 +34,10 @@ type Quality []uint8
|
|||||||
|
|
||||||
var __default_qualities__ = make(Quality, 0, 500)
|
var __default_qualities__ = make(Quality, 0, 500)
|
||||||
|
|
||||||
|
// __make_default_qualities__ generates a default quality slice of the given length.
|
||||||
|
//
|
||||||
|
// It takes an integer parameter 'length' which specifies the desired length of the quality slice.
|
||||||
|
// It returns a Quality slice which is a subset of the '__default_qualities__' slice.
|
||||||
func __make_default_qualities__(length int) Quality {
|
func __make_default_qualities__(length int) Quality {
|
||||||
cl := len(__default_qualities__)
|
cl := len(__default_qualities__)
|
||||||
if cl < length {
|
if cl < length {
|
||||||
@ -59,11 +63,14 @@ type BioSequence struct {
|
|||||||
feature []byte
|
feature []byte
|
||||||
paired *BioSequence // A pointer to the paired sequence
|
paired *BioSequence // A pointer to the paired sequence
|
||||||
annotations Annotation
|
annotations Annotation
|
||||||
annot_lock *sync.Mutex
|
annot_lock sync.Mutex
|
||||||
}
|
}
|
||||||
|
|
||||||
// MakeEmptyBioSequence() creates a new BioSequence object with no data
|
// NewEmptyBioSequence creates a new BioSequence object with an empty sequence.
|
||||||
func MakeEmptyBioSequence(preallocate int) BioSequence {
|
//
|
||||||
|
// The preallocate parameter specifies the number of bytes to preallocate for the sequence. If preallocate is greater than 0, the sequence will be preallocated with the specified number of bytes. If preallocate is 0, the sequence will not be preallocated.
|
||||||
|
// The function returns a pointer to the newly created BioSequence object.
|
||||||
|
func NewEmptyBioSequence(preallocate int) *BioSequence {
|
||||||
atomic.AddInt32(&_NewSeq, 1)
|
atomic.AddInt32(&_NewSeq, 1)
|
||||||
atomic.AddInt32(&_InMemSeq, 1)
|
atomic.AddInt32(&_InMemSeq, 1)
|
||||||
|
|
||||||
@ -72,7 +79,7 @@ func MakeEmptyBioSequence(preallocate int) BioSequence {
|
|||||||
seq = GetSlice(preallocate)
|
seq = GetSlice(preallocate)
|
||||||
}
|
}
|
||||||
|
|
||||||
return BioSequence{
|
return &BioSequence{
|
||||||
id: "",
|
id: "",
|
||||||
definition: "",
|
definition: "",
|
||||||
source: "",
|
source: "",
|
||||||
@ -81,36 +88,33 @@ func MakeEmptyBioSequence(preallocate int) BioSequence {
|
|||||||
feature: nil,
|
feature: nil,
|
||||||
paired: nil,
|
paired: nil,
|
||||||
annotations: nil,
|
annotations: nil,
|
||||||
annot_lock: &sync.Mutex{},
|
annot_lock: sync.Mutex{},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// `NewEmptyBioSequence()` returns a pointer to a new empty BioSequence
|
// NewBioSequence creates a new BioSequence object with the given ID, sequence, and definition.
|
||||||
func NewEmptyBioSequence(preallocate int) *BioSequence {
|
//
|
||||||
s := MakeEmptyBioSequence(preallocate)
|
// Parameters:
|
||||||
return &s
|
// - id: the ID of the BioSequence.
|
||||||
}
|
// - sequence: the sequence data of the BioSequence.
|
||||||
|
// - definition: the definition of the BioSequence.
|
||||||
// `MakeBioSequence` creates a new `BioSequence` with the given `id`, `sequence`, and `definition`
|
//
|
||||||
func MakeBioSequence(id string,
|
// Returns:
|
||||||
|
// - *BioSequence: the newly created BioSequence object.
|
||||||
|
func NewBioSequence(id string,
|
||||||
sequence []byte,
|
sequence []byte,
|
||||||
definition string) BioSequence {
|
definition string) *BioSequence {
|
||||||
bs := MakeEmptyBioSequence(0)
|
bs := NewEmptyBioSequence(0)
|
||||||
bs.SetId(id)
|
bs.SetId(id)
|
||||||
bs.SetSequence(sequence)
|
bs.SetSequence(sequence)
|
||||||
bs.SetDefinition(definition)
|
bs.SetDefinition(definition)
|
||||||
return bs
|
return bs
|
||||||
}
|
}
|
||||||
|
|
||||||
// `NewBioSequence` creates a new `BioSequence` struct and returns a pointer to it
|
// Recycle recycles the BioSequence object.
|
||||||
func NewBioSequence(id string,
|
//
|
||||||
sequence []byte,
|
// It decreases the count of in-memory sequences and increases the count of recycled sequences.
|
||||||
definition string) *BioSequence {
|
// It also recycles the various slices and annotations of the BioSequence object.
|
||||||
s := MakeBioSequence(id, sequence, definition)
|
|
||||||
return &s
|
|
||||||
}
|
|
||||||
|
|
||||||
// A method that is called when the sequence is no longer needed.
|
|
||||||
func (sequence *BioSequence) Recycle() {
|
func (sequence *BioSequence) Recycle() {
|
||||||
|
|
||||||
atomic.AddInt32(&_RecycleSeq, 1)
|
atomic.AddInt32(&_RecycleSeq, 1)
|
||||||
@ -133,9 +137,15 @@ func (sequence *BioSequence) Recycle() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copying the BioSequence.
|
// Copy returns a new BioSequence that is a copy of the original BioSequence.
|
||||||
|
//
|
||||||
|
// It copies the id and definition fields of the original BioSequence to the new BioSequence.
|
||||||
|
// It also creates new slices and copies the values from the original BioSequence's sequence, qualities, and feature fields to the new BioSequence.
|
||||||
|
// If the original BioSequence has annotations, it locks the annot_lock and copies the annotations to the new BioSequence.
|
||||||
|
//
|
||||||
|
// The function returns the new BioSequence.
|
||||||
func (s *BioSequence) Copy() *BioSequence {
|
func (s *BioSequence) Copy() *BioSequence {
|
||||||
newSeq := MakeEmptyBioSequence(0)
|
newSeq := NewEmptyBioSequence(0)
|
||||||
|
|
||||||
newSeq.id = s.id
|
newSeq.id = s.id
|
||||||
newSeq.definition = s.definition
|
newSeq.definition = s.definition
|
||||||
@ -150,30 +160,45 @@ func (s *BioSequence) Copy() *BioSequence {
|
|||||||
newSeq.annotations = GetAnnotation(s.annotations)
|
newSeq.annotations = GetAnnotation(s.annotations)
|
||||||
}
|
}
|
||||||
|
|
||||||
return &newSeq
|
return newSeq
|
||||||
}
|
}
|
||||||
|
|
||||||
// A method that returns the id of the sequence.
|
// Id returns the ID of the BioSequence.
|
||||||
|
//
|
||||||
|
// No parameters.
|
||||||
|
// Returns a string.
|
||||||
func (s *BioSequence) Id() string {
|
func (s *BioSequence) Id() string {
|
||||||
return s.id
|
return s.id
|
||||||
}
|
}
|
||||||
|
|
||||||
// A method that returns the definition of the sequence.
|
// Definition returns the definition of the BioSequence.
|
||||||
|
//
|
||||||
|
// No parameters.
|
||||||
|
// Returns a string.
|
||||||
func (s *BioSequence) Definition() string {
|
func (s *BioSequence) Definition() string {
|
||||||
return s.definition
|
return s.definition
|
||||||
}
|
}
|
||||||
|
|
||||||
// A method that returns the sequence as a byte slice.
|
// Sequence returns the sequence of the BioSequence.
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - []byte: The sequence of the BioSequence.
|
||||||
func (s *BioSequence) Sequence() []byte {
|
func (s *BioSequence) Sequence() []byte {
|
||||||
return s.sequence
|
return s.sequence
|
||||||
}
|
}
|
||||||
|
|
||||||
// A method that returns the sequence as a string.
|
// String returns the string representation of the Sequence.
|
||||||
|
//
|
||||||
|
// No parameters.
|
||||||
|
// Returns a string.
|
||||||
func (s *BioSequence) String() string {
|
func (s *BioSequence) String() string {
|
||||||
return string(s.sequence)
|
return string(s.sequence)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returning the length of the sequence.
|
// Len returns the length of the BioSequence.
|
||||||
|
//
|
||||||
|
// It does not take any parameters.
|
||||||
|
// It returns an integer representing the length of the sequence.
|
||||||
func (s *BioSequence) Len() int {
|
func (s *BioSequence) Len() int {
|
||||||
return len(s.sequence)
|
return len(s.sequence)
|
||||||
}
|
}
|
||||||
@ -301,39 +326,47 @@ func (s *BioSequence) WriteString(data string) (int, error) {
|
|||||||
return s.Write(bdata)
|
return s.Write(bdata)
|
||||||
}
|
}
|
||||||
|
|
||||||
// A method that appends a byte to the sequence.
|
// WriteByte appends a byte to the BioSequence's sequence.
|
||||||
|
//
|
||||||
|
// data: the byte to append to the sequence.
|
||||||
|
// error: an error if the append operation fails.
|
||||||
func (s *BioSequence) WriteByte(data byte) error {
|
func (s *BioSequence) WriteByte(data byte) error {
|
||||||
s.sequence = append(s.sequence, data)
|
s.sequence = append(s.sequence, data)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Clearing the sequence.
|
// Clear clears the BioSequence by resetting the sequence to an empty slice.
|
||||||
|
//
|
||||||
|
// No parameters.
|
||||||
|
// No return values.
|
||||||
func (s *BioSequence) Clear() {
|
func (s *BioSequence) Clear() {
|
||||||
s.sequence = s.sequence[0:0]
|
s.sequence = s.sequence[0:0]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Composition calculates the composition of the BioSequence.
|
||||||
|
//
|
||||||
|
// It counts the occurrences of each nucleotide (a, c, g, t) in the sequence
|
||||||
|
// and returns a map with the counts.
|
||||||
|
//
|
||||||
|
// No parameters.
|
||||||
|
// Returns a map of byte to int, with the counts of each nucleotide.
|
||||||
func (s *BioSequence) Composition() map[byte]int {
|
func (s *BioSequence) Composition() map[byte]int {
|
||||||
|
counts := map[byte]int{
|
||||||
|
'a': 0,
|
||||||
|
'c': 0,
|
||||||
|
'g': 0,
|
||||||
|
't': 0,
|
||||||
|
'o': 0,
|
||||||
|
}
|
||||||
|
|
||||||
a := 0
|
|
||||||
c := 0
|
|
||||||
g := 0
|
|
||||||
t := 0
|
|
||||||
other := 0
|
|
||||||
for _, char := range s.sequence {
|
for _, char := range s.sequence {
|
||||||
switch char {
|
switch char | byte(32) {
|
||||||
case 'a':
|
case 'a', 'c', 'g', 't':
|
||||||
a++
|
counts[char]++
|
||||||
case 'c':
|
|
||||||
c++
|
|
||||||
case 'g':
|
|
||||||
g++
|
|
||||||
case 't':
|
|
||||||
t++
|
|
||||||
default:
|
default:
|
||||||
other++
|
counts['o']++
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return map[byte]int{'a': a, 'c': c, 'g': g, 't': t, 'o': other}
|
return counts
|
||||||
}
|
}
|
||||||
|
366
pkg/obiseq/biosequence_test.go
Normal file
366
pkg/obiseq/biosequence_test.go
Normal file
@ -0,0 +1,366 @@
|
|||||||
|
package obiseq
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"reflect"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestNewEmptyBioSequence tests the NewEmptyBioSequence function.
|
||||||
|
//
|
||||||
|
// It checks the behavior of the function by creating different BioSequence instances with different preallocate values.
|
||||||
|
// The function verifies that the sequence is correctly preallocated or not preallocated based on the input value.
|
||||||
|
// It also checks that the length and capacity of the sequence are set correctly.
|
||||||
|
// The test fails if the function returns nil or if the sequence length or capacity is not as expected.
|
||||||
|
func TestNewEmptyBioSequence(t *testing.T) {
|
||||||
|
// Test case: preallocate is 0, sequence should not be preallocated
|
||||||
|
seq := NewEmptyBioSequence(0)
|
||||||
|
if seq == nil {
|
||||||
|
t.Errorf("NewEmptyBioSequence(0) returned nil")
|
||||||
|
} else if len(seq.sequence) != 0 {
|
||||||
|
t.Errorf("Expected sequence length to be 0, got %d", len(seq.sequence))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test case: preallocate is greater than 0, sequence should be preallocated
|
||||||
|
seq = NewEmptyBioSequence(100)
|
||||||
|
if seq == nil {
|
||||||
|
t.Errorf("NewEmptyBioSequence(100) returned nil")
|
||||||
|
} else if cap(seq.sequence) < 100 {
|
||||||
|
t.Errorf("Expected sequence capacity to be at least 100, got %d", cap(seq.sequence))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test case: preallocate is negative, sequence should not be preallocated
|
||||||
|
seq = NewEmptyBioSequence(-100)
|
||||||
|
if seq == nil {
|
||||||
|
t.Errorf("NewEmptyBioSequence(-100) returned nil")
|
||||||
|
} else if len(seq.sequence) != 0 {
|
||||||
|
t.Errorf("Expected sequence length to be 0, got %d", len(seq.sequence))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestNewBioSequence tests the NewBioSequence function.
|
||||||
|
//
|
||||||
|
// It checks the correctness of the NewBioSequence function by validating that the BioSequence object
|
||||||
|
// created has the correct ID, sequence, and definition.
|
||||||
|
// The function performs two test cases:
|
||||||
|
// 1. Test case 1 checks if the BioSequence object created using the NewBioSequence function has
|
||||||
|
// the expected ID, sequence, and definition when provided with valid inputs.
|
||||||
|
// 2. Test case 2 checks if the BioSequence object created using the NewBioSequence function has
|
||||||
|
// the expected ID, sequence, and definition when provided with different valid inputs.
|
||||||
|
func TestNewBioSequence(t *testing.T) {
|
||||||
|
// Test case 1:
|
||||||
|
id := "seq1"
|
||||||
|
sequence := []byte("ACGT")
|
||||||
|
definition := "DNA sequence"
|
||||||
|
expectedID := "seq1"
|
||||||
|
expectedSequence := []byte("acgt")
|
||||||
|
expectedDefinition := "DNA sequence"
|
||||||
|
|
||||||
|
bs := NewBioSequence(id, sequence, definition)
|
||||||
|
|
||||||
|
if bs.Id() != expectedID {
|
||||||
|
t.Errorf("Expected ID to be %s, but got %s", expectedID, bs.Id())
|
||||||
|
}
|
||||||
|
|
||||||
|
if !bytes.Equal(bs.Sequence(), expectedSequence) {
|
||||||
|
t.Errorf("Expected sequence to be %v, but got %v", expectedSequence, bs.Sequence())
|
||||||
|
}
|
||||||
|
|
||||||
|
if bs.Definition() != expectedDefinition {
|
||||||
|
t.Errorf("Expected definition to be %s, but got %s", expectedDefinition, bs.Definition())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test case 2:
|
||||||
|
id = "seq2"
|
||||||
|
sequence = []byte("ATCG")
|
||||||
|
definition = "RNA sequence"
|
||||||
|
expectedID = "seq2"
|
||||||
|
expectedSequence = []byte("atcg")
|
||||||
|
expectedDefinition = "RNA sequence"
|
||||||
|
|
||||||
|
bs = NewBioSequence(id, sequence, definition)
|
||||||
|
|
||||||
|
if bs.Id() != expectedID {
|
||||||
|
t.Errorf("Expected ID to be %s, but got %s", expectedID, bs.Id())
|
||||||
|
}
|
||||||
|
|
||||||
|
if !bytes.Equal(bs.Sequence(), expectedSequence) {
|
||||||
|
t.Errorf("Expected sequence to be %v, but got %v", expectedSequence, bs.Sequence())
|
||||||
|
}
|
||||||
|
|
||||||
|
if bs.Definition() != expectedDefinition {
|
||||||
|
t.Errorf("Expected definition to be %s, but got %s", expectedDefinition, bs.Definition())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestBioSequence_Recycle tests the Recycle method of the BioSequence struct.
|
||||||
|
//
|
||||||
|
// Test case 1: Recycle a BioSequence object with non-nil slices and annotations.
|
||||||
|
// Test case 2: Recycle a nil BioSequence object.
|
||||||
|
// Test case 3: Recycle a BioSequence object with nil slices and annotations.
|
||||||
|
func TestBioSequence_Recycle(t *testing.T) {
|
||||||
|
// Test case 1: Recycle a BioSequence object with non-nil slices and annotations
|
||||||
|
sequence := &BioSequence{
|
||||||
|
sequence: []byte{'A', 'C', 'G', 'T'},
|
||||||
|
feature: []byte("..."),
|
||||||
|
qualities: []byte{30, 30, 30, 30},
|
||||||
|
annotations: Annotation{"description": "Test"},
|
||||||
|
}
|
||||||
|
sequence.Recycle()
|
||||||
|
|
||||||
|
if len(sequence.sequence) != 0 {
|
||||||
|
t.Errorf("Expected sequence to be empty, got %v", sequence.sequence)
|
||||||
|
}
|
||||||
|
if len(sequence.feature) != 0 {
|
||||||
|
t.Errorf("Expected feature to be empty, got %v", sequence.feature)
|
||||||
|
}
|
||||||
|
if len(sequence.qualities) != 0 {
|
||||||
|
t.Errorf("Expected qualities to be empty, got %v", sequence.qualities)
|
||||||
|
}
|
||||||
|
if sequence.annotations != nil {
|
||||||
|
t.Errorf("Expected annotations to be nil, got %v", sequence.annotations)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test case 2: Recycle a nil BioSequence object
|
||||||
|
var nilSequence *BioSequence
|
||||||
|
nilSequence.Recycle() // No panic expected
|
||||||
|
|
||||||
|
// Test case 3: Recycle a BioSequence object with nil slices and annotations
|
||||||
|
emptySequence := &BioSequence{}
|
||||||
|
emptySequence.Recycle() // No panic expected
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestCopy tests the Copy function of the BioSequence struct.
|
||||||
|
//
|
||||||
|
// It creates a new BioSequence and copies the fields from the original sequence
|
||||||
|
// to the new one. It then performs various tests to check if the fields were
|
||||||
|
// copied correctly.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - t: The testing.T object used for reporting test failures.
|
||||||
|
//
|
||||||
|
// Returns: None.
|
||||||
|
func TestCopy(t *testing.T) {
|
||||||
|
seq := &BioSequence{
|
||||||
|
id: "test",
|
||||||
|
definition: "test sequence",
|
||||||
|
sequence: []byte("ATCG"),
|
||||||
|
qualities: []byte("1234"),
|
||||||
|
feature: []byte("feature1...feature2"),
|
||||||
|
annotations: Annotation{
|
||||||
|
"annotation1": "value1",
|
||||||
|
"annotation2": "value2",
|
||||||
|
},
|
||||||
|
annot_lock: sync.Mutex{},
|
||||||
|
}
|
||||||
|
|
||||||
|
newSeq := seq.Copy()
|
||||||
|
|
||||||
|
// Test if the id and definition fields are copied correctly
|
||||||
|
if newSeq.id != seq.id {
|
||||||
|
t.Errorf("Expected id to be %v, got %v", seq.id, newSeq.id)
|
||||||
|
}
|
||||||
|
if newSeq.definition != seq.definition {
|
||||||
|
t.Errorf("Expected definition to be %v, got %v", seq.definition, newSeq.definition)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test if the sequence, qualities, and feature fields are copied correctly
|
||||||
|
if !reflect.DeepEqual(newSeq.sequence, seq.sequence) {
|
||||||
|
t.Errorf("Expected sequence to be %v, got %v", seq.sequence, newSeq.sequence)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(newSeq.qualities, seq.qualities) {
|
||||||
|
t.Errorf("Expected qualities to be %v, got %v", seq.qualities, newSeq.qualities)
|
||||||
|
}
|
||||||
|
if !reflect.DeepEqual(newSeq.feature, seq.feature) {
|
||||||
|
t.Errorf("Expected feature to be %v, got %v", seq.feature, newSeq.feature)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test if the annotations are copied correctly
|
||||||
|
if !reflect.DeepEqual(newSeq.annotations, seq.annotations) {
|
||||||
|
t.Errorf("Expected annotations to be %v, got %v", seq.annotations, newSeq.annotations)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestBioSequence_Id tests the Id method of the BioSequence struct.
|
||||||
|
//
|
||||||
|
// It initializes a BioSequence with an ID using the constructor and then
|
||||||
|
// verifies that the Id method returns the expected ID.
|
||||||
|
// The expected ID is "ABC123".
|
||||||
|
// The method takes no parameters and returns a string.
|
||||||
|
func TestBioSequence_Id(t *testing.T) {
|
||||||
|
// Initialize BioSequence with an ID using the constructor
|
||||||
|
bioSeq := NewBioSequence("ABC123", []byte(""), "")
|
||||||
|
|
||||||
|
// Test case: ID is returned correctly
|
||||||
|
expected := "ABC123"
|
||||||
|
result := bioSeq.Id()
|
||||||
|
if result != expected {
|
||||||
|
t.Errorf("Expected ID to be %s, but got %s", expected, result)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestBioSequenceDefinition tests the Definition() method of the BioSequence struct.
|
||||||
|
//
|
||||||
|
// This function verifies the behavior of the Definition() method in two test cases:
|
||||||
|
// 1. Empty definition: It creates a BioSequence object with an empty definition and verifies that the Definition() method returns an empty string.
|
||||||
|
// 2. Non-empty definition: It creates a BioSequence object with a non-empty definition and verifies that the Definition() method returns the expected definition.
|
||||||
|
func TestBioSequenceDefinition(t *testing.T) {
|
||||||
|
// Test case 1: Empty definition
|
||||||
|
seq1 := NewBioSequence("", []byte{}, "")
|
||||||
|
expected1 := ""
|
||||||
|
if got1 := seq1.Definition(); got1 != expected1 {
|
||||||
|
t.Errorf("Expected %q, but got %q", expected1, got1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test case 2: Non-empty definition
|
||||||
|
seq2 := NewBioSequence("", []byte{}, "This is a definition")
|
||||||
|
expected2 := "This is a definition"
|
||||||
|
if got2 := seq2.Definition(); got2 != expected2 {
|
||||||
|
t.Errorf("Expected %q, but got %q", expected2, got2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestBioSequenceSequence tests the Sequence() method of the BioSequence struct.
|
||||||
|
//
|
||||||
|
// It verifies the behavior of the Sequence() method under two scenarios:
|
||||||
|
// - Test case 1: Empty sequence
|
||||||
|
// - Test case 2: Non-empty sequence
|
||||||
|
//
|
||||||
|
// Parameter(s):
|
||||||
|
// - t: The testing object provided by the testing framework.
|
||||||
|
// It is used to report errors if the test fails.
|
||||||
|
//
|
||||||
|
// Return type(s):
|
||||||
|
// None.
|
||||||
|
func TestBioSequenceSequence(t *testing.T) {
|
||||||
|
// Test case 1: Empty sequence
|
||||||
|
seq := NewBioSequence("", []byte{}, "")
|
||||||
|
expected := []byte{}
|
||||||
|
actual := seq.Sequence()
|
||||||
|
if !bytes.EqualFold(actual, expected) {
|
||||||
|
t.Errorf("Expected %v, but got %v", expected, actual)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test case 2: Non-empty sequence
|
||||||
|
seq = NewBioSequence("", []byte("atcg"), "")
|
||||||
|
expected = []byte("atcg")
|
||||||
|
actual = seq.Sequence()
|
||||||
|
if !bytes.EqualFold(actual, expected) {
|
||||||
|
t.Errorf("Expected %v, but got %v", expected, actual)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestBioSequence_String tests the String method of the BioSequence struct.
|
||||||
|
//
|
||||||
|
// It includes two test cases:
|
||||||
|
//
|
||||||
|
// 1. Test case 1: Empty sequence
|
||||||
|
// - Creates an empty BioSequence instance.
|
||||||
|
// - Expects an empty string as the result of calling the String method on the BioSequence instance.
|
||||||
|
//
|
||||||
|
// 2. Test case 2: Non-empty sequence
|
||||||
|
// - Creates a BioSequence instance with the sequence "acgt".
|
||||||
|
// - Expects the sequence "acgt" as the result of calling the String method on the BioSequence instance.
|
||||||
|
//
|
||||||
|
// No parameters are required.
|
||||||
|
// No return types are specified.
|
||||||
|
func TestBioSequence_String(t *testing.T) {
|
||||||
|
// Test case 1: Empty sequence
|
||||||
|
seq1 := &BioSequence{}
|
||||||
|
expected1 := ""
|
||||||
|
if got1 := seq1.String(); got1 != expected1 {
|
||||||
|
t.Errorf("Test case 1 failed: expected %s, got %s", expected1, got1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test case 2: Non-empty sequence
|
||||||
|
seq2 := &BioSequence{sequence: []byte("acgt")}
|
||||||
|
expected2 := "acgt"
|
||||||
|
if got2 := seq2.String(); got2 != expected2 {
|
||||||
|
t.Errorf("Test case 2 failed: expected %s, got %s", expected2, got2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestBioSequence_Len tests the Len method of the BioSequence struct.
|
||||||
|
//
|
||||||
|
// It verifies the behavior of the method by performing multiple test cases.
|
||||||
|
// Each test case creates a BioSequence instance with a specific sequence and
|
||||||
|
// compares the actual length returned by the Len method with the expected
|
||||||
|
// length.
|
||||||
|
//
|
||||||
|
// Test 1: Empty sequence
|
||||||
|
// - Create a BioSequence instance with an empty sequence.
|
||||||
|
// - The expected length is 0.
|
||||||
|
// - Check if the actual length returned by the Len method matches the expected
|
||||||
|
// length. If not, report an error.
|
||||||
|
//
|
||||||
|
// Test 2: Sequence with 5 characters
|
||||||
|
// - Create a BioSequence instance with a sequence of 5 characters.
|
||||||
|
// - The expected length is 5.
|
||||||
|
// - Check if the actual length returned by the Len method matches the expected
|
||||||
|
// length. If not, report an error.
|
||||||
|
//
|
||||||
|
// Test 3: Sequence with 10 characters
|
||||||
|
// - Create a BioSequence instance with a sequence of 10 characters.
|
||||||
|
// - The expected length is 10.
|
||||||
|
// - Check if the actual length returned by the Len method matches the expected
|
||||||
|
// length. If not, report an error.
|
||||||
|
func TestBioSequence_Len(t *testing.T) {
|
||||||
|
// Test 1: Empty sequence
|
||||||
|
s1 := NewBioSequence("", nil, "")
|
||||||
|
expected1 := 0
|
||||||
|
if len := s1.Len(); len != expected1 {
|
||||||
|
t.Errorf("Expected length: %d, but got: %d", expected1, len)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 2: Sequence with 5 characters
|
||||||
|
s2 := NewBioSequence("", []byte("ATCGT"), "")
|
||||||
|
expected2 := 5
|
||||||
|
if len := s2.Len(); len != expected2 {
|
||||||
|
t.Errorf("Expected length: %d, but got: %d", expected2, len)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test 3: Sequence with 10 characters
|
||||||
|
s3 := NewBioSequence("", []byte("AGCTAGCTAG"), "")
|
||||||
|
expected3 := 10
|
||||||
|
if len := s3.Len(); len != expected3 {
|
||||||
|
t.Errorf("Expected length: %d, but got: %d", expected3, len)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestBioSequence_Composition tests the Composition method of the BioSequence struct.
|
||||||
|
//
|
||||||
|
// It tests the method with three different test cases:
|
||||||
|
// 1. Empty sequence: It checks if the Composition method returns the expected composition when the sequence is empty.
|
||||||
|
// 2. Sequence with valid nucleotides: It checks if the Composition method returns the expected composition when the sequence contains valid nucleotides.
|
||||||
|
// 3. Sequence with invalid nucleotides: It checks if the Composition method returns the expected composition when the sequence contains invalid nucleotides.
|
||||||
|
//
|
||||||
|
// The expected composition for each test case is defined in a map where the keys are the nucleotides and the values are the expected counts.
|
||||||
|
// The Composition method is expected to return a map with the actual nucleotide counts.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - t: The testing.T object used for reporting test failures and logging.
|
||||||
|
//
|
||||||
|
// Return type: void.
|
||||||
|
func TestBioSequence_Composition(t *testing.T) {
|
||||||
|
// Test case: Empty sequence
|
||||||
|
seq1 := NewBioSequence("", []byte(""), "")
|
||||||
|
expected1 := map[byte]int{'a': 0, 'c': 0, 'g': 0, 't': 0, 'o': 0}
|
||||||
|
if result1 := seq1.Composition(); !reflect.DeepEqual(result1, expected1) {
|
||||||
|
t.Errorf("Composition() returned incorrect result for empty sequence. Got %v, expected %v", result1, expected1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test case: Sequence with valid nucleotides
|
||||||
|
seq2 := NewBioSequence("", []byte("acgtACGT"), "")
|
||||||
|
expected2 := map[byte]int{'a': 2, 'c': 2, 'g': 2, 't': 2, 'o': 0}
|
||||||
|
if result2 := seq2.Composition(); !reflect.DeepEqual(result2, expected2) {
|
||||||
|
t.Errorf("Composition() returned incorrect result for sequence with valid nucleotides. Got %v, expected %v", result2, expected2)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test case: Sequence with invalid nucleotides
|
||||||
|
seq3 := NewBioSequence("", []byte("acgtACGT1234"), "")
|
||||||
|
expected3 := map[byte]int{'a': 2, 'c': 2, 'g': 2, 't': 2, 'o': 4}
|
||||||
|
if result3 := seq3.Composition(); !reflect.DeepEqual(result3, expected3) {
|
||||||
|
t.Errorf("Composition() returned incorrect result for sequence with invalid nucleotides. Got %v, expected %v", result3, expected3)
|
||||||
|
}
|
||||||
|
}
|
@ -38,7 +38,7 @@ func GetSlice(capacity int) []byte {
|
|||||||
if p == nil || *p == nil || cap(*p) < capacity {
|
if p == nil || *p == nil || cap(*p) < capacity {
|
||||||
return make([]byte, 0, capacity)
|
return make([]byte, 0, capacity)
|
||||||
}
|
}
|
||||||
|
|
||||||
s := *p
|
s := *p
|
||||||
|
|
||||||
if cap(s) < capacity {
|
if cap(s) < capacity {
|
||||||
@ -73,7 +73,12 @@ func RecycleAnnotation(a *Annotation) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// It returns a new Annotation object, initialized with the values from the first argument
|
// GetAnnotation returns an Annotation from the BioSequenceAnnotationPool.
|
||||||
|
//
|
||||||
|
// It takes as argument O or 1 Annotation annotation object.
|
||||||
|
// If an annotation object is passed, it is copied into the new Annotation.
|
||||||
|
//
|
||||||
|
// It returns an Annotation.
|
||||||
func GetAnnotation(values ...Annotation) Annotation {
|
func GetAnnotation(values ...Annotation) Annotation {
|
||||||
a := Annotation(nil)
|
a := Annotation(nil)
|
||||||
|
|
||||||
|
@ -271,10 +271,14 @@ func (m *NotABoolean) Error() string {
|
|||||||
return m.message
|
return m.message
|
||||||
}
|
}
|
||||||
|
|
||||||
// > It copies the contents of the `src` map into the `dest` map, but if the value is a map, slice, or
|
// MustFillMap fills the destination map with the values from the source map.
|
||||||
// array, it makes a deep copy of it
|
//
|
||||||
|
// The function takes in two parameters:
|
||||||
|
// - dest: a map[string]interface{} representing the destination map.
|
||||||
|
// - src: a map[string]interface{} representing the source map.
|
||||||
|
//
|
||||||
|
// There is no return value.
|
||||||
func MustFillMap(dest, src map[string]interface{}) {
|
func MustFillMap(dest, src map[string]interface{}) {
|
||||||
|
|
||||||
for k, v := range src {
|
for k, v := range src {
|
||||||
if IsAMap(v) || IsASlice(v) || IsAnArray(v) {
|
if IsAMap(v) || IsASlice(v) || IsAnArray(v) {
|
||||||
v = deepcopy.MustAnything(v)
|
v = deepcopy.MustAnything(v)
|
||||||
|
Reference in New Issue
Block a user