From 3f8c0d6a2f2928ed37fa674289a3bafc2634eb91 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 30 Aug 2023 19:59:46 +0200 Subject: [PATCH] Replace MakeBioSequence call by NewBioSequence call, Implements a new file format guesser Adds some more API doc Former-commit-id: 9837bf1c28beca6ddb599b367f93548950ba83c1 --- pkg/obialign/alignment.go | 27 ++- pkg/obiformats/universal_read.go | 141 ++++++++---- pkg/obikmer/debruijn.go | 10 +- pkg/obiseq/biosequence.go | 135 +++++++----- pkg/obiseq/biosequence_test.go | 366 +++++++++++++++++++++++++++++++ pkg/obiseq/pool.go | 9 +- pkg/obiutils/goutils.go | 10 +- 7 files changed, 582 insertions(+), 116 deletions(-) create mode 100644 pkg/obiseq/biosequence_test.go diff --git a/pkg/obialign/alignment.go b/pkg/obialign/alignment.go index c43dd00..59dc3f9 100644 --- a/pkg/obialign/alignment.go +++ b/pkg/obialign/alignment.go @@ -8,21 +8,30 @@ import ( "fmt" "math" "strings" - "sync" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" ) -// A pool of byte slices. -var _BuildAlignArenaPool = sync.Pool{ - New: func() interface{} { - bs := make([]byte, 0, 300) - return &bs - }, -} +// // A pool of byte slices. +// var _BuildAlignArenaPool = sync.Pool{ +// New: func() interface{} { +// bs := make([]byte, 0, 300) +// return &bs +// }, +// } +// _BuildAlignment builds the alignment between two sequences. +// // It takes two sequences, a path, a gap character, and two buffers, and it builds the alignment by -// walking the path and copying the sequences into the buffers +// walking the path and copying the sequences into the buffers. +// +// Parameters: +// - seqA: a byte slice representing the first sequence. +// - seqB: a byte slice representing the second sequence. +// - path: a slice of integers representing the alignment path. +// - gap: a byte representing the gap character. +// - bufferA: a pointer to a byte slice for storing the aligned sequence A. +// - bufferB: a pointer to a byte slice for storing the aligned sequence B. func _BuildAlignment(seqA, seqB []byte, path []int, gap byte, bufferA, bufferB *[]byte) { *bufferA = (*bufferA)[:0] diff --git a/pkg/obiformats/universal_read.go b/pkg/obiformats/universal_read.go index f091ecb..f63b3fb 100644 --- a/pkg/obiformats/universal_read.go +++ b/pkg/obiformats/universal_read.go @@ -2,11 +2,14 @@ package obiformats import ( "bufio" - gzip "github.com/klauspost/pgzip" + "bytes" "io" "os" "path" - "strings" + "regexp" + + "github.com/gabriel-vasile/mimetype" + gzip "github.com/klauspost/pgzip" log "github.com/sirupsen/logrus" @@ -14,36 +17,89 @@ import ( "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" ) -func GuessSeqFileType(firstline string) string { - switch { - case strings.HasPrefix(firstline, "#@ecopcr-v2"): - return "ecopcr" - - case strings.HasPrefix(firstline, "#"): - return "ecopcr" - - case strings.HasPrefix(firstline, ">"): - return "fasta" - - case strings.HasPrefix(firstline, "@"): - return "fastq" - - case strings.HasPrefix(firstline, "ID "): - return "embl" - - case strings.HasPrefix(firstline, "LOCUS "): - return "genbank" - - // Special case for genbank release files - // I hope it is enougth stringeant - case strings.HasSuffix(firstline, " Genetic Se"): - return "genbank" - - default: - return "unknown" +// OBIMimeTypeGuesser is a function that takes an io.Reader as input and guesses the MIME type of the data. +// It uses several detectors to identify specific file formats, such as FASTA, FASTQ, ecoPCR2, GenBank, and EMBL. +// The function reads data from the input stream and analyzes it using the mimetype library. +// It then returns the detected MIME type, a modified reader with the read data, and any error encountered during the process. +// +// The following file types are recognized: +// - "text/ecopcr": if the first line starts with "#@ecopcr-v2". +// - "text/fasta": if the first line starts with ">". +// - "text/fastq": if the first line starts with "@". +// - "text/embl": if the first line starts with "ID ". +// - "text/genbank": if the first line starts with "LOCUS ". +// - "text/genbank" (special case): if the first line "Genetic Sequence Data Bank" (for genbank release files). +// - "text/csv" +// +// Parameters: +// - stream: An io.Reader representing the input stream to read data from. +// +// Returns: +// - *mimetype.MIME: The detected MIME type of the data. +// - io.Reader: A modified reader with the read data. +// - error: Any error encountered during the process. +func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) { + fastaDetector := func(raw []byte, limit uint32) bool { + ok, err := regexp.Match("^>[^ ]", raw) + return ok && err == nil } + + fastqDetector := func(raw []byte, limit uint32) bool { + ok, err := regexp.Match("^@[^ ]", raw) + return ok && err == nil + } + + ecoPCR2Detector := func(raw []byte, limit uint32) bool { + ok := bytes.HasPrefix(raw, []byte("#@ecopcr-v2")) + return ok + } + + genbankDetector := func(raw []byte, limit uint32) bool { + ok2 := bytes.HasPrefix(raw, []byte("LOCUS ")) + ok1, err := regexp.Match("^[^ ]* +Genetic Sequence Data Bank *\n", raw) + return ok2 || (ok1 && err == nil) + } + + emblDetector := func(raw []byte, limit uint32) bool { + ok := bytes.HasPrefix(raw, []byte("ID ")) + return ok + } + + mimetype.Lookup("text/plain").Extend(fastaDetector, "text/fasta", ".fasta") + mimetype.Lookup("text/plain").Extend(fastqDetector, "text/fastq", ".fastq") + mimetype.Lookup("text/plain").Extend(ecoPCR2Detector, "text/ecopcr2", ".ecopcr") + mimetype.Lookup("text/plain").Extend(genbankDetector, "text/genbank", ".seq") + mimetype.Lookup("text/plain").Extend(emblDetector, "text/embl", ".dat") + + // Create a buffer to store the read data + buf := make([]byte, 1024*128) + n, err := stream.Read(buf) + + if err != nil && err != io.EOF { + return nil, nil, err + } + + // Detect the MIME type using the mimetype library + mimeType := mimetype.Detect(buf) + if mimeType == nil { + return nil, nil, err + } + + // Create a new reader based on the read data + newReader := io.MultiReader(bytes.NewReader(buf[:n]), stream) + + return mimeType, newReader, nil } +// ReadSequencesFromFile reads sequences from a file and returns an iterator of bio sequences and an error. +// +// Parameters: +// - filename: The name of the file to read the sequences from. +// - options: Optional parameters to customize the reading process. +// +// Returns: +// - obiiter.IBioSequence: An iterator of bio sequences. +// - error: An error if any occurred during the reading process. func ReadSequencesFromFile(filename string, options ...WithOption) (obiiter.IBioSequence, error) { var file *os.File @@ -71,35 +127,28 @@ func ReadSequencesFromFile(filename string, reader = greader } - breader := bufio.NewReader(reader) + mime, reader, err := OBIMimeTypeGuesser(reader) - tag, _ := breader.Peek(30) - - if len(tag) < 30 { - newIter := obiiter.MakeIBioSequence() - newIter.Close() - return newIter, nil + if err != nil { + return obiiter.NilIBioSequence, err } - filetype := GuessSeqFileType(string(tag)) - log.Debugf("File guessed format : %s (tag: %s)", - filetype, (strings.Split(string(tag), "\n"))[0]) - reader = breader + reader = bufio.NewReader(reader) - switch filetype { - case "fastq", "fasta": + switch mime.String() { + case "text/fasta", "text/fastq": file.Close() is, err := ReadFastSeqFromFile(filename, options...) return is, err - case "ecopcr": + case "text/ecopcr2": return ReadEcoPCR(reader, options...), nil - case "embl": + case "text/embl": return ReadEMBL(reader, options...), nil - case "genbank": + case "text/genbank": return ReadGenbank(reader, options...), nil default: log.Fatalf("File %s has guessed format %s which is not yet implemented", - filename, filetype) + filename, mime.String()) } return obiiter.NilIBioSequence, nil diff --git a/pkg/obikmer/debruijn.go b/pkg/obikmer/debruijn.go index 6042a9b..5d7db7c 100644 --- a/pkg/obikmer/debruijn.go +++ b/pkg/obikmer/debruijn.go @@ -226,13 +226,13 @@ func (g *DeBruijnGraph) LongestConsensus(id string) (*obiseq.BioSequence, error) s := g.DecodePath(path) if len(s) > 0 { - seq := obiseq.MakeBioSequence( + seq := obiseq.NewBioSequence( id, []byte(s), "", ) - return &seq, nil + return seq, nil } return nil, fmt.Errorf("cannot identify optimum path") @@ -295,13 +295,13 @@ func (g *DeBruijnGraph) BestConsensus(id string) (*obiseq.BioSequence, error) { s := g.DecodePath(path) if len(s) > 0 { - seq := obiseq.MakeBioSequence( + seq := obiseq.NewBioSequence( id, []byte(s), "", ) - return &seq, nil + return seq, nil } return nil, fmt.Errorf("cannot identify optimum path") @@ -366,7 +366,7 @@ func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) { for _, idx := range init { graph.append(s[graph.kmersize:], idx) - } + } } } diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index f9a1c0a..e25f3d4 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -34,6 +34,10 @@ type Quality []uint8 var __default_qualities__ = make(Quality, 0, 500) +// __make_default_qualities__ generates a default quality slice of the given length. +// +// It takes an integer parameter 'length' which specifies the desired length of the quality slice. +// It returns a Quality slice which is a subset of the '__default_qualities__' slice. func __make_default_qualities__(length int) Quality { cl := len(__default_qualities__) if cl < length { @@ -59,11 +63,14 @@ type BioSequence struct { feature []byte paired *BioSequence // A pointer to the paired sequence annotations Annotation - annot_lock *sync.Mutex + annot_lock sync.Mutex } -// MakeEmptyBioSequence() creates a new BioSequence object with no data -func MakeEmptyBioSequence(preallocate int) BioSequence { +// NewEmptyBioSequence creates a new BioSequence object with an empty sequence. +// +// The preallocate parameter specifies the number of bytes to preallocate for the sequence. If preallocate is greater than 0, the sequence will be preallocated with the specified number of bytes. If preallocate is 0, the sequence will not be preallocated. +// The function returns a pointer to the newly created BioSequence object. +func NewEmptyBioSequence(preallocate int) *BioSequence { atomic.AddInt32(&_NewSeq, 1) atomic.AddInt32(&_InMemSeq, 1) @@ -72,7 +79,7 @@ func MakeEmptyBioSequence(preallocate int) BioSequence { seq = GetSlice(preallocate) } - return BioSequence{ + return &BioSequence{ id: "", definition: "", source: "", @@ -81,36 +88,33 @@ func MakeEmptyBioSequence(preallocate int) BioSequence { feature: nil, paired: nil, annotations: nil, - annot_lock: &sync.Mutex{}, + annot_lock: sync.Mutex{}, } } -// `NewEmptyBioSequence()` returns a pointer to a new empty BioSequence -func NewEmptyBioSequence(preallocate int) *BioSequence { - s := MakeEmptyBioSequence(preallocate) - return &s -} - -// `MakeBioSequence` creates a new `BioSequence` with the given `id`, `sequence`, and `definition` -func MakeBioSequence(id string, +// NewBioSequence creates a new BioSequence object with the given ID, sequence, and definition. +// +// Parameters: +// - id: the ID of the BioSequence. +// - sequence: the sequence data of the BioSequence. +// - definition: the definition of the BioSequence. +// +// Returns: +// - *BioSequence: the newly created BioSequence object. +func NewBioSequence(id string, sequence []byte, - definition string) BioSequence { - bs := MakeEmptyBioSequence(0) + definition string) *BioSequence { + bs := NewEmptyBioSequence(0) bs.SetId(id) bs.SetSequence(sequence) bs.SetDefinition(definition) return bs } -// `NewBioSequence` creates a new `BioSequence` struct and returns a pointer to it -func NewBioSequence(id string, - sequence []byte, - definition string) *BioSequence { - s := MakeBioSequence(id, sequence, definition) - return &s -} - -// A method that is called when the sequence is no longer needed. +// Recycle recycles the BioSequence object. +// +// It decreases the count of in-memory sequences and increases the count of recycled sequences. +// It also recycles the various slices and annotations of the BioSequence object. func (sequence *BioSequence) Recycle() { atomic.AddInt32(&_RecycleSeq, 1) @@ -133,9 +137,15 @@ func (sequence *BioSequence) Recycle() { } } -// Copying the BioSequence. +// Copy returns a new BioSequence that is a copy of the original BioSequence. +// +// It copies the id and definition fields of the original BioSequence to the new BioSequence. +// It also creates new slices and copies the values from the original BioSequence's sequence, qualities, and feature fields to the new BioSequence. +// If the original BioSequence has annotations, it locks the annot_lock and copies the annotations to the new BioSequence. +// +// The function returns the new BioSequence. func (s *BioSequence) Copy() *BioSequence { - newSeq := MakeEmptyBioSequence(0) + newSeq := NewEmptyBioSequence(0) newSeq.id = s.id newSeq.definition = s.definition @@ -150,30 +160,45 @@ func (s *BioSequence) Copy() *BioSequence { newSeq.annotations = GetAnnotation(s.annotations) } - return &newSeq + return newSeq } -// A method that returns the id of the sequence. +// Id returns the ID of the BioSequence. +// +// No parameters. +// Returns a string. func (s *BioSequence) Id() string { return s.id } -// A method that returns the definition of the sequence. +// Definition returns the definition of the BioSequence. +// +// No parameters. +// Returns a string. func (s *BioSequence) Definition() string { return s.definition } -// A method that returns the sequence as a byte slice. +// Sequence returns the sequence of the BioSequence. +// +// Returns: +// - []byte: The sequence of the BioSequence. func (s *BioSequence) Sequence() []byte { return s.sequence } -// A method that returns the sequence as a string. +// String returns the string representation of the Sequence. +// +// No parameters. +// Returns a string. func (s *BioSequence) String() string { return string(s.sequence) } -// Returning the length of the sequence. +// Len returns the length of the BioSequence. +// +// It does not take any parameters. +// It returns an integer representing the length of the sequence. func (s *BioSequence) Len() int { return len(s.sequence) } @@ -301,39 +326,47 @@ func (s *BioSequence) WriteString(data string) (int, error) { return s.Write(bdata) } -// A method that appends a byte to the sequence. +// WriteByte appends a byte to the BioSequence's sequence. +// +// data: the byte to append to the sequence. +// error: an error if the append operation fails. func (s *BioSequence) WriteByte(data byte) error { s.sequence = append(s.sequence, data) return nil } -// Clearing the sequence. +// Clear clears the BioSequence by resetting the sequence to an empty slice. +// +// No parameters. +// No return values. func (s *BioSequence) Clear() { s.sequence = s.sequence[0:0] } +// Composition calculates the composition of the BioSequence. +// +// It counts the occurrences of each nucleotide (a, c, g, t) in the sequence +// and returns a map with the counts. +// +// No parameters. +// Returns a map of byte to int, with the counts of each nucleotide. func (s *BioSequence) Composition() map[byte]int { + counts := map[byte]int{ + 'a': 0, + 'c': 0, + 'g': 0, + 't': 0, + 'o': 0, + } - a := 0 - c := 0 - g := 0 - t := 0 - other := 0 for _, char := range s.sequence { - switch char { - case 'a': - a++ - case 'c': - c++ - case 'g': - g++ - case 't': - t++ + switch char | byte(32) { + case 'a', 'c', 'g', 't': + counts[char]++ default: - other++ - + counts['o']++ } } - return map[byte]int{'a': a, 'c': c, 'g': g, 't': t, 'o': other} + return counts } diff --git a/pkg/obiseq/biosequence_test.go b/pkg/obiseq/biosequence_test.go new file mode 100644 index 0000000..9311b7e --- /dev/null +++ b/pkg/obiseq/biosequence_test.go @@ -0,0 +1,366 @@ +package obiseq + +import ( + "bytes" + "reflect" + "sync" + "testing" +) + +// TestNewEmptyBioSequence tests the NewEmptyBioSequence function. +// +// It checks the behavior of the function by creating different BioSequence instances with different preallocate values. +// The function verifies that the sequence is correctly preallocated or not preallocated based on the input value. +// It also checks that the length and capacity of the sequence are set correctly. +// The test fails if the function returns nil or if the sequence length or capacity is not as expected. +func TestNewEmptyBioSequence(t *testing.T) { + // Test case: preallocate is 0, sequence should not be preallocated + seq := NewEmptyBioSequence(0) + if seq == nil { + t.Errorf("NewEmptyBioSequence(0) returned nil") + } else if len(seq.sequence) != 0 { + t.Errorf("Expected sequence length to be 0, got %d", len(seq.sequence)) + } + + // Test case: preallocate is greater than 0, sequence should be preallocated + seq = NewEmptyBioSequence(100) + if seq == nil { + t.Errorf("NewEmptyBioSequence(100) returned nil") + } else if cap(seq.sequence) < 100 { + t.Errorf("Expected sequence capacity to be at least 100, got %d", cap(seq.sequence)) + } + + // Test case: preallocate is negative, sequence should not be preallocated + seq = NewEmptyBioSequence(-100) + if seq == nil { + t.Errorf("NewEmptyBioSequence(-100) returned nil") + } else if len(seq.sequence) != 0 { + t.Errorf("Expected sequence length to be 0, got %d", len(seq.sequence)) + } +} + +// TestNewBioSequence tests the NewBioSequence function. +// +// It checks the correctness of the NewBioSequence function by validating that the BioSequence object +// created has the correct ID, sequence, and definition. +// The function performs two test cases: +// 1. Test case 1 checks if the BioSequence object created using the NewBioSequence function has +// the expected ID, sequence, and definition when provided with valid inputs. +// 2. Test case 2 checks if the BioSequence object created using the NewBioSequence function has +// the expected ID, sequence, and definition when provided with different valid inputs. +func TestNewBioSequence(t *testing.T) { + // Test case 1: + id := "seq1" + sequence := []byte("ACGT") + definition := "DNA sequence" + expectedID := "seq1" + expectedSequence := []byte("acgt") + expectedDefinition := "DNA sequence" + + bs := NewBioSequence(id, sequence, definition) + + if bs.Id() != expectedID { + t.Errorf("Expected ID to be %s, but got %s", expectedID, bs.Id()) + } + + if !bytes.Equal(bs.Sequence(), expectedSequence) { + t.Errorf("Expected sequence to be %v, but got %v", expectedSequence, bs.Sequence()) + } + + if bs.Definition() != expectedDefinition { + t.Errorf("Expected definition to be %s, but got %s", expectedDefinition, bs.Definition()) + } + + // Test case 2: + id = "seq2" + sequence = []byte("ATCG") + definition = "RNA sequence" + expectedID = "seq2" + expectedSequence = []byte("atcg") + expectedDefinition = "RNA sequence" + + bs = NewBioSequence(id, sequence, definition) + + if bs.Id() != expectedID { + t.Errorf("Expected ID to be %s, but got %s", expectedID, bs.Id()) + } + + if !bytes.Equal(bs.Sequence(), expectedSequence) { + t.Errorf("Expected sequence to be %v, but got %v", expectedSequence, bs.Sequence()) + } + + if bs.Definition() != expectedDefinition { + t.Errorf("Expected definition to be %s, but got %s", expectedDefinition, bs.Definition()) + } +} + +// TestBioSequence_Recycle tests the Recycle method of the BioSequence struct. +// +// Test case 1: Recycle a BioSequence object with non-nil slices and annotations. +// Test case 2: Recycle a nil BioSequence object. +// Test case 3: Recycle a BioSequence object with nil slices and annotations. +func TestBioSequence_Recycle(t *testing.T) { + // Test case 1: Recycle a BioSequence object with non-nil slices and annotations + sequence := &BioSequence{ + sequence: []byte{'A', 'C', 'G', 'T'}, + feature: []byte("..."), + qualities: []byte{30, 30, 30, 30}, + annotations: Annotation{"description": "Test"}, + } + sequence.Recycle() + + if len(sequence.sequence) != 0 { + t.Errorf("Expected sequence to be empty, got %v", sequence.sequence) + } + if len(sequence.feature) != 0 { + t.Errorf("Expected feature to be empty, got %v", sequence.feature) + } + if len(sequence.qualities) != 0 { + t.Errorf("Expected qualities to be empty, got %v", sequence.qualities) + } + if sequence.annotations != nil { + t.Errorf("Expected annotations to be nil, got %v", sequence.annotations) + } + + // Test case 2: Recycle a nil BioSequence object + var nilSequence *BioSequence + nilSequence.Recycle() // No panic expected + + // Test case 3: Recycle a BioSequence object with nil slices and annotations + emptySequence := &BioSequence{} + emptySequence.Recycle() // No panic expected +} + +// TestCopy tests the Copy function of the BioSequence struct. +// +// It creates a new BioSequence and copies the fields from the original sequence +// to the new one. It then performs various tests to check if the fields were +// copied correctly. +// +// Parameters: +// - t: The testing.T object used for reporting test failures. +// +// Returns: None. +func TestCopy(t *testing.T) { + seq := &BioSequence{ + id: "test", + definition: "test sequence", + sequence: []byte("ATCG"), + qualities: []byte("1234"), + feature: []byte("feature1...feature2"), + annotations: Annotation{ + "annotation1": "value1", + "annotation2": "value2", + }, + annot_lock: sync.Mutex{}, + } + + newSeq := seq.Copy() + + // Test if the id and definition fields are copied correctly + if newSeq.id != seq.id { + t.Errorf("Expected id to be %v, got %v", seq.id, newSeq.id) + } + if newSeq.definition != seq.definition { + t.Errorf("Expected definition to be %v, got %v", seq.definition, newSeq.definition) + } + + // Test if the sequence, qualities, and feature fields are copied correctly + if !reflect.DeepEqual(newSeq.sequence, seq.sequence) { + t.Errorf("Expected sequence to be %v, got %v", seq.sequence, newSeq.sequence) + } + if !reflect.DeepEqual(newSeq.qualities, seq.qualities) { + t.Errorf("Expected qualities to be %v, got %v", seq.qualities, newSeq.qualities) + } + if !reflect.DeepEqual(newSeq.feature, seq.feature) { + t.Errorf("Expected feature to be %v, got %v", seq.feature, newSeq.feature) + } + + // Test if the annotations are copied correctly + if !reflect.DeepEqual(newSeq.annotations, seq.annotations) { + t.Errorf("Expected annotations to be %v, got %v", seq.annotations, newSeq.annotations) + } +} + +// TestBioSequence_Id tests the Id method of the BioSequence struct. +// +// It initializes a BioSequence with an ID using the constructor and then +// verifies that the Id method returns the expected ID. +// The expected ID is "ABC123". +// The method takes no parameters and returns a string. +func TestBioSequence_Id(t *testing.T) { + // Initialize BioSequence with an ID using the constructor + bioSeq := NewBioSequence("ABC123", []byte(""), "") + + // Test case: ID is returned correctly + expected := "ABC123" + result := bioSeq.Id() + if result != expected { + t.Errorf("Expected ID to be %s, but got %s", expected, result) + } +} + +// TestBioSequenceDefinition tests the Definition() method of the BioSequence struct. +// +// This function verifies the behavior of the Definition() method in two test cases: +// 1. Empty definition: It creates a BioSequence object with an empty definition and verifies that the Definition() method returns an empty string. +// 2. Non-empty definition: It creates a BioSequence object with a non-empty definition and verifies that the Definition() method returns the expected definition. +func TestBioSequenceDefinition(t *testing.T) { + // Test case 1: Empty definition + seq1 := NewBioSequence("", []byte{}, "") + expected1 := "" + if got1 := seq1.Definition(); got1 != expected1 { + t.Errorf("Expected %q, but got %q", expected1, got1) + } + + // Test case 2: Non-empty definition + seq2 := NewBioSequence("", []byte{}, "This is a definition") + expected2 := "This is a definition" + if got2 := seq2.Definition(); got2 != expected2 { + t.Errorf("Expected %q, but got %q", expected2, got2) + } +} + +// TestBioSequenceSequence tests the Sequence() method of the BioSequence struct. +// +// It verifies the behavior of the Sequence() method under two scenarios: +// - Test case 1: Empty sequence +// - Test case 2: Non-empty sequence +// +// Parameter(s): +// - t: The testing object provided by the testing framework. +// It is used to report errors if the test fails. +// +// Return type(s): +// None. +func TestBioSequenceSequence(t *testing.T) { + // Test case 1: Empty sequence + seq := NewBioSequence("", []byte{}, "") + expected := []byte{} + actual := seq.Sequence() + if !bytes.EqualFold(actual, expected) { + t.Errorf("Expected %v, but got %v", expected, actual) + } + + // Test case 2: Non-empty sequence + seq = NewBioSequence("", []byte("atcg"), "") + expected = []byte("atcg") + actual = seq.Sequence() + if !bytes.EqualFold(actual, expected) { + t.Errorf("Expected %v, but got %v", expected, actual) + } +} + +// TestBioSequence_String tests the String method of the BioSequence struct. +// +// It includes two test cases: +// +// 1. Test case 1: Empty sequence +// - Creates an empty BioSequence instance. +// - Expects an empty string as the result of calling the String method on the BioSequence instance. +// +// 2. Test case 2: Non-empty sequence +// - Creates a BioSequence instance with the sequence "acgt". +// - Expects the sequence "acgt" as the result of calling the String method on the BioSequence instance. +// +// No parameters are required. +// No return types are specified. +func TestBioSequence_String(t *testing.T) { + // Test case 1: Empty sequence + seq1 := &BioSequence{} + expected1 := "" + if got1 := seq1.String(); got1 != expected1 { + t.Errorf("Test case 1 failed: expected %s, got %s", expected1, got1) + } + + // Test case 2: Non-empty sequence + seq2 := &BioSequence{sequence: []byte("acgt")} + expected2 := "acgt" + if got2 := seq2.String(); got2 != expected2 { + t.Errorf("Test case 2 failed: expected %s, got %s", expected2, got2) + } +} + +// TestBioSequence_Len tests the Len method of the BioSequence struct. +// +// It verifies the behavior of the method by performing multiple test cases. +// Each test case creates a BioSequence instance with a specific sequence and +// compares the actual length returned by the Len method with the expected +// length. +// +// Test 1: Empty sequence +// - Create a BioSequence instance with an empty sequence. +// - The expected length is 0. +// - Check if the actual length returned by the Len method matches the expected +// length. If not, report an error. +// +// Test 2: Sequence with 5 characters +// - Create a BioSequence instance with a sequence of 5 characters. +// - The expected length is 5. +// - Check if the actual length returned by the Len method matches the expected +// length. If not, report an error. +// +// Test 3: Sequence with 10 characters +// - Create a BioSequence instance with a sequence of 10 characters. +// - The expected length is 10. +// - Check if the actual length returned by the Len method matches the expected +// length. If not, report an error. +func TestBioSequence_Len(t *testing.T) { + // Test 1: Empty sequence + s1 := NewBioSequence("", nil, "") + expected1 := 0 + if len := s1.Len(); len != expected1 { + t.Errorf("Expected length: %d, but got: %d", expected1, len) + } + + // Test 2: Sequence with 5 characters + s2 := NewBioSequence("", []byte("ATCGT"), "") + expected2 := 5 + if len := s2.Len(); len != expected2 { + t.Errorf("Expected length: %d, but got: %d", expected2, len) + } + + // Test 3: Sequence with 10 characters + s3 := NewBioSequence("", []byte("AGCTAGCTAG"), "") + expected3 := 10 + if len := s3.Len(); len != expected3 { + t.Errorf("Expected length: %d, but got: %d", expected3, len) + } +} + +// TestBioSequence_Composition tests the Composition method of the BioSequence struct. +// +// It tests the method with three different test cases: +// 1. Empty sequence: It checks if the Composition method returns the expected composition when the sequence is empty. +// 2. Sequence with valid nucleotides: It checks if the Composition method returns the expected composition when the sequence contains valid nucleotides. +// 3. Sequence with invalid nucleotides: It checks if the Composition method returns the expected composition when the sequence contains invalid nucleotides. +// +// The expected composition for each test case is defined in a map where the keys are the nucleotides and the values are the expected counts. +// The Composition method is expected to return a map with the actual nucleotide counts. +// +// Parameters: +// - t: The testing.T object used for reporting test failures and logging. +// +// Return type: void. +func TestBioSequence_Composition(t *testing.T) { + // Test case: Empty sequence + seq1 := NewBioSequence("", []byte(""), "") + expected1 := map[byte]int{'a': 0, 'c': 0, 'g': 0, 't': 0, 'o': 0} + if result1 := seq1.Composition(); !reflect.DeepEqual(result1, expected1) { + t.Errorf("Composition() returned incorrect result for empty sequence. Got %v, expected %v", result1, expected1) + } + + // Test case: Sequence with valid nucleotides + seq2 := NewBioSequence("", []byte("acgtACGT"), "") + expected2 := map[byte]int{'a': 2, 'c': 2, 'g': 2, 't': 2, 'o': 0} + if result2 := seq2.Composition(); !reflect.DeepEqual(result2, expected2) { + t.Errorf("Composition() returned incorrect result for sequence with valid nucleotides. Got %v, expected %v", result2, expected2) + } + + // Test case: Sequence with invalid nucleotides + seq3 := NewBioSequence("", []byte("acgtACGT1234"), "") + expected3 := map[byte]int{'a': 2, 'c': 2, 'g': 2, 't': 2, 'o': 4} + if result3 := seq3.Composition(); !reflect.DeepEqual(result3, expected3) { + t.Errorf("Composition() returned incorrect result for sequence with invalid nucleotides. Got %v, expected %v", result3, expected3) + } +} diff --git a/pkg/obiseq/pool.go b/pkg/obiseq/pool.go index dcf98c1..096f838 100644 --- a/pkg/obiseq/pool.go +++ b/pkg/obiseq/pool.go @@ -38,7 +38,7 @@ func GetSlice(capacity int) []byte { if p == nil || *p == nil || cap(*p) < capacity { return make([]byte, 0, capacity) } - + s := *p if cap(s) < capacity { @@ -73,7 +73,12 @@ func RecycleAnnotation(a *Annotation) { } } -// It returns a new Annotation object, initialized with the values from the first argument +// GetAnnotation returns an Annotation from the BioSequenceAnnotationPool. +// +// It takes as argument O or 1 Annotation annotation object. +// If an annotation object is passed, it is copied into the new Annotation. +// +// It returns an Annotation. func GetAnnotation(values ...Annotation) Annotation { a := Annotation(nil) diff --git a/pkg/obiutils/goutils.go b/pkg/obiutils/goutils.go index 10e5790..09d328e 100644 --- a/pkg/obiutils/goutils.go +++ b/pkg/obiutils/goutils.go @@ -271,10 +271,14 @@ func (m *NotABoolean) Error() string { return m.message } -// > It copies the contents of the `src` map into the `dest` map, but if the value is a map, slice, or -// array, it makes a deep copy of it +// MustFillMap fills the destination map with the values from the source map. +// +// The function takes in two parameters: +// - dest: a map[string]interface{} representing the destination map. +// - src: a map[string]interface{} representing the source map. +// +// There is no return value. func MustFillMap(dest, src map[string]interface{}) { - for k, v := range src { if IsAMap(v) || IsASlice(v) || IsAnArray(v) { v = deepcopy.MustAnything(v)