From d23a911080f5d07bc69823698531e824d0d494fe Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 5 Oct 2023 07:21:12 +0200 Subject: [PATCH] Change the way sequence definition are managed. They are now when present stored as an attribute Former-commit-id: 6e618377c05b42937d2eace3c9668390980ab68c --- pkg/obiapat/pattern.go | 18 ++++++++++++++++++ pkg/obiformats/empty_file.go | 9 +++++++++ pkg/obiformats/fastaseq_read.go | 10 ++++++++++ pkg/obiformats/fastqseq_read.go | 10 ++++++++++ pkg/obiformats/ngsfilter_read.go | 19 ++++++++++++++++--- pkg/obingslibrary/match.go | 19 +++++++++++++++++++ pkg/obiseq/biosequence.go | 23 ++++++++++++++++------- pkg/obiseq/biosequence_test.go | 13 ++++--------- pkg/obiseq/subseq.go | 2 +- pkg/obiutils/cast_interface.go | 12 ++++++++++++ pkg/obiutils/goutils.go | 10 ---------- 11 files changed, 115 insertions(+), 30 deletions(-) create mode 100644 pkg/obiformats/empty_file.go diff --git a/pkg/obiapat/pattern.go b/pkg/obiapat/pattern.go index 55590c2..9d2f176 100644 --- a/pkg/obiapat/pattern.go +++ b/pkg/obiapat/pattern.go @@ -298,6 +298,24 @@ func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, begin, length int return loc } +// BestMatch finds the best match of a given pattern in a sequence. +// +// THe function identify the first occurrence of the pattern in the sequence. +// The search can be limited to a portion of the sequence using the begin and +// length parameters to find the next occurrences. +// +// The BestMatch methood ins +// It takes the following parameters: +// - pattern: the pattern to search for (ApatPattern). +// - sequence: the sequence to search in (ApatSequence). +// - begin: the starting index of the search (int). +// - length: the length of the search (int). +// +// It returns the following values: +// - start: the starting index of the best match (int). +// - end: the ending index of the best match (int). +// - nerr: the number of errors in the best match (int). +// - matched: a boolean indicating whether a match was found (bool). func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) (start int, end int, nerr int, matched bool) { res := pattern.FindAllIndex(sequence, begin, length) diff --git a/pkg/obiformats/empty_file.go b/pkg/obiformats/empty_file.go new file mode 100644 index 0000000..8774631 --- /dev/null +++ b/pkg/obiformats/empty_file.go @@ -0,0 +1,9 @@ +package obiformats + +import "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + +func ReadEmptyFile(options ...WithOption) (obiiter.IBioSequence, error) { + out := obiiter.MakeIBioSequence() + out.Close() + return out, nil +} diff --git a/pkg/obiformats/fastaseq_read.go b/pkg/obiformats/fastaseq_read.go index a61414c..3ba8248 100644 --- a/pkg/obiformats/fastaseq_read.go +++ b/pkg/obiformats/fastaseq_read.go @@ -294,6 +294,11 @@ func ReadFastaFromFile(filename string, options ...WithOption) (obiiter.IBioSequ file, err := Ropen(filename) + if err == ErrNoContent { + log.Infof("file %s is empty", filename) + return ReadEmptyFile(options...) + } + if err != nil { return obiiter.NilIBioSequence, err } @@ -305,6 +310,11 @@ func ReadFastaFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSe options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin"))) input, err := Buf(os.Stdin) + if err == ErrNoContent { + log.Infof("stdin is empty") + return ReadEmptyFile(options...) + } + if err != nil { log.Fatalf("open file error: %v", err) return obiiter.NilIBioSequence, err diff --git a/pkg/obiformats/fastqseq_read.go b/pkg/obiformats/fastqseq_read.go index 62b559d..dbbe4c1 100644 --- a/pkg/obiformats/fastqseq_read.go +++ b/pkg/obiformats/fastqseq_read.go @@ -339,6 +339,11 @@ func ReadFastqFromFile(filename string, options ...WithOption) (obiiter.IBioSequ file, err := Ropen(filename) + if err == ErrNoContent { + log.Infof("file %s is empty", filename) + return ReadEmptyFile(options...) + } + if err != nil { return obiiter.NilIBioSequence, err } @@ -350,6 +355,11 @@ func ReadFastqFromStdin(reader io.Reader, options ...WithOption) (obiiter.IBioSe options = append(options, OptionsSource(obiutils.RemoveAllExt("stdin"))) input, err := Buf(os.Stdin) + if err == ErrNoContent { + log.Infof("stdin is empty") + return ReadEmptyFile(options...) + } + if err != nil { log.Fatalf("open file error: %v", err) return obiiter.NilIBioSequence, err diff --git a/pkg/obiformats/ngsfilter_read.go b/pkg/obiformats/ngsfilter_read.go index f6dfd37..3cd0dd0 100644 --- a/pkg/obiformats/ngsfilter_read.go +++ b/pkg/obiformats/ngsfilter_read.go @@ -59,9 +59,13 @@ func _parseMainNGSFilterTags(text string) obingslibrary.TagPair { } } -func _parseMainNGSFilter(text string) (obingslibrary.PrimerPair, obingslibrary.TagPair, string, string, bool) { +func _parseMainNGSFilter(text string) (obingslibrary.PrimerPair, obingslibrary.TagPair, string, string, bool, bool) { fields := strings.Fields(text) + if len(fields) < 6 { + return obingslibrary.PrimerPair{}, obingslibrary.TagPair{}, "", "", false, false + } + tags := _parseMainNGSFilterTags(fields[2]) partial := fields[5] == "T" || fields[5] == "t" @@ -72,7 +76,8 @@ func _parseMainNGSFilter(text string) (obingslibrary.PrimerPair, obingslibrary.T tags, fields[0], fields[1], - partial + partial, + true } func ReadNGSFilter(reader io.Reader) (obingslibrary.NGSLibrary, error) { @@ -89,7 +94,15 @@ func ReadNGSFilter(reader io.Reader) (obingslibrary.NGSLibrary, error) { split := strings.SplitN(line, "@", 2) - primers, tags, experiment, sample, partial := _parseMainNGSFilter(split[0]) + if len(split) < 1 { + return nil, fmt.Errorf("line %d : invalid format", i+1) + } + + primers, tags, experiment, sample, partial, ok := _parseMainNGSFilter(split[0]) + + if !ok { + return nil, fmt.Errorf("line %d : invalid format", i+1) + } marker, _ := ngsfilter.GetMarker(primers.Forward, primers.Reverse) pcr, ok := marker.GetPCR(tags.Forward, tags.Reverse) diff --git a/pkg/obingslibrary/match.go b/pkg/obingslibrary/match.go index dac3a92..995ef00 100644 --- a/pkg/obingslibrary/match.go +++ b/pkg/obingslibrary/match.go @@ -104,6 +104,16 @@ func (marker *Marker) Compile(forward, reverse string, maxError int, allowsIndel return nil } +// Match finds the best matching demultiplex for a given sequence. +// +// Parameters: +// +// marker - a pointer to a Marker struct that contains the forward and reverse primers. +// sequence - a pointer to a BioSequence struct that represents the input sequence. +// +// Returns: +// +// A pointer to a DemultiplexMatch struct that contains the best matching demultiplex. func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { aseq, _ := obiapat.MakeApatSequence(sequence, false) @@ -223,6 +233,15 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { return nil } +// ExtractBarcode extracts the barcode from the given biosequence. +// +// Parameters: +// - sequence: The biosequence from which to extract the barcode. +// - inplace: A boolean indicating whether the barcode should be extracted in-place or not. +// +// Returns: +// - The biosequence with the extracted barcode. +// - An error indicating any issues encountered during the extraction process. func (match *DemultiplexMatch) ExtractBarcode(sequence *obiseq.BioSequence, inplace bool) (*obiseq.BioSequence, error) { if !inplace { sequence = sequence.Copy() diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index 69b4f20..f147549 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -55,8 +55,8 @@ type Annotation map[string]interface{} // A BioSequence is a sequence of bytes with an identifier, a definition, a sequence, qualities, // features and annotations. It aims to represent a biological sequence type BioSequence struct { - id string // The identidier of the sequence (private accessible through the method Id) - definition string // The documentation of the sequence (private accessible through the method Definition) + id string // The identidier of the sequence (private accessible through the method Id) + //definition string // The documentation of the sequence (private accessible through the method Definition) source string // The filename without directory name and extension from where the sequence was read. sequence []byte // The sequence itself, it is accessible by the methode Sequence qualities []byte // The quality scores of the sequence. @@ -80,8 +80,8 @@ func NewEmptyBioSequence(preallocate int) *BioSequence { } return &BioSequence{ - id: "", - definition: "", + id: "", + //definition: "", source: "", sequence: seq, qualities: nil, @@ -148,7 +148,7 @@ func (s *BioSequence) Copy() *BioSequence { newSeq := NewEmptyBioSequence(0) newSeq.id = s.id - newSeq.definition = s.definition + //newSeq.definition = s.definition newSeq.sequence = CopySlice(s.sequence) newSeq.qualities = CopySlice(s.qualities) @@ -176,7 +176,16 @@ func (s *BioSequence) Id() string { // No parameters. // Returns a string. func (s *BioSequence) Definition() string { - return s.definition + definition := "" + var err error + def, ok := s.GetAttribute("definition") + if ok { + definition, err = obiutils.InterfaceToString(def) + if err != nil { + definition = "" + } + } + return definition } // Sequence returns the sequence of the BioSequence. @@ -315,7 +324,7 @@ func (s *BioSequence) SetId(id string) { // // It takes a string parameter 'definition' and assigns it to the 'definition' field of the BioSequence struct. func (s *BioSequence) SetDefinition(definition string) { - s.definition = definition + s.SetAttribute("definition", definition) } // SetSource sets the source of the BioSequence. diff --git a/pkg/obiseq/biosequence_test.go b/pkg/obiseq/biosequence_test.go index 4089a18..c9ce5f1 100644 --- a/pkg/obiseq/biosequence_test.go +++ b/pkg/obiseq/biosequence_test.go @@ -143,11 +143,10 @@ func TestBioSequence_Recycle(t *testing.T) { // Returns: None. func TestCopy(t *testing.T) { seq := &BioSequence{ - id: "test", - definition: "test sequence", - sequence: []byte("ATCG"), - qualities: []byte("1234"), - feature: []byte("feature1...feature2"), + id: "test", + sequence: []byte("ATCG"), + qualities: []byte("1234"), + feature: []byte("feature1...feature2"), annotations: Annotation{ "annotation1": "value1", "annotation2": "value2", @@ -161,10 +160,6 @@ func TestCopy(t *testing.T) { if newSeq.id != seq.id { t.Errorf("Expected id to be %v, got %v", seq.id, newSeq.id) } - if newSeq.definition != seq.definition { - t.Errorf("Expected definition to be %v, got %v", seq.definition, newSeq.definition) - } - // Test if the sequence, qualities, and feature fields are copied correctly if !reflect.DeepEqual(newSeq.sequence, seq.sequence) { t.Errorf("Expected sequence to be %v, got %v", seq.sequence, newSeq.sequence) diff --git a/pkg/obiseq/subseq.go b/pkg/obiseq/subseq.go index 0e49da3..e78047c 100644 --- a/pkg/obiseq/subseq.go +++ b/pkg/obiseq/subseq.go @@ -32,7 +32,7 @@ func (sequence *BioSequence) Subsequence(from, to int, circular bool) (*BioSeque } newSeq.id = fmt.Sprintf("%s_sub[%d..%d]", sequence.Id(), from+1, to) - newSeq.definition = sequence.definition + // newSeq.definition = sequence.definition } else { newSeq, _ = sequence.Subsequence(from, sequence.Len(), false) newSeq.Write(sequence.Sequence()[0:to]) diff --git a/pkg/obiutils/cast_interface.go b/pkg/obiutils/cast_interface.go index f7ba7bd..9244f6a 100644 --- a/pkg/obiutils/cast_interface.go +++ b/pkg/obiutils/cast_interface.go @@ -1,5 +1,17 @@ package obiutils +import "fmt" + +// InterfaceToString converts an interface value to a string. +// +// The function takes an interface{} value as a parameter and returns a string representation of that value. +// It returns the string representation and an error if any occurred during the conversion process. +func InterfaceToString(i interface{}) (val string, err error) { + err = nil + val = fmt.Sprintf("%v", i) + return +} + // CastableToInt checks if the given input can be casted to an integer. // // i: the value to check for castability. diff --git a/pkg/obiutils/goutils.go b/pkg/obiutils/goutils.go index 09d328e..f88c54c 100644 --- a/pkg/obiutils/goutils.go +++ b/pkg/obiutils/goutils.go @@ -4,7 +4,6 @@ import ( "bufio" "bytes" "encoding/json" - "fmt" "io" "os" "reflect" @@ -13,15 +12,6 @@ import ( "github.com/barkimedes/go-deepcopy" ) -// InterfaceToInt converts a interface{} to an integer value if possible. -// If not a "NotAnInteger" error is returned via the err -// return value and val is set to 0. -func InterfaceToString(i interface{}) (val string, err error) { - err = nil - val = fmt.Sprintf("%v", i) - return -} - // NotAnInteger defines a new type of Error : "NotAnInteger" type NotAnInteger struct { message string