From 9e63013bc241aba407ea4e18cddf776652b5d0b0 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 7 May 2024 10:54:12 +0200 Subject: [PATCH] Correction on obiformat of bug leading to partial parsing and add godocs Former-commit-id: b27105355f1a330eedf6eaa72c8ac94f06806c28 --- pkg/obialign/is_d0_or_d1.go | 22 ++++++++- pkg/obiformats/fastaseq_read.go | 2 + pkg/obiformats/fastqseq_read.go | 6 +++ pkg/obilua/lua.go | 82 ++++++++++++++++++++++-------- pkg/obiseq/merge.go | 73 ++++++++++++++++++++------- pkg/obiseq/worker.go | 88 +++++++++++++++++++++++++++++++++ 6 files changed, 231 insertions(+), 42 deletions(-) diff --git a/pkg/obialign/is_d0_or_d1.go b/pkg/obialign/is_d0_or_d1.go index 702e012..cdb4917 100644 --- a/pkg/obialign/is_d0_or_d1.go +++ b/pkg/obialign/is_d0_or_d1.go @@ -1,8 +1,15 @@ package obialign -import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "golang.org/x/exp/constraints" +) -func abs(x int) int { +// abs computes the absolute value of a given float or integer. +// +// x: the input value of type k (float or integer). +// k: the return type, which is the absolute value of x. +func abs[k constraints.Float | constraints.Integer](x k) k { if x < 0 { return -x } @@ -10,6 +17,17 @@ func abs(x int) int { return x } +// D1Or0 checks if two sequences are identical or differ by one position. +// +// Parameters: +// - seq1: a pointer to the first sequence +// - seq2: a pointer to the second sequence +// +// Returns: +// - int: 0 if the sequences are identical or 0 if they differ by one position, -1 otherwise +// - int: the position where the sequences differ, -1 if they are identical +// - byte: the character in the first sequence at the differing position, '-' if it's a deletion +// - byte: the character in the second sequence at the differing position, '-' if it's a deletion func D1Or0(seq1, seq2 *obiseq.BioSequence) (int, int, byte, byte) { pos := -1 diff --git a/pkg/obiformats/fastaseq_read.go b/pkg/obiformats/fastaseq_read.go index 95fa162..028cefd 100644 --- a/pkg/obiformats/fastaseq_read.go +++ b/pkg/obiformats/fastaseq_read.go @@ -108,6 +108,8 @@ func _ParseFastaFile(source string, if is_end_of_line { definition = defBytes.String() state = 5 + } else { + defBytes.WriteByte(C) } case 5: if !is_end_of_line { diff --git a/pkg/obiformats/fastqseq_read.go b/pkg/obiformats/fastqseq_read.go index 1c6c22c..1bb2f24 100644 --- a/pkg/obiformats/fastqseq_read.go +++ b/pkg/obiformats/fastqseq_read.go @@ -229,6 +229,8 @@ func _ParseFastqFile(source string, // End of identifier identifier = idBytes.String() state = 3 + } else { + idBytes.WriteByte(C) } if is_end_of_line { // Definition empty @@ -250,6 +252,8 @@ func _ParseFastqFile(source string, if is_end_of_line { definition = defBytes.String() state = 5 + } else { + defBytes.WriteByte(C) } case 5: // Beginning of sequence if !is_end_of_line { @@ -308,6 +312,8 @@ func _ParseFastqFile(source string, } sequences[len(sequences)-1].SetQualities(q) state = 11 + } else { + qualBytes.WriteByte(C) } case 11: if is_end_of_line { diff --git a/pkg/obilua/lua.go b/pkg/obilua/lua.go index 354f9a0..9cdf82c 100644 --- a/pkg/obilua/lua.go +++ b/pkg/obilua/lua.go @@ -13,6 +13,10 @@ import ( "github.com/yuin/gopher-lua/parse" ) +// NewInterpreter creates a new Lua interpreter and registers the Obilib and ObiContext modules. +// +// No parameters. +// Returns a pointer to a Lua state. func NewInterpreter() *lua.LState { lua := lua.NewState() @@ -22,6 +26,10 @@ func NewInterpreter() *lua.LState { return lua } +// Compile compiles a Lua program into a Lua function proto. +// +// It takes a byte slice containing the Lua program and a string representing the name of the program. +// It returns a pointer to a Lua function proto and an error if any. func Compile(program []byte, name string) (*lua.FunctionProto, error) { reader := bytes.NewReader(program) @@ -37,6 +45,12 @@ func Compile(program []byte, name string) (*lua.FunctionProto, error) { return proto, nil } +// CompileScript compiles a Lua script from a file. +// +// It takes a file path as input and returns a pointer to a Lua function proto and an error if any. +// The function reads the contents of the file specified by the file path and compiles it into a Lua function proto using the Compile function. +// If there is an error reading the file, the function returns nil and the error. +// Otherwise, it returns the compiled Lua function proto and nil error. func CompileScript(filePath string) (*lua.FunctionProto, error) { program, err := os.ReadFile(filePath) @@ -47,6 +61,25 @@ func CompileScript(filePath string) (*lua.FunctionProto, error) { return Compile(program, filePath) } +// LuaWorker creates a Go function that executes a Lua script and returns a SeqWorker. +// +// The function takes a Lua function prototype as input and creates a new interpreter. +// It then creates a new Lua function from the prototype and pushes it onto the interpreter's stack. +// The interpreter calls the Lua function and checks for any errors. +// It retrieves the global variable "worker" from the interpreter and checks if it is a Lua function. +// If it is a Lua function, it defines a Go function that takes a BioSequence as input. +// Inside the Go function, it calls the Lua function with the BioSequence as an argument. +// It retrieves the result from the interpreter and checks its type. +// If the result is a BioSequence or a BioSequenceSlice, it returns it along with any error. +// If the result is not of the expected type, it returns an error. +// If the global variable "worker" is not a Lua function, it logs a fatal error. +// The Go function is returned as a SeqWorker. +// +// Parameters: +// - proto: The Lua function prototype. +// +// Return type: +// - obiseq.SeqWorker: The Go function that executes the Lua script and returns a SeqWorker. func LuaWorker(proto *lua.FunctionProto) obiseq.SeqWorker { interpreter := NewInterpreter() lfunc := interpreter.NewFunctionFromProto(proto) @@ -94,30 +127,20 @@ func LuaWorker(proto *lua.FunctionProto) obiseq.SeqWorker { return f } - log.Fatalf("THe worker object is not a function") + log.Fatalf("The worker object is not a function") return nil - // f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { - // interpreter.SetGlobal("sequence", obiseq2Lua(interpreter, sequence)) - // interpreter.Push(lfunc) - // err := interpreter.PCall(0, lua.MultRet, nil) - // result := interpreter.GetGlobal("result") - - // if result != lua.LNil { - // log.Info("youpi ", result) - // } - - // rep := interpreter.GetGlobal("sequence") - - // if rep.Type() == lua.LTUserData { - // ud := rep.(*lua.LUserData) - // sequence = ud.Value.(*obiseq.BioSequence) - // } - - // return obiseq.BioSequenceSlice{sequence}, err - // } - } +// LuaProcessor processes a Lua script on a sequence iterator and returns a new iterator. +// +// Parameters: +// - iterator: The IBioSequence iterator to process. +// - name: The name of the Lua script. +// - program: The Lua script program as a string. +// - breakOnError: A boolean indicating whether to stop processing if an error occurs. +// - nworkers: An integer representing the number of workers to use for processing. +// Returns: +// - obiiter.IBioSequence: The new IBioSequence iterator after processing the Lua script. func LuaProcessor(iterator obiiter.IBioSequence, name, program string, breakOnError bool, nworkers int) obiiter.IBioSequence { newIter := obiiter.MakeIBioSequence() @@ -223,6 +246,15 @@ func LuaProcessor(iterator obiiter.IBioSequence, name, program string, breakOnEr } +// LuaPipe creates a pipeable function that applies a Lua script to an input sequence. +// +// Parameters: +// - name: The name of the Lua script. +// - program: The Lua script program as a string. +// - breakOnError: A boolean indicating whether to stop processing if an error occurs. +// - nworkers: An integer representing the number of workers to use for processing. +// Returns: +// - obiiter.Pipeable: A pipeable function that applies the Lua script to the input sequence. func LuaPipe(name, program string, breakOnError bool, nworkers int) obiiter.Pipeable { f := func(input obiiter.IBioSequence) obiiter.IBioSequence { @@ -232,6 +264,14 @@ func LuaPipe(name, program string, breakOnError bool, nworkers int) obiiter.Pipe return f } +// LuaScriptPipe creates a pipeable function that applies a Lua script to an input sequence. +// +// Parameters: +// - filename: The name of the Lua script file. +// - breakOnError: A boolean indicating whether to stop processing if an error occurs. +// - nworkers: An integer representing the number of workers to use for processing. +// Returns: +// - obiiter.Pipeable: A pipeable function that applies the Lua script to the input sequence. func LuaScriptPipe(filename string, breakOnError bool, nworkers int) obiiter.Pipeable { program, err := os.ReadFile(filename) diff --git a/pkg/obiseq/merge.go b/pkg/obiseq/merge.go index e62e6e2..e32f6c0 100644 --- a/pkg/obiseq/merge.go +++ b/pkg/obiseq/merge.go @@ -11,14 +11,24 @@ import ( type StatsOnValues map[string]int +// StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute. +// +// Parameters: +// - key: the attribute key (string) +// +// Return type: +// - string func StatsOnSlotName(key string) string { return "merged_" + key } -/* - Tests if the sequence has already a slot summarizing statistics - of occurrence for a given attribute. -*/ +// HasStatsOn tests if the sequence has already a slot summarizing statistics of occurrence for a given attribute. +// +// Parameters: +// - key: the attribute key (string) +// +// Return type: +// - bool func (sequence *BioSequence) HasStatsOn(key string) bool { if !sequence.HasAnnotation() { return false @@ -31,7 +41,14 @@ func (sequence *BioSequence) HasStatsOn(key string) bool { return ok } -// A function that takes a BioSequence and a key and returns a StatsOnValues. +// StatsOn returns the slot summarizing statistics of occurrence for a given attribute. +// +// Parameters: +// - key: the attribute key (string) to be summarized +// - na: the value to be used if the attribute is not present +// +// Return type: +// - StatsOnValues func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues { mkey := StatsOnSlotName(key) annotations := sequence.Annotations() @@ -77,7 +94,14 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues { return stats } -// Adding the count of the sequence to the count of the key in the stats. +// StatsPlusOne adds the count of the sequence toAdd to the count of the key in the stats. +// +// Parameters: +// - key: the attribute key (string) to be summarized +// - toAdd: the BioSequence to add to the stats +// - na: the value to be used if the attribute is not present +// Return type: +// - bool func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na string) bool { sval := na annotations := sequence.Annotations() @@ -109,10 +133,14 @@ func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na str old = 0 } stats[sval] = old + toAdd.Count() - annotations[StatsOnSlotName(key)] = stats + annotations[StatsOnSlotName(key)] = stats // TODO: check if this is necessary return retval } +// Merge merges the given StatsOnValues with the current StatsOnValues. +// +// It takes a parameter `toMerged` of type StatsOnValues, which represents the StatsOnValues to be merged. +// It returns a value of type StatsOnValues, which represents the merged StatsOnValues. func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues { for k, val := range toMerged { old, ok := stats[k] @@ -125,7 +153,16 @@ func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues { return stats } -// Merging two sequences. +// Merge merges two sequences into a single sequence. +// +// Parameters: +// - tomerge: the sequence to be merged (BioSequence) +// - na: the value to be used if the attribute is not present (string) +// - inplace: a boolean indicating whether to merge in place or not (bool) +// - statsOn: a variadic string parameter representing the attributes to be summarized (string) +// +// Return type: +// - *BioSequence: the merged sequence (BioSequence) func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn ...string) *BioSequence { if !inplace { sequence = sequence.Copy() @@ -184,17 +221,15 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool return sequence } -/* -* - - Merges a set of sequence into a single sequence. - - The function assumes that every sequence in the batch is - identical in term of sequence. Actually the function only - aggregates the annotations of the different sequences to be merged - - Quality information is lost during the merge procedure. -*/ +// Merge merges the given sequences into a single sequence. +// +// Parameters: +// - sequences: a slice of BioSequence objects to be merged (BioSequenceSlice) +// - na: the value to be used if the attribute is not present (string) +// - statsOn: a slice of strings representing the attributes to be summarized ([]string) +// +// Return type: +// - *BioSequence: the merged sequence (BioSequence) func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequence { seq := sequences[0] //sequences[0] = nil diff --git a/pkg/obiseq/worker.go b/pkg/obiseq/worker.go index 442650d..fcafd22 100644 --- a/pkg/obiseq/worker.go +++ b/pkg/obiseq/worker.go @@ -12,10 +12,46 @@ type SeqAnnotator func(*BioSequence) type SeqWorker func(*BioSequence) (BioSequenceSlice, error) type SeqSliceWorker func(BioSequenceSlice) (BioSequenceSlice, error) +// NilSeqWorker returns a BioSequenceSlice containing the input sequence and a nil error value. +// This function is typically used as a placeholder or default worker in SeqToSliceWorker when no specific worker is needed. +// +// Parameters: +// +// seq *BioSequence: A pointer to a BioSequence struct that needs processing. +// +// Returns: +// +// BioSequenceSlice, error: This function returns a slice containing the input sequence and an error value. If no error occurred during the operation, it will be nil; otherwise, it will contain details about the error. func NilSeqWorker(seq *BioSequence) (BioSequenceSlice, error) { return BioSequenceSlice{seq}, nil } +// AnnotatorToSeqWorker is a higher-order function that takes a SeqAnnotator +// function and returns a SeqWorker function. It is used to wrap a sequence +// annotation function and convert it into a worker function that can be used +// in a pipeline or workflow for processing biological sequences. +// +// Parameters: +// +// function SeqAnnotator: A function that takes a pointer to a BioSequence +// struct and performs some annotation or processing on the sequence data. +// The SeqAnnotator type is expected to be a function with the following +// signature: +// func(seq *BioSequence) +// This function should modify the input BioSequence struct in-place by adding +// annotations, metadata, or performing any other desired operations. +// +// Returns: +// +// SeqWorker: A function that takes a pointer to a BioSequence struct and +// returns a BioSequenceSlice containing the input BioSequence, along with +// an error value. The SeqWorker type is expected to be a function with the +// following signature: +// func(seq *BioSequence) (BioSequenceSlice, error) +// The returned SeqWorker function wraps the provided SeqAnnotator function +// and applies it to the input BioSequence before returning the modified +// BioSequence in a BioSequenceSlice. The error value is always nil, as the +// function does not perform any operations that could potentially fail. func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker { f := func(seq *BioSequence) (BioSequenceSlice, error) { function(seq) @@ -24,6 +60,35 @@ func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker { return f } +// SeqToSliceWorker is a higher-order function that takes a SeqWorker and a +// boolean value indicating whether to break on error and returns a SeqSliceWorker. +// It can be used in a pipeline or workflow for processing biological sequences, +// applying the provided worker to each element of a BioSequenceSlice and returning +// a new slice. +// +// Parameters: +// +// worker SeqWorker: A function that takes a pointer to a BioSequence struct and +// performs some processing on it. +// The signature for this function is func(seq *BioSequence) (BioSequenceSlice, error). +// This function should return a modified BioSequence in a BioSequenceSlice along with +// an error value indicating whether the operation was successful or not. +// breakOnError bool: A boolean flag that determines whether to stop processing further +// elements in case of an error. If set to true and an error is encountered while +// processing any element, it stops processing remaining elements and returns the processed +// slice so far along with the encountered error. If set to false, it logs the error and +// continues processing remaining elements. +// +// Returns: +// +// SeqSliceWorker: A function that takes a BioSequenceSlice (a slice of pointers to +// BioSequence structs) as input and returns a processed BioSequenceSlice along with +// an error value indicating whether the operation was successful or not. +// The signature for this function is func(input BioSequenceSlice) (BioSequenceSlice, error). +// If breakOnError is set to true and any element processing results in an error, it stops +// further processing and returns the processed slice so far along with the encountered error. +// Otherwise, it processes all elements and returns them as a single BioSequenceSlice along with +// a nil error value. func SeqToSliceWorker(worker SeqWorker, breakOnError bool) SeqSliceWorker { var f SeqSliceWorker @@ -68,6 +133,18 @@ func SeqToSliceWorker(worker SeqWorker, return f } +// SeqToSliceConditionalWorker creates a new SeqSliceWorker that processes each sequence in a slice based on a condition. It takes a SequencePredicate and a worker function as arguments. The worker function is only applied to sequences that satisfy the condition. +// If `condition` is nil, this function just behaves like SeqToSliceWorker with the provided `worker`. +// If `breakOnError` is true, the pipeline will stop and return an error if any sequence processing fails. Otherwise, it will log a warning message for each failed sequence. +// +// Parameters: +// - condition SequencePredicate: A predicate function that determines which sequences should be processed by the worker. +// - worker SeqWorker: The worker to be applied to the sequences that satisfy the condition. +// - breakOnError bool: If true, the pipeline will stop and return an error if any sequence processing fails. Otherwise, it will log a warning message for each failed sequence. +// +// Returns: +// +// SeqSliceWorker: A new SeqSliceWorker that processes sequences based on a condition. This function returns a single SeqSliceWorker that can be used to process BioSequences in a workflow or pipeline. func SeqToSliceConditionalWorker( condition SequencePredicate, worker SeqWorker, breakOnError bool) SeqSliceWorker { @@ -112,6 +189,17 @@ func SeqToSliceConditionalWorker( return f } +// ChainWorkers chains two workers together and returns a new SeqWorker. It takes an existing worker (`worker`) and a next worker as arguments, combines them into a pipeline and applies it to each BioSequence in the sequence slice. +// If `next` is nil, this function just returns the input worker. +// If `worker` is nil, this function just returns the next worker. +// +// Parameters: +// - worker SeqWorker: The initial worker to be chained. This worker will be executed first on each sequence. +// - next SeqWorker: The next worker in the pipeline. This worker will be applied to the output of `worker` for each sequence. +// +// Returns: +// +// SeqWorker: A new SeqWorker that chains the input workers together into a pipeline. This function returns a single SeqWorker that can be used to process BioSequences in a workflow or pipeline. func (worker SeqWorker) ChainWorkers(next SeqWorker) SeqWorker { if worker == nil { return next