Correction on obiformat of bug leading to partial parsing and add godocs

Former-commit-id: b27105355f1a330eedf6eaa72c8ac94f06806c28
This commit is contained in:
Eric Coissac
2024-05-07 10:54:12 +02:00
parent 5b98393a68
commit 9e63013bc2
6 changed files with 231 additions and 42 deletions

View File

@ -1,8 +1,15 @@
package obialign package obialign
import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"golang.org/x/exp/constraints"
)
func abs(x int) int { // abs computes the absolute value of a given float or integer.
//
// x: the input value of type k (float or integer).
// k: the return type, which is the absolute value of x.
func abs[k constraints.Float | constraints.Integer](x k) k {
if x < 0 { if x < 0 {
return -x return -x
} }
@ -10,6 +17,17 @@ func abs(x int) int {
return x return x
} }
// D1Or0 checks if two sequences are identical or differ by one position.
//
// Parameters:
// - seq1: a pointer to the first sequence
// - seq2: a pointer to the second sequence
//
// Returns:
// - int: 0 if the sequences are identical or 0 if they differ by one position, -1 otherwise
// - int: the position where the sequences differ, -1 if they are identical
// - byte: the character in the first sequence at the differing position, '-' if it's a deletion
// - byte: the character in the second sequence at the differing position, '-' if it's a deletion
func D1Or0(seq1, seq2 *obiseq.BioSequence) (int, int, byte, byte) { func D1Or0(seq1, seq2 *obiseq.BioSequence) (int, int, byte, byte) {
pos := -1 pos := -1

View File

@ -108,6 +108,8 @@ func _ParseFastaFile(source string,
if is_end_of_line { if is_end_of_line {
definition = defBytes.String() definition = defBytes.String()
state = 5 state = 5
} else {
defBytes.WriteByte(C)
} }
case 5: case 5:
if !is_end_of_line { if !is_end_of_line {

View File

@ -229,6 +229,8 @@ func _ParseFastqFile(source string,
// End of identifier // End of identifier
identifier = idBytes.String() identifier = idBytes.String()
state = 3 state = 3
} else {
idBytes.WriteByte(C)
} }
if is_end_of_line { if is_end_of_line {
// Definition empty // Definition empty
@ -250,6 +252,8 @@ func _ParseFastqFile(source string,
if is_end_of_line { if is_end_of_line {
definition = defBytes.String() definition = defBytes.String()
state = 5 state = 5
} else {
defBytes.WriteByte(C)
} }
case 5: // Beginning of sequence case 5: // Beginning of sequence
if !is_end_of_line { if !is_end_of_line {
@ -308,6 +312,8 @@ func _ParseFastqFile(source string,
} }
sequences[len(sequences)-1].SetQualities(q) sequences[len(sequences)-1].SetQualities(q)
state = 11 state = 11
} else {
qualBytes.WriteByte(C)
} }
case 11: case 11:
if is_end_of_line { if is_end_of_line {

View File

@ -13,6 +13,10 @@ import (
"github.com/yuin/gopher-lua/parse" "github.com/yuin/gopher-lua/parse"
) )
// NewInterpreter creates a new Lua interpreter and registers the Obilib and ObiContext modules.
//
// No parameters.
// Returns a pointer to a Lua state.
func NewInterpreter() *lua.LState { func NewInterpreter() *lua.LState {
lua := lua.NewState() lua := lua.NewState()
@ -22,6 +26,10 @@ func NewInterpreter() *lua.LState {
return lua return lua
} }
// Compile compiles a Lua program into a Lua function proto.
//
// It takes a byte slice containing the Lua program and a string representing the name of the program.
// It returns a pointer to a Lua function proto and an error if any.
func Compile(program []byte, name string) (*lua.FunctionProto, error) { func Compile(program []byte, name string) (*lua.FunctionProto, error) {
reader := bytes.NewReader(program) reader := bytes.NewReader(program)
@ -37,6 +45,12 @@ func Compile(program []byte, name string) (*lua.FunctionProto, error) {
return proto, nil return proto, nil
} }
// CompileScript compiles a Lua script from a file.
//
// It takes a file path as input and returns a pointer to a Lua function proto and an error if any.
// The function reads the contents of the file specified by the file path and compiles it into a Lua function proto using the Compile function.
// If there is an error reading the file, the function returns nil and the error.
// Otherwise, it returns the compiled Lua function proto and nil error.
func CompileScript(filePath string) (*lua.FunctionProto, error) { func CompileScript(filePath string) (*lua.FunctionProto, error) {
program, err := os.ReadFile(filePath) program, err := os.ReadFile(filePath)
@ -47,6 +61,25 @@ func CompileScript(filePath string) (*lua.FunctionProto, error) {
return Compile(program, filePath) return Compile(program, filePath)
} }
// LuaWorker creates a Go function that executes a Lua script and returns a SeqWorker.
//
// The function takes a Lua function prototype as input and creates a new interpreter.
// It then creates a new Lua function from the prototype and pushes it onto the interpreter's stack.
// The interpreter calls the Lua function and checks for any errors.
// It retrieves the global variable "worker" from the interpreter and checks if it is a Lua function.
// If it is a Lua function, it defines a Go function that takes a BioSequence as input.
// Inside the Go function, it calls the Lua function with the BioSequence as an argument.
// It retrieves the result from the interpreter and checks its type.
// If the result is a BioSequence or a BioSequenceSlice, it returns it along with any error.
// If the result is not of the expected type, it returns an error.
// If the global variable "worker" is not a Lua function, it logs a fatal error.
// The Go function is returned as a SeqWorker.
//
// Parameters:
// - proto: The Lua function prototype.
//
// Return type:
// - obiseq.SeqWorker: The Go function that executes the Lua script and returns a SeqWorker.
func LuaWorker(proto *lua.FunctionProto) obiseq.SeqWorker { func LuaWorker(proto *lua.FunctionProto) obiseq.SeqWorker {
interpreter := NewInterpreter() interpreter := NewInterpreter()
lfunc := interpreter.NewFunctionFromProto(proto) lfunc := interpreter.NewFunctionFromProto(proto)
@ -94,30 +127,20 @@ func LuaWorker(proto *lua.FunctionProto) obiseq.SeqWorker {
return f return f
} }
log.Fatalf("THe worker object is not a function") log.Fatalf("The worker object is not a function")
return nil return nil
// f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
// interpreter.SetGlobal("sequence", obiseq2Lua(interpreter, sequence))
// interpreter.Push(lfunc)
// err := interpreter.PCall(0, lua.MultRet, nil)
// result := interpreter.GetGlobal("result")
// if result != lua.LNil {
// log.Info("youpi ", result)
// }
// rep := interpreter.GetGlobal("sequence")
// if rep.Type() == lua.LTUserData {
// ud := rep.(*lua.LUserData)
// sequence = ud.Value.(*obiseq.BioSequence)
// }
// return obiseq.BioSequenceSlice{sequence}, err
// }
} }
// LuaProcessor processes a Lua script on a sequence iterator and returns a new iterator.
//
// Parameters:
// - iterator: The IBioSequence iterator to process.
// - name: The name of the Lua script.
// - program: The Lua script program as a string.
// - breakOnError: A boolean indicating whether to stop processing if an error occurs.
// - nworkers: An integer representing the number of workers to use for processing.
// Returns:
// - obiiter.IBioSequence: The new IBioSequence iterator after processing the Lua script.
func LuaProcessor(iterator obiiter.IBioSequence, name, program string, breakOnError bool, nworkers int) obiiter.IBioSequence { func LuaProcessor(iterator obiiter.IBioSequence, name, program string, breakOnError bool, nworkers int) obiiter.IBioSequence {
newIter := obiiter.MakeIBioSequence() newIter := obiiter.MakeIBioSequence()
@ -223,6 +246,15 @@ func LuaProcessor(iterator obiiter.IBioSequence, name, program string, breakOnEr
} }
// LuaPipe creates a pipeable function that applies a Lua script to an input sequence.
//
// Parameters:
// - name: The name of the Lua script.
// - program: The Lua script program as a string.
// - breakOnError: A boolean indicating whether to stop processing if an error occurs.
// - nworkers: An integer representing the number of workers to use for processing.
// Returns:
// - obiiter.Pipeable: A pipeable function that applies the Lua script to the input sequence.
func LuaPipe(name, program string, breakOnError bool, nworkers int) obiiter.Pipeable { func LuaPipe(name, program string, breakOnError bool, nworkers int) obiiter.Pipeable {
f := func(input obiiter.IBioSequence) obiiter.IBioSequence { f := func(input obiiter.IBioSequence) obiiter.IBioSequence {
@ -232,6 +264,14 @@ func LuaPipe(name, program string, breakOnError bool, nworkers int) obiiter.Pipe
return f return f
} }
// LuaScriptPipe creates a pipeable function that applies a Lua script to an input sequence.
//
// Parameters:
// - filename: The name of the Lua script file.
// - breakOnError: A boolean indicating whether to stop processing if an error occurs.
// - nworkers: An integer representing the number of workers to use for processing.
// Returns:
// - obiiter.Pipeable: A pipeable function that applies the Lua script to the input sequence.
func LuaScriptPipe(filename string, breakOnError bool, nworkers int) obiiter.Pipeable { func LuaScriptPipe(filename string, breakOnError bool, nworkers int) obiiter.Pipeable {
program, err := os.ReadFile(filename) program, err := os.ReadFile(filename)

View File

@ -11,14 +11,24 @@ import (
type StatsOnValues map[string]int type StatsOnValues map[string]int
// StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute.
//
// Parameters:
// - key: the attribute key (string)
//
// Return type:
// - string
func StatsOnSlotName(key string) string { func StatsOnSlotName(key string) string {
return "merged_" + key return "merged_" + key
} }
/* // HasStatsOn tests if the sequence has already a slot summarizing statistics of occurrence for a given attribute.
Tests if the sequence has already a slot summarizing statistics //
of occurrence for a given attribute. // Parameters:
*/ // - key: the attribute key (string)
//
// Return type:
// - bool
func (sequence *BioSequence) HasStatsOn(key string) bool { func (sequence *BioSequence) HasStatsOn(key string) bool {
if !sequence.HasAnnotation() { if !sequence.HasAnnotation() {
return false return false
@ -31,7 +41,14 @@ func (sequence *BioSequence) HasStatsOn(key string) bool {
return ok return ok
} }
// A function that takes a BioSequence and a key and returns a StatsOnValues. // StatsOn returns the slot summarizing statistics of occurrence for a given attribute.
//
// Parameters:
// - key: the attribute key (string) to be summarized
// - na: the value to be used if the attribute is not present
//
// Return type:
// - StatsOnValues
func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues { func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
mkey := StatsOnSlotName(key) mkey := StatsOnSlotName(key)
annotations := sequence.Annotations() annotations := sequence.Annotations()
@ -77,7 +94,14 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
return stats return stats
} }
// Adding the count of the sequence to the count of the key in the stats. // StatsPlusOne adds the count of the sequence toAdd to the count of the key in the stats.
//
// Parameters:
// - key: the attribute key (string) to be summarized
// - toAdd: the BioSequence to add to the stats
// - na: the value to be used if the attribute is not present
// Return type:
// - bool
func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na string) bool { func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na string) bool {
sval := na sval := na
annotations := sequence.Annotations() annotations := sequence.Annotations()
@ -109,10 +133,14 @@ func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na str
old = 0 old = 0
} }
stats[sval] = old + toAdd.Count() stats[sval] = old + toAdd.Count()
annotations[StatsOnSlotName(key)] = stats annotations[StatsOnSlotName(key)] = stats // TODO: check if this is necessary
return retval return retval
} }
// Merge merges the given StatsOnValues with the current StatsOnValues.
//
// It takes a parameter `toMerged` of type StatsOnValues, which represents the StatsOnValues to be merged.
// It returns a value of type StatsOnValues, which represents the merged StatsOnValues.
func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues { func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
for k, val := range toMerged { for k, val := range toMerged {
old, ok := stats[k] old, ok := stats[k]
@ -125,7 +153,16 @@ func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
return stats return stats
} }
// Merging two sequences. // Merge merges two sequences into a single sequence.
//
// Parameters:
// - tomerge: the sequence to be merged (BioSequence)
// - na: the value to be used if the attribute is not present (string)
// - inplace: a boolean indicating whether to merge in place or not (bool)
// - statsOn: a variadic string parameter representing the attributes to be summarized (string)
//
// Return type:
// - *BioSequence: the merged sequence (BioSequence)
func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn ...string) *BioSequence { func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn ...string) *BioSequence {
if !inplace { if !inplace {
sequence = sequence.Copy() sequence = sequence.Copy()
@ -184,17 +221,15 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
return sequence return sequence
} }
/* // Merge merges the given sequences into a single sequence.
* //
// Parameters:
Merges a set of sequence into a single sequence. // - sequences: a slice of BioSequence objects to be merged (BioSequenceSlice)
// - na: the value to be used if the attribute is not present (string)
The function assumes that every sequence in the batch is // - statsOn: a slice of strings representing the attributes to be summarized ([]string)
identical in term of sequence. Actually the function only //
aggregates the annotations of the different sequences to be merged // Return type:
// - *BioSequence: the merged sequence (BioSequence)
Quality information is lost during the merge procedure.
*/
func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequence { func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequence {
seq := sequences[0] seq := sequences[0]
//sequences[0] = nil //sequences[0] = nil

View File

@ -12,10 +12,46 @@ type SeqAnnotator func(*BioSequence)
type SeqWorker func(*BioSequence) (BioSequenceSlice, error) type SeqWorker func(*BioSequence) (BioSequenceSlice, error)
type SeqSliceWorker func(BioSequenceSlice) (BioSequenceSlice, error) type SeqSliceWorker func(BioSequenceSlice) (BioSequenceSlice, error)
// NilSeqWorker returns a BioSequenceSlice containing the input sequence and a nil error value.
// This function is typically used as a placeholder or default worker in SeqToSliceWorker when no specific worker is needed.
//
// Parameters:
//
// seq *BioSequence: A pointer to a BioSequence struct that needs processing.
//
// Returns:
//
// BioSequenceSlice, error: This function returns a slice containing the input sequence and an error value. If no error occurred during the operation, it will be nil; otherwise, it will contain details about the error.
func NilSeqWorker(seq *BioSequence) (BioSequenceSlice, error) { func NilSeqWorker(seq *BioSequence) (BioSequenceSlice, error) {
return BioSequenceSlice{seq}, nil return BioSequenceSlice{seq}, nil
} }
// AnnotatorToSeqWorker is a higher-order function that takes a SeqAnnotator
// function and returns a SeqWorker function. It is used to wrap a sequence
// annotation function and convert it into a worker function that can be used
// in a pipeline or workflow for processing biological sequences.
//
// Parameters:
//
// function SeqAnnotator: A function that takes a pointer to a BioSequence
// struct and performs some annotation or processing on the sequence data.
// The SeqAnnotator type is expected to be a function with the following
// signature:
// func(seq *BioSequence)
// This function should modify the input BioSequence struct in-place by adding
// annotations, metadata, or performing any other desired operations.
//
// Returns:
//
// SeqWorker: A function that takes a pointer to a BioSequence struct and
// returns a BioSequenceSlice containing the input BioSequence, along with
// an error value. The SeqWorker type is expected to be a function with the
// following signature:
// func(seq *BioSequence) (BioSequenceSlice, error)
// The returned SeqWorker function wraps the provided SeqAnnotator function
// and applies it to the input BioSequence before returning the modified
// BioSequence in a BioSequenceSlice. The error value is always nil, as the
// function does not perform any operations that could potentially fail.
func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker { func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
f := func(seq *BioSequence) (BioSequenceSlice, error) { f := func(seq *BioSequence) (BioSequenceSlice, error) {
function(seq) function(seq)
@ -24,6 +60,35 @@ func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
return f return f
} }
// SeqToSliceWorker is a higher-order function that takes a SeqWorker and a
// boolean value indicating whether to break on error and returns a SeqSliceWorker.
// It can be used in a pipeline or workflow for processing biological sequences,
// applying the provided worker to each element of a BioSequenceSlice and returning
// a new slice.
//
// Parameters:
//
// worker SeqWorker: A function that takes a pointer to a BioSequence struct and
// performs some processing on it.
// The signature for this function is func(seq *BioSequence) (BioSequenceSlice, error).
// This function should return a modified BioSequence in a BioSequenceSlice along with
// an error value indicating whether the operation was successful or not.
// breakOnError bool: A boolean flag that determines whether to stop processing further
// elements in case of an error. If set to true and an error is encountered while
// processing any element, it stops processing remaining elements and returns the processed
// slice so far along with the encountered error. If set to false, it logs the error and
// continues processing remaining elements.
//
// Returns:
//
// SeqSliceWorker: A function that takes a BioSequenceSlice (a slice of pointers to
// BioSequence structs) as input and returns a processed BioSequenceSlice along with
// an error value indicating whether the operation was successful or not.
// The signature for this function is func(input BioSequenceSlice) (BioSequenceSlice, error).
// If breakOnError is set to true and any element processing results in an error, it stops
// further processing and returns the processed slice so far along with the encountered error.
// Otherwise, it processes all elements and returns them as a single BioSequenceSlice along with
// a nil error value.
func SeqToSliceWorker(worker SeqWorker, func SeqToSliceWorker(worker SeqWorker,
breakOnError bool) SeqSliceWorker { breakOnError bool) SeqSliceWorker {
var f SeqSliceWorker var f SeqSliceWorker
@ -68,6 +133,18 @@ func SeqToSliceWorker(worker SeqWorker,
return f return f
} }
// SeqToSliceConditionalWorker creates a new SeqSliceWorker that processes each sequence in a slice based on a condition. It takes a SequencePredicate and a worker function as arguments. The worker function is only applied to sequences that satisfy the condition.
// If `condition` is nil, this function just behaves like SeqToSliceWorker with the provided `worker`.
// If `breakOnError` is true, the pipeline will stop and return an error if any sequence processing fails. Otherwise, it will log a warning message for each failed sequence.
//
// Parameters:
// - condition SequencePredicate: A predicate function that determines which sequences should be processed by the worker.
// - worker SeqWorker: The worker to be applied to the sequences that satisfy the condition.
// - breakOnError bool: If true, the pipeline will stop and return an error if any sequence processing fails. Otherwise, it will log a warning message for each failed sequence.
//
// Returns:
//
// SeqSliceWorker: A new SeqSliceWorker that processes sequences based on a condition. This function returns a single SeqSliceWorker that can be used to process BioSequences in a workflow or pipeline.
func SeqToSliceConditionalWorker( func SeqToSliceConditionalWorker(
condition SequencePredicate, condition SequencePredicate,
worker SeqWorker, breakOnError bool) SeqSliceWorker { worker SeqWorker, breakOnError bool) SeqSliceWorker {
@ -112,6 +189,17 @@ func SeqToSliceConditionalWorker(
return f return f
} }
// ChainWorkers chains two workers together and returns a new SeqWorker. It takes an existing worker (`worker`) and a next worker as arguments, combines them into a pipeline and applies it to each BioSequence in the sequence slice.
// If `next` is nil, this function just returns the input worker.
// If `worker` is nil, this function just returns the next worker.
//
// Parameters:
// - worker SeqWorker: The initial worker to be chained. This worker will be executed first on each sequence.
// - next SeqWorker: The next worker in the pipeline. This worker will be applied to the output of `worker` for each sequence.
//
// Returns:
//
// SeqWorker: A new SeqWorker that chains the input workers together into a pipeline. This function returns a single SeqWorker that can be used to process BioSequences in a workflow or pipeline.
func (worker SeqWorker) ChainWorkers(next SeqWorker) SeqWorker { func (worker SeqWorker) ChainWorkers(next SeqWorker) SeqWorker {
if worker == nil { if worker == nil {
return next return next