Correction on obiformat of bug leading to partial parsing and add godocs

Former-commit-id: b27105355f1a330eedf6eaa72c8ac94f06806c28
This commit is contained in:
Eric Coissac
2024-05-07 10:54:12 +02:00
parent 5b98393a68
commit 9e63013bc2
6 changed files with 231 additions and 42 deletions

View File

@ -1,8 +1,15 @@
package obialign
import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"golang.org/x/exp/constraints"
)
func abs(x int) int {
// abs computes the absolute value of a given float or integer.
//
// x: the input value of type k (float or integer).
// k: the return type, which is the absolute value of x.
func abs[k constraints.Float | constraints.Integer](x k) k {
if x < 0 {
return -x
}
@ -10,6 +17,17 @@ func abs(x int) int {
return x
}
// D1Or0 checks if two sequences are identical or differ by one position.
//
// Parameters:
// - seq1: a pointer to the first sequence
// - seq2: a pointer to the second sequence
//
// Returns:
// - int: 0 if the sequences are identical or 0 if they differ by one position, -1 otherwise
// - int: the position where the sequences differ, -1 if they are identical
// - byte: the character in the first sequence at the differing position, '-' if it's a deletion
// - byte: the character in the second sequence at the differing position, '-' if it's a deletion
func D1Or0(seq1, seq2 *obiseq.BioSequence) (int, int, byte, byte) {
pos := -1

View File

@ -108,6 +108,8 @@ func _ParseFastaFile(source string,
if is_end_of_line {
definition = defBytes.String()
state = 5
} else {
defBytes.WriteByte(C)
}
case 5:
if !is_end_of_line {

View File

@ -229,6 +229,8 @@ func _ParseFastqFile(source string,
// End of identifier
identifier = idBytes.String()
state = 3
} else {
idBytes.WriteByte(C)
}
if is_end_of_line {
// Definition empty
@ -250,6 +252,8 @@ func _ParseFastqFile(source string,
if is_end_of_line {
definition = defBytes.String()
state = 5
} else {
defBytes.WriteByte(C)
}
case 5: // Beginning of sequence
if !is_end_of_line {
@ -308,6 +312,8 @@ func _ParseFastqFile(source string,
}
sequences[len(sequences)-1].SetQualities(q)
state = 11
} else {
qualBytes.WriteByte(C)
}
case 11:
if is_end_of_line {

View File

@ -13,6 +13,10 @@ import (
"github.com/yuin/gopher-lua/parse"
)
// NewInterpreter creates a new Lua interpreter and registers the Obilib and ObiContext modules.
//
// No parameters.
// Returns a pointer to a Lua state.
func NewInterpreter() *lua.LState {
lua := lua.NewState()
@ -22,6 +26,10 @@ func NewInterpreter() *lua.LState {
return lua
}
// Compile compiles a Lua program into a Lua function proto.
//
// It takes a byte slice containing the Lua program and a string representing the name of the program.
// It returns a pointer to a Lua function proto and an error if any.
func Compile(program []byte, name string) (*lua.FunctionProto, error) {
reader := bytes.NewReader(program)
@ -37,6 +45,12 @@ func Compile(program []byte, name string) (*lua.FunctionProto, error) {
return proto, nil
}
// CompileScript compiles a Lua script from a file.
//
// It takes a file path as input and returns a pointer to a Lua function proto and an error if any.
// The function reads the contents of the file specified by the file path and compiles it into a Lua function proto using the Compile function.
// If there is an error reading the file, the function returns nil and the error.
// Otherwise, it returns the compiled Lua function proto and nil error.
func CompileScript(filePath string) (*lua.FunctionProto, error) {
program, err := os.ReadFile(filePath)
@ -47,6 +61,25 @@ func CompileScript(filePath string) (*lua.FunctionProto, error) {
return Compile(program, filePath)
}
// LuaWorker creates a Go function that executes a Lua script and returns a SeqWorker.
//
// The function takes a Lua function prototype as input and creates a new interpreter.
// It then creates a new Lua function from the prototype and pushes it onto the interpreter's stack.
// The interpreter calls the Lua function and checks for any errors.
// It retrieves the global variable "worker" from the interpreter and checks if it is a Lua function.
// If it is a Lua function, it defines a Go function that takes a BioSequence as input.
// Inside the Go function, it calls the Lua function with the BioSequence as an argument.
// It retrieves the result from the interpreter and checks its type.
// If the result is a BioSequence or a BioSequenceSlice, it returns it along with any error.
// If the result is not of the expected type, it returns an error.
// If the global variable "worker" is not a Lua function, it logs a fatal error.
// The Go function is returned as a SeqWorker.
//
// Parameters:
// - proto: The Lua function prototype.
//
// Return type:
// - obiseq.SeqWorker: The Go function that executes the Lua script and returns a SeqWorker.
func LuaWorker(proto *lua.FunctionProto) obiseq.SeqWorker {
interpreter := NewInterpreter()
lfunc := interpreter.NewFunctionFromProto(proto)
@ -94,30 +127,20 @@ func LuaWorker(proto *lua.FunctionProto) obiseq.SeqWorker {
return f
}
log.Fatalf("THe worker object is not a function")
log.Fatalf("The worker object is not a function")
return nil
// f := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
// interpreter.SetGlobal("sequence", obiseq2Lua(interpreter, sequence))
// interpreter.Push(lfunc)
// err := interpreter.PCall(0, lua.MultRet, nil)
// result := interpreter.GetGlobal("result")
// if result != lua.LNil {
// log.Info("youpi ", result)
// }
// rep := interpreter.GetGlobal("sequence")
// if rep.Type() == lua.LTUserData {
// ud := rep.(*lua.LUserData)
// sequence = ud.Value.(*obiseq.BioSequence)
// }
// return obiseq.BioSequenceSlice{sequence}, err
// }
}
// LuaProcessor processes a Lua script on a sequence iterator and returns a new iterator.
//
// Parameters:
// - iterator: The IBioSequence iterator to process.
// - name: The name of the Lua script.
// - program: The Lua script program as a string.
// - breakOnError: A boolean indicating whether to stop processing if an error occurs.
// - nworkers: An integer representing the number of workers to use for processing.
// Returns:
// - obiiter.IBioSequence: The new IBioSequence iterator after processing the Lua script.
func LuaProcessor(iterator obiiter.IBioSequence, name, program string, breakOnError bool, nworkers int) obiiter.IBioSequence {
newIter := obiiter.MakeIBioSequence()
@ -223,6 +246,15 @@ func LuaProcessor(iterator obiiter.IBioSequence, name, program string, breakOnEr
}
// LuaPipe creates a pipeable function that applies a Lua script to an input sequence.
//
// Parameters:
// - name: The name of the Lua script.
// - program: The Lua script program as a string.
// - breakOnError: A boolean indicating whether to stop processing if an error occurs.
// - nworkers: An integer representing the number of workers to use for processing.
// Returns:
// - obiiter.Pipeable: A pipeable function that applies the Lua script to the input sequence.
func LuaPipe(name, program string, breakOnError bool, nworkers int) obiiter.Pipeable {
f := func(input obiiter.IBioSequence) obiiter.IBioSequence {
@ -232,6 +264,14 @@ func LuaPipe(name, program string, breakOnError bool, nworkers int) obiiter.Pipe
return f
}
// LuaScriptPipe creates a pipeable function that applies a Lua script to an input sequence.
//
// Parameters:
// - filename: The name of the Lua script file.
// - breakOnError: A boolean indicating whether to stop processing if an error occurs.
// - nworkers: An integer representing the number of workers to use for processing.
// Returns:
// - obiiter.Pipeable: A pipeable function that applies the Lua script to the input sequence.
func LuaScriptPipe(filename string, breakOnError bool, nworkers int) obiiter.Pipeable {
program, err := os.ReadFile(filename)

View File

@ -11,14 +11,24 @@ import (
type StatsOnValues map[string]int
// StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute.
//
// Parameters:
// - key: the attribute key (string)
//
// Return type:
// - string
func StatsOnSlotName(key string) string {
return "merged_" + key
}
/*
Tests if the sequence has already a slot summarizing statistics
of occurrence for a given attribute.
*/
// HasStatsOn tests if the sequence has already a slot summarizing statistics of occurrence for a given attribute.
//
// Parameters:
// - key: the attribute key (string)
//
// Return type:
// - bool
func (sequence *BioSequence) HasStatsOn(key string) bool {
if !sequence.HasAnnotation() {
return false
@ -31,7 +41,14 @@ func (sequence *BioSequence) HasStatsOn(key string) bool {
return ok
}
// A function that takes a BioSequence and a key and returns a StatsOnValues.
// StatsOn returns the slot summarizing statistics of occurrence for a given attribute.
//
// Parameters:
// - key: the attribute key (string) to be summarized
// - na: the value to be used if the attribute is not present
//
// Return type:
// - StatsOnValues
func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
mkey := StatsOnSlotName(key)
annotations := sequence.Annotations()
@ -77,7 +94,14 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
return stats
}
// Adding the count of the sequence to the count of the key in the stats.
// StatsPlusOne adds the count of the sequence toAdd to the count of the key in the stats.
//
// Parameters:
// - key: the attribute key (string) to be summarized
// - toAdd: the BioSequence to add to the stats
// - na: the value to be used if the attribute is not present
// Return type:
// - bool
func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na string) bool {
sval := na
annotations := sequence.Annotations()
@ -109,10 +133,14 @@ func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na str
old = 0
}
stats[sval] = old + toAdd.Count()
annotations[StatsOnSlotName(key)] = stats
annotations[StatsOnSlotName(key)] = stats // TODO: check if this is necessary
return retval
}
// Merge merges the given StatsOnValues with the current StatsOnValues.
//
// It takes a parameter `toMerged` of type StatsOnValues, which represents the StatsOnValues to be merged.
// It returns a value of type StatsOnValues, which represents the merged StatsOnValues.
func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
for k, val := range toMerged {
old, ok := stats[k]
@ -125,7 +153,16 @@ func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
return stats
}
// Merging two sequences.
// Merge merges two sequences into a single sequence.
//
// Parameters:
// - tomerge: the sequence to be merged (BioSequence)
// - na: the value to be used if the attribute is not present (string)
// - inplace: a boolean indicating whether to merge in place or not (bool)
// - statsOn: a variadic string parameter representing the attributes to be summarized (string)
//
// Return type:
// - *BioSequence: the merged sequence (BioSequence)
func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn ...string) *BioSequence {
if !inplace {
sequence = sequence.Copy()
@ -184,17 +221,15 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
return sequence
}
/*
*
Merges a set of sequence into a single sequence.
The function assumes that every sequence in the batch is
identical in term of sequence. Actually the function only
aggregates the annotations of the different sequences to be merged
Quality information is lost during the merge procedure.
*/
// Merge merges the given sequences into a single sequence.
//
// Parameters:
// - sequences: a slice of BioSequence objects to be merged (BioSequenceSlice)
// - na: the value to be used if the attribute is not present (string)
// - statsOn: a slice of strings representing the attributes to be summarized ([]string)
//
// Return type:
// - *BioSequence: the merged sequence (BioSequence)
func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequence {
seq := sequences[0]
//sequences[0] = nil

View File

@ -12,10 +12,46 @@ type SeqAnnotator func(*BioSequence)
type SeqWorker func(*BioSequence) (BioSequenceSlice, error)
type SeqSliceWorker func(BioSequenceSlice) (BioSequenceSlice, error)
// NilSeqWorker returns a BioSequenceSlice containing the input sequence and a nil error value.
// This function is typically used as a placeholder or default worker in SeqToSliceWorker when no specific worker is needed.
//
// Parameters:
//
// seq *BioSequence: A pointer to a BioSequence struct that needs processing.
//
// Returns:
//
// BioSequenceSlice, error: This function returns a slice containing the input sequence and an error value. If no error occurred during the operation, it will be nil; otherwise, it will contain details about the error.
func NilSeqWorker(seq *BioSequence) (BioSequenceSlice, error) {
return BioSequenceSlice{seq}, nil
}
// AnnotatorToSeqWorker is a higher-order function that takes a SeqAnnotator
// function and returns a SeqWorker function. It is used to wrap a sequence
// annotation function and convert it into a worker function that can be used
// in a pipeline or workflow for processing biological sequences.
//
// Parameters:
//
// function SeqAnnotator: A function that takes a pointer to a BioSequence
// struct and performs some annotation or processing on the sequence data.
// The SeqAnnotator type is expected to be a function with the following
// signature:
// func(seq *BioSequence)
// This function should modify the input BioSequence struct in-place by adding
// annotations, metadata, or performing any other desired operations.
//
// Returns:
//
// SeqWorker: A function that takes a pointer to a BioSequence struct and
// returns a BioSequenceSlice containing the input BioSequence, along with
// an error value. The SeqWorker type is expected to be a function with the
// following signature:
// func(seq *BioSequence) (BioSequenceSlice, error)
// The returned SeqWorker function wraps the provided SeqAnnotator function
// and applies it to the input BioSequence before returning the modified
// BioSequence in a BioSequenceSlice. The error value is always nil, as the
// function does not perform any operations that could potentially fail.
func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
f := func(seq *BioSequence) (BioSequenceSlice, error) {
function(seq)
@ -24,6 +60,35 @@ func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
return f
}
// SeqToSliceWorker is a higher-order function that takes a SeqWorker and a
// boolean value indicating whether to break on error and returns a SeqSliceWorker.
// It can be used in a pipeline or workflow for processing biological sequences,
// applying the provided worker to each element of a BioSequenceSlice and returning
// a new slice.
//
// Parameters:
//
// worker SeqWorker: A function that takes a pointer to a BioSequence struct and
// performs some processing on it.
// The signature for this function is func(seq *BioSequence) (BioSequenceSlice, error).
// This function should return a modified BioSequence in a BioSequenceSlice along with
// an error value indicating whether the operation was successful or not.
// breakOnError bool: A boolean flag that determines whether to stop processing further
// elements in case of an error. If set to true and an error is encountered while
// processing any element, it stops processing remaining elements and returns the processed
// slice so far along with the encountered error. If set to false, it logs the error and
// continues processing remaining elements.
//
// Returns:
//
// SeqSliceWorker: A function that takes a BioSequenceSlice (a slice of pointers to
// BioSequence structs) as input and returns a processed BioSequenceSlice along with
// an error value indicating whether the operation was successful or not.
// The signature for this function is func(input BioSequenceSlice) (BioSequenceSlice, error).
// If breakOnError is set to true and any element processing results in an error, it stops
// further processing and returns the processed slice so far along with the encountered error.
// Otherwise, it processes all elements and returns them as a single BioSequenceSlice along with
// a nil error value.
func SeqToSliceWorker(worker SeqWorker,
breakOnError bool) SeqSliceWorker {
var f SeqSliceWorker
@ -68,6 +133,18 @@ func SeqToSliceWorker(worker SeqWorker,
return f
}
// SeqToSliceConditionalWorker creates a new SeqSliceWorker that processes each sequence in a slice based on a condition. It takes a SequencePredicate and a worker function as arguments. The worker function is only applied to sequences that satisfy the condition.
// If `condition` is nil, this function just behaves like SeqToSliceWorker with the provided `worker`.
// If `breakOnError` is true, the pipeline will stop and return an error if any sequence processing fails. Otherwise, it will log a warning message for each failed sequence.
//
// Parameters:
// - condition SequencePredicate: A predicate function that determines which sequences should be processed by the worker.
// - worker SeqWorker: The worker to be applied to the sequences that satisfy the condition.
// - breakOnError bool: If true, the pipeline will stop and return an error if any sequence processing fails. Otherwise, it will log a warning message for each failed sequence.
//
// Returns:
//
// SeqSliceWorker: A new SeqSliceWorker that processes sequences based on a condition. This function returns a single SeqSliceWorker that can be used to process BioSequences in a workflow or pipeline.
func SeqToSliceConditionalWorker(
condition SequencePredicate,
worker SeqWorker, breakOnError bool) SeqSliceWorker {
@ -112,6 +189,17 @@ func SeqToSliceConditionalWorker(
return f
}
// ChainWorkers chains two workers together and returns a new SeqWorker. It takes an existing worker (`worker`) and a next worker as arguments, combines them into a pipeline and applies it to each BioSequence in the sequence slice.
// If `next` is nil, this function just returns the input worker.
// If `worker` is nil, this function just returns the next worker.
//
// Parameters:
// - worker SeqWorker: The initial worker to be chained. This worker will be executed first on each sequence.
// - next SeqWorker: The next worker in the pipeline. This worker will be applied to the output of `worker` for each sequence.
//
// Returns:
//
// SeqWorker: A new SeqWorker that chains the input workers together into a pipeline. This function returns a single SeqWorker that can be used to process BioSequences in a workflow or pipeline.
func (worker SeqWorker) ChainWorkers(next SeqWorker) SeqWorker {
if worker == nil {
return next