2023-01-22 22:39:13 +01:00
package obiseq
2024-03-02 16:03:46 -04:00
import (
"fmt"
"slices"
log "github.com/sirupsen/logrus"
)
2023-11-22 09:46:30 +01:00
2023-01-22 22:39:13 +01:00
type SeqAnnotator func ( * BioSequence )
2024-03-02 16:03:46 -04:00
type SeqWorker func ( * BioSequence ) ( BioSequenceSlice , error )
type SeqSliceWorker func ( BioSequenceSlice ) ( BioSequenceSlice , error )
2023-01-22 22:39:13 +01:00
2024-05-07 10:54:12 +02:00
// NilSeqWorker returns a BioSequenceSlice containing the input sequence and a nil error value.
// This function is typically used as a placeholder or default worker in SeqToSliceWorker when no specific worker is needed.
//
// Parameters:
//
// seq *BioSequence: A pointer to a BioSequence struct that needs processing.
//
// Returns:
//
// BioSequenceSlice, error: This function returns a slice containing the input sequence and an error value. If no error occurred during the operation, it will be nil; otherwise, it will contain details about the error.
2024-03-02 16:03:46 -04:00
func NilSeqWorker ( seq * BioSequence ) ( BioSequenceSlice , error ) {
return BioSequenceSlice { seq } , nil
2023-01-25 13:22:56 +01:00
}
2024-05-07 10:54:12 +02:00
// AnnotatorToSeqWorker is a higher-order function that takes a SeqAnnotator
// function and returns a SeqWorker function. It is used to wrap a sequence
// annotation function and convert it into a worker function that can be used
// in a pipeline or workflow for processing biological sequences.
//
// Parameters:
//
// function SeqAnnotator: A function that takes a pointer to a BioSequence
// struct and performs some annotation or processing on the sequence data.
// The SeqAnnotator type is expected to be a function with the following
// signature:
// func(seq *BioSequence)
// This function should modify the input BioSequence struct in-place by adding
// annotations, metadata, or performing any other desired operations.
//
// Returns:
//
// SeqWorker: A function that takes a pointer to a BioSequence struct and
// returns a BioSequenceSlice containing the input BioSequence, along with
// an error value. The SeqWorker type is expected to be a function with the
// following signature:
// func(seq *BioSequence) (BioSequenceSlice, error)
// The returned SeqWorker function wraps the provided SeqAnnotator function
// and applies it to the input BioSequence before returning the modified
// BioSequence in a BioSequenceSlice. The error value is always nil, as the
// function does not perform any operations that could potentially fail.
2023-01-22 22:39:13 +01:00
func AnnotatorToSeqWorker ( function SeqAnnotator ) SeqWorker {
2024-03-02 16:03:46 -04:00
f := func ( seq * BioSequence ) ( BioSequenceSlice , error ) {
2023-01-22 22:39:13 +01:00
function ( seq )
2024-03-02 16:03:46 -04:00
return BioSequenceSlice { seq } , nil
2023-01-22 22:39:13 +01:00
}
return f
}
2024-05-07 10:54:12 +02:00
// SeqToSliceWorker is a higher-order function that takes a SeqWorker and a
// boolean value indicating whether to break on error and returns a SeqSliceWorker.
// It can be used in a pipeline or workflow for processing biological sequences,
// applying the provided worker to each element of a BioSequenceSlice and returning
// a new slice.
//
// Parameters:
//
// worker SeqWorker: A function that takes a pointer to a BioSequence struct and
// performs some processing on it.
// The signature for this function is func(seq *BioSequence) (BioSequenceSlice, error).
// This function should return a modified BioSequence in a BioSequenceSlice along with
// an error value indicating whether the operation was successful or not.
// breakOnError bool: A boolean flag that determines whether to stop processing further
// elements in case of an error. If set to true and an error is encountered while
// processing any element, it stops processing remaining elements and returns the processed
// slice so far along with the encountered error. If set to false, it logs the error and
// continues processing remaining elements.
//
// Returns:
//
// SeqSliceWorker: A function that takes a BioSequenceSlice (a slice of pointers to
// BioSequence structs) as input and returns a processed BioSequenceSlice along with
// an error value indicating whether the operation was successful or not.
// The signature for this function is func(input BioSequenceSlice) (BioSequenceSlice, error).
// If breakOnError is set to true and any element processing results in an error, it stops
// further processing and returns the processed slice so far along with the encountered error.
// Otherwise, it processes all elements and returns them as a single BioSequenceSlice along with
// a nil error value.
2023-11-22 09:46:30 +01:00
func SeqToSliceWorker ( worker SeqWorker ,
2024-03-03 11:16:24 -04:00
breakOnError bool ) SeqSliceWorker {
2023-01-25 13:22:56 +01:00
var f SeqSliceWorker
if worker == nil {
2024-03-03 11:16:24 -04:00
f = func ( input BioSequenceSlice ) ( BioSequenceSlice , error ) {
return input , nil
2023-01-25 13:22:56 +01:00
}
} else {
2024-03-02 16:03:46 -04:00
f = func ( input BioSequenceSlice ) ( BioSequenceSlice , error ) {
2024-03-03 11:16:24 -04:00
output := MakeBioSequenceSlice ( len ( input ) )
2023-11-22 09:46:30 +01:00
i := 0
for _ , s := range input {
2024-03-02 16:03:46 -04:00
r , err := worker ( s )
if err == nil {
for _ , rs := range r {
2024-03-03 11:16:24 -04:00
if i == len ( output ) {
output = slices . Grow ( output , cap ( output ) )
output = output [ : cap ( output ) ]
}
2024-03-02 16:03:46 -04:00
output [ i ] = rs
i ++
}
} else {
if breakOnError {
err = fmt . Errorf ( "got an error on sequence %s processing : %v" ,
s . Id ( ) , err )
return BioSequenceSlice { } , err
} else {
2024-03-03 11:16:24 -04:00
log . Warnf ( "got an error on sequence %s processing : %v" ,
s . Id ( ) , err )
2024-03-02 16:03:46 -04:00
}
2023-11-22 09:46:30 +01:00
}
2023-01-25 13:22:56 +01:00
}
2023-11-22 09:46:30 +01:00
2024-03-02 16:03:46 -04:00
return output [ 0 : i ] , nil
2023-11-22 09:46:30 +01:00
}
2023-01-25 13:22:56 +01:00
}
return f
}
2025-03-11 16:35:38 +01:00
func SeqToSliceFilterOnWorker ( condition SequencePredicate ,
breakOnError bool ) SeqSliceWorker {
if condition == nil {
return func ( slice BioSequenceSlice ) ( BioSequenceSlice , error ) {
return slice , nil
}
}
f := func ( input BioSequenceSlice ) ( BioSequenceSlice , error ) {
output := MakeBioSequenceSlice ( len ( input ) )
i := 0
for _ , s := range input {
if condition ( s ) {
output [ i ] = s
i ++
}
}
return output [ 0 : i ] , nil
}
return f
}
2024-05-07 10:54:12 +02:00
// SeqToSliceConditionalWorker creates a new SeqSliceWorker that processes each sequence in a slice based on a condition. It takes a SequencePredicate and a worker function as arguments. The worker function is only applied to sequences that satisfy the condition.
// If `condition` is nil, this function just behaves like SeqToSliceWorker with the provided `worker`.
// If `breakOnError` is true, the pipeline will stop and return an error if any sequence processing fails. Otherwise, it will log a warning message for each failed sequence.
//
// Parameters:
// - condition SequencePredicate: A predicate function that determines which sequences should be processed by the worker.
// - worker SeqWorker: The worker to be applied to the sequences that satisfy the condition.
// - breakOnError bool: If true, the pipeline will stop and return an error if any sequence processing fails. Otherwise, it will log a warning message for each failed sequence.
//
// Returns:
//
// SeqSliceWorker: A new SeqSliceWorker that processes sequences based on a condition. This function returns a single SeqSliceWorker that can be used to process BioSequences in a workflow or pipeline.
2024-03-02 16:03:46 -04:00
func SeqToSliceConditionalWorker (
2023-01-25 13:22:56 +01:00
condition SequencePredicate ,
2024-03-03 11:16:24 -04:00
worker SeqWorker , breakOnError bool ) SeqSliceWorker {
2023-01-25 13:22:56 +01:00
if condition == nil {
2024-03-03 11:16:24 -04:00
return SeqToSliceWorker ( worker , breakOnError )
2023-01-25 13:22:56 +01:00
}
2025-03-11 16:35:38 +01:00
if worker == nil {
return nil
}
2024-03-02 16:03:46 -04:00
f := func ( input BioSequenceSlice ) ( BioSequenceSlice , error ) {
2024-03-03 11:16:24 -04:00
output := MakeBioSequenceSlice ( len ( input ) )
2023-11-22 09:46:30 +01:00
i := 0
for _ , s := range input {
2023-01-25 13:22:56 +01:00
if condition ( s ) {
2024-03-02 16:03:46 -04:00
r , err := worker ( s )
if err == nil {
for _ , rs := range r {
2024-03-03 11:16:24 -04:00
if i == len ( output ) {
output = slices . Grow ( output , cap ( output ) )
output = output [ : cap ( output ) ]
}
2024-03-02 16:03:46 -04:00
output [ i ] = rs
i ++
}
} else {
if breakOnError {
err = fmt . Errorf ( "got an error on sequence %s processing : %v" ,
s . Id ( ) , err )
return BioSequenceSlice { } , err
} else {
2024-03-03 11:16:24 -04:00
log . Warnf ( "got an error on sequence %s processing : %v" ,
s . Id ( ) , err )
2024-03-02 16:03:46 -04:00
}
2023-11-22 09:46:30 +01:00
}
2025-03-11 16:35:38 +01:00
} else {
output [ i ] = s
i ++
2023-01-22 22:39:13 +01:00
}
2023-01-25 13:22:56 +01:00
}
2023-01-22 22:39:13 +01:00
2024-03-02 16:03:46 -04:00
return output [ 0 : i ] , nil
2023-01-22 22:39:13 +01:00
}
return f
}
2024-05-07 10:54:12 +02:00
// ChainWorkers chains two workers together and returns a new SeqWorker. It takes an existing worker (`worker`) and a next worker as arguments, combines them into a pipeline and applies it to each BioSequence in the sequence slice.
// If `next` is nil, this function just returns the input worker.
// If `worker` is nil, this function just returns the next worker.
//
// Parameters:
// - worker SeqWorker: The initial worker to be chained. This worker will be executed first on each sequence.
// - next SeqWorker: The next worker in the pipeline. This worker will be applied to the output of `worker` for each sequence.
//
// Returns:
//
// SeqWorker: A new SeqWorker that chains the input workers together into a pipeline. This function returns a single SeqWorker that can be used to process BioSequences in a workflow or pipeline.
2023-01-25 13:22:56 +01:00
func ( worker SeqWorker ) ChainWorkers ( next SeqWorker ) SeqWorker {
if worker == nil {
return next
} else {
if next == nil {
return worker
}
2023-11-22 09:46:30 +01:00
}
2023-01-25 13:22:56 +01:00
2024-03-03 11:16:24 -04:00
sw := SeqToSliceWorker ( next , false )
2024-03-02 16:03:46 -04:00
f := func ( seq * BioSequence ) ( BioSequenceSlice , error ) {
2023-11-22 09:46:30 +01:00
if seq == nil {
2024-03-02 16:03:46 -04:00
return BioSequenceSlice { } , nil
}
slice , err := worker ( seq )
if err == nil {
slice , err = sw ( slice )
2023-11-22 09:46:30 +01:00
}
2024-03-02 16:03:46 -04:00
return slice , err
2023-01-25 13:22:56 +01:00
}
return f
}