Correction on obiformat of bug leading to partial parsing and add godocs

Former-commit-id: b27105355f1a330eedf6eaa72c8ac94f06806c28
This commit is contained in:
Eric Coissac
2024-05-07 10:54:12 +02:00
parent 5b98393a68
commit 9e63013bc2
6 changed files with 231 additions and 42 deletions

View File

@ -11,14 +11,24 @@ import (
type StatsOnValues map[string]int
// StatsOnSlotName returns the name of the slot that summarizes statistics of occurrence for a given attribute.
//
// Parameters:
// - key: the attribute key (string)
//
// Return type:
// - string
func StatsOnSlotName(key string) string {
return "merged_" + key
}
/*
Tests if the sequence has already a slot summarizing statistics
of occurrence for a given attribute.
*/
// HasStatsOn tests if the sequence has already a slot summarizing statistics of occurrence for a given attribute.
//
// Parameters:
// - key: the attribute key (string)
//
// Return type:
// - bool
func (sequence *BioSequence) HasStatsOn(key string) bool {
if !sequence.HasAnnotation() {
return false
@ -31,7 +41,14 @@ func (sequence *BioSequence) HasStatsOn(key string) bool {
return ok
}
// A function that takes a BioSequence and a key and returns a StatsOnValues.
// StatsOn returns the slot summarizing statistics of occurrence for a given attribute.
//
// Parameters:
// - key: the attribute key (string) to be summarized
// - na: the value to be used if the attribute is not present
//
// Return type:
// - StatsOnValues
func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
mkey := StatsOnSlotName(key)
annotations := sequence.Annotations()
@ -77,7 +94,14 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
return stats
}
// Adding the count of the sequence to the count of the key in the stats.
// StatsPlusOne adds the count of the sequence toAdd to the count of the key in the stats.
//
// Parameters:
// - key: the attribute key (string) to be summarized
// - toAdd: the BioSequence to add to the stats
// - na: the value to be used if the attribute is not present
// Return type:
// - bool
func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na string) bool {
sval := na
annotations := sequence.Annotations()
@ -109,10 +133,14 @@ func (sequence *BioSequence) StatsPlusOne(key string, toAdd *BioSequence, na str
old = 0
}
stats[sval] = old + toAdd.Count()
annotations[StatsOnSlotName(key)] = stats
annotations[StatsOnSlotName(key)] = stats // TODO: check if this is necessary
return retval
}
// Merge merges the given StatsOnValues with the current StatsOnValues.
//
// It takes a parameter `toMerged` of type StatsOnValues, which represents the StatsOnValues to be merged.
// It returns a value of type StatsOnValues, which represents the merged StatsOnValues.
func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
for k, val := range toMerged {
old, ok := stats[k]
@ -125,7 +153,16 @@ func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
return stats
}
// Merging two sequences.
// Merge merges two sequences into a single sequence.
//
// Parameters:
// - tomerge: the sequence to be merged (BioSequence)
// - na: the value to be used if the attribute is not present (string)
// - inplace: a boolean indicating whether to merge in place or not (bool)
// - statsOn: a variadic string parameter representing the attributes to be summarized (string)
//
// Return type:
// - *BioSequence: the merged sequence (BioSequence)
func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool, statsOn ...string) *BioSequence {
if !inplace {
sequence = sequence.Copy()
@ -184,17 +221,15 @@ func (sequence *BioSequence) Merge(tomerge *BioSequence, na string, inplace bool
return sequence
}
/*
*
Merges a set of sequence into a single sequence.
The function assumes that every sequence in the batch is
identical in term of sequence. Actually the function only
aggregates the annotations of the different sequences to be merged
Quality information is lost during the merge procedure.
*/
// Merge merges the given sequences into a single sequence.
//
// Parameters:
// - sequences: a slice of BioSequence objects to be merged (BioSequenceSlice)
// - na: the value to be used if the attribute is not present (string)
// - statsOn: a slice of strings representing the attributes to be summarized ([]string)
//
// Return type:
// - *BioSequence: the merged sequence (BioSequence)
func (sequences BioSequenceSlice) Merge(na string, statsOn []string) *BioSequence {
seq := sequences[0]
//sequences[0] = nil

View File

@ -12,10 +12,46 @@ type SeqAnnotator func(*BioSequence)
type SeqWorker func(*BioSequence) (BioSequenceSlice, error)
type SeqSliceWorker func(BioSequenceSlice) (BioSequenceSlice, error)
// NilSeqWorker returns a BioSequenceSlice containing the input sequence and a nil error value.
// This function is typically used as a placeholder or default worker in SeqToSliceWorker when no specific worker is needed.
//
// Parameters:
//
// seq *BioSequence: A pointer to a BioSequence struct that needs processing.
//
// Returns:
//
// BioSequenceSlice, error: This function returns a slice containing the input sequence and an error value. If no error occurred during the operation, it will be nil; otherwise, it will contain details about the error.
func NilSeqWorker(seq *BioSequence) (BioSequenceSlice, error) {
return BioSequenceSlice{seq}, nil
}
// AnnotatorToSeqWorker is a higher-order function that takes a SeqAnnotator
// function and returns a SeqWorker function. It is used to wrap a sequence
// annotation function and convert it into a worker function that can be used
// in a pipeline or workflow for processing biological sequences.
//
// Parameters:
//
// function SeqAnnotator: A function that takes a pointer to a BioSequence
// struct and performs some annotation or processing on the sequence data.
// The SeqAnnotator type is expected to be a function with the following
// signature:
// func(seq *BioSequence)
// This function should modify the input BioSequence struct in-place by adding
// annotations, metadata, or performing any other desired operations.
//
// Returns:
//
// SeqWorker: A function that takes a pointer to a BioSequence struct and
// returns a BioSequenceSlice containing the input BioSequence, along with
// an error value. The SeqWorker type is expected to be a function with the
// following signature:
// func(seq *BioSequence) (BioSequenceSlice, error)
// The returned SeqWorker function wraps the provided SeqAnnotator function
// and applies it to the input BioSequence before returning the modified
// BioSequence in a BioSequenceSlice. The error value is always nil, as the
// function does not perform any operations that could potentially fail.
func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
f := func(seq *BioSequence) (BioSequenceSlice, error) {
function(seq)
@ -24,6 +60,35 @@ func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
return f
}
// SeqToSliceWorker is a higher-order function that takes a SeqWorker and a
// boolean value indicating whether to break on error and returns a SeqSliceWorker.
// It can be used in a pipeline or workflow for processing biological sequences,
// applying the provided worker to each element of a BioSequenceSlice and returning
// a new slice.
//
// Parameters:
//
// worker SeqWorker: A function that takes a pointer to a BioSequence struct and
// performs some processing on it.
// The signature for this function is func(seq *BioSequence) (BioSequenceSlice, error).
// This function should return a modified BioSequence in a BioSequenceSlice along with
// an error value indicating whether the operation was successful or not.
// breakOnError bool: A boolean flag that determines whether to stop processing further
// elements in case of an error. If set to true and an error is encountered while
// processing any element, it stops processing remaining elements and returns the processed
// slice so far along with the encountered error. If set to false, it logs the error and
// continues processing remaining elements.
//
// Returns:
//
// SeqSliceWorker: A function that takes a BioSequenceSlice (a slice of pointers to
// BioSequence structs) as input and returns a processed BioSequenceSlice along with
// an error value indicating whether the operation was successful or not.
// The signature for this function is func(input BioSequenceSlice) (BioSequenceSlice, error).
// If breakOnError is set to true and any element processing results in an error, it stops
// further processing and returns the processed slice so far along with the encountered error.
// Otherwise, it processes all elements and returns them as a single BioSequenceSlice along with
// a nil error value.
func SeqToSliceWorker(worker SeqWorker,
breakOnError bool) SeqSliceWorker {
var f SeqSliceWorker
@ -68,6 +133,18 @@ func SeqToSliceWorker(worker SeqWorker,
return f
}
// SeqToSliceConditionalWorker creates a new SeqSliceWorker that processes each sequence in a slice based on a condition. It takes a SequencePredicate and a worker function as arguments. The worker function is only applied to sequences that satisfy the condition.
// If `condition` is nil, this function just behaves like SeqToSliceWorker with the provided `worker`.
// If `breakOnError` is true, the pipeline will stop and return an error if any sequence processing fails. Otherwise, it will log a warning message for each failed sequence.
//
// Parameters:
// - condition SequencePredicate: A predicate function that determines which sequences should be processed by the worker.
// - worker SeqWorker: The worker to be applied to the sequences that satisfy the condition.
// - breakOnError bool: If true, the pipeline will stop and return an error if any sequence processing fails. Otherwise, it will log a warning message for each failed sequence.
//
// Returns:
//
// SeqSliceWorker: A new SeqSliceWorker that processes sequences based on a condition. This function returns a single SeqSliceWorker that can be used to process BioSequences in a workflow or pipeline.
func SeqToSliceConditionalWorker(
condition SequencePredicate,
worker SeqWorker, breakOnError bool) SeqSliceWorker {
@ -112,6 +189,17 @@ func SeqToSliceConditionalWorker(
return f
}
// ChainWorkers chains two workers together and returns a new SeqWorker. It takes an existing worker (`worker`) and a next worker as arguments, combines them into a pipeline and applies it to each BioSequence in the sequence slice.
// If `next` is nil, this function just returns the input worker.
// If `worker` is nil, this function just returns the next worker.
//
// Parameters:
// - worker SeqWorker: The initial worker to be chained. This worker will be executed first on each sequence.
// - next SeqWorker: The next worker in the pipeline. This worker will be applied to the output of `worker` for each sequence.
//
// Returns:
//
// SeqWorker: A new SeqWorker that chains the input workers together into a pipeline. This function returns a single SeqWorker that can be used to process BioSequences in a workflow or pipeline.
func (worker SeqWorker) ChainWorkers(next SeqWorker) SeqWorker {
if worker == nil {
return next