From dd9307a4cd277011e44c16c7ea4856b593565e1a Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 30 May 2024 08:27:24 +0200 Subject: [PATCH] Swich to the system min and max functions and remove the version from obiutils Former-commit-id: 8c4558921b0d0c266b070f16e83813de6e6d4a0f --- pkg/obialign/fastlcsegf.go | 10 ++-- pkg/obiapat/pattern.go | 53 +++++++++++++------ pkg/obiformats/fastqseq_read.go | 81 ----------------------------- pkg/obiiter/fragment.go | 3 +- pkg/obikmer/counting.go | 9 ++-- pkg/obikmer/debruijn.go | 3 +- pkg/obingslibrary/match.go | 9 ++-- pkg/obisuffix/suffix_array.go | 5 +- pkg/obitools/obipcr/pcr.go | 5 +- pkg/obitools/obirefidx/obirefidx.go | 4 +- pkg/obitools/obitag/obitag.go | 6 +-- pkg/obiutils/minmax.go | 15 ------ 12 files changed, 58 insertions(+), 145 deletions(-) diff --git a/pkg/obialign/fastlcsegf.go b/pkg/obialign/fastlcsegf.go index 152238b..c80414d 100644 --- a/pkg/obialign/fastlcsegf.go +++ b/pkg/obialign/fastlcsegf.go @@ -2,7 +2,6 @@ package obialign import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) var _iupac = [26]byte{ @@ -130,11 +129,11 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[ // in_matrix := false x1 := y - lB + extra x2 := extra - y - xs := obiutils.Max(obiutils.Max(x1, x2), 0) + xs := max(x1, x2, 0) x1 = y + extra x2 = lA + extra - y - xf := obiutils.Min(obiutils.Min(x1, x2), even-1) + 1 + xf := min(x1, x2, even-1) + 1 for x := xs; x < xf; x++ { @@ -222,11 +221,11 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[ // . 9 10 + 2 - 1 x1 = y - lB + extra + even x2 = extra - y + even - 1 - xs = obiutils.Max(obiutils.Max(x1, x2), even) + xs = max(x1, x2, even) x1 = y + extra + even x2 = lA + extra - y + even - 1 - xf = obiutils.Min(obiutils.Min(x1, x2), width-1) + 1 + xf = min(x1, x2, width-1) + 1 for x := xs; x < xf; x++ { @@ -383,4 +382,3 @@ func FastLCSEGFScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uin func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64) (int, int) { return FastLCSEGFScoreByte(seqA.Sequence(), seqB.Sequence(), maxError, false, buffer) } - diff --git a/pkg/obiapat/pattern.go b/pkg/obiapat/pattern.go index 7ee9555..fd5a050 100644 --- a/pkg/obiapat/pattern.go +++ b/pkg/obiapat/pattern.go @@ -15,7 +15,6 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) var _MaxPatLen = int(C.MAX_PAT_LEN) @@ -53,26 +52,24 @@ var NilApatPattern = ApatPattern{nil} // type. var NilApatSequence = ApatSequence{nil} -// MakeApatPattern creates an ApatPattern object based on the given pattern, error maximum and allowsIndel flag. +// MakeApatPattern creates an ApatPattern object based on the given pattern, error maximum, and allowsIndel flag. // -// The pattern is a short DNA sequence (up to 64 symboles). +// Parameters: +// The pattern is a short DNA sequence (up to 64 symbols). // Ambiguities can be represented or using UIPAC symboles, // or using the [...] classical in regular pattern grammar. // For example, the ambiguity A/T can be indicated using W // or [AT]. A nucleotide can be negated by preceding it with -// a '!'. The APAT algorithm allows for error during the -// matching process. The maximum number of tolerated error -// is indicated at the construction of the pattern using -// the errormax parameter. Some positions can be marked as not +// a '!'. The pattern is converted to uppercase. +// Some positions can be marked as not // allowed for mismatches. They have to be signaled using a '#' // sign after the corresponding nucleotide. // -// Parameters: -// pattern: The input pattern string. -// errormax: The maximum number of errors allowed. -// allowsIndel: A flag indicating whether indels are allowed or not. +// errormax is the maximum number of errors allowed in the pattern. // -// Returns an ApatPattern object and an error. +// allowsIndel is a flag indicating whether indels are allowed in the pattern. +// +// Returns an ApatPattern object and an error if the pattern is invalid. func MakeApatPattern(pattern string, errormax int, allowsIndel bool) (ApatPattern, error) { cpattern := C.CString(pattern) defer C.free(unsafe.Pointer(cpattern)) @@ -264,6 +261,7 @@ func (sequence ApatSequence) Free() { // values of the [3]int indicate respectively the start and the end position of // the match. Following the GO convention the end position is not included in the // match. The third value indicates the number of error detected for this occurrence. + func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, begin, length int) (loc [][3]int) { if begin < 0 { begin = 0 @@ -348,8 +346,8 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) ( start = best[0] - nerr end = best[0] + int(pattern.pointer.pointer.patlen) + nerr - start = obiutils.Max(start, 0) - end = obiutils.Min(end, sequence.Len()) + start = max(start, 0) + end = min(end, sequence.Len()) cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat)) frg := sequence.pointer.reference.Sequence()[start:end] @@ -377,6 +375,22 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) ( // return int(_AllocatedApaSequences) // } +// AllMatches finds all matches of a given pattern in a sequence. +// +// It only works if the parttern is a pure IUPAC sequence without +// supplementary characters normally allowed : ! and #. +// +// It takes the following parameters: +// - pattern: the pattern to search for (ApatPattern). +// - sequence: the sequence to search in (ApatSequence). +// - begin: the starting index of the search (int). +// - length: the length of the search (int). +// +// It returns a slice of [3]int representing the locations of all matches in the sequence. +// The AllMatches methood returns return a slice of [3]int. The two firsts +// values of the [3]int indicate respectively the start and the end position of +// the match. Following the GO convention the end position is not included in the +// match. The third value indicates the number of error detected for this occurrence. func (pattern ApatPattern) AllMatches(sequence ApatSequence, begin, length int) (loc [][3]int) { res := pattern.FindAllIndex(sequence, begin, length) @@ -384,12 +398,17 @@ func (pattern ApatPattern) AllMatches(sequence ApatSequence, begin, length int) buffer := sbuffer[:] for _, m := range res { + // Recompute the start and end position of the match + // when the pattern allows for indels if m[2] > 0 && pattern.pointer.pointer.hasIndel { start := m[0] - m[2] end := m[0] + int(pattern.pointer.pointer.patlen) + m[2] - start = obiutils.Max(start, 0) - end = obiutils.Min(end, sequence.Len()) - + start = max(start, 0) + end = min(end, sequence.Len()) + // 1 << 30 = 1,073,741,824 = 1Gb + // It's a virtual array mapping the sequence to the pattern + // in the C code. + // No allocations are done here. cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat)) frg := sequence.pointer.reference.Sequence()[start:end] diff --git a/pkg/obiformats/fastqseq_read.go b/pkg/obiformats/fastqseq_read.go index 6437ad8..67ed182 100644 --- a/pkg/obiformats/fastqseq_read.go +++ b/pkg/obiformats/fastqseq_read.go @@ -100,87 +100,6 @@ func _EndOfLastFastqEntry(buffer []byte) int { return cut } -func lastFastqCut(buffer []byte) ([]byte, []byte) { - imax := len(buffer) - cut := imax - state := 0 - restart := imax - 1 - for i := restart; i >= 0 && state < 7; i-- { - C := buffer[i] - is_end_of_line := C == '\r' || C == '\n' - is_space := C == ' ' || C == '\t' - is_sep := is_space || is_end_of_line - - switch state { - case 0: - if C == '+' { - // Potential start of quality part step 1 - state = 1 - restart = i - } - case 1: - if is_end_of_line { - // Potential start of quality part step 2 - state = 2 - } else { - // it was not the start of quality part - state = 0 - i = restart - } - case 2: - if is_sep { - // Potential start of quality part step 2 (stay in the same state) - state = 2 - } else if (C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') || C == '-' || C == '.' || C == '[' || C == ']' { - // End of the sequence - state = 3 - } else { - // it was not the start of quality part - state = 0 - i = restart - } - case 3: - if is_end_of_line { - // Entrering in the header line - state = 4 - } else if (C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') || C == '-' || C == '.' || C == '[' || C == ']' { - // progressing along of the sequence - state = 3 - } else { - // it was not the sequence part - state = 0 - i = restart - } - case 4: - if is_end_of_line { - state = 4 - } else { - state = 5 - } - case 5: - if is_end_of_line { - // It was not the header line - state = 0 - i = restart - } else if C == '@' { - state = 6 - cut = i - } - case 6: - if is_end_of_line { - state = 7 - } else { - state = 0 - i = restart - } - } - } - if state == 7 { - return buffer[:cut], bytes.Clone(buffer[cut:]) - } - return []byte{}, buffer -} - func _ParseFastqFile(source string, input ChannelSeqFileChunk, out obiiter.IBioSequence, diff --git a/pkg/obiiter/fragment.go b/pkg/obiiter/fragment.go index 7abdd6b..bbddd23 100644 --- a/pkg/obiiter/fragment.go +++ b/pkg/obiiter/fragment.go @@ -4,7 +4,6 @@ import ( log "github.com/sirupsen/logrus" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) func IFragments(minsize, length, overlap, size, nworkers int) Pipeable { @@ -30,7 +29,7 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable { news = append(news, s) } else { for i := 0; i < s.Len(); i += step { - end := obiutils.Min(i+length, s.Len()) + end := min(i+length, s.Len()) fusion := false if (s.Len() - end) < step { end = s.Len() diff --git a/pkg/obikmer/counting.go b/pkg/obikmer/counting.go index 5ce8d31..1c27967 100644 --- a/pkg/obikmer/counting.go +++ b/pkg/obikmer/counting.go @@ -4,13 +4,10 @@ import ( "math" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) type Table4mer [256]uint16 - - func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Table4mer { iternal_buffer := Encode4mer(seq, buffer) // The slice of 4-mer codes @@ -33,7 +30,7 @@ func Count4Mer(seq *obiseq.BioSequence, buffer *[]byte, counts *Table4mer) *Tabl func Common4Mer(count1, count2 *Table4mer) int { sum := 0 for i := 0; i < 256; i++ { - sum += int(obiutils.Min((*count1)[i], (*count2)[i])) + sum += int(min((*count1)[i], (*count2)[i])) } return sum } @@ -49,7 +46,7 @@ func Sum4Mer(count *Table4mer) int { func LCS4MerBounds(count1, count2 *Table4mer) (int, int) { s1 := Sum4Mer(count1) s2 := Sum4Mer(count2) - smin := obiutils.Min(s1, s2) + smin := min(s1, s2) cw := Common4Mer(count1, count2) @@ -66,7 +63,7 @@ func LCS4MerBounds(count1, count2 *Table4mer) (int, int) { func Error4MerBounds(count1, count2 *Table4mer) (int, int) { s1 := Sum4Mer(count1) s2 := Sum4Mer(count2) - smax := obiutils.Max(s1, s2) + smax := max(s1, s2) cw := Common4Mer(count1, count2) diff --git a/pkg/obikmer/debruijn.go b/pkg/obikmer/debruijn.go index 9ef8c6b..3bafddd 100644 --- a/pkg/obikmer/debruijn.go +++ b/pkg/obikmer/debruijn.go @@ -10,7 +10,6 @@ import ( "slices" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" log "github.com/sirupsen/logrus" ) @@ -472,7 +471,7 @@ func (graph *DeBruijnGraph) Gml() string { n := graph.Nexts(idx) for _, dst := range n { dstid := nodeidx[dst] - weight := obiutils.Min(graph.Weight(dst), weight) + weight := min(graph.Weight(dst), weight) label := decode[dst&3] buffer.WriteString( fmt.Sprintf(`edge [ source "%d" diff --git a/pkg/obingslibrary/match.go b/pkg/obingslibrary/match.go index 5621aed..68bd10e 100644 --- a/pkg/obingslibrary/match.go +++ b/pkg/obingslibrary/match.go @@ -9,7 +9,6 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) type DemultiplexMatch struct { @@ -130,7 +129,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { sseq := sequence.String() direct := sseq[start:end] - tagstart := obiutils.Max(start-marker.taglength, 0) + tagstart := max(start-marker.taglength, 0) ftag := strings.ToLower(sseq[tagstart:start]) m := DemultiplexMatch{ @@ -150,7 +149,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { reverse, _ := sequence.Subsequence(start, end, false) defer reverse.Recycle() reverse = reverse.ReverseComplement(true) - endtag := obiutils.Min(end+marker.taglength, sequence.Len()) + endtag := min(end+marker.taglength, sequence.Len()) rtag, err := sequence.Subsequence(end, endtag, false) defer rtag.Recycle() srtag := "" @@ -201,7 +200,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { sseq := sequence.String() reverse := strings.ToLower(sseq[start:end]) - tagstart := obiutils.Max(start-marker.taglength, 0) + tagstart := max(start-marker.taglength, 0) rtag := strings.ToLower(sseq[tagstart:start]) m := DemultiplexMatch{ @@ -221,7 +220,7 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { defer direct.Recycle() direct = direct.ReverseComplement(true) - endtag := obiutils.Min(end+marker.taglength, sequence.Len()) + endtag := min(end+marker.taglength, sequence.Len()) ftag, err := sequence.Subsequence(end, endtag, false) defer ftag.Recycle() sftag := "" diff --git a/pkg/obisuffix/suffix_array.go b/pkg/obisuffix/suffix_array.go index cb28931..87927f7 100644 --- a/pkg/obisuffix/suffix_array.go +++ b/pkg/obisuffix/suffix_array.go @@ -6,7 +6,6 @@ import ( "sort" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) type Suffix struct { @@ -27,7 +26,7 @@ func SuffixLess(suffixarray SuffixArray) func(i, j int) bool { sj := suffixarray.Suffixes[j] bj := (*suffixarray.Sequences)[int(sj.Idx)].Sequence()[sj.Pos:] - l := obiutils.Min(len(bi), len(bj)) + l := min(len(bi), len(bj)) p := 0 for p < l && bi[p] == bj[p] { p++ @@ -92,7 +91,7 @@ func (suffixarray *SuffixArray) CommonSuffix() []int { si := suffixarray.Suffixes[i] bi := (*suffixarray.Sequences)[int(si.Idx)].Sequence()[si.Pos:] - l := obiutils.Min(len(bi), len(bp)) + l := min(len(bi), len(bp)) p := 0 for p < l && bi[p] == bp[p] { p++ diff --git a/pkg/obitools/obipcr/pcr.go b/pkg/obitools/obipcr/pcr.go index 3fbf18d..f713b76 100644 --- a/pkg/obitools/obipcr/pcr.go +++ b/pkg/obitools/obipcr/pcr.go @@ -4,7 +4,6 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" log "github.com/sirupsen/logrus" ) @@ -47,8 +46,8 @@ func CLIPCR(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error) { frags := obiiter.IFragments( CLIMaxLength()*1000, CLIMaxLength()*100, - CLIMaxLength()+obiutils.Max(len(CLIForwardPrimer()), - len(CLIReversePrimer()))+obiutils.Min(len(CLIForwardPrimer()), + CLIMaxLength()+max(len(CLIForwardPrimer()), + len(CLIReversePrimer()))+min(len(CLIForwardPrimer()), len(CLIReversePrimer()))/2, 100, obioptions.CLIParallelWorkers(), diff --git a/pkg/obitools/obirefidx/obirefidx.go b/pkg/obitools/obirefidx/obirefidx.go index 6509028..74fcb90 100644 --- a/pkg/obitools/obirefidx/obirefidx.go +++ b/pkg/obitools/obirefidx/obirefidx.go @@ -63,7 +63,7 @@ func IndexSequence(seqidx int, if lca[order] == ancestor { // nseq[i]++ if mini != -1 { - wordmin = obiutils.Max(sequence.Len(), references[order].Len()) - 3 - 4*mini + wordmin = max(sequence.Len(), references[order].Len()) - 3 - 4*mini } if cw[order] < wordmin { @@ -189,7 +189,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence { indexed := obiiter.MakeIBioSequence() go func() { for i := 0; i < len(references); i += 10 { - limits <- [2]int{i, obiutils.Min(i+10, len(references))} + limits <- [2]int{i, min(i+10, len(references))} } close(limits) }() diff --git a/pkg/obitools/obitag/obitag.go b/pkg/obitools/obitag/obitag.go index b4b8e84..5dd4db2 100644 --- a/pkg/obitools/obitag/obitag.go +++ b/pkg/obitools/obitag/obitag.go @@ -110,7 +110,7 @@ func FindClosests(sequence *obiseq.BioSequence, d, _, _, _ := obialign.D1Or0(sequence, references[order]) if d >= 0 { score = d - alilength = obiutils.Max(sequence.Len(), ref.Len()) + alilength = max(sequence.Len(), ref.Len()) lcs = alilength - score } } else { @@ -294,8 +294,8 @@ func CLIAssignTaxonomy(iterator obiiter.IBioSequence, j++ } else { log.Warnf("Taxid %d is not described in the taxonomy."+ - " Sequence %s is discared from the reference database", - seq.Taxid(), seq.Id()) + " Sequence %s is discared from the reference database", + seq.Taxid(), seq.Id()) } } diff --git a/pkg/obiutils/minmax.go b/pkg/obiutils/minmax.go index fdcc10a..1c48440 100644 --- a/pkg/obiutils/minmax.go +++ b/pkg/obiutils/minmax.go @@ -4,20 +4,6 @@ import ( "golang.org/x/exp/constraints" ) -func Min[T constraints.Ordered](x, y T) T { - if x < y { - return x - } - return y -} - -func Max[T constraints.Ordered](x, y T) T { - if x < y { - return y - } - return x -} - func MinMax[T constraints.Ordered](x, y T) (T, T) { if x < y { return x, y @@ -25,7 +11,6 @@ func MinMax[T constraints.Ordered](x, y T) (T, T) { return y, x } - func MinMaxSlice[T constraints.Ordered](vec []T) (min, max T) { if len(vec) == 0 { panic("empty slice")