mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
160 lines
4.1 KiB
Go
160 lines
4.1 KiB
Go
package obiutils
|
|
|
|
import (
|
|
"fmt"
|
|
"unsafe"
|
|
)
|
|
|
|
type AsciiSet [256]bool
|
|
|
|
var AsciiSpaceSet = AsciiSetFromString("\t\n\v\f\r ")
|
|
var AsciiUnderScore = AsciiSetFromString("_")
|
|
var AsciiDigitSet = AsciiSetFromString("0123456789")
|
|
var AsciiUpperSet = AsciiSetFromString("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|
var AsciiLowerSet = AsciiSetFromString("abcdefghijklmnopqrstuvwxyz")
|
|
var AsciiAlphaSet = AsciiUpperSet.Union(AsciiLowerSet)
|
|
var AsciiAlphaNumSet = AsciiAlphaSet.Union(AsciiDigitSet)
|
|
|
|
// UnsafeStringFromBytes converts a byte slice into a string without making a copy of the data.
|
|
// This function is considered unsafe because it directly manipulates memory and does not
|
|
// perform any checks on the byte slice's contents. It should be used with caution.
|
|
//
|
|
// Parameters:
|
|
// - data: A byte slice that contains the data to be converted to a string.
|
|
//
|
|
// Returns:
|
|
//
|
|
// A string representation of the byte slice. If the input byte slice is empty,
|
|
// an empty string is returned.
|
|
func UnsafeStringFromBytes(data []byte) string {
|
|
if len(data) > 0 {
|
|
// Convert the byte slice to a string using unsafe operations.
|
|
s := unsafe.String(unsafe.SliceData(data), len(data))
|
|
return s
|
|
}
|
|
|
|
return "" // Return an empty string if the input slice is empty.
|
|
}
|
|
|
|
func AsciiSetFromString(s string) AsciiSet {
|
|
r := [256]bool{}
|
|
for _, c := range s {
|
|
r[c] = true
|
|
}
|
|
return r
|
|
}
|
|
|
|
func (r *AsciiSet) Contains(c byte) bool {
|
|
return r[c]
|
|
}
|
|
|
|
func (r *AsciiSet) Union(s AsciiSet) AsciiSet {
|
|
for i := 0; i < 256; i++ {
|
|
s[i] = r[i] || s[i]
|
|
}
|
|
|
|
return s
|
|
}
|
|
|
|
func (r *AsciiSet) Intersect(s AsciiSet) AsciiSet {
|
|
for i := 0; i < 256; i++ {
|
|
s[i] = r[i] && s[i]
|
|
}
|
|
|
|
return s
|
|
}
|
|
|
|
// FirstWord extracts the first word from a given string.
|
|
// A word is defined as a sequence of non-space characters.
|
|
// It ignores leading whitespace and stops at the first whitespace character encountered.
|
|
//
|
|
// Parameters:
|
|
// - s: The input string from which the first word is to be extracted.
|
|
//
|
|
// Returns:
|
|
//
|
|
// A string containing the first word found in the input string. If the input string
|
|
// is empty or contains only whitespace, an empty string is returned.
|
|
func FirstWord(s string) string {
|
|
// Fast path for ASCII: look for the first ASCII non-space byte
|
|
start := 0
|
|
for ; start < len(s); start++ {
|
|
c := s[start]
|
|
if !AsciiSpaceSet.Contains(c) {
|
|
break
|
|
}
|
|
}
|
|
|
|
stop := start
|
|
for ; stop < len(s); stop++ {
|
|
c := s[stop]
|
|
if AsciiSpaceSet.Contains(c) {
|
|
break
|
|
}
|
|
}
|
|
return s[start:stop]
|
|
}
|
|
|
|
// FirstRestrictedWord extracts the first word from a given string while enforcing character restrictions.
|
|
// A word is defined as a sequence of non-space characters. The function checks each character
|
|
// against the provided restriction array, which indicates whether a character is allowed.
|
|
//
|
|
// Parameters:
|
|
// - s: The input string from which the first restricted word is to be extracted.
|
|
// - restriction: A boolean array of size 256 where each index represents a character's ASCII value.
|
|
// If restriction[c] is false, the character c is not allowed in the word.
|
|
//
|
|
// Returns:
|
|
//
|
|
// A string containing the first word found in the input string that does not contain any restricted characters.
|
|
// If a restricted character is found, an error is returned indicating the invalid character.
|
|
// If the input string is empty or contains only whitespace, an empty string is returned with no error.
|
|
func (restriction *AsciiSet) FirstWord(s string) (string, error) {
|
|
// Fast path for ASCII: look for the first ASCII non-space byte
|
|
start := 0
|
|
for ; start < len(s); start++ {
|
|
c := s[start]
|
|
if !AsciiSpaceSet.Contains(c) {
|
|
break
|
|
}
|
|
}
|
|
|
|
stop := start
|
|
for ; stop < len(s); stop++ {
|
|
c := s[stop]
|
|
if AsciiSpaceSet.Contains(c) {
|
|
break
|
|
}
|
|
if !restriction.Contains(c) {
|
|
return "", fmt.Errorf("invalid character '%c' in string: %s", c, s)
|
|
}
|
|
}
|
|
|
|
return s[start:stop], nil
|
|
}
|
|
|
|
func (r *AsciiSet) TrimLeft(s string) string {
|
|
i := 0
|
|
for ; i < len(s); i++ {
|
|
c := s[i]
|
|
if !AsciiSpaceSet.Contains(c) {
|
|
break
|
|
}
|
|
}
|
|
return s[i:]
|
|
}
|
|
|
|
func SplitInTwo(s string, sep byte) (string, string) {
|
|
i := 0
|
|
for ; i < len(s); i++ {
|
|
c := s[i]
|
|
if c == sep {
|
|
break
|
|
}
|
|
}
|
|
if i == len(s) {
|
|
return s, ""
|
|
}
|
|
return s[:i], s[i+1:]
|
|
}
|