mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Changes to be committed:
modified: cmd/obitools/obitag/main.go modified: cmd/obitools/obitag2/main.go modified: go.mod modified: go.sum modified: pkg/obiformats/ncbitaxdump/read.go modified: pkg/obioptions/version.go modified: pkg/obiseq/attributes.go modified: pkg/obiseq/taxonomy_lca.go modified: pkg/obiseq/taxonomy_methods.go modified: pkg/obiseq/taxonomy_predicate.go modified: pkg/obitax/inner.go modified: pkg/obitax/lca.go new file: pkg/obitax/taxid.go modified: pkg/obitax/taxon.go modified: pkg/obitax/taxonomy.go modified: pkg/obitax/taxonslice.go modified: pkg/obitools/obicleandb/obicleandb.go modified: pkg/obitools/obigrep/options.go modified: pkg/obitools/obilandmark/obilandmark.go modified: pkg/obitools/obilandmark/options.go modified: pkg/obitools/obirefidx/famlilyindexing.go modified: pkg/obitools/obirefidx/geomindexing.go modified: pkg/obitools/obirefidx/obirefidx.go modified: pkg/obitools/obirefidx/options.go modified: pkg/obitools/obitag/obigeomtag.go modified: pkg/obitools/obitag/obitag.go modified: pkg/obitools/obitag/options.go modified: pkg/obiutils/strings.go
This commit is contained in:
@ -1,12 +1,158 @@
|
||||
package obiutils
|
||||
|
||||
import "unsafe"
|
||||
import (
|
||||
"fmt"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
type AsciiSet [256]bool
|
||||
|
||||
var AsciiSpaceSet = AsciiSetFromString("\t\n\v\f\r ")
|
||||
var AsciiDigitSet = AsciiSetFromString("0123456789")
|
||||
var AsciiUpperSet = AsciiSetFromString("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
||||
var AsciiLowerSet = AsciiSetFromString("abcdefghijklmnopqrstuvwxyz")
|
||||
var AsciiAlphaSet = AsciiUpperSet.Union(AsciiLowerSet)
|
||||
var AsciiAlphaNumSet = AsciiAlphaSet.Union(AsciiDigitSet)
|
||||
|
||||
// UnsafeStringFromBytes converts a byte slice into a string without making a copy of the data.
|
||||
// This function is considered unsafe because it directly manipulates memory and does not
|
||||
// perform any checks on the byte slice's contents. It should be used with caution.
|
||||
//
|
||||
// Parameters:
|
||||
// - data: A byte slice that contains the data to be converted to a string.
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// A string representation of the byte slice. If the input byte slice is empty,
|
||||
// an empty string is returned.
|
||||
func UnsafeStringFromBytes(data []byte) string {
|
||||
if len(data) > 0 {
|
||||
// Convert the byte slice to a string using unsafe operations.
|
||||
s := unsafe.String(unsafe.SliceData(data), len(data))
|
||||
return s
|
||||
}
|
||||
|
||||
return ""
|
||||
return "" // Return an empty string if the input slice is empty.
|
||||
}
|
||||
|
||||
func AsciiSetFromString(s string) AsciiSet {
|
||||
r := [256]bool{}
|
||||
for _, c := range s {
|
||||
r[c] = true
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
func (r *AsciiSet) Contains(c byte) bool {
|
||||
return r[c]
|
||||
}
|
||||
|
||||
func (r *AsciiSet) Union(s AsciiSet) AsciiSet {
|
||||
for i := 0; i < 256; i++ {
|
||||
s[i] = r[i] || s[i]
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
func (r *AsciiSet) Intersect(s AsciiSet) AsciiSet {
|
||||
for i := 0; i < 256; i++ {
|
||||
s[i] = r[i] && s[i]
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// FirstWord extracts the first word from a given string.
|
||||
// A word is defined as a sequence of non-space characters.
|
||||
// It ignores leading whitespace and stops at the first whitespace character encountered.
|
||||
//
|
||||
// Parameters:
|
||||
// - s: The input string from which the first word is to be extracted.
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// A string containing the first word found in the input string. If the input string
|
||||
// is empty or contains only whitespace, an empty string is returned.
|
||||
func FirstWord(s string) string {
|
||||
// Fast path for ASCII: look for the first ASCII non-space byte
|
||||
start := 0
|
||||
for ; start < len(s); start++ {
|
||||
c := s[start]
|
||||
if !AsciiSpaceSet.Contains(c) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
stop := start
|
||||
for ; stop < len(s); stop++ {
|
||||
c := s[stop]
|
||||
if AsciiSpaceSet.Contains(c) {
|
||||
break
|
||||
}
|
||||
}
|
||||
return s[start:stop]
|
||||
}
|
||||
|
||||
// FirstRestrictedWord extracts the first word from a given string while enforcing character restrictions.
|
||||
// A word is defined as a sequence of non-space characters. The function checks each character
|
||||
// against the provided restriction array, which indicates whether a character is allowed.
|
||||
//
|
||||
// Parameters:
|
||||
// - s: The input string from which the first restricted word is to be extracted.
|
||||
// - restriction: A boolean array of size 256 where each index represents a character's ASCII value.
|
||||
// If restriction[c] is false, the character c is not allowed in the word.
|
||||
//
|
||||
// Returns:
|
||||
//
|
||||
// A string containing the first word found in the input string that does not contain any restricted characters.
|
||||
// If a restricted character is found, an error is returned indicating the invalid character.
|
||||
// If the input string is empty or contains only whitespace, an empty string is returned with no error.
|
||||
func (restriction *AsciiSet) FirstWord(s string) (string, error) {
|
||||
// Fast path for ASCII: look for the first ASCII non-space byte
|
||||
start := 0
|
||||
for ; start < len(s); start++ {
|
||||
c := s[start]
|
||||
if !AsciiSpaceSet.Contains(c) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
stop := start
|
||||
for ; stop < len(s); stop++ {
|
||||
c := s[stop]
|
||||
if AsciiSpaceSet.Contains(c) {
|
||||
break
|
||||
}
|
||||
if !restriction.Contains(c) {
|
||||
return "", fmt.Errorf("invalid character '%c' in string: %s", c, s)
|
||||
}
|
||||
}
|
||||
|
||||
return s[start:stop], nil
|
||||
}
|
||||
|
||||
func (r *AsciiSet) TrimLeft(s string) string {
|
||||
i := 0
|
||||
for ; i < len(s); i++ {
|
||||
c := s[i]
|
||||
if !AsciiSpaceSet.Contains(c) {
|
||||
break
|
||||
}
|
||||
}
|
||||
return s[i:]
|
||||
}
|
||||
|
||||
func SplitInTwo(s string, sep byte) (string, string) {
|
||||
i := 0
|
||||
for ; i < len(s); i++ {
|
||||
c := s[i]
|
||||
if c == sep {
|
||||
break
|
||||
}
|
||||
}
|
||||
if i == len(s) {
|
||||
return s, ""
|
||||
}
|
||||
return s[:i], s[i+1:]
|
||||
}
|
||||
|
Reference in New Issue
Block a user