Changes to be committed:

modified: cmd/obitools/obitag/main.go modified: cmd/obitools/obitag2/main.go modified: go.mod modified: go.sum modified: pkg/obiformats/ncbitaxdump/read.go modified: pkg/obioptions/version.go modified: pkg/obiseq/attributes.go modified: pkg/obiseq/taxonomy_lca.go modified: pkg/obiseq/taxonomy_methods.go modified: pkg/obiseq/taxonomy_predicate.go modified: pkg/obitax/inner.go modified: pkg/obitax/lca.go new file: pkg/obitax/taxid.go modified: pkg/obitax/taxon.go modified: pkg/obitax/taxonomy.go modified: pkg/obitax/taxonslice.go modified: pkg/obitools/obicleandb/obicleandb.go modified: pkg/obitools/obigrep/options.go modified: pkg/obitools/obilandmark/obilandmark.go modified: pkg/obitools/obilandmark/options.go modified: pkg/obitools/obirefidx/famlilyindexing.go modified: pkg/obitools/obirefidx/geomindexing.go modified: pkg/obitools/obirefidx/obirefidx.go modified: pkg/obitools/obirefidx/options.go modified: pkg/obitools/obitag/obigeomtag.go modified: pkg/obitools/obitag/obitag.go modified: pkg/obitools/obitag/options.go modified: pkg/obiutils/strings.go
2025-06-29 16:20:46 +00:00 · 2024-12-19 13:36:59 +01:00
parent f41a6fbb60
commit 795df34d1a
28 changed files with 590 additions and 280 deletions
--- a/pkg/obiutils/strings.go
+++ b/pkg/obiutils/strings.go
@ -1,12 +1,158 @@
 package obiutils

-import "unsafe"
+import (
+	"fmt"
+	"unsafe"
+)

+type AsciiSet [256]bool
+
+var AsciiSpaceSet = AsciiSetFromString("\t\n\v\f\r ")
+var AsciiDigitSet = AsciiSetFromString("0123456789")
+var AsciiUpperSet = AsciiSetFromString("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
+var AsciiLowerSet = AsciiSetFromString("abcdefghijklmnopqrstuvwxyz")
+var AsciiAlphaSet = AsciiUpperSet.Union(AsciiLowerSet)
+var AsciiAlphaNumSet = AsciiAlphaSet.Union(AsciiDigitSet)
+
+// UnsafeStringFromBytes converts a byte slice into a string without making a copy of the data.
+// This function is considered unsafe because it directly manipulates memory and does not
+// perform any checks on the byte slice's contents. It should be used with caution.
+//
+// Parameters:
+//   - data: A byte slice that contains the data to be converted to a string.
+//
+// Returns:
+//
+//	A string representation of the byte slice. If the input byte slice is empty,
+//	an empty string is returned.
 func UnsafeStringFromBytes(data []byte) string {
 	if len(data) > 0 {
+		// Convert the byte slice to a string using unsafe operations.
 		s := unsafe.String(unsafe.SliceData(data), len(data))
 		return s
 	}

-	return ""
+	return "" // Return an empty string if the input slice is empty.
+}
+
+func AsciiSetFromString(s string) AsciiSet {
+	r := [256]bool{}
+	for _, c := range s {
+		r[c] = true
+	}
+	return r
+}
+
+func (r *AsciiSet) Contains(c byte) bool {
+	return r[c]
+}
+
+func (r *AsciiSet) Union(s AsciiSet) AsciiSet {
+	for i := 0; i < 256; i++ {
+		s[i] = r[i] || s[i]
+	}
+
+	return s
+}
+
+func (r *AsciiSet) Intersect(s AsciiSet) AsciiSet {
+	for i := 0; i < 256; i++ {
+		s[i] = r[i] && s[i]
+	}
+
+	return s
+}
+
+// FirstWord extracts the first word from a given string.
+// A word is defined as a sequence of non-space characters.
+// It ignores leading whitespace and stops at the first whitespace character encountered.
+//
+// Parameters:
+//   - s: The input string from which the first word is to be extracted.
+//
+// Returns:
+//
+//	A string containing the first word found in the input string. If the input string
+//	is empty or contains only whitespace, an empty string is returned.
+func FirstWord(s string) string {
+	// Fast path for ASCII: look for the first ASCII non-space byte
+	start := 0
+	for ; start < len(s); start++ {
+		c := s[start]
+		if !AsciiSpaceSet.Contains(c) {
+			break
+		}
+	}
+
+	stop := start
+	for ; stop < len(s); stop++ {
+		c := s[stop]
+		if AsciiSpaceSet.Contains(c) {
+			break
+		}
+	}
+	return s[start:stop]
+}
+
+// FirstRestrictedWord extracts the first word from a given string while enforcing character restrictions.
+// A word is defined as a sequence of non-space characters. The function checks each character
+// against the provided restriction array, which indicates whether a character is allowed.
+//
+// Parameters:
+//   - s: The input string from which the first restricted word is to be extracted.
+//   - restriction: A boolean array of size 256 where each index represents a character's ASCII value.
+//     If restriction[c] is false, the character c is not allowed in the word.
+//
+// Returns:
+//
+//	A string containing the first word found in the input string that does not contain any restricted characters.
+//	If a restricted character is found, an error is returned indicating the invalid character.
+//	If the input string is empty or contains only whitespace, an empty string is returned with no error.
+func (restriction *AsciiSet) FirstWord(s string) (string, error) {
+	// Fast path for ASCII: look for the first ASCII non-space byte
+	start := 0
+	for ; start < len(s); start++ {
+		c := s[start]
+		if !AsciiSpaceSet.Contains(c) {
+			break
+		}
+	}
+
+	stop := start
+	for ; stop < len(s); stop++ {
+		c := s[stop]
+		if AsciiSpaceSet.Contains(c) {
+			break
+		}
+		if !restriction.Contains(c) {
+			return "", fmt.Errorf("invalid character '%c' in string: %s", c, s)
+		}
+	}
+
+	return s[start:stop], nil
+}
+
+func (r *AsciiSet) TrimLeft(s string) string {
+	i := 0
+	for ; i < len(s); i++ {
+		c := s[i]
+		if !AsciiSpaceSet.Contains(c) {
+			break
+		}
+	}
+	return s[i:]
+}
+
+func SplitInTwo(s string, sep byte) (string, string) {
+	i := 0
+	for ; i < len(s); i++ {
+		c := s[i]
+		if c == sep {
+			break
+		}
+	}
+	if i == len(s) {
+		return s, ""
+	}
+	return s[:i], s[i+1:]
 }