obitools4/pkg/obiutils/strings.go

package obiutils

import (
	"fmt"
	"unsafe"
)

type AsciiSet [256]bool

var AsciiSpaceSet = AsciiSetFromString("\t\n\v\f\r ")
var AsciiUnderScore = AsciiSetFromString("_")
var AsciiDigitSet = AsciiSetFromString("0123456789")
var AsciiUpperSet = AsciiSetFromString("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
var AsciiLowerSet = AsciiSetFromString("abcdefghijklmnopqrstuvwxyz")
var AsciiAlphaSet = AsciiUpperSet.Union(AsciiLowerSet)
var AsciiAlphaNumSet = AsciiAlphaSet.Union(AsciiDigitSet)

// UnsafeStringFromBytes converts a byte slice into a string without making a copy of the data.
// This function is considered unsafe because it directly manipulates memory and does not
// perform any checks on the byte slice's contents. It should be used with caution.
//
// Parameters:
//   - data: A byte slice that contains the data to be converted to a string.
//
// Returns:
//
//	A string representation of the byte slice. If the input byte slice is empty,
//	an empty string is returned.
func UnsafeStringFromBytes(data []byte) string {
	if len(data) > 0 {
		// Convert the byte slice to a string using unsafe operations.
		s := unsafe.String(unsafe.SliceData(data), len(data))
		return s
	}

	return "" // Return an empty string if the input slice is empty.
}

func AsciiSetFromString(s string) AsciiSet {
	r := [256]bool{}
	for _, c := range s {
		r[c] = true
	}
	return r
}

func (r *AsciiSet) Contains(c byte) bool {
	return r[c]
}

func (r *AsciiSet) Union(s AsciiSet) AsciiSet {
	for i := 0; i < 256; i++ {
		s[i] = r[i] || s[i]
	}

	return s
}

func (r *AsciiSet) Intersect(s AsciiSet) AsciiSet {
	for i := 0; i < 256; i++ {
		s[i] = r[i] && s[i]
	}

	return s
}

// FirstWord extracts the first word from a given string.
// A word is defined as a sequence of non-space characters.
// It ignores leading whitespace and stops at the first whitespace character encountered.
//
// Parameters:
//   - s: The input string from which the first word is to be extracted.
//
// Returns:
//
//	A string containing the first word found in the input string. If the input string
//	is empty or contains only whitespace, an empty string is returned.
func FirstWord(s string) string {
	// Fast path for ASCII: look for the first ASCII non-space byte
	start := 0
	for ; start < len(s); start++ {
		c := s[start]
		if !AsciiSpaceSet.Contains(c) {
			break
		}
	}

	stop := start
	for ; stop < len(s); stop++ {
		c := s[stop]
		if AsciiSpaceSet.Contains(c) {
			break
		}
	}
	return s[start:stop]
}

// FirstRestrictedWord extracts the first word from a given string while enforcing character restrictions.
// A word is defined as a sequence of non-space characters. The function checks each character
// against the provided restriction array, which indicates whether a character is allowed.
//
// Parameters:
//   - s: The input string from which the first restricted word is to be extracted.
//   - restriction: A boolean array of size 256 where each index represents a character's ASCII value.
//     If restriction[c] is false, the character c is not allowed in the word.
//
// Returns:
//
//	A string containing the first word found in the input string that does not contain any restricted characters.
//	If a restricted character is found, an error is returned indicating the invalid character.
//	If the input string is empty or contains only whitespace, an empty string is returned with no error.
func (restriction *AsciiSet) FirstWord(s string) (string, error) {
	// Fast path for ASCII: look for the first ASCII non-space byte
	start := 0
	for ; start < len(s); start++ {
		c := s[start]
		if !AsciiSpaceSet.Contains(c) {
			break
		}
	}

	stop := start
	for ; stop < len(s); stop++ {
		c := s[stop]
		if AsciiSpaceSet.Contains(c) {
			break
		}
		if !restriction.Contains(c) {
			return "", fmt.Errorf("invalid character '%c' in string: %s", c, s)
		}
	}

	return s[start:stop], nil
}

func (r *AsciiSet) TrimLeft(s string) string {
	i := 0
	for ; i < len(s); i++ {
		c := s[i]
		if !AsciiSpaceSet.Contains(c) {
			break
		}
	}
	return s[i:]
}

func SplitInTwo(s string, sep byte) (string, string) {
	i := 0
	for ; i < len(s); i++ {
		c := s[i]
		if c == sep {
			break
		}
	}
	if i == len(s) {
		return s, ""
	}
	return s[:i], s[i+1:]
}