mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-06-23 02:28:07 +03:00
The optimization includes the following improvements: - Implementation of a function that processes 8 bytes per loop iteration to locate ASCII characters using bitwise manipulations. - Implementation of the ToLowercaseFunc function that prevents string copying if the string is already in lowercase. - Use of a lookup table for converting ASCII characters to lowercase, with logic copied from the VictoriaLogs repository.
182 lines
4.4 KiB
Go
182 lines
4.4 KiB
Go
package stringsutil
|
|
|
|
import (
|
|
"slices"
|
|
"sync"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
"unsafe"
|
|
)
|
|
|
|
// LimitStringLen limits the length of s with maxLen.
|
|
//
|
|
// If len(s) > maxLen, then s is replaced with "s_prefix..s_suffix",
|
|
// so the total length of the returned string doesn't exceed maxLen.
|
|
func LimitStringLen(s string, maxLen int) string {
|
|
if maxLen < 4 {
|
|
maxLen = 4
|
|
}
|
|
if len(s) <= maxLen {
|
|
return s
|
|
}
|
|
n := (maxLen / 2) - 1
|
|
return s[:n] + ".." + s[len(s)-n:]
|
|
}
|
|
|
|
// AppendLowercase appends lowercase s to dst and returns the result.
|
|
// It is recommended to use ToLowercaseFunc if possible to avoid copying of s.
|
|
func AppendLowercase(dst []byte, s string) []byte {
|
|
// Try to find the first uppercase character.
|
|
n := uppercaseIndex(s)
|
|
if n < 0 {
|
|
// Fast path: no uppercase characters found.
|
|
dst = append(dst, s...)
|
|
return dst
|
|
}
|
|
|
|
// Slow path: convert s to lowercase.
|
|
dst = slices.Grow(dst, len(s))
|
|
dst = append(dst, s[:n]...)
|
|
s = s[n:]
|
|
return appendLowercaseInternal(dst, s)
|
|
}
|
|
|
|
func appendLowercaseInternal(dst []byte, s string) []byte {
|
|
dstLen := len(dst)
|
|
|
|
// Try fast path at first by assuming that s contains only ASCII chars.
|
|
hasUnicodeChars := false
|
|
for i := range len(s) {
|
|
c := s[i]
|
|
if c >= utf8.RuneSelf {
|
|
hasUnicodeChars = true
|
|
break
|
|
}
|
|
if c >= 'A' && c <= 'Z' {
|
|
c += 'a' - 'A'
|
|
}
|
|
dst = append(dst, c)
|
|
}
|
|
if hasUnicodeChars {
|
|
// Slow path - s contains non-ASCII chars. Use Unicode encoding.
|
|
dst = dst[:dstLen]
|
|
for _, r := range s {
|
|
r = unicode.ToLower(r)
|
|
dst = utf8.AppendRune(dst, r)
|
|
}
|
|
}
|
|
return dst
|
|
}
|
|
|
|
// ToLowercaseFunc calls f with a lowercase version of s.
|
|
// The resulting value is only valid during the f call.
|
|
func ToLowercaseFunc(s string, f func(s string)) {
|
|
// Try to find the first uppercase character.
|
|
n := uppercaseIndex(s)
|
|
if n < 0 {
|
|
// Fast path: no uppercase characters found.
|
|
f(s)
|
|
return
|
|
}
|
|
|
|
sb := getStringBuilder()
|
|
defer putStringBuilder(sb)
|
|
|
|
sb.buf = slices.Grow(sb.buf, len(s))
|
|
sb.appendString(s[:n])
|
|
sb.buf = appendLowercaseInternal(sb.buf, s[n:])
|
|
f(sb.string())
|
|
}
|
|
|
|
// IsLowercase returns true if the given string does not contain uppercase characters.
|
|
func IsLowercase(s string) bool {
|
|
return uppercaseIndex(s) < 0
|
|
}
|
|
|
|
// uppercaseIndex returns the index of the first uppercase character in s,
|
|
// or -1 if s does not contain uppercase characters.
|
|
func uppercaseIndex(s string) int {
|
|
idx := 0
|
|
|
|
// Fast path for ASCII-only strings - process 8 bytes at a time.
|
|
for idx <= len(s)-8 {
|
|
v := uint64FromString(s[idx:])
|
|
// ASCII characters have the 8th bit clear.
|
|
// The operation bellow is the same as s[idx] < utf8.RuneSelf, but for multiple bytes.
|
|
if isASCII := v&0x8080808080808080 == 0; !isASCII {
|
|
break
|
|
}
|
|
|
|
// Check if any byte lacks the 6th bit, which indicates uppercase symbol or '@', '[', '\', ']', '^', '_'.
|
|
mightHaveUpper := ^v&0x2020202020202020 != 0
|
|
if mightHaveUpper {
|
|
for j := 0; j < 8; j++ {
|
|
c := s[idx+j]
|
|
if c >= 'A' && c <= 'Z' {
|
|
return idx + j
|
|
}
|
|
}
|
|
}
|
|
idx += 8
|
|
}
|
|
|
|
// Handle the rest of the s.
|
|
for idx < len(s) {
|
|
if c := s[idx]; c < utf8.RuneSelf {
|
|
if c >= 'A' && c <= 'Z' {
|
|
return idx
|
|
}
|
|
idx++
|
|
continue
|
|
}
|
|
r, size := utf8.DecodeRuneInString(s[idx:])
|
|
if r != unicode.ToLower(r) {
|
|
return idx
|
|
}
|
|
idx += size
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// uint64FromString interprets the first 8 bytes of string b as a little-endian uint64.
|
|
// The same as binary.LittleEndian.Uint64, but operates on strings.
|
|
//
|
|
// This function is a bit slower than (*uint64)(unsafe.Pointer(ptr)) alternative,
|
|
// but does not have the issue with data alignment. See: https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3927
|
|
func uint64FromString(b string) uint64 {
|
|
_ = b[7] // bounds check hint to compiler; see golang.org/issue/14808
|
|
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
|
|
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
|
|
}
|
|
|
|
type stringBuilder struct {
|
|
buf []byte
|
|
}
|
|
|
|
func (sb *stringBuilder) appendString(s string) {
|
|
sb.buf = append(sb.buf, s...)
|
|
}
|
|
|
|
func (sb *stringBuilder) reset() {
|
|
sb.buf = sb.buf[:0]
|
|
}
|
|
|
|
func (sb *stringBuilder) string() string {
|
|
return unsafe.String(unsafe.SliceData(sb.buf), len(sb.buf))
|
|
}
|
|
|
|
var stringBuilderPool = sync.Pool{
|
|
New: func() any {
|
|
return &stringBuilder{}
|
|
},
|
|
}
|
|
|
|
func getStringBuilder() *stringBuilder {
|
|
return stringBuilderPool.Get().(*stringBuilder)
|
|
}
|
|
|
|
func putStringBuilder(sb *stringBuilder) {
|
|
sb.reset()
|
|
stringBuilderPool.Put(sb)
|
|
}
|