Compare commits

..

1 Commits

Author SHA1 Message Date
Vadim Alekseev
de3690671b lib/stringsutil: optimize AppendLowercase
The optimization includes the following improvements:
- Implementation of a function that processes 8 bytes per loop iteration to locate ASCII characters using bitwise manipulations.
- Implementation of the ToLowercaseFunc function that prevents string copying if the string is already in lowercase.
- Use of a lookup table for converting ASCII characters to lowercase, with logic copied from the VictoriaLogs repository.
2026-04-16 02:22:45 +04:00
5 changed files with 484 additions and 111 deletions

View File

@@ -33,7 +33,6 @@ type FS struct {
//
// The returned fs must be stopped when no long needed with MustStop call.
func (fs *FS) Init() error {
fs.Dir = filepath.Clean(fs.Dir)
if fs.MaxBytesPerSecond > 0 {
fs.bl = newBandwidthLimiter(fs.MaxBytesPerSecond)
}

View File

@@ -1,29 +0,0 @@
package fslocal
import (
"os"
"path/filepath"
"testing"
)
func TestFSInitCleanDir(t *testing.T) {
dir := t.TempDir()
if err := os.WriteFile(filepath.Join(dir, "testfile"), []byte("x"), 0600); err != nil {
t.Fatal(err)
}
// trailing slash must not cause ListParts to panic
fs := &FS{Dir: dir + string(filepath.Separator)}
if err := fs.Init(); err != nil {
t.Fatalf("Init error: %s", err)
}
defer fs.MustStop()
parts, err := fs.ListParts()
if err != nil {
t.Fatalf("ListParts error: %s", err)
}
if len(parts) != 1 {
t.Fatalf("expected 1 part, got %d", len(parts))
}
}

View File

@@ -1,8 +1,11 @@
package stringsutil
import (
"slices"
"sync"
"unicode"
"unicode/utf8"
"unsafe"
)
// LimitStringLen limits the length of s with maxLen.
@@ -21,9 +24,24 @@ func LimitStringLen(s string, maxLen int) string {
}
// AppendLowercase appends lowercase s to dst and returns the result.
//
// It is faster alternative to strings.ToLower.
// It is recommended to use ToLowercaseFunc if possible to avoid copying of s.
func AppendLowercase(dst []byte, s string) []byte {
// Try to find the first uppercase character.
n := uppercaseIndex(s)
if n < 0 {
// Fast path: no uppercase characters found.
dst = append(dst, s...)
return dst
}
// Slow path: convert s to lowercase.
dst = slices.Grow(dst, len(s))
dst = append(dst, s[:n]...)
s = s[n:]
return appendLowercaseInternal(dst, s)
}
func appendLowercaseInternal(dst []byte, s string) []byte {
dstLen := len(dst)
// Try fast path at first by assuming that s contains only ASCII chars.
@@ -49,3 +67,115 @@ func AppendLowercase(dst []byte, s string) []byte {
}
return dst
}
// ToLowercaseFunc calls f with a lowercase version of s.
// The resulting value is only valid during the f call.
func ToLowercaseFunc(s string, f func(s string)) {
// Try to find the first uppercase character.
n := uppercaseIndex(s)
if n < 0 {
// Fast path: no uppercase characters found.
f(s)
return
}
sb := getStringBuilder()
defer putStringBuilder(sb)
sb.buf = slices.Grow(sb.buf, len(s))
sb.appendString(s[:n])
sb.buf = appendLowercaseInternal(sb.buf, s[n:])
f(sb.string())
}
// IsLowercase returns true if the given string does not contain uppercase characters.
func IsLowercase(s string) bool {
return uppercaseIndex(s) < 0
}
// uppercaseIndex returns the index of the first uppercase character in s,
// or -1 if s does not contain uppercase characters.
func uppercaseIndex(s string) int {
idx := 0
// Fast path for ASCII-only strings - process 8 bytes at a time.
for idx <= len(s)-8 {
v := uint64FromString(s[idx:])
// ASCII characters have the 8th bit clear.
// The operation bellow is the same as s[idx] < utf8.RuneSelf, but for multiple bytes.
if isASCII := v&0x8080808080808080 == 0; !isASCII {
break
}
// Check if any byte lacks the 6th bit, which indicates uppercase symbol or '@', '[', '\', ']', '^', '_'.
mightHaveUpper := ^v&0x2020202020202020 != 0
if mightHaveUpper {
for j := 0; j < 8; j++ {
c := s[idx+j]
if c >= 'A' && c <= 'Z' {
return idx + j
}
}
}
idx += 8
}
// Handle the rest of the s.
for idx < len(s) {
if c := s[idx]; c < utf8.RuneSelf {
if c >= 'A' && c <= 'Z' {
return idx
}
idx++
continue
}
r, size := utf8.DecodeRuneInString(s[idx:])
if r != unicode.ToLower(r) {
return idx
}
idx += size
}
return -1
}
// uint64FromString interprets the first 8 bytes of string b as a little-endian uint64.
// The same as binary.LittleEndian.Uint64, but operates on strings.
//
// This function is a bit slower than (*uint64)(unsafe.Pointer(ptr)) alternative,
// but does not have the issue with data alignment. See: https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3927
func uint64FromString(b string) uint64 {
_ = b[7] // bounds check hint to compiler; see golang.org/issue/14808
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
}
type stringBuilder struct {
buf []byte
}
func (sb *stringBuilder) appendString(s string) {
sb.buf = append(sb.buf, s...)
}
func (sb *stringBuilder) reset() {
sb.buf = sb.buf[:0]
}
func (sb *stringBuilder) string() string {
return unsafe.String(unsafe.SliceData(sb.buf), len(sb.buf))
}
var stringBuilderPool = sync.Pool{
New: func() any {
return &stringBuilder{}
},
}
func getStringBuilder() *stringBuilder {
return stringBuilderPool.Get().(*stringBuilder)
}
func putStringBuilder(sb *stringBuilder) {
sb.reset()
stringBuilderPool.Put(sb)
}

View File

@@ -23,18 +23,255 @@ func TestLimitStringLen(t *testing.T) {
f("abcde", 5, "abcde")
}
func TestAppendLowercase(t *testing.T) {
f := func(s, resultExpected string) {
func TestAppendLowercaseToLowercaseFunc(t *testing.T) {
f := func(s, expected string) {
t.Helper()
result := AppendLowercase(nil, s)
if string(result) != resultExpected {
t.Fatalf("unexpected result; got %q; want %q", result, resultExpected)
got := AppendLowercase(nil, s)
if string(got) != expected {
t.Fatalf("unexpected result; got %q; want %q", got, expected)
}
ToLowercaseFunc(s, func(s string) {
if s != expected {
t.Fatalf("unexpected result; got %q; want %q", got, expected)
}
})
}
// Empty string
f("", "")
// ASCII lowercase
f("hello", "hello")
f("world", "world")
f("abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz")
// ASCII uppercase
f("HELLO", "hello")
f("WORLD", "world")
f("ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")
// ASCII mixed case
f("Hello", "hello")
f("heLLo", "hello")
f("WOrld", "world")
f("HeLLo WoRLd", "hello world")
// Unicode Cyrillic
f("привіт", "привіт")
f("світ", "світ")
f("ПРИВІТ", "привіт")
f("СВІТ", "світ")
f("Привіт", "привіт")
f("приВіт", "привіт")
// Unicode Greek
f("αβγδε", "αβγδε")
f("ΑΒΓΔΕ", "αβγδε")
f("Αβγδε", "αβγδε")
// Latin Extended
f("café", "café")
f("naïve", "naïve")
f("niño", "niño")
f("ærøå", "ærøå")
f("ñüöäß", "ñüöäß")
f("CAFÉ", "café")
f("NAÏVE", "naïve")
f("NIÑO", "niño")
f("ÆRØÅ", "ærøå")
f("ÑÜÖÄ", "ñüöä")
f("Café", "café")
f("naÏve", "naïve")
f("Niño", "niño")
// Thai
f("สวัสดี", "สวัสดี")
f("โลก", "โลก")
// Japanese Hiragana
f("こんにちは", "こんにちは")
f("せかい", "せかい")
// Japanese Katakana
f("コンニチハ", "コンニチハ")
f("セカイ", "セカイ")
// Chinese
f("你好", "你好")
f("世界", "世界")
// Devanagari
f("नमस्ते", "नमस्ते")
f("दुनिया", "दुनिया")
// Georgian
f("გამარჯობა", "გამარჯობა")
f("ᲒᲐᲛᲐᲠᲯᲝᲑᲐ", "გამარჯობა")
// Armenian
f("բարեւ", "բարեւ")
f("ԲԱՐԵՒ", "բարեւ")
// Turkish
f("İSTANBUL", "istanbul")
// Mixed languages
f("hello世界", "hello世界")
f("привет123", "привет123")
f("test你好", "test你好")
f("Hello世界", "hello世界")
f(ривет123", "привет123")
f("Test你好", "test你好")
// Emoji and symbols
f("hello😀world", "hello😀world")
f("test✨case", "test✨case")
f("foo🎉bar", "foo🎉bar")
f("HELLO😀WORLD", "hello😀world")
// Digits
f("hello123", "hello123")
f("test456world", "test456world")
f("abc123def456", "abc123def456")
f("123", "123")
f("456789", "456789")
f("0", "0")
f("HELLO123", "hello123")
f("TEST456WORLD", "test456world")
f("ABC123DEF456", "abc123def456")
// Special characters
f("hello-world", "hello-world")
f("test_case", "test_case")
f("foo.bar", "foo.bar")
f("a@b#c$d", "a@b#c$d")
f("!@#$%", "!@#$%")
f(".,;:-_", ".,;:-_")
f("()[]{}", "()[]{}")
f("HELLO-WORLD", "hello-world")
f("TEST_CASE", "test_case")
f("FOO.BAR", "foo.bar")
f("A@B#C$D", "a@b#c$d")
}
func TestIsLower(t *testing.T) {
f := func(s string, want bool) {
t.Helper()
if IsLowercase(s) != want {
t.Fatalf("unexpected result; got %v; want %v for %q", IsLowercase(s), want, s)
}
}
f("", "")
f("foo", "foo")
f("FOO", "foo")
f("foo БаР baz 123", "foo бар baz 123")
// Empty string
f("", true)
// ASCII lowercase
f("hello", true)
f("world", true)
f("abcdefghijklmnopqrstuvwxyz", true)
// ASCII uppercase
f("HELLO", false)
f("WORLD", false)
f("ABCDEFGHIJKLMNOPQRSTUVWXYZ", false)
// ASCII mixed case
f("Hello", false)
f("heLLo", false)
f("WOrld", false)
// Unicode Cyrillic
f("привіт", true)
f("світ", true)
f("ПРИВІТ", false)
f("СВІТ", false)
f("Привіт", false)
f("приВіт", false)
// Unicode Greek
f("αβγδε", true)
f("ΑΒΓΔΕ", false)
f("Αβγδε", false)
// Latin Extended with diacritics
f("café", true)
f("naïve", true)
f("niño", true)
f("ærøå", true)
f("ñüöäß", true)
f("CAFÉ", false)
f("NAÏVE", false)
f("NIÑO", false)
f("ÆRØÅ", false)
f("ÑÜÖÄ", false)
f("Café", false)
f("naÏve", false)
f("Niño", false)
// Thai
f("สวัสดี", true)
f("โลก", true)
// Japanese Hiragana
f("こんにちは", true)
f("せかい", true)
// Japanese Katakana
f("コンニチハ", true)
f("セカイ", true)
// Chinese characters
f("你好", true)
f("世界", true)
// Devanagari
f("नमस्ते", true)
f("दुनिया", true)
// Georgian
f("გამარჯობა", true)
f("ᲒᲐᲛᲐᲠᲯᲝᲑᲐ", false)
// Armenian
f("բարեւ", true)
f("ԲԱՐԵՒ", false)
// Mixed languages
f("hello世界", true)
f("привет123", true)
f("test你好", true)
f("Hello世界", false)
f(ривет123", false)
f("Test你好", false)
// Emoji and symbols
f("hello😀world", true)
f("test✨case", true)
f("foo🎉bar", true)
// Digits
f("hello123", true)
f("test456world", true)
f("abc123def456", true)
f("123", true)
f("456789", true)
f("0", true)
f("HELLO123", false)
f("TEST456WORLD", false)
f("ABC123DEF456", false)
// Special characters
f("hello-world", true)
f("test_case", true)
f("foo.bar", true)
f("a@b#c$d", true)
f("!@#$%", true)
f(".,;:-_", true)
f("()[]{}", true)
f("HELLO-WORLD", false)
f("TEST_CASE", false)
f("FOO.BAR", false)
f("A@B#C$D", false)
}

View File

@@ -1,96 +1,132 @@
package stringsutil
import (
"strings"
"sync/atomic"
"testing"
)
func BenchmarkAppendLowercase(b *testing.B) {
b.Run("ascii-all-lowercase", func(b *testing.B) {
benchmarkAppendLowercase(b, []string{"foo bar baz abc def", "23k umlkds", "lq, poweri2349)"})
b.Run("ascii-full-lowercase", func(b *testing.B) {
data := `started kubernetes log collector for node "gke-sandbox-e2-standard-8-20250715071-5b0a2ce9-vyko"`
benchmarkToLower(b, data)
})
b.Run("ascii-some-uppercase", func(b *testing.B) {
benchmarkAppendLowercase(b, []string{"Foo Bar baz ABC def", "23k umlKDs", "lq, Poweri2349)"})
b.Run("ascii-partial-lowercase", func(b *testing.B) {
data := `started Kubernetes log collector for Node "gke-sandbox-e2-standard-8-20250715071-5b0a2ce9-vyko"`
benchmarkToLower(b, data)
})
b.Run("ascii-all-uppercase", func(b *testing.B) {
benchmarkAppendLowercase(b, []string{"FOO BAR BAZ ABC DEF", "23K UMLKDS", "LQ, POWERI2349)"})
b.Run("ascii-full-uppercase", func(b *testing.B) {
data := `STARTED KUBERNETES LOG COLLECTOR FOR NODE "GKE-SANDBOX-E2-STANDARD-8-20250715071-5B0A2CE9-VYKO"`
benchmarkToLower(b, data)
})
b.Run("unicode-all-lowercase", func(b *testing.B) {
benchmarkAppendLowercase(b, []string{"хщцукодл длобючф дл", "23и юбывлц", "лф, длощшу2349)"})
b.Run("ascii-partial-uppercase", func(b *testing.B) {
data := `started KUBERNETES log collector FOR NODE "gke-sandbox-e2-standard-8-20250715071-5b0a2ce9-vyko"`
benchmarkToLower(b, data)
})
b.Run("unicode-some-uppercase", func(b *testing.B) {
benchmarkAppendLowercase(b, []string{"Хщцукодл Длобючф ДЛ", "23и юбыВЛц", "лф, Длощшу2349)"})
b.Run("ascii-full-title", func(b *testing.B) {
data := `Started Kubernetes Log Collector For Node "Gke-Sandbox-E2-Standard-8-20250715071-5b0a2ce9-Vyko"`
benchmarkToLower(b, data)
})
b.Run("unicode-all-uppercase", func(b *testing.B) {
benchmarkAppendLowercase(b, []string{"ХЩЦУКОДЛ ДЛОБЮЧФ ДЛ", "23И ЮБЫВЛЦ", "ЛФ, ДЛОЩШУ2349)"})
b.Run("ascii-partial-title", func(b *testing.B) {
data := `started Kubernetes log Collector for Node "gke-sandbox-e2-standard-8-20250715071-5b0a2ce9-vyko"`
benchmarkToLower(b, data)
})
b.Run("ascii-mixcase", func(b *testing.B) {
data := `Started Kubernetes log COLLECTOR for nodE "GKE-Sandbox-E2-Standard-8-20250715071-5b0a2ce9-VYKO"`
benchmarkToLower(b, data)
})
b.Run("unicode-full-lowercase", func(b *testing.B) {
data := `запущен кубернетес лог коллектор на ноде гке-сендбокс-е2-стандарт-8-20250715071-5в0а2се9-вико`
benchmarkToLower(b, data)
})
b.Run("unicode-partial-lowercase", func(b *testing.B) {
data := `запущен КубернеТЕС лОг кОллектор нА НодЕ гке-сендбокс-е2-стандарт-8-20250715071-5в0а2се9-вико`
benchmarkToLower(b, data)
})
b.Run("unicode-full-uppercase", func(b *testing.B) {
data := `ЗАПУЩЕН КУБЕРНЕТЕС ЛОГ КОЛЛЕКТОР НА НОДЕ ГКЕ-СЕНДБОКС-Е2-СТАНДАРТ-8-20250715071-5В0А2СЕ9-ВИКО`
benchmarkToLower(b, data)
})
b.Run("unicode-partial-uppercase", func(b *testing.B) {
data := `запущен КУБЕРНЕТЕС лог коллектор НА НОДЕ гке-сендбокс-е2-стандарт-8-20250715071-5в0а2се9-вико`
benchmarkToLower(b, data)
})
b.Run("unicode-full-title", func(b *testing.B) {
data := `Запущен Кубернетес Лог Коллектор На Ноде Гке-Сендбокс-Е2-Стандарт-8-20250715071-5В0а2се9-Вико`
benchmarkToLower(b, data)
})
b.Run("unicode-partial-title", func(b *testing.B) {
data := `запущен Кубернетес лог Коллектор на Ноде гке-сендбокс-е2-стандарт-8-20250715071-5в0а2се9-вико`
benchmarkToLower(b, data)
})
b.Run("unicode-mixcase", func(b *testing.B) {
data := `Запущен Кубернетес лог КОЛЛЕКТОР на нодЕ гке-Сендбокс-Е2-Стандарт-8-20250715071-5В0а2се9-ВИКО`
benchmarkToLower(b, data)
})
}
func benchmarkAppendLowercase(b *testing.B, a []string) {
n := 0
for _, s := range a {
n += len(s)
}
func benchmarkToLower(b *testing.B, s string) {
b.Helper()
b.ReportAllocs()
b.SetBytes(int64(n))
b.SetBytes(int64(len(s)))
b.RunParallel(func(pb *testing.PB) {
var buf []byte
var n uint64
for pb.Next() {
buf = buf[:0]
for _, s := range a {
buf = AppendLowercase(buf, s)
}
n += uint64(len(buf))
buf = AppendLowercase(buf[:0], s)
}
GlobalSink.Add(n)
})
}
func BenchmarkStringsToLower(b *testing.B) {
b.Run("ascii-all-lowercase", func(b *testing.B) {
benchmarkStringsToLower(b, []string{"foo bar baz abc def", "23k umlkds", "lq, poweri2349)"})
})
b.Run("ascii-some-uppercase", func(b *testing.B) {
benchmarkStringsToLower(b, []string{"Foo Bar baz ABC def", "23k umlKDs", "lq, Poweri2349)"})
})
b.Run("ascii-all-uppercase", func(b *testing.B) {
benchmarkStringsToLower(b, []string{"FOO BAR BAZ ABC DEF", "23K UMLKDS", "LQ, POWERI2349)"})
})
b.Run("unicode-all-lowercase", func(b *testing.B) {
benchmarkStringsToLower(b, []string{"хщцукодл длобючф дл", "23и юбывлц", "лф, длощшу2349)"})
})
b.Run("unicode-some-uppercase", func(b *testing.B) {
benchmarkStringsToLower(b, []string{"Хщцукодл Длобючф ДЛ", "23и юбыВЛц", "лф, Длощшу2349)"})
})
b.Run("unicode-all-uppercase", func(b *testing.B) {
benchmarkStringsToLower(b, []string{"ХЩЦУКОДЛ ДЛОБЮЧФ ДЛ", "23И ЮБЫВЛЦ", "ЛФ, ДЛОЩШУ2349)"})
})
}
func benchmarkStringsToLower(b *testing.B, a []string) {
n := 0
for _, s := range a {
n += len(s)
}
b.ReportAllocs()
b.SetBytes(int64(n))
b.RunParallel(func(pb *testing.PB) {
var buf []byte
var n uint64
for pb.Next() {
buf = buf[:0]
for _, s := range a {
sLower := strings.ToLower(s)
buf = append(buf, sLower...)
}
n += uint64(len(buf))
}
GlobalSink.Add(n)
GlobalSink.Add(uint64(len(buf)))
})
}
var GlobalSink atomic.Uint64
func BenchmarkIsLowercase(b *testing.B) {
b.Run("ascii-mismatch", func(b *testing.B) {
data := `started kubernetes log collector for node "gke-sandbox-e2-standard-8-20250715071-5b0a2ce9-vyko"`
benchmarkIsLowercase(b, data, true)
})
b.Run("ascii-match-start", func(b *testing.B) {
data := `started Kubernetes log collector for Node "gke-sandbox-e2-standard-8-20250715071-5b0a2ce9-vyko"`
benchmarkIsLowercase(b, data, false)
})
b.Run("ascii-match-middle", func(b *testing.B) {
data := `started kubernetes log collector for Node "gke-sandbox-e2-standard-8-20250715071-5b0a2ce9-vyko"`
benchmarkIsLowercase(b, data, false)
})
b.Run("ascii-match-end", func(b *testing.B) {
data := `started kubernetes log collector for node "gke-sandbox-e2-standard-8-20250715071-5b0a2ce9-vyKo"`
benchmarkIsLowercase(b, data, false)
})
b.Run("unicode-mismatch", func(b *testing.B) {
data := `запущен кубернетес лог коллектор на ноде гке-сендбокс-е2-стандарт-8-20250715071-5в0а2се9-вико`
benchmarkIsLowercase(b, data, true)
})
b.Run("unicode-match-start", func(b *testing.B) {
data := `запущен Кубернетес лог коллектор на ноде гке-сендбокс-е2-стандарт-8-20250715071-5в0а2се9-вико`
benchmarkIsLowercase(b, data, false)
})
b.Run("unicode-match-middle", func(b *testing.B) {
data := `запущен кубернетес лог коллектор на Ноде гке-сендбокс-е2-стандарт-8-20250715071-5в0а2се9-вико`
benchmarkIsLowercase(b, data, false)
})
b.Run("unicode-match-end", func(b *testing.B) {
data := `запущен кубернетес лог коллектор на ноде гке-сендбокс-е2-стандарт-8-20250715071-5в0а2се9-виКо`
benchmarkIsLowercase(b, data, false)
})
}
func benchmarkIsLowercase(b *testing.B, s string, expected bool) {
b.Helper()
b.ReportAllocs()
b.SetBytes(int64(len(s)))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
if IsLowercase(s) != expected {
b.Fatalf("expected IsLower(%q) to return %v", s, expected)
}
}
})
}