app/vmalert/notifier: fix test after b1e998f7

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
app/vmalert/utils: unregister metrics only if there is no refs left
2026-05-17 16:59:40 +03:00 · 2025-02-06 15:02:47 +04:00 · 2025-02-06 14:49:26 +04:00 · 2025-02-06 09:16:56 +01:00 · 2025-02-05 22:48:03 +01:00 · 2025-02-05 17:10:11 +01:00
42 changed files with 1658 additions and 2185 deletions
--- a/app/vlinsert/insertutils/timestamp.go
+++ b/app/vlinsert/insertutils/timestamp.go
@@ -2,19 +2,20 @@ package insertutils

 import (
 	"fmt"
+	"math"
 	"strconv"
 	"time"

 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
 )

-// ExtractTimestampFromFields extracts timestamp in nanoseconds from the field with the name timeField at fields.
+// ExtractTimestampRFC3339NanoFromFields extracts RFC3339 timestamp in nanoseconds from the field with the name timeField at fields.
 //
 // The value for the timeField is set to empty string after returning from the function,
 // so it could be ignored during data ingestion.
 //
 // The current timestamp is returned if fields do not contain a field with timeField name or if the timeField value is empty.
-func ExtractTimestampFromFields(timeField string, fields []logstorage.Field) (int64, error) {
+func ExtractTimestampRFC3339NanoFromFields(timeField string, fields []logstorage.Field) (int64, error) {
 	for i := range fields {
 		f := &fields[i]
 		if f.Name != timeField {
@@ -47,24 +48,22 @@ func parseTimestamp(s string) (int64, error) {
 	return nsecs, nil
 }

-// ParseUnixTimestamp parses s as unix timestamp in seconds, milliseconds, microseconds or nanoseconds and returns the parsed timestamp in nanoseconds.
+// ParseUnixTimestamp parses s as unix timestamp in either seconds or milliseconds and returns the parsed timestamp in nanoseconds.
 func ParseUnixTimestamp(s string) (int64, error) {
 	n, err := strconv.ParseInt(s, 10, 64)
 	if err != nil {
 		return 0, fmt.Errorf("cannot parse unix timestamp from %q: %w", s, err)
 	}
 	if n < (1<<31) && n >= (-1<<31) {
-		// The timestamp is in seconds.
-		return n * 1e9, nil
+		// The timestamp is in seconds. Convert it to milliseconds
+		n *= 1e3
 	}
-	if n < 1e3*(1<<31) && n >= 1e3*(-1<<31) {
-		// The timestamp is in milliseconds.
-		return n * 1e6, nil
+	if n > int64(math.MaxInt64)/1e6 {
+		return 0, fmt.Errorf("too big timestamp in milliseconds: %d; mustn't exceed %d", n, int64(math.MaxInt64)/1e6)
 	}
-	if n < 1e6*(1<<31) && n >= 1e6*(-1<<31) {
-		// The timestamp is in microseconds.
-		return n * 1e3, nil
+	if n < int64(math.MinInt64)/1e6 {
+		return 0, fmt.Errorf("too small timestamp in milliseconds: %d; must be bigger than %d", n, int64(math.MinInt64)/1e6)
 	}
-	// The timestamp is in nanoseconds
+	n *= 1e6
 	return n, nil
 }
--- a/app/vlinsert/insertutils/timestamp_test.go
+++ b/app/vlinsert/insertutils/timestamp_test.go
@@ -6,11 +6,11 @@ import (
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage"
 )

-func TestExtractTimestampFromFields_Success(t *testing.T) {
+func TestExtractTimestampRFC3339NanoFromFields_Success(t *testing.T) {
 	f := func(timeField string, fields []logstorage.Field, nsecsExpected int64) {
 		t.Helper()

-		nsecs, err := ExtractTimestampFromFields(timeField, fields)
+		nsecs, err := ExtractTimestampRFC3339NanoFromFields(timeField, fields)
 		if err != nil {
 			t.Fatalf("unexpected error: %s", err)
 		}
@@ -51,18 +51,6 @@ func TestExtractTimestampFromFields_Success(t *testing.T) {
 		{Name: "foo", Value: "bar"},
 	}, 1718773640123456789)

-	// Unix timestamp in nanoseconds
-	f("time", []logstorage.Field{
-		{Name: "foo", Value: "bar"},
-		{Name: "time", Value: "1718773640123456789"},
-	}, 1718773640123456789)
-
-	// Unix timestamp in microseconds
-	f("time", []logstorage.Field{
-		{Name: "foo", Value: "bar"},
-		{Name: "time", Value: "1718773640123456"},
-	}, 1718773640123456000)
-
 	// Unix timestamp in milliseconds
 	f("time", []logstorage.Field{
 		{Name: "foo", Value: "bar"},
@@ -76,14 +64,14 @@ func TestExtractTimestampFromFields_Success(t *testing.T) {
 	}, 1718773640000000000)
 }

-func TestExtractTimestampFromFields_Error(t *testing.T) {
+func TestExtractTimestampRFC3339NanoFromFields_Error(t *testing.T) {
 	f := func(s string) {
 		t.Helper()

 		fields := []logstorage.Field{
 			{Name: "time", Value: s},
 		}
-		nsecs, err := ExtractTimestampFromFields("time", fields)
+		nsecs, err := ExtractTimestampRFC3339NanoFromFields("time", fields)
 		if err == nil {
 			t.Fatalf("expecting non-nil error")
 		}
@@ -92,7 +80,6 @@ func TestExtractTimestampFromFields_Error(t *testing.T) {
 		}
 	}

-	// invalid time
 	f("foobar")

 	// incomplete time
--- a/app/vlinsert/jsonline/jsonline.go
+++ b/app/vlinsert/jsonline/jsonline.go
@@ -99,7 +99,7 @@ func readLine(lr *insertutils.LineReader, timeField string, msgFields []string,
 	if err := p.ParseLogMessage(line); err != nil {
 		return false, fmt.Errorf("cannot parse json-encoded log entry: %w", err)
 	}
-	ts, err := insertutils.ExtractTimestampFromFields(timeField, p.Fields)
+	ts, err := insertutils.ExtractTimestampRFC3339NanoFromFields(timeField, p.Fields)
 	if err != nil {
 		return false, fmt.Errorf("cannot get timestamp: %w", err)
 	}
--- a/app/vlinsert/syslog/syslog.go
+++ b/app/vlinsert/syslog/syslog.go
@@ -560,7 +560,7 @@ func processLine(line []byte, currentYear int, timezone *time.Location, useLocal
 	if useLocalTimestamp {
 		ts = time.Now().UnixNano()
 	} else {
-		nsecs, err := insertutils.ExtractTimestampFromFields("timestamp", p.Fields)
+		nsecs, err := insertutils.ExtractTimestampRFC3339NanoFromFields("timestamp", p.Fields)
 		if err != nil {
 			return fmt.Errorf("cannot get timestamp from syslog line %q: %w", line, err)
 		}
--- a/app/vmalert/notifier/notifier_blackhole_test.go
+++ b/app/vmalert/notifier/notifier_blackhole_test.go
@@ -2,6 +2,7 @@ package notifier

 import (
 	"context"
+	"fmt"
 	"testing"
 	"time"

@@ -27,10 +28,12 @@ func TestBlackHoleNotifier_Send(t *testing.T) {
 }

 func TestBlackHoleNotifier_Close(t *testing.T) {
+	addr := "blackhole-close"
 	bh := newBlackHoleNotifier()
+	bh.addr = addr
 	if err := bh.Send(context.Background(), []Alert{{
 		GroupID:     0,
-		Name:        "alert0",
+		Name:        "alert1",
 		Start:       time.Now().UTC(),
 		End:         time.Now().UTC(),
 		Annotations: map[string]string{"a": "b", "c": "d", "e": "f"},
@@ -41,10 +44,10 @@ func TestBlackHoleNotifier_Close(t *testing.T) {
 	bh.Close()

 	defaultMetrics := metricset.GetDefaultSet()
-	alertMetricName := "vmalert_alerts_sent_total{addr=\"blackhole\"}"
+	alertMetricName := fmt.Sprintf("vmalert_alerts_sent_total{addr=%q}", addr)
 	for _, name := range defaultMetrics.ListMetricNames() {
 		if name == alertMetricName {
-			t.Fatalf("Metric name should have unregistered.But still present")
+			t.Fatalf("Metric name should have unregistered. But still present")
 		}
 	}
 }
--- a/app/vmalert/utils/metrics.go
+++ b/app/vmalert/utils/metrics.go
@@ -1,14 +1,56 @@
 package utils

-import "github.com/VictoriaMetrics/metrics"
+import (
+	"sync"
+	"sync/atomic"
+
+	"github.com/VictoriaMetrics/metrics"
+
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
+)

 type namedMetric struct {
 	Name string
 }

+var usedMetrics map[string]*atomic.Int64
+var usedMetricMu sync.Mutex
+
+func trackUsedMetric(name string) {
+	usedMetricMu.Lock()
+	defer usedMetricMu.Unlock()
+
+	if usedMetrics == nil {
+		usedMetrics = make(map[string]*atomic.Int64)
+	}
+	if _, ok := usedMetrics[name]; !ok {
+		usedMetrics[name] = &atomic.Int64{}
+	}
+	usedMetrics[name].Add(1)
+}
+
 // Unregister removes the metric by name from default registry
 func (nm namedMetric) Unregister() {
-	metrics.UnregisterMetric(nm.Name)
+	if usedMetrics == nil {
+		logger.Fatalf("BUG: unregistered metric %q before registering", nm.Name)
+	}
+
+	usedMetricMu.Lock()
+	counter, ok := usedMetrics[nm.Name]
+	if !ok {
+		logger.Fatalf("BUG: unregistered metric %q before registering", nm.Name)
+	}
+	current := counter.Add(-1)
+	usedMetricMu.Unlock()
+
+	if current < 0 {
+		logger.Fatalf("BUG: negative metric counter for %q", nm.Name)
+	}
+
+	if current == 0 {
+		metrics.UnregisterMetric(nm.Name)
+	}
+
 }

 // Gauge is a metrics.Gauge with Name
@@ -19,6 +61,7 @@ type Gauge struct {

 // GetOrCreateGauge creates a new Gauge with the given name
 func GetOrCreateGauge(name string, f func() float64) *Gauge {
+	trackUsedMetric(name)
 	return &Gauge{
 		namedMetric: namedMetric{Name: name},
 		Gauge:       metrics.GetOrCreateGauge(name, f),
@@ -33,6 +76,7 @@ type Counter struct {

 // GetOrCreateCounter creates a new Counter with the given name
 func GetOrCreateCounter(name string) *Counter {
+	trackUsedMetric(name)
 	return &Counter{
 		namedMetric: namedMetric{Name: name},
 		Counter:     metrics.GetOrCreateCounter(name),
@@ -47,6 +91,7 @@ type Summary struct {

 // GetOrCreateSummary creates a new Summary with the given name
 func GetOrCreateSummary(name string) *Summary {
+	trackUsedMetric(name)
 	return &Summary{
 		namedMetric: namedMetric{Name: name},
 		Summary:     metrics.GetOrCreateSummary(name),
--- a/app/vmalert/utils/metrics_test.go
+++ b/app/vmalert/utils/metrics_test.go
@@ -0,0 +1,52 @@
+package utils
+
+import (
+	"testing"
+
+	"github.com/VictoriaMetrics/metrics"
+)
+
+func isMetricRegistered(name string) bool {
+	metricNames := metrics.GetDefaultSet().ListMetricNames()
+	for _, mn := range metricNames {
+		if mn == name {
+			return true
+		}
+	}
+
+	return false
+}
+
+func TestMetricIsUnregistered(t *testing.T) {
+	metricName := "example_runs_total"
+	c := GetOrCreateCounter(metricName)
+	if !isMetricRegistered(metricName) {
+		t.Errorf("Expected metric %s to be present", metricName)
+	}
+
+	c.Unregister()
+	if isMetricRegistered(metricName) {
+		t.Errorf("Expected metric %s to be unregistered", metricName)
+	}
+}
+
+func TestMetricIsRemovedIfNoUses(t *testing.T) {
+	metricName := "example_runs_total"
+	c := GetOrCreateCounter(metricName)
+	c2 := GetOrCreateCounter(metricName)
+
+	if !isMetricRegistered(metricName) {
+		t.Errorf("Expected metric %s to be present", metricName)
+	}
+
+	c.Unregister()
+	// metric should still be registered since c2 is using it
+	if !isMetricRegistered(metricName) {
+		t.Errorf("Expected metric %s to be present", metricName)
+	}
+
+	c2.Unregister()
+	if isMetricRegistered(metricName) {
+		t.Errorf("Expected metric %s to be unregistered", metricName)
+	}
+}
--- a/app/vmselect/netstorage/netstorage.go
+++ b/app/vmselect/netstorage/netstorage.go
@@ -8,6 +8,7 @@ import (
 	"sort"
 	"sync"
 	"sync/atomic"
+	"time"
 	"unsafe"

 	"github.com/VictoriaMetrics/metrics"
@@ -1001,7 +1002,9 @@ func ExportBlocks(qt *querytracer.Tracer, sq *storage.SearchQuery, deadline sear

 	sr := getStorageSearch()
 	defer putStorageSearch(sr)
+	startTime := time.Now()
 	sr.Init(qt, vmstorage.Storage, tfss, tr, sq.MaxMetrics, deadline.Deadline())
+	indexSearchDuration.UpdateDuration(startTime)

 	// Start workers that call f in parallel on available CPU cores.
 	workCh := make(chan *exportWork, gomaxprocs*8)
@@ -1139,7 +1142,9 @@ func ProcessSearchQuery(qt *querytracer.Tracer, sq *storage.SearchQuery, deadlin
 	defer vmstorage.WG.Done()

 	sr := getStorageSearch()
+	startTime := time.Now()
 	maxSeriesCount := sr.Init(qt, vmstorage.Storage, tfss, tr, sq.MaxMetrics, deadline.Deadline())
+	indexSearchDuration.UpdateDuration(startTime)
 	type blockRefs struct {
 		brs []blockRef
 	}
@@ -1291,6 +1296,8 @@ func ProcessSearchQuery(qt *querytracer.Tracer, sq *storage.SearchQuery, deadlin
 	return &rss, nil
 }

+var indexSearchDuration = metrics.NewHistogram(`vm_index_search_duration_seconds`)
+
 type blockRef struct {
 	partRef storage.PartRef
 	addr    tmpBlockAddr
--- a/app/vmselect/vmui/asset-manifest.json
+++ b/app/vmselect/vmui/asset-manifest.json
@@ -1,13 +1,13 @@
 {
  "files": {
-    "main.css": "./static/css/main.7fa18e1b.css",
-    "main.js": "./static/js/main.ba08300f.js",
+    "main.css": "./static/css/main.af583aad.css",
+    "main.js": "./static/js/main.1413b18d.js",
    "static/js/685.f772060c.chunk.js": "./static/js/685.f772060c.chunk.js",
    "static/media/MetricsQL.md": "./static/media/MetricsQL.a00044c91d9781cf8557.md",
    "index.html": "./index.html"
  },
  "entrypoints": [
-    "static/css/main.7fa18e1b.css",
-    "static/js/main.ba08300f.js"
+    "static/css/main.af583aad.css",
+    "static/js/main.1413b18d.js"
  ]
 }
--- a/app/vmselect/vmui/index.html
+++ b/app/vmselect/vmui/index.html
@@ -1 +1 @@
-<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><link rel="apple-touch-icon" href="./favicon.svg"/><link rel="mask-icon" href="./favicon.svg" color="#000000"><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=5"/><meta name="theme-color" content="#000000"/><meta name="description" content="Explore and troubleshoot your VictoriaMetrics data"/><link rel="manifest" href="./manifest.json"/><title>vmui</title><script src="./dashboards/index.js" type="module"></script><meta name="twitter:card" content="summary"><meta name="twitter:title" content="UI for VictoriaMetrics"><meta name="twitter:site" content="@https://victoriametrics.com/"><meta name="twitter:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta name="twitter:image" content="./preview.jpg"><meta property="og:type" content="website"><meta property="og:title" content="UI for VictoriaMetrics"><meta property="og:url" content="https://victoriametrics.com/"><meta property="og:description" content="Explore and troubleshoot your VictoriaMetrics data"><script defer="defer" src="./static/js/main.ba08300f.js"></script><link href="./static/css/main.7fa18e1b.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
+<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="./favicon.svg"/><link rel="apple-touch-icon" href="./favicon.svg"/><link rel="mask-icon" href="./favicon.svg" color="#000000"><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=5"/><meta name="theme-color" content="#000000"/><meta name="description" content="Explore and troubleshoot your VictoriaMetrics data"/><link rel="manifest" href="./manifest.json"/><title>vmui</title><script src="./dashboards/index.js" type="module"></script><meta name="twitter:card" content="summary"><meta name="twitter:title" content="UI for VictoriaMetrics"><meta name="twitter:site" content="@https://victoriametrics.com/"><meta name="twitter:description" content="Explore and troubleshoot your VictoriaMetrics data"><meta name="twitter:image" content="./preview.jpg"><meta property="og:type" content="website"><meta property="og:title" content="UI for VictoriaMetrics"><meta property="og:url" content="https://victoriametrics.com/"><meta property="og:description" content="Explore and troubleshoot your VictoriaMetrics data"><script defer="defer" src="./static/js/main.1413b18d.js"></script><link href="./static/css/main.af583aad.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
--- a/app/vmselect/vmui/static/css/main.7fa18e1b.css
+++ b/app/vmselect/vmui/static/css/main.7fa18e1b.css
--- a/app/vmselect/vmui/static/css/main.af583aad.css
+++ b/app/vmselect/vmui/static/css/main.af583aad.css
--- a/app/vmselect/vmui/static/js/main.1413b18d.js
+++ b/app/vmselect/vmui/static/js/main.1413b18d.js
--- a/app/vmselect/vmui/static/js/main.1413b18d.js.LICENSE.txt
+++ b/app/vmselect/vmui/static/js/main.1413b18d.js.LICENSE.txt
--- a/app/vmselect/vmui/static/js/main.ba08300f.js
+++ b/app/vmselect/vmui/static/js/main.ba08300f.js
--- a/app/vmui/Dockerfile-web
+++ b/app/vmui/Dockerfile-web
@@ -1,4 +1,4 @@
-FROM golang:1.23.6 AS build-web-stage
+FROM golang:1.23.5 AS build-web-stage
 COPY build /build

 WORKDIR /build
--- a/dashboards/vm/vmagent.json
+++ b/dashboards/vm/vmagent.json
--- a/dashboards/vmagent.json
+++ b/dashboards/vmagent.json
--- a/deployment/docker/Makefile
+++ b/deployment/docker/Makefile
@@ -6,7 +6,7 @@ ROOT_IMAGE ?= alpine:3.21.2
 ROOT_IMAGE_SCRATCH ?= scratch
 CERTS_IMAGE := alpine:3.21.2

-GO_BUILDER_IMAGE := golang:1.23.6-alpine
+GO_BUILDER_IMAGE := golang:1.23.5-alpine
 BUILDER_IMAGE := local/builder:2.0.0-$(shell echo $(GO_BUILDER_IMAGE) | tr :/ __)-1
 BASE_IMAGE := local/base:1.1.4-$(shell echo $(ROOT_IMAGE) | tr :/ __)-$(shell echo $(CERTS_IMAGE) | tr :/ __)
 DOCKER ?= docker
--- a/deployment/docker/docker-compose-cluster.yml
+++ b/deployment/docker/docker-compose-cluster.yml
@@ -152,7 +152,7 @@ services:
  # and distributes them according to --config.file.
  alertmanager:
    container_name: alertmanager
-    image: prom/alertmanager:v0.28.0
+    image: prom/alertmanager:v0.27.0
    volumes:
      - ./alertmanager.yml:/config/alertmanager.yml
    command:
--- a/deployment/docker/docker-compose-victorialogs.yml
+++ b/deployment/docker/docker-compose-victorialogs.yml
@@ -126,7 +126,7 @@ services:
  # and distributes them according to --config.file.
  alertmanager:
    container_name: alertmanager
-    image: prom/alertmanager:v0.28.0
+    image: prom/alertmanager:v0.27.0
    volumes:
      - ./alertmanager.yml:/config/alertmanager.yml
    command:
--- a/deployment/docker/docker-compose.yml
+++ b/deployment/docker/docker-compose.yml
@@ -93,7 +93,7 @@ services:
  # and distributes them according to --config.file.
  alertmanager:
    container_name: alertmanager
-    image: prom/alertmanager:v0.28.0
+    image: prom/alertmanager:v0.27.0
    volumes:
      - ./alertmanager.yml:/config/alertmanager.yml
    command:
--- a/deployment/docker/vmanomaly/vmanomaly-integration/docker-compose.yml
+++ b/deployment/docker/vmanomaly/vmanomaly-integration/docker-compose.yml
@@ -89,7 +89,7 @@ services:
      - "--licenseFile=/license"
  alertmanager:
    container_name: alertmanager
-    image: prom/alertmanager:v0.28.0
+    image: prom/alertmanager:v0.27.0
    volumes:
      - ./alertmanager.yml:/config/alertmanager.yml
    command:
--- a/deployment/logs-benchmark/docker-compose-elk.yml
+++ b/deployment/logs-benchmark/docker-compose-elk.yml
@@ -18,7 +18,7 @@ services:
      - vlogs

  generator:
-    image: golang:1.23.6-alpine
+    image: golang:1.23.5-alpine
    restart: always
    working_dir: /go/src/app
    volumes:
--- a/deployment/logs-benchmark/docker-compose-loki.yml
+++ b/deployment/logs-benchmark/docker-compose-loki.yml
@@ -2,7 +2,7 @@ version: '3'

 services:
  generator:
-    image: golang:1.23.6-alpine
+    image: golang:1.23.5-alpine
    restart: always
    working_dir: /go/src/app
    volumes:
--- a/deployment/logs-benchmark/docker-compose.yml
+++ b/deployment/logs-benchmark/docker-compose.yml
@@ -58,7 +58,7 @@ services:
      - ./vmsingle/promscrape.yml:/promscrape.yml

  grafana:
-    image: grafana/grafana:11.5.0
+    image: grafana/grafana:9.2.7
    depends_on: [vmsingle]
    ports:
      - 3000:3000
--- a/docs/VictoriaLogs/CHANGELOG.md
+++ b/docs/VictoriaLogs/CHANGELOG.md
@@ -16,12 +16,8 @@ according to [these docs](https://docs.victoriametrics.com/victorialogs/quicksta

 ## tip

-* FEATURE: [LogsQL](https://docs.victoriametrics.com/victorialogs/logsql/): improve performance for [`stats by (...) ...`](https://docs.victoriametrics.com/victorialogs/logsql/#stats-pipe) by up to 30% when it is applied to big number of `by (...)` groups.
-* FEATURE: [LogsQL](https://docs.victoriametrics.com/victorialogs/logsql/): improve performance for [`top` pipe](https://docs.victoriametrics.com/victorialogs/logsql/#top-pipe) by up to 30% when it is applied to big number of unique values.
-* FEATURE: [`stats` pipe](https://docs.victoriametrics.com/victorialogs/logsql/#stats-pipe): improve performance for [`count_uniq`](https://docs.victoriametrics.com/victorialogs/logsql/#count_uniq-stats) and [`count_uniq_hash`](https://docs.victoriametrics.com/victorialogs/logsql/#count_uniq_hash-stats) functions by up to 30% when they are applied to big number of unique values.
 * FEATURE: [`block_stats` pipe](https://docs.victoriametrics.com/victorialogs/logsql/#block_stats-pipe): return the path to the part where every data block is stored. The path to the part is returned in the `part_path` field. This allows investigating the distribution of data blocks among parts.
 * FEATURE: reduce VictoriaLogs startup time by multiple times when it opens a large datastore with big [retention](https://docs.victoriametrics.com/victorialogs/#retention).
-* FEATURE: [data ingestion](https://docs.victoriametrics.com/victorialogs/data-ingestion/): accept timestamps with microsecond and nanosecond precision at [`_time` field](https://docs.victoriametrics.com/victorialogs/keyconcepts/#time-field).
 * FEATURE: [web UI](https://docs.victoriametrics.com/victorialogs/querying/#web-ui): add the `_msg` field to the list of fields for the group view, allowing users to select multiple fields, including `_msg`, for log display.

 * BUGFIX: [LogsQL](https://docs.victoriametrics.com/victorialogs/logsql/): properly limit [`concurrency` query option](https://docs.victoriametrics.com/victorialogs/logsql/#query-options) for [`stats`](https://docs.victoriametrics.com/victorialogs/logsql/#stats-pipe), [`uniq`](https://docs.victoriametrics.com/victorialogs/logsql/#uniq-pipe) and [`top`](https://docs.victoriametrics.com/victorialogs/logsql/#top-pipe). This prevents from `runtime error: index out of range` panic. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8201).
--- a/docs/VictoriaLogs/data-ingestion/README.md
+++ b/docs/VictoriaLogs/data-ingestion/README.md
@@ -56,8 +56,7 @@ Otherwise the timestamp field must be in one of the following formats:
  If timezone information is missing (for example, `2023-06-20 15:32:10`),
  then the time is parsed in the local timezone of the host where VictoriaLogs runs.

- Unix timestamp in seconds, milliseconds, microseconds or nanoseconds. For example, `1686026893` (seconds), `1686026893735` (milliseconds),
-  `1686026893735321` (microseconds) or `1686026893735321098` (nanoseconds).
+- Unix timestamp in seconds or in milliseconds. For example, `1686026893` (seconds) or `1686026893735` (milliseconds).

 See [these docs](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model) for details on fields,
 which must be present in the ingested log messages.
@@ -112,8 +111,7 @@ Otherwise the timestamp field must be in one of the following formats:
  If timezone information is missing (for example, `2023-06-20 15:32:10`),
  then the time is parsed in the local timezone of the host where VictoriaLogs runs.

- Unix timestamp in seconds, milliseconds, microseconds or nanoseconds. For example, `1686026893` (seconds), `1686026893735` (milliseconds),
-  `1686026893735321` (microseconds) or `1686026893735321098` (nanoseconds).
+- Unix timestamp in seconds or in milliseconds. For example, `1686026893` (seconds) or `1686026893735` (milliseconds).

 See [these docs](https://docs.victoriametrics.com/victorialogs/keyconcepts/#data-model) for details on fields,
 which must be present in the ingested log messages.
--- a/docs/VictoriaLogs/keyConcepts.md
+++ b/docs/VictoriaLogs/keyConcepts.md
@@ -146,8 +146,7 @@ The timestamp field must be in one of the following formats:
  If timezone information is missing (for example, `2023-06-20 15:32:10`),
  then the time is parsed in the local timezone of the host where VictoriaLogs runs.

- Unix timestamp in seconds, milliseconds, microseconds or nanoseconds. For example, `1686026893` (seconds), `1686026893735` (milliseconds),
-  `1686026893735321` (microseconds) or `1686026893735321098` (nanoseconds).
+- Unix timestamp in seconds or in milliseconds. For example, `1686026893` (seconds) or `1686026893735` (milliseconds).

 For example, the following [log entry](#data-model) contains valid timestamp with millisecond precision in the `_time` field:

--- a/docs/changelog/CHANGELOG.md
+++ b/docs/changelog/CHANGELOG.md
@@ -18,16 +18,6 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).

 ## tip

-* SECURITY: upgrade Go builder from Go1.23.5 to Go1.23.6. See the list of issues addressed in [Go1.23.6](https://github.com/golang/go/issues?q=milestone%3AGo1.23.6+label%3ACherryPickApproved).
-
-* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert/): fix polluted alert messages when multiple Alertmanager instances are configured as notifiers with alert_relabel_configs.. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8040), and thanks to @evkuzin for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/8258).
-
-## [v1.111.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.111.0)
-
-Released at 2025-02-07
-
-**Update note 1: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and  [vmstorage](https://docs.victoriametrics.com/victoriametrics/) stop exposing `vm_index_search_duration_seconds` histogram metric. This metric records time spent on search operations in the index. It was introduced in [v1.56.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.56.0). However, this metric was used neither in dashboards nor in alerting rules. It also has high cardinality because index search operations latency can differ by 3 orders of magnitude. Hence, dropping it as unused.**
-
 * FEATURE: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and [vmstorage](https://docs.victoriametrics.com/cluster-victoriametrics/): improve startup times when opening a storage with the [retention](https://docs.victoriametrics.com/#retention) exceeding a few months.
 * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add the ability to switch the heatmap to a line chart. Now, vmui would suggest to switch to line graph display if heatmap can't be properly rendered. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8057).
 * FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): add `-httpInternalListenAddr` cmd-line flag to serve internal HTTP routes `/metrics`, `/flags`, etc. It allows properly route requests to backends with the same service routes as vmauth. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6468) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7345) for details.
@@ -39,6 +29,7 @@ Released at 2025-02-07
 * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui) for [VictoriaMetrics enterprise](https://docs.victoriametrics.com/enterprise.html) components: properly display enterprise features when the enterprise version is used.
 * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and [vmselect](https://docs.victoriametrics.com/cluster-victoriametrics/): fix discrepancies when using `or` binary operator. See [this](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7759) and [this](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7640) issues for details.
 * BUGFIX: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly update number of unique series for [cardinality limiter](https://docs.victoriametrics.com/#cardinality-limiter) on ingestion. Previously, limit could undercount the real number of the ingested unique series. 
+* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert/): do not unregister group metrics if the group is still in use. Previously, this could lead to group metrics being absent even though rules group is still running. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8229) for details.

 ## [v1.102.12](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.12)

--- a/docs/keyConcepts/keyConcepts.md
+++ b/docs/keyConcepts/keyConcepts.md
@@ -35,7 +35,7 @@ which means there is no need to define metric names or their labels in advance.
 metrics anytime.

 Actually, the metric name is also a label with a special name `__name__`. 
-The `__name__` key could be omitted {{% available_from "v1.111.0" %}} for simplicity. So the following series are identical:
+The `__name__` key could be omitted {{% available_from "#" %}} for simplicity. So the following series are identical:

 ```
 requests_total{path="/", code="200"} 
--- a/docs/vmauth.md
+++ b/docs/vmauth.md
@@ -1086,7 +1086,7 @@ It is recommended protecting the following endpoints with authKeys:
 * `/metrics` with `-metricsAuthKey` command-line flag, so unauthorized users couldn't access [vmauth metrics](#monitoring).
 * `/debug/pprof` with `-pprofAuthKey` command-line flag, so unauthorized users couldn't access [profiling information](#profiling).

-As an alternative, it's possible to serve internal API routes at the different listen address with command-line flag `-httpInternalListenAddr=127.0.0.1:8426`. {{% available_from "v1.111.0" %}}
+As an alternative, it's possible to serve internal API routes at the different listen address with command-line flag `-httpInternalListenAddr=127.0.0.1:8426`. {{% available_from "#" %}}

 `vmauth` also supports the ability to restrict access by IP - see [these docs](#ip-filters). See also [concurrency limiting docs](#concurrency-limiting).

--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,6 @@
 module github.com/VictoriaMetrics/VictoriaMetrics

-go 1.23.6
+go 1.23.5

 // This is needed in order to avoid vmbackup and vmrestore binary size increase by 20MB
 // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8008
--- a/lib/logstorage/chunked_allocator.go
+++ b/lib/logstorage/chunked_allocator.go
@@ -27,15 +27,7 @@ type chunkedAllocator struct {
 	uniqValuesProcessors    []statsUniqValuesProcessor
 	valuesProcessors        []statsValuesProcessor

-	pipeStatsGroups    []pipeStatsGroup
-	pipeStatsGroupMaps []pipeStatsGroupMap
-
-	statsProcessors []statsProcessor
-
-	statsCountUniqSets     []statsCountUniqSet
-	statsCountUniqHashSets []statsCountUniqHashSet
-
-	hitsMaps []hitsMap
+	pipeStatsGroups []pipeStatsGroup

 	u64Buf []uint64

@@ -124,26 +116,6 @@ func (a *chunkedAllocator) newPipeStatsGroup() (p *pipeStatsGroup) {
 	return addNewItem(&a.pipeStatsGroups, a)
 }

-func (a *chunkedAllocator) newPipeStatsGroupMaps(itemsLen uint) []pipeStatsGroupMap {
-	return addNewItems(&a.pipeStatsGroupMaps, itemsLen, a)
-}
-
-func (a *chunkedAllocator) newStatsProcessors(itemsLen uint) []statsProcessor {
-	return addNewItems(&a.statsProcessors, itemsLen, a)
-}
-
-func (a *chunkedAllocator) newStatsCountUniqSets(itemsLen uint) []statsCountUniqSet {
-	return addNewItems(&a.statsCountUniqSets, itemsLen, a)
-}
-
-func (a *chunkedAllocator) newStatsCountUniqHashSets(itemsLen uint) []statsCountUniqHashSet {
-	return addNewItems(&a.statsCountUniqHashSets, itemsLen, a)
-}
-
-func (a *chunkedAllocator) newHitsMaps(itemsLen uint) []hitsMap {
-	return addNewItems(&a.hitsMaps, itemsLen, a)
-}
-
 func (a *chunkedAllocator) newUint64() (p *uint64) {
 	return addNewItem(&a.u64Buf, a)
 }
@@ -153,32 +125,33 @@ func (a *chunkedAllocator) cloneBytesToString(b []byte) string {
 }

 func (a *chunkedAllocator) cloneString(s string) string {
-	xs := addNewItems(&a.stringsBuf, uint(len(s)), a)
-	copy(xs, s)
-	return bytesutil.ToUnsafeString(xs)
+	const maxChunkLen = 64 * 1024
+	if a.stringsBuf != nil && len(a.stringsBuf)+len(s) > maxChunkLen {
+		a.stringsBuf = nil
+	}
+	if a.stringsBuf == nil {
+		a.stringsBuf = make([]byte, 0, maxChunkLen)
+		a.bytesAllocated += maxChunkLen
+	}
+
+	sbLen := len(a.stringsBuf)
+	a.stringsBuf = append(a.stringsBuf, s...)
+	return bytesutil.ToUnsafeString(a.stringsBuf[sbLen:])
 }

 func addNewItem[T any](dstPtr *[]T, a *chunkedAllocator) *T {
-	xs := addNewItems(dstPtr, 1, a)
-	return &xs[0]
-}
-
-func addNewItems[T any](dstPtr *[]T, itemsLen uint, a *chunkedAllocator) []T {
 	dst := *dstPtr
-	var maxItems = (64 * 1024) / uint(unsafe.Sizeof(dst[0]))
-	if itemsLen > maxItems {
-		return make([]T, itemsLen)
-	}
-	if dst != nil && uint(len(dst))+itemsLen > maxItems {
+	var maxItems = (64 * 1024) / int(unsafe.Sizeof(dst[0]))
+	if dst != nil && len(dst)+1 > maxItems {
 		dst = nil
 	}
 	if dst == nil {
 		dst = make([]T, 0, maxItems)
-		a.bytesAllocated += int(maxItems * uint(unsafe.Sizeof(dst[0])))
+		a.bytesAllocated += maxItems * int(unsafe.Sizeof(dst[0]))
 	}
-	dstLen := uint(len(dst))
-	dst = dst[:dstLen+itemsLen]
-	xs := dst[dstLen : dstLen+itemsLen : dstLen+itemsLen]
+	var x T
+	dst = append(dst, x)
+	item := &dst[len(dst)-1]
 	*dstPtr = dst
-	return xs
+	return item
 }
--- a/lib/logstorage/hits_map.go
+++ b/lib/logstorage/hits_map.go
@@ -9,176 +9,36 @@ import (
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
 )

-type hitsMapAdaptive struct {
+type hitsMap struct {
 	stateSizeBudget *int

-	// concurrency is the number of parallel workers to use when merging shards.
-	//
-	// this field must be updated by the caller before using statsCountUniqProcessor.
-	concurrency uint
-
-	// hm tracks hits until the number of unique values reaches hitsMapAdaptiveMaxLen.
-	// After that hits are tracked by shards.
-	hm hitsMap
-
-	// shards tracks hits for big number of unique values.
-	//
-	// Every shard contains hits for a share of unique values.
-	shards []hitsMap
+	u64        map[uint64]*uint64
+	negative64 map[uint64]*uint64
+	strings    map[string]*uint64

 	// a reduces memory allocations when counting the number of hits over big number of unique values.
 	a chunkedAllocator
 }

-// the maximum number of values to track in hitsMapAdaptive.hm before switching to hitsMapAdaptive.shards
-//
-// Too big value may slow down hitsMapMergeParallel() across big number of CPU cores.
-// Too small value may significantly increase RAM usage when hits for big number of unique values are counted.
-const hitsMapAdaptiveMaxLen = 4 << 10
-
-func (hma *hitsMapAdaptive) reset() {
-	*hma = hitsMapAdaptive{}
-}
-
-func (hma *hitsMapAdaptive) init(concurrency uint, stateSizeBudget *int) {
-	hma.reset()
-	hma.stateSizeBudget = stateSizeBudget
-	hma.concurrency = concurrency
-}
-
-func (hma *hitsMapAdaptive) clear() {
-	*hma.stateSizeBudget += hma.stateSize()
-	hma.init(hma.concurrency, hma.stateSizeBudget)
-}
-
-func (hma *hitsMapAdaptive) stateSize() int {
-	n := hma.hm.stateSize()
-	for i := range hma.shards {
-		n += hma.shards[i].stateSize()
-	}
-	return n
-}
-
-func (hma *hitsMapAdaptive) entriesCount() uint64 {
-	if hma.shards == nil {
-		return hma.hm.entriesCount()
-	}
-
-	shards := hma.shards
-	n := uint64(0)
-	for i := range shards {
-		n += shards[i].entriesCount()
-	}
-	return n
-}
-
-func (hma *hitsMapAdaptive) updateStateGeneric(key string, hits uint64) {
-	if n, ok := tryParseUint64(key); ok {
-		hma.updateStateUint64(n, hits)
-		return
-	}
-	if len(key) > 0 && key[0] == '-' {
-		if n, ok := tryParseInt64(key); ok {
-			hma.updateStateNegativeInt64(n, hits)
-			return
-		}
-	}
-	hma.updateStateString(bytesutil.ToUnsafeBytes(key), hits)
-}
-
-func (hma *hitsMapAdaptive) updateStateInt64(n int64, hits uint64) {
-	if n >= 0 {
-		hma.updateStateUint64(uint64(n), hits)
-	} else {
-		hma.updateStateNegativeInt64(n, hits)
-	}
-}
-
-func (hma *hitsMapAdaptive) updateStateUint64(n, hits uint64) {
-	if hma.shards == nil {
-		stateSize := hma.hm.updateStateUint64(&hma.a, n, hits)
-		if stateSize > 0 {
-			*hma.stateSizeBudget -= stateSize
-			hma.probablyMoveToShards()
-		}
-		return
-	}
-	hm := hma.getShardByUint64(n)
-	*hma.stateSizeBudget -= hm.updateStateUint64(&hma.a, n, hits)
-}
-
-func (hma *hitsMapAdaptive) updateStateNegativeInt64(n int64, hits uint64) {
-	if hma.shards == nil {
-		stateSize := hma.hm.updateStateNegativeInt64(&hma.a, n, hits)
-		if stateSize > 0 {
-			*hma.stateSizeBudget -= stateSize
-			hma.probablyMoveToShards()
-		}
-		return
-	}
-	hm := hma.getShardByUint64(uint64(n))
-	*hma.stateSizeBudget -= hm.updateStateNegativeInt64(&hma.a, n, hits)
-}
-
-func (hma *hitsMapAdaptive) updateStateString(key []byte, hits uint64) {
-	if hma.shards == nil {
-		stateSize := hma.hm.updateStateString(&hma.a, key, hits)
-		if stateSize > 0 {
-			*hma.stateSizeBudget -= stateSize
-			hma.probablyMoveToShards()
-		}
-		return
-	}
-	hm := hma.getShardByString(key)
-	*hma.stateSizeBudget -= hm.updateStateString(&hma.a, key, hits)
-}
-
-func (hma *hitsMapAdaptive) probablyMoveToShards() {
-	if hma.hm.entriesCount() < hitsMapAdaptiveMaxLen {
-		return
-	}
-	hma.moveToShards()
-}
-
-func (hma *hitsMapAdaptive) moveToShards() {
-	hma.shards = hma.a.newHitsMaps(hma.concurrency)
-
-	for n, pHits := range hma.hm.u64 {
-		hm := hma.getShardByUint64(n)
-		hm.setStateUint64(n, pHits)
-	}
-	for n, pHits := range hma.hm.negative64 {
-		hm := hma.getShardByUint64(n)
-		hm.setStateNegativeInt64(int64(n), pHits)
-	}
-	for s, pHits := range hma.hm.strings {
-		hm := hma.getShardByString(bytesutil.ToUnsafeBytes(s))
-		hm.setStateString(s, pHits)
-	}
-
-	hma.hm.reset()
-}
-
-func (hma *hitsMapAdaptive) getShardByUint64(n uint64) *hitsMap {
-	h := fastHashUint64(n)
-	shardIdx := h % uint64(len(hma.shards))
-	return &hma.shards[shardIdx]
-}
-
-func (hma *hitsMapAdaptive) getShardByString(v []byte) *hitsMap {
-	h := xxhash.Sum64(v)
-	shardIdx := h % uint64(len(hma.shards))
-	return &hma.shards[shardIdx]
-}
-
-type hitsMap struct {
-	u64        map[uint64]*uint64
-	negative64 map[uint64]*uint64
-	strings    map[string]*uint64
-}
-
 func (hm *hitsMap) reset() {
-	*hm = hitsMap{}
+	hm.stateSizeBudget = nil
+
+	hm.u64 = nil
+	hm.negative64 = nil
+	hm.strings = nil
+}
+
+func (hm *hitsMap) clear() {
+	*hm.stateSizeBudget += hm.stateSize()
+	hm.init(hm.stateSizeBudget)
+}
+
+func (hm *hitsMap) init(stateSizeBudget *int) {
+	hm.stateSizeBudget = stateSizeBudget
+
+	hm.u64 = make(map[uint64]*uint64)
+	hm.negative64 = make(map[uint64]*uint64)
+	hm.strings = make(map[string]*uint64)
 }

 func (hm *hitsMap) entriesCount() uint64 {
@@ -187,89 +47,76 @@ func (hm *hitsMap) entriesCount() uint64 {
 }

 func (hm *hitsMap) stateSize() int {
-	size := 0
-
-	for n, pHits := range hm.u64 {
-		size += int(unsafe.Sizeof(n) + unsafe.Sizeof(pHits) + unsafe.Sizeof(*pHits))
+	n := 24*(len(hm.u64)+len(hm.negative64)) + 40*len(hm.strings)
+	for k := range hm.strings {
+		n += len(k)
 	}
-	for n, pHits := range hm.negative64 {
-		size += int(unsafe.Sizeof(n) + unsafe.Sizeof(pHits) + unsafe.Sizeof(*pHits))
-	}
-	for k, pHits := range hm.strings {
-		size += len(k) + int(unsafe.Sizeof(k)+unsafe.Sizeof(pHits)+unsafe.Sizeof(*pHits))
-	}
-
-	return size
+	return n
 }

-func (hm *hitsMap) updateStateUint64(a *chunkedAllocator, n, hits uint64) int {
+func (hm *hitsMap) updateStateGeneric(key string, hits uint64) {
+	if n, ok := tryParseUint64(key); ok {
+		hm.updateStateUint64(n, hits)
+		return
+	}
+	if len(key) > 0 && key[0] == '-' {
+		if n, ok := tryParseInt64(key); ok {
+			hm.updateStateNegativeInt64(n, hits)
+			return
+		}
+	}
+	hm.updateStateString(bytesutil.ToUnsafeBytes(key), hits)
+}
+
+func (hm *hitsMap) updateStateInt64(n int64, hits uint64) {
+	if n >= 0 {
+		hm.updateStateUint64(uint64(n), hits)
+	} else {
+		hm.updateStateNegativeInt64(n, hits)
+	}
+}
+
+func (hm *hitsMap) updateStateUint64(n, hits uint64) {
 	pHits := hm.u64[n]
 	if pHits != nil {
 		*pHits += hits
-		return 0
+		return
 	}

-	pHits = a.newUint64()
+	pHits = hm.a.newUint64()
 	*pHits = hits
-	return int(unsafe.Sizeof(*pHits)) + hm.setStateUint64(n, pHits)
-}
-
-func (hm *hitsMap) setStateUint64(n uint64, pHits *uint64) int {
-	if hm.u64 == nil {
-		hm.u64 = map[uint64]*uint64{
-			n: pHits,
-		}
-		return int(unsafe.Sizeof(hm.u64) + unsafe.Sizeof(n) + unsafe.Sizeof(pHits))
-	}
 	hm.u64[n] = pHits
-	return int(unsafe.Sizeof(n) + unsafe.Sizeof(pHits))
+
+	*hm.stateSizeBudget -= 24
 }

-func (hm *hitsMap) updateStateNegativeInt64(a *chunkedAllocator, n int64, hits uint64) int {
+func (hm *hitsMap) updateStateNegativeInt64(n int64, hits uint64) {
 	pHits := hm.negative64[uint64(n)]
 	if pHits != nil {
 		*pHits += hits
-		return 0
+		return
 	}

-	pHits = a.newUint64()
+	pHits = hm.a.newUint64()
 	*pHits = hits
-	return int(unsafe.Sizeof(*pHits)) + hm.setStateNegativeInt64(n, pHits)
-}
-
-func (hm *hitsMap) setStateNegativeInt64(n int64, pHits *uint64) int {
-	if hm.negative64 == nil {
-		hm.negative64 = map[uint64]*uint64{
-			uint64(n): pHits,
-		}
-		return int(unsafe.Sizeof(hm.negative64) + unsafe.Sizeof(uint64(n)) + unsafe.Sizeof(pHits))
-	}
 	hm.negative64[uint64(n)] = pHits
-	return int(unsafe.Sizeof(n) + unsafe.Sizeof(pHits))
+
+	*hm.stateSizeBudget -= 24
 }

-func (hm *hitsMap) updateStateString(a *chunkedAllocator, key []byte, hits uint64) int {
+func (hm *hitsMap) updateStateString(key []byte, hits uint64) {
 	pHits := hm.strings[string(key)]
 	if pHits != nil {
 		*pHits += hits
-		return 0
+		return
 	}

-	keyCopy := a.cloneBytesToString(key)
-	pHits = a.newUint64()
+	keyCopy := hm.a.cloneBytesToString(key)
+	pHits = hm.a.newUint64()
 	*pHits = hits
-	return len(keyCopy) + int(unsafe.Sizeof(*pHits)) + hm.setStateString(keyCopy, pHits)
-}
+	hm.strings[keyCopy] = pHits

-func (hm *hitsMap) setStateString(v string, pHits *uint64) int {
-	if hm.strings == nil {
-		hm.strings = map[string]*uint64{
-			v: pHits,
-		}
-		return int(unsafe.Sizeof(hm.strings) + unsafe.Sizeof(v) + unsafe.Sizeof(pHits))
-	}
-	hm.strings[v] = pHits
-	return int(unsafe.Sizeof(v) + unsafe.Sizeof(pHits))
+	*hm.stateSizeBudget -= len(keyCopy) + 40
 }

 func (hm *hitsMap) mergeState(src *hitsMap, stopCh <-chan struct{}) {
@@ -279,7 +126,7 @@ func (hm *hitsMap) mergeState(src *hitsMap, stopCh <-chan struct{}) {
 		}
 		pHitsDst := hm.u64[n]
 		if pHitsDst == nil {
-			hm.setStateUint64(n, pHitsSrc)
+			hm.u64[n] = pHitsSrc
 		} else {
 			*pHitsDst += *pHitsSrc
 		}
@@ -290,7 +137,7 @@ func (hm *hitsMap) mergeState(src *hitsMap, stopCh <-chan struct{}) {
 		}
 		pHitsDst := hm.negative64[n]
 		if pHitsDst == nil {
-			hm.setStateNegativeInt64(int64(n), pHitsSrc)
+			hm.negative64[n] = pHitsSrc
 		} else {
 			*pHitsDst += *pHitsSrc
 		}
@@ -301,52 +148,89 @@ func (hm *hitsMap) mergeState(src *hitsMap, stopCh <-chan struct{}) {
 		}
 		pHitsDst := hm.strings[k]
 		if pHitsDst == nil {
-			hm.setStateString(k, pHitsSrc)
+			hm.strings[k] = pHitsSrc
 		} else {
 			*pHitsDst += *pHitsSrc
 		}
 	}
 }

-// hitsMapMergeParallel merges hmas in parallel
+// hitsMapMergeParallel merges hms in parallel on the given cpusCount
 //
-// The merged disjoint parts of hmas are passed to f.
+// The mered disjoint parts of hms are passed to f.
 // The function may be interrupted by closing stopCh.
 // The caller must check for closed stopCh after returning from the function.
-func hitsMapMergeParallel(hmas []*hitsMapAdaptive, stopCh <-chan struct{}, f func(hm *hitsMap)) {
-	if len(hmas) == 0 {
+func hitsMapMergeParallel(hms []*hitsMap, cpusCount int, stopCh <-chan struct{}, f func(hm *hitsMap)) {
+	srcLen := len(hms)
+	if srcLen < 2 {
+		// Nothing to merge
+		if len(hms) == 1 {
+			f(hms[0])
+		}
 		return
 	}

 	var wg sync.WaitGroup
-	for i := range hmas {
-		hma := hmas[i]
-		if hma.shards != nil {
-			continue
-		}
+	perShardMaps := make([][]hitsMap, srcLen)
+	for i := range hms {
 		wg.Add(1)
-		go func() {
+		go func(idx int) {
 			defer wg.Done()
-			hma.moveToShards()
-		}()
+
+			stateSizeBudget := 0
+			perCPU := make([]hitsMap, cpusCount)
+			for i := range perCPU {
+				perCPU[i].init(&stateSizeBudget)
+			}
+
+			hm := hms[idx]
+
+			for n, pHits := range hm.u64 {
+				if needStop(stopCh) {
+					return
+				}
+				k := unsafe.Slice((*byte)(unsafe.Pointer(&n)), 8)
+				h := xxhash.Sum64(k)
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].u64[n] = pHits
+			}
+			for n, pHits := range hm.negative64 {
+				if needStop(stopCh) {
+					return
+				}
+				k := unsafe.Slice((*byte)(unsafe.Pointer(&n)), 8)
+				h := xxhash.Sum64(k)
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].negative64[n] = pHits
+			}
+			for k, pHits := range hm.strings {
+				if needStop(stopCh) {
+					return
+				}
+				h := xxhash.Sum64(bytesutil.ToUnsafeBytes(k))
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].strings[k] = pHits
+			}
+
+			perShardMaps[idx] = perCPU
+			hm.reset()
+		}(i)
 	}
 	wg.Wait()
 	if needStop(stopCh) {
 		return
 	}

-	cpusCount := len(hmas[0].shards)
-
+	// Merge per-shard entries into perShardMaps[0]
 	for i := 0; i < cpusCount; i++ {
 		wg.Add(1)
 		go func(cpuIdx int) {
 			defer wg.Done()

-			hm := &hmas[0].shards[cpuIdx]
-			for j := range hmas[1:] {
-				src := &hmas[1+j].shards[cpuIdx]
-				hm.mergeState(src, stopCh)
-				src.reset()
+			hm := &perShardMaps[0][cpuIdx]
+			for _, perCPU := range perShardMaps[1:] {
+				hm.mergeState(&perCPU[cpuIdx], stopCh)
+				perCPU[cpuIdx].reset()
 			}
 			f(hm)
 		}(i)
--- a/lib/logstorage/pipe_facets.go
+++ b/lib/logstorage/pipe_facets.go
@@ -3,7 +3,6 @@ package logstorage
 import (
 	"fmt"
 	"sort"
-	"sync"
 	"sync/atomic"
 	"unsafe"

@@ -79,25 +78,25 @@ func (pf *pipeFacets) visitSubqueries(_ func(q *Query)) {
 func (pf *pipeFacets) newPipeProcessor(workersCount int, stopCh <-chan struct{}, cancel func(), ppNext pipeProcessor) pipeProcessor {
 	maxStateSize := int64(float64(memory.Allowed()) * 0.2)

+	shards := make([]pipeFacetsProcessorShard, workersCount)
+	for i := range shards {
+		shards[i] = pipeFacetsProcessorShard{
+			pipeFacetsProcessorShardNopad: pipeFacetsProcessorShardNopad{
+				pf: pf,
+			},
+		}
+	}
+
 	pfp := &pipeFacetsProcessor{
 		pf:     pf,
 		stopCh: stopCh,
 		cancel: cancel,
 		ppNext: ppNext,

+		shards: shards,
+
 		maxStateSize: maxStateSize,
 	}
-
-	shards := make([]pipeFacetsProcessorShard, workersCount)
-	for i := range shards {
-		shards[i] = pipeFacetsProcessorShard{
-			pipeFacetsProcessorShardNopad: pipeFacetsProcessorShardNopad{
-				pfp: pfp,
-			},
-		}
-	}
-	pfp.shards = shards
-
 	pfp.stateSizeBudget.Store(maxStateSize)

 	return pfp
@@ -123,8 +122,8 @@ type pipeFacetsProcessorShard struct {
 }

 type pipeFacetsProcessorShardNopad struct {
-	// pfp points to the parent pipeFacetsProcessor.
-	pfp *pipeFacetsProcessor
+	// pf points to the parent pipeFacets.
+	pf *pipeFacets

 	// a is used for reducing memory allocations when counting facets over big number of unique fields
 	a chunkedAllocator
@@ -141,7 +140,7 @@ type pipeFacetsProcessorShardNopad struct {
 }

 type pipeFacetsFieldHits struct {
-	m          hitsMapAdaptive
+	m          hitsMap
 	mustIgnore bool
 }

@@ -164,7 +163,7 @@ func (shard *pipeFacetsProcessorShard) updateFacetsForColumn(br *blockResult, c
 	if fhs.mustIgnore {
 		return
 	}
-	if fhs.m.entriesCount() >= shard.pfp.pf.maxValuesPerField {
+	if fhs.m.entriesCount() >= shard.pf.maxValuesPerField {
 		// Ignore fields with too many unique values
 		fhs.enableIgnoreField()
 		return
@@ -220,7 +219,7 @@ func (shard *pipeFacetsProcessorShard) updateFacetsForColumn(br *blockResult, c
 }

 func (shard *pipeFacetsProcessorShard) updateStateInt64(fhs *pipeFacetsFieldHits, n int64) {
-	if maxValueLen := shard.pfp.pf.maxValueLen; maxValueLen <= 21 && uint64(int64StringLen(n)) > maxValueLen {
+	if maxValueLen := shard.pf.maxValueLen; maxValueLen <= 21 && uint64(int64StringLen(n)) > maxValueLen {
 		// Ignore fields with too long values, since they are hard to use in faceted search.
 		fhs.enableIgnoreField()
 		return
@@ -229,7 +228,7 @@ func (shard *pipeFacetsProcessorShard) updateStateInt64(fhs *pipeFacetsFieldHits
 }

 func (shard *pipeFacetsProcessorShard) updateStateUint64(fhs *pipeFacetsFieldHits, n uint64) {
-	if maxValueLen := shard.pfp.pf.maxValueLen; maxValueLen <= 20 && uint64(uint64StringLen(n)) > maxValueLen {
+	if maxValueLen := shard.pf.maxValueLen; maxValueLen <= 20 && uint64(uint64StringLen(n)) > maxValueLen {
 		// Ignore fields with too long values, since they are hard to use in faceted search.
 		fhs.enableIgnoreField()
 		return
@@ -289,7 +288,7 @@ func (shard *pipeFacetsProcessorShard) updateStateGeneric(fhs *pipeFacetsFieldHi
 		// So it is better ignoring empty values.
 		return
 	}
-	if uint64(len(v)) > shard.pfp.pf.maxValueLen {
+	if uint64(len(v)) > shard.pf.maxValueLen {
 		// Ignore fields with too long values, since they are hard to use in faceted search.
 		fhs.enableIgnoreField()
 		return
@@ -304,7 +303,7 @@ func (shard *pipeFacetsProcessorShard) getFieldHits(fieldName string) *pipeFacet
 	fhs, ok := shard.m[fieldName]
 	if !ok {
 		fhs = &pipeFacetsFieldHits{}
-		fhs.m.init(uint(len(shard.pfp.shards)), &shard.stateSizeBudget)
+		fhs.m.init(&shard.stateSizeBudget)
 		fieldNameCopy := shard.a.cloneString(fieldName)
 		shard.m[fieldNameCopy] = fhs
 		shard.stateSizeBudget -= len(fieldNameCopy) + int(unsafe.Sizeof(fhs)+unsafe.Sizeof(*fhs))
@@ -342,7 +341,7 @@ func (pfp *pipeFacetsProcessor) flush() error {
 	}

 	// merge state across shards
-	hmasByFieldName := make(map[string][]*hitsMapAdaptive)
+	hms := make(map[string]*hitsMap)
 	rowsTotal := uint64(0)
 	for _, shard := range pfp.shards {
 		if needStop(pfp.stopCh) {
@@ -352,14 +351,19 @@ func (pfp *pipeFacetsProcessor) flush() error {
 			if fhs.mustIgnore {
 				continue
 			}
-			hmasByFieldName[fieldName] = append(hmasByFieldName[fieldName], &fhs.m)
+			hm, ok := hms[fieldName]
+			if !ok {
+				hms[fieldName] = &fhs.m
+				continue
+			}
+			hm.mergeState(&fhs.m, pfp.stopCh)
 		}
 		rowsTotal += shard.rowsTotal
 	}

 	// sort fieldNames
-	fieldNames := make([]string, 0, len(hmasByFieldName))
-	for fieldName := range hmasByFieldName {
+	fieldNames := make([]string, 0, len(hms))
+	for fieldName := range hms {
 		fieldNames = append(fieldNames, fieldName)
 	}
 	sort.Strings(fieldNames)
@@ -373,30 +377,31 @@ func (pfp *pipeFacetsProcessor) flush() error {
 		if needStop(pfp.stopCh) {
 			return nil
 		}
-
-		hmas := hmasByFieldName[fieldName]
-		var hms []*hitsMap
-		var hmsLock sync.Mutex
-		hitsMapMergeParallel(hmas, pfp.stopCh, func(hm *hitsMap) {
-			hmsLock.Lock()
-			hms = append(hms, hm)
-			hmsLock.Unlock()
-		})
-
-		entriesCount := uint64(0)
-		for _, hm := range hms {
-			entriesCount += hm.entriesCount()
-		}
-		if entriesCount > pfp.pf.maxValuesPerField {
+		hm := hms[fieldName]
+		if hm.entriesCount() > pfp.pf.maxValuesPerField {
 			continue
 		}

-		vs := make([]pipeTopEntry, 0, entriesCount)
-		for _, hm := range hms {
-			vs = appendTopEntryFacets(vs, hm)
+		vs := make([]pipeTopEntry, 0, hm.entriesCount())
+		for n, pHits := range hm.u64 {
+			vs = append(vs, pipeTopEntry{
+				k:    string(marshalUint64String(nil, n)),
+				hits: *pHits,
+			})
 		}
-
-		if len(vs) == 1 && vs[0].hits == rowsTotal && !wctx.pfp.pf.keepConstFields {
+		for n, pHits := range hm.negative64 {
+			vs = append(vs, pipeTopEntry{
+				k:    string(marshalInt64String(nil, int64(n))),
+				hits: *pHits,
+			})
+		}
+		for k, pHits := range hm.strings {
+			vs = append(vs, pipeTopEntry{
+				k:    k,
+				hits: *pHits,
+			})
+		}
+		if len(vs) == 1 && vs[0].hits == rowsTotal && !pfp.pf.keepConstFields {
 			// Skip field with constant value.
 			continue
 		}
@@ -407,9 +412,6 @@ func (pfp *pipeFacetsProcessor) flush() error {
 			vs = vs[:limit]
 		}
 		for _, v := range vs {
-			if needStop(pfp.stopCh) {
-				return nil
-			}
 			wctx.writeRow(fieldName, v.k, v.hits)
 		}
 	}
@@ -418,28 +420,6 @@ func (pfp *pipeFacetsProcessor) flush() error {
 	return nil
 }

-func appendTopEntryFacets(dst []pipeTopEntry, hm *hitsMap) []pipeTopEntry {
-	for n, pHits := range hm.u64 {
-		dst = append(dst, pipeTopEntry{
-			k:    string(marshalUint64String(nil, n)),
-			hits: *pHits,
-		})
-	}
-	for n, pHits := range hm.negative64 {
-		dst = append(dst, pipeTopEntry{
-			k:    string(marshalInt64String(nil, int64(n))),
-			hits: *pHits,
-		})
-	}
-	for k, pHits := range hm.strings {
-		dst = append(dst, pipeTopEntry{
-			k:    k,
-			hits: *pHits,
-		})
-	}
-	return dst
-}
-
 type pipeFacetsWriteContext struct {
 	pfp *pipeFacetsProcessor
 	rcs []resultColumn
--- a/lib/logstorage/pipe_stats.go
+++ b/lib/logstorage/pipe_stats.go
@@ -13,6 +13,7 @@ import (
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/slicesutil"
 )

 // pipeStats processes '| stats ...' queries.
@@ -45,8 +46,6 @@ type statsFunc interface {
 	updateNeededFields(neededFields fieldsSet)

 	// newStatsProcessor must create new statsProcessor for calculating stats for the given statsFunc
-	//
-	// a must be used for allocating memory inside the returned statsProcessor.
 	newStatsProcessor(a *chunkedAllocator) statsProcessor
 }

@@ -211,26 +210,26 @@ const stateSizeBudgetChunk = 1 << 20
 func (ps *pipeStats) newPipeProcessor(workersCount int, stopCh <-chan struct{}, cancel func(), ppNext pipeProcessor) pipeProcessor {
 	maxStateSize := int64(float64(memory.Allowed()) * 0.4)

+	shards := make([]pipeStatsProcessorShard, workersCount)
+	for i := range shards {
+		shards[i] = pipeStatsProcessorShard{
+			pipeStatsProcessorShardNopad: pipeStatsProcessorShardNopad{
+				ps: ps,
+			},
+		}
+		shards[i].init()
+	}
+
 	psp := &pipeStatsProcessor{
 		ps:     ps,
 		stopCh: stopCh,
 		cancel: cancel,
 		ppNext: ppNext,

+		shards: shards,
+
 		maxStateSize: maxStateSize,
 	}
-
-	shards := make([]pipeStatsProcessorShard, workersCount)
-	for i := range shards {
-		shards[i] = pipeStatsProcessorShard{
-			pipeStatsProcessorShardNopad: pipeStatsProcessorShardNopad{
-				psp: psp,
-			},
-		}
-		shards[i].init()
-	}
-	psp.shards = shards
-
 	psp.stateSizeBudget.Store(maxStateSize)

 	return psp
@@ -256,19 +255,9 @@ type pipeStatsProcessorShard struct {
 }

 type pipeStatsProcessorShardNopad struct {
-	psp *pipeStatsProcessor
+	ps *pipeStats

-	// groupMap is used for tracking small number of groups until it reaches pipeStatsGroupMapMaxLen.
-	// After that the groups are tracked by groupMapShards.
-	groupMap pipeStatsGroupMap
-
-	// groupMapShards are used for tracking big number of groups.
-	//
-	// Every shard contains a share of unique groups, which are merged in parallel at flush().
-	groupMapShards []pipeStatsGroupMap
-
-	// a is used for reducing memory allocations when calculating stats among big number of different groups.
-	a chunkedAllocator
+	m pipeStatsGroupMap

 	// bms and brTmp are used for applying per-func filters.
 	bms   []bitmap
@@ -280,26 +269,34 @@ type pipeStatsProcessorShardNopad struct {
 	stateSizeBudget int
 }

-// the maximum number of groups to track in pipeStatsProcessorShard.groupMap before switching to pipeStatsProcessorShard.groupMapShards
-//
-// Too big value may slow down flush() across big number of CPU cores.
-// Too small value may significantly increase RAM usage when stats for big number of groups is calculated.
-const pipeStatsGroupMapMaxLen = 4 << 10
-
 type pipeStatsGroupMap struct {
 	shard *pipeStatsProcessorShard

 	u64        map[uint64]*pipeStatsGroup
 	negative64 map[uint64]*pipeStatsGroup
 	strings    map[string]*pipeStatsGroup
+
+	// a and sfpsBuf are used for reducing memory allocations when calculating stats among big number of different groups.
+	a       chunkedAllocator
+	sfpsBuf []statsProcessor
 }

 func (psm *pipeStatsGroupMap) reset() {
-	*psm = pipeStatsGroupMap{}
+	psm.shard = nil
+
+	psm.u64 = nil
+	psm.negative64 = nil
+	psm.strings = nil
+
+	psm.sfpsBuf = nil
 }

 func (psm *pipeStatsGroupMap) init(shard *pipeStatsProcessorShard) {
 	psm.shard = shard
+
+	psm.u64 = make(map[uint64]*pipeStatsGroup)
+	psm.negative64 = make(map[uint64]*pipeStatsGroup)
+	psm.strings = make(map[string]*pipeStatsGroup)
 }

 func (psm *pipeStatsGroupMap) entriesCount() uint64 {
@@ -307,74 +304,98 @@ func (psm *pipeStatsGroupMap) entriesCount() uint64 {
 	return uint64(n)
 }

-func (psm *pipeStatsGroupMap) getPipeStatsGroupUint64(n uint64) (*pipeStatsGroup, bool) {
-	if psg := psm.u64[n]; psg != nil {
-		return psg, false
+func (psm *pipeStatsGroupMap) getPipeStatsGroupGeneric(key string) *pipeStatsGroup {
+	if n, ok := tryParseUint64(key); ok {
+		return psm.getPipeStatsGroupUint64(n)
 	}
-
-	psg := psm.shard.newPipeStatsGroup()
-	psm.setPipeStatsGroupUint64(n, psg)
-	return psg, true
-}
-
-func (psm *pipeStatsGroupMap) setPipeStatsGroupUint64(n uint64, psg *pipeStatsGroup) {
-	if psm.u64 == nil {
-		psm.u64 = map[uint64]*pipeStatsGroup{
-			n: psg,
+	if len(key) > 0 && key[0] == '-' {
+		if n, ok := tryParseInt64(key); ok {
+			return psm.getPipeStatsGroupNegativeInt64(n)
 		}
-		psm.shard.stateSizeBudget -= int(unsafe.Sizeof(psm.u64) + unsafe.Sizeof(n) + unsafe.Sizeof(psg))
-	} else {
-		psm.u64[n] = psg
-		psm.shard.stateSizeBudget -= int(unsafe.Sizeof(n) + unsafe.Sizeof(psg))
 	}
+	return psm.getPipeStatsGroupString(bytesutil.ToUnsafeBytes(key))
 }

-func (psm *pipeStatsGroupMap) getPipeStatsGroupNegativeInt64(n int64) (*pipeStatsGroup, bool) {
-	if psg := psm.negative64[uint64(n)]; psg != nil {
-		return psg, false
+func (psm *pipeStatsGroupMap) getPipeStatsGroupInt64(n int64) *pipeStatsGroup {
+	if n >= 0 {
+		return psm.getPipeStatsGroupUint64(uint64(n))
 	}
-
-	psg := psm.shard.newPipeStatsGroup()
-	psm.setPipeStatsGroupNegativeInt64(n, psg)
-	return psg, true
+	return psm.getPipeStatsGroupNegativeInt64(n)
 }

-func (psm *pipeStatsGroupMap) setPipeStatsGroupNegativeInt64(n int64, psg *pipeStatsGroup) {
-	if psm.negative64 == nil {
-		psm.negative64 = map[uint64]*pipeStatsGroup{
-			uint64(n): psg,
-		}
-		psm.shard.stateSizeBudget -= int(unsafe.Sizeof(psm.negative64) + unsafe.Sizeof(n) + unsafe.Sizeof(psg))
-	} else {
-		psm.negative64[uint64(n)] = psg
-		psm.shard.stateSizeBudget -= int(unsafe.Sizeof(n) + unsafe.Sizeof(psg))
-	}
-}
-
-func (psm *pipeStatsGroupMap) getPipeStatsGroupString(key []byte) (*pipeStatsGroup, bool) {
-	if psg := psm.strings[string(key)]; psg != nil {
-		return psg, false
+func (psm *pipeStatsGroupMap) getPipeStatsGroupUint64(n uint64) *pipeStatsGroup {
+	psg := psm.u64[n]
+	if psg != nil {
+		return psg
 	}

-	psg := psm.shard.newPipeStatsGroup()
-	keyCopy := psm.shard.a.cloneBytesToString(key)
-	psm.shard.stateSizeBudget -= len(keyCopy)
-	psm.setPipeStatsGroupString(keyCopy, psg)
-	return psg, true
+	psg = psm.newPipeStatsGroup()
+	psm.u64[n] = psg
+	psm.shard.stateSizeBudget -= int(unsafe.Sizeof(n) + unsafe.Sizeof(psg))
+
+	return psg
 }

-func (psm *pipeStatsGroupMap) setPipeStatsGroupString(v string, psg *pipeStatsGroup) {
-	if psm.strings == nil {
-		psm.strings = map[string]*pipeStatsGroup{
-			v: psg,
-		}
-		psm.shard.stateSizeBudget -= int(unsafe.Sizeof(psm.strings) + unsafe.Sizeof(v))
-	} else {
-		psm.strings[v] = psg
-		psm.shard.stateSizeBudget -= int(unsafe.Sizeof(v))
+func (psm *pipeStatsGroupMap) getPipeStatsGroupNegativeInt64(n int64) *pipeStatsGroup {
+	psg := psm.negative64[uint64(n)]
+	if psg != nil {
+		return psg
 	}
+
+	psg = psm.newPipeStatsGroup()
+	psm.negative64[uint64(n)] = psg
+	psm.shard.stateSizeBudget -= int(unsafe.Sizeof(n) + unsafe.Sizeof(psg))
+
+	return psg
 }

+func (psm *pipeStatsGroupMap) getPipeStatsGroupString(key []byte) *pipeStatsGroup {
+	psg := psm.strings[string(key)]
+	if psg != nil {
+		return psg
+	}
+
+	psg = psm.newPipeStatsGroup()
+	keyCopy := psm.a.cloneBytesToString(key)
+	psm.strings[keyCopy] = psg
+	psm.shard.stateSizeBudget -= len(keyCopy) + int(unsafe.Sizeof(keyCopy))
+
+	return psg
+}
+
+func (psm *pipeStatsGroupMap) newPipeStatsGroup() *pipeStatsGroup {
+	sfps := psm.newStatsProcessors()
+
+	for i, f := range psm.shard.ps.funcs {
+		bytesAllocated := psm.a.bytesAllocated
+		sfps[i] = f.f.newStatsProcessor(&psm.a)
+		psm.shard.stateSizeBudget -= psm.a.bytesAllocated - bytesAllocated
+	}
+
+	psg := psm.a.newPipeStatsGroup()
+	psg.funcs = psm.shard.ps.funcs
+	psg.sfps = sfps
+	psm.shard.stateSizeBudget -= int(unsafe.Sizeof(*psg) + unsafe.Sizeof(sfps[0])*uintptr(len(sfps)))
+
+	return psg
+}
+
+func (psm *pipeStatsGroupMap) newStatsProcessors() []statsProcessor {
+	funcsLen := len(psm.shard.ps.funcs)
+	if len(psm.sfpsBuf)+funcsLen > cap(psm.sfpsBuf) {
+		psm.sfpsBuf = nil
+	}
+	if psm.sfpsBuf == nil {
+		psm.sfpsBuf = make([]statsProcessor, 0, pipeStatsProcessorChunkLen)
+	}
+
+	sfpsBufLen := len(psm.sfpsBuf)
+	psm.sfpsBuf = slicesutil.SetLength(psm.sfpsBuf, sfpsBufLen+funcsLen)
+	return psm.sfpsBuf[sfpsBufLen:]
+}
+
+const pipeStatsProcessorChunkLen = 64 * 1024 / int(unsafe.Sizeof((statsProcessor)(nil)))
+
 func (psm *pipeStatsGroupMap) mergeState(src *pipeStatsGroupMap, stopCh <-chan struct{}) {
 	for n, psgSrc := range src.u64 {
 		if needStop(stopCh) {
@@ -382,7 +403,7 @@ func (psm *pipeStatsGroupMap) mergeState(src *pipeStatsGroupMap, stopCh <-chan s
 		}
 		psgDst := psm.u64[n]
 		if psgDst == nil {
-			psm.setPipeStatsGroupUint64(n, psgSrc)
+			psm.u64[n] = psgSrc
 		} else {
 			psgDst.mergeState(psgSrc)
 		}
@@ -393,7 +414,7 @@ func (psm *pipeStatsGroupMap) mergeState(src *pipeStatsGroupMap, stopCh <-chan s
 		}
 		psgDst := psm.negative64[n]
 		if psgDst == nil {
-			psm.setPipeStatsGroupNegativeInt64(int64(n), psgSrc)
+			psm.negative64[n] = psgSrc
 		} else {
 			psgDst.mergeState(psgSrc)
 		}
@@ -404,54 +425,22 @@ func (psm *pipeStatsGroupMap) mergeState(src *pipeStatsGroupMap, stopCh <-chan s
 		}
 		psgDst := psm.strings[k]
 		if psgDst == nil {
-			psm.setPipeStatsGroupString(k, psgSrc)
+			psm.strings[k] = psgSrc
 		} else {
 			psgDst.mergeState(psgSrc)
 		}
 	}
 }

-func initStatsConcurrency(sfp statsProcessor, concurrency uint) {
-	switch t := sfp.(type) {
-	case *statsCountUniqProcessor:
-		t.concurrency = concurrency
-	case *statsCountUniqHashProcessor:
-		t.concurrency = concurrency
-	case *statsUniqValuesProcessor:
-		t.concurrency = concurrency
-	}
-}
-
 func (shard *pipeStatsProcessorShard) init() {
-	shard.groupMap.init(shard)
+	shard.m.init(shard)

-	funcsLen := len(shard.psp.ps.funcs)
+	funcsLen := len(shard.ps.funcs)
 	shard.bms = make([]bitmap, funcsLen)
 }

-func (shard *pipeStatsProcessorShard) newPipeStatsGroup() *pipeStatsGroup {
-	bytesAllocated := shard.a.bytesAllocated
-
-	funcsLen := len(shard.psp.ps.funcs)
-	sfps := shard.a.newStatsProcessors(uint(funcsLen))
-
-	for i, f := range shard.psp.ps.funcs {
-		sfp := f.f.newStatsProcessor(&shard.a)
-		initStatsConcurrency(sfp, uint(len(shard.psp.shards)))
-		sfps[i] = sfp
-	}
-
-	psg := shard.a.newPipeStatsGroup()
-	psg.funcs = shard.psp.ps.funcs
-	psg.sfps = sfps
-
-	shard.stateSizeBudget -= shard.a.bytesAllocated - bytesAllocated
-
-	return psg
-}
-
 func (shard *pipeStatsProcessorShard) writeBlock(br *blockResult) {
-	byFields := shard.psp.ps.byFields
+	byFields := shard.ps.byFields

 	// Update shard.bms by applying per-function filters
 	shard.applyPerFunctionFilters(br)
@@ -459,7 +448,7 @@ func (shard *pipeStatsProcessorShard) writeBlock(br *blockResult) {
 	// Process stats for the defined functions
 	if len(byFields) == 0 {
 		// Fast path - pass all the rows to a single group with empty key.
-		psg := shard.getPipeStatsGroupString(nil)
+		psg := shard.m.getPipeStatsGroupString(nil)
 		shard.stateSizeBudget -= psg.updateStatsForAllRows(shard.bms, br, &shard.brTmp)
 		return
 	}
@@ -492,7 +481,7 @@ func (shard *pipeStatsProcessorShard) writeBlock(br *blockResult) {
 		for _, values := range columnValues {
 			keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(values[0]))
 		}
-		psg := shard.getPipeStatsGroupString(keyBuf)
+		psg := shard.m.getPipeStatsGroupString(keyBuf)
 		shard.stateSizeBudget -= psg.updateStatsForAllRows(shard.bms, br, &shard.brTmp)
 		shard.keyBuf = keyBuf
 		return
@@ -516,7 +505,7 @@ func (shard *pipeStatsProcessorShard) writeBlock(br *blockResult) {
 			for _, values := range columnValues {
 				keyBuf = encoding.MarshalBytes(keyBuf, bytesutil.ToUnsafeBytes(values[i]))
 			}
-			psg = shard.getPipeStatsGroupString(keyBuf)
+			psg = shard.m.getPipeStatsGroupString(keyBuf)
 		}
 		shard.stateSizeBudget -= psg.updateStatsForRow(shard.bms, br, i)
 	}
@@ -528,7 +517,7 @@ func (shard *pipeStatsProcessorShard) updateStatsSingleColumn(br *blockResult, b
 	if c.isConst {
 		// Fast path for column with a constant value.
 		v := br.getBucketedValue(c.valuesEncoded[0], bf)
-		psg := shard.getPipeStatsGroupGeneric(v)
+		psg := shard.m.getPipeStatsGroupGeneric(v)
 		shard.stateSizeBudget -= psg.updateStatsForAllRows(shard.bms, br, &shard.brTmp)
 		return
 	}
@@ -541,7 +530,7 @@ func (shard *pipeStatsProcessorShard) updateStatsSingleColumn(br *blockResult, b
 			for i, v := range values {
 				if i <= 0 || values[i-1] != v {
 					n := unmarshalUint8(v)
-					psg = shard.getPipeStatsGroupUint64(uint64(n))
+					psg = shard.m.getPipeStatsGroupUint64(uint64(n))
 				}
 				shard.stateSizeBudget -= psg.updateStatsForRow(shard.bms, br, i)
 			}
@@ -552,7 +541,7 @@ func (shard *pipeStatsProcessorShard) updateStatsSingleColumn(br *blockResult, b
 			for i, v := range values {
 				if i <= 0 || values[i-1] != v {
 					n := unmarshalUint16(v)
-					psg = shard.getPipeStatsGroupUint64(uint64(n))
+					psg = shard.m.getPipeStatsGroupUint64(uint64(n))
 				}
 				shard.stateSizeBudget -= psg.updateStatsForRow(shard.bms, br, i)
 			}
@@ -563,7 +552,7 @@ func (shard *pipeStatsProcessorShard) updateStatsSingleColumn(br *blockResult, b
 			for i, v := range values {
 				if i <= 0 || values[i-1] != v {
 					n := unmarshalUint32(v)
-					psg = shard.getPipeStatsGroupUint64(uint64(n))
+					psg = shard.m.getPipeStatsGroupUint64(uint64(n))
 				}
 				shard.stateSizeBudget -= psg.updateStatsForRow(shard.bms, br, i)
 			}
@@ -574,7 +563,7 @@ func (shard *pipeStatsProcessorShard) updateStatsSingleColumn(br *blockResult, b
 			for i, v := range values {
 				if i <= 0 || values[i-1] != v {
 					n := unmarshalUint64(v)
-					psg = shard.getPipeStatsGroupUint64(n)
+					psg = shard.m.getPipeStatsGroupUint64(n)
 				}
 				shard.stateSizeBudget -= psg.updateStatsForRow(shard.bms, br, i)
 			}
@@ -585,7 +574,7 @@ func (shard *pipeStatsProcessorShard) updateStatsSingleColumn(br *blockResult, b
 			for i, v := range values {
 				if i <= 0 || values[i-1] != v {
 					n := unmarshalInt64(v)
-					psg = shard.getPipeStatsGroupInt64(n)
+					psg = shard.m.getPipeStatsGroupInt64(n)
 				}
 				shard.stateSizeBudget -= psg.updateStatsForRow(shard.bms, br, i)
 			}
@@ -598,14 +587,14 @@ func (shard *pipeStatsProcessorShard) updateStatsSingleColumn(br *blockResult, b
 	values := c.getValuesBucketed(br, bf)
 	for i := 0; i < br.rowsLen; i++ {
 		if i <= 0 || values[i-1] != values[i] {
-			psg = shard.getPipeStatsGroupGeneric(values[i])
+			psg = shard.m.getPipeStatsGroupGeneric(values[i])
 		}
 		shard.stateSizeBudget -= psg.updateStatsForRow(shard.bms, br, i)
 	}
 }

 func (shard *pipeStatsProcessorShard) applyPerFunctionFilters(br *blockResult) {
-	funcs := shard.psp.ps.funcs
+	funcs := shard.ps.funcs
 	for i := range funcs {
 		iff := funcs[i].iff
 		if iff == nil {
@@ -620,111 +609,6 @@ func (shard *pipeStatsProcessorShard) applyPerFunctionFilters(br *blockResult) {
 	}
 }

-func (shard *pipeStatsProcessorShard) getPipeStatsGroupGeneric(v string) *pipeStatsGroup {
-	if n, ok := tryParseUint64(v); ok {
-		return shard.getPipeStatsGroupUint64(n)
-	}
-	if len(v) > 0 && v[0] == '-' {
-		if n, ok := tryParseInt64(v); ok {
-			return shard.getPipeStatsGroupNegativeInt64(n)
-		}
-	}
-	return shard.getPipeStatsGroupString(bytesutil.ToUnsafeBytes(v))
-}
-
-func (shard *pipeStatsProcessorShard) getPipeStatsGroupInt64(n int64) *pipeStatsGroup {
-	if n >= 0 {
-		return shard.getPipeStatsGroupUint64(uint64(n))
-	}
-	return shard.getPipeStatsGroupNegativeInt64(n)
-}
-
-func (shard *pipeStatsProcessorShard) getPipeStatsGroupUint64(n uint64) *pipeStatsGroup {
-	if shard.groupMapShards == nil {
-		psg, isNew := shard.groupMap.getPipeStatsGroupUint64(n)
-		if isNew {
-			shard.probablyMoveGroupMapToShards()
-		}
-		return psg
-	}
-	psm := shard.getGroupMapShardByUint64(n)
-	psg, _ := psm.getPipeStatsGroupUint64(n)
-	return psg
-}
-
-func (shard *pipeStatsProcessorShard) getPipeStatsGroupNegativeInt64(n int64) *pipeStatsGroup {
-	if shard.groupMapShards == nil {
-		psg, isNew := shard.groupMap.getPipeStatsGroupNegativeInt64(n)
-		if isNew {
-			shard.probablyMoveGroupMapToShards()
-		}
-		return psg
-	}
-	psm := shard.getGroupMapShardByUint64(uint64(n))
-	psg, _ := psm.getPipeStatsGroupNegativeInt64(n)
-	return psg
-}
-
-func (shard *pipeStatsProcessorShard) getPipeStatsGroupString(v []byte) *pipeStatsGroup {
-	if shard.groupMapShards == nil {
-		psg, isNew := shard.groupMap.getPipeStatsGroupString(v)
-		if isNew {
-			shard.probablyMoveGroupMapToShards()
-		}
-		return psg
-	}
-	psm := shard.getGroupMapShardByString(v)
-	psg, _ := psm.getPipeStatsGroupString(v)
-	return psg
-}
-
-func (shard *pipeStatsProcessorShard) probablyMoveGroupMapToShards() {
-	if shard.groupMap.entriesCount() < pipeStatsGroupMapMaxLen {
-		return
-	}
-	shard.moveGroupMapToShards()
-}
-
-func (shard *pipeStatsProcessorShard) moveGroupMapToShards() {
-	// set cpusCount to the number of shards, since this is the concurrency limit set by the caller.
-	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8201
-	cpusCount := uint(len(shard.psp.shards))
-	bytesAllocatedPrev := shard.a.bytesAllocated
-	shard.groupMapShards = shard.a.newPipeStatsGroupMaps(cpusCount)
-	shard.stateSizeBudget -= shard.a.bytesAllocated - bytesAllocatedPrev
-
-	for i := range shard.groupMapShards {
-		shard.groupMapShards[i].init(shard)
-	}
-
-	for n, psg := range shard.groupMap.u64 {
-		psm := shard.getGroupMapShardByUint64(n)
-		psm.setPipeStatsGroupUint64(n, psg)
-	}
-	for n, psg := range shard.groupMap.negative64 {
-		psm := shard.getGroupMapShardByUint64(n)
-		psm.setPipeStatsGroupNegativeInt64(int64(n), psg)
-	}
-	for s, psg := range shard.groupMap.strings {
-		psm := shard.getGroupMapShardByString(bytesutil.ToUnsafeBytes(s))
-		psm.setPipeStatsGroupString(s, psg)
-	}
-
-	shard.groupMap.reset()
-}
-
-func (shard *pipeStatsProcessorShard) getGroupMapShardByString(v []byte) *pipeStatsGroupMap {
-	h := xxhash.Sum64(v)
-	shardIdx := h % uint64(len(shard.groupMapShards))
-	return &shard.groupMapShards[shardIdx]
-}
-
-func (shard *pipeStatsProcessorShard) getGroupMapShardByUint64(n uint64) *pipeStatsGroupMap {
-	h := fastHashUint64(n)
-	shardIdx := h % uint64(len(shard.groupMapShards))
-	return &shard.groupMapShards[shardIdx]
-}
-
 type pipeStatsGroup struct {
 	funcs []pipeStatsFunc
 	sfps  []statsProcessor
@@ -793,7 +677,10 @@ func (psp *pipeStatsProcessor) flush() error {
 	}

 	// Merge states across shards in parallel
-	psms := psp.mergeShardsParallel()
+	psms, err := psp.mergeShardsParallel()
+	if err != nil {
+		return err
+	}
 	if needStop(psp.stopCh) {
 		return nil
 	}
@@ -802,8 +689,8 @@ func (psp *pipeStatsProcessor) flush() error {
 		// Special case - zero matching rows.
 		shard := &psp.shards[0]
 		shard.init()
-		shard.groupMap.getPipeStatsGroupString(nil)
-		psms = append(psms, &shard.groupMap)
+		_ = shard.m.getPipeStatsGroupString(nil)
+		psms = append(psms, &shard.m)
 	}

 	// Write the calculated stats in parallel to the next pipe.
@@ -958,48 +845,93 @@ func (psw *pipeStatsWriter) writeShardData(psm *pipeStatsGroupMap) {
 	}
 }

-func (psp *pipeStatsProcessor) mergeShardsParallel() []*pipeStatsGroupMap {
+func (psp *pipeStatsProcessor) mergeShardsParallel() ([]*pipeStatsGroupMap, error) {
 	shards := psp.shards
-	var wg sync.WaitGroup
-	for i := range shards {
-		shard := &shards[i]
-		if shard.groupMapShards != nil {
-			continue
-		}
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-			shard.moveGroupMapToShards()
-		}()
-	}
-	wg.Wait()
-	if needStop(psp.stopCh) {
-		return nil
-	}
+	shardsLen := len(shards)

 	// set cpusCount to the number of shards, since this is the concurrency limit set by the caller.
 	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8201
-	cpusCount := len(shards[0].groupMapShards)
+	cpusCount := len(shards)
+
+	if shardsLen == 1 {
+		var psms []*pipeStatsGroupMap
+		shard := &shards[0]
+		if shard.m.entriesCount() > 0 {
+			psms = append(psms, &shard.m)
+		}
+		return psms, nil
+	}
+
+	var wg sync.WaitGroup
+	perShardMaps := make([][]pipeStatsGroupMap, shardsLen)
+	for i := range shards {
+		wg.Add(1)
+		go func(idx int) {
+			defer wg.Done()
+
+			perCPU := make([]pipeStatsGroupMap, cpusCount)
+			for i := range perCPU {
+				perCPU[i].init(&shards[idx])
+			}
+
+			psm := &shards[idx].m
+
+			for n, psg := range psm.u64 {
+				if needStop(psp.stopCh) {
+					return
+				}
+				k := unsafe.Slice((*byte)(unsafe.Pointer(&n)), 8)
+				h := xxhash.Sum64(k)
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].u64[n] = psg
+			}
+			for n, psg := range psm.negative64 {
+				if needStop(psp.stopCh) {
+					return
+				}
+				k := unsafe.Slice((*byte)(unsafe.Pointer(&n)), 8)
+				h := xxhash.Sum64(k)
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].negative64[n] = psg
+			}
+			for k, psg := range psm.strings {
+				if needStop(psp.stopCh) {
+					return
+				}
+				h := xxhash.Sum64(bytesutil.ToUnsafeBytes(k))
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].strings[k] = psg
+			}
+
+			perShardMaps[idx] = perCPU
+			psm.reset()
+		}(i)
+	}
+	wg.Wait()
+	if needStop(psp.stopCh) {
+		return nil, nil
+	}
+
+	// Merge per-shard entries into perShardMaps[0]
 	for i := 0; i < cpusCount; i++ {
 		wg.Add(1)
 		go func(cpuIdx int) {
 			defer wg.Done()

-			psm := &shards[0].groupMapShards[cpuIdx]
-			for j := range shards[1:] {
-				src := &shards[1+j].groupMapShards[cpuIdx]
-				psm.mergeState(src, psp.stopCh)
-				src.reset()
+			psm := &perShardMaps[0][cpuIdx]
+			for _, perCPU := range perShardMaps[1:] {
+				psm.mergeState(&perCPU[cpuIdx], psp.stopCh)
+				perCPU[cpuIdx].reset()
 			}
 		}(i)
 	}
 	wg.Wait()
 	if needStop(psp.stopCh) {
-		return nil
+		return nil, nil
 	}

 	// Filter out maps without entries
-	psms := shards[0].groupMapShards
+	psms := perShardMaps[0]
 	result := make([]*pipeStatsGroupMap, 0, len(psms))
 	for i := range psms {
 		if psms[i].entriesCount() > 0 {
@@ -1007,7 +939,7 @@ func (psp *pipeStatsProcessor) mergeShardsParallel() []*pipeStatsGroupMap {
 		}
 	}

-	return result
+	return result, nil
 }

 func parsePipeStats(lex *lexer, needStatsKeyword bool) (pipe, error) {
--- a/lib/logstorage/pipe_top.go
+++ b/lib/logstorage/pipe_top.go
@@ -94,7 +94,7 @@ func (pt *pipeTop) newPipeProcessor(workersCount int, stopCh <-chan struct{}, ca
 				pt: pt,
 			},
 		}
-		shards[i].m.init(uint(workersCount), &shards[i].stateSizeBudget)
+		shards[i].m.init(&shards[i].stateSizeBudget)
 	}

 	ptp := &pipeTopProcessor{
@@ -136,7 +136,7 @@ type pipeTopProcessorShardNopad struct {
 	pt *pipeTop

 	// m holds per-value hits.
-	m hitsMapAdaptive
+	m hitsMap

 	// keyBuf is a temporary buffer for building keys for m.
 	keyBuf []byte
@@ -389,17 +389,21 @@ func (ptp *pipeTopProcessor) mergeShardsParallel() []*pipeTopEntry {
 		return nil
 	}

-	hmas := make([]*hitsMapAdaptive, 0, len(ptp.shards))
+	hms := make([]*hitsMap, 0, len(ptp.shards))
 	for i := range ptp.shards {
-		hma := &ptp.shards[i].m
-		if hma.entriesCount() > 0 {
-			hmas = append(hmas, hma)
+		hm := &ptp.shards[i].m
+		if hm.entriesCount() > 0 {
+			hms = append(hms, hm)
 		}
 	}

+	// set cpusCount to the number of shards, since this is the concurrency limit set by the caller.
+	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8201
+	cpusCount := len(ptp.shards)
+
 	var entries []*pipeTopEntry
 	var entriesLock sync.Mutex
-	hitsMapMergeParallel(hmas, ptp.stopCh, func(hm *hitsMap) {
+	hitsMapMergeParallel(hms, cpusCount, ptp.stopCh, func(hm *hitsMap) {
 		es := getTopEntries(hm, limit, ptp.stopCh)
 		entriesLock.Lock()
 		entries = append(entries, es...)
--- a/lib/logstorage/pipe_uniq.go
+++ b/lib/logstorage/pipe_uniq.go
@@ -77,7 +77,7 @@ func (pu *pipeUniq) newPipeProcessor(workersCount int, stopCh <-chan struct{}, c
 				pu: pu,
 			},
 		}
-		shards[i].m.init(uint(workersCount), &shards[i].stateSizeBudget)
+		shards[i].m.init(&shards[i].stateSizeBudget)
 	}

 	pup := &pipeUniqProcessor{
@@ -119,7 +119,7 @@ type pipeUniqProcessorShardNopad struct {
 	pu *pipeUniq

 	// m holds per-row hits.
-	m hitsMapAdaptive
+	m hitsMap

 	// keyBuf is a temporary buffer for building keys for m.
 	keyBuf []byte
@@ -462,17 +462,21 @@ func (pup *pipeUniqProcessor) writeShardData(workerID uint, hm *hitsMap, resetHi
 }

 func (pup *pipeUniqProcessor) mergeShardsParallel() []*hitsMap {
-	hmas := make([]*hitsMapAdaptive, 0, len(pup.shards))
+	hms := make([]*hitsMap, 0, len(pup.shards))
 	for i := range pup.shards {
-		hma := &pup.shards[i].m
-		if hma.entriesCount() > 0 {
-			hmas = append(hmas, hma)
+		hm := &pup.shards[i].m
+		if hm.entriesCount() > 0 {
+			hms = append(hms, hm)
 		}
 	}

-	var hmsResult []*hitsMap
+	// set cpusCount to the number of shards, since this is the concurrency limit set by the caller.
+	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8201
+	cpusCount := len(pup.shards)
+
+	hmsResult := make([]*hitsMap, 0, cpusCount)
 	var hmsLock sync.Mutex
-	hitsMapMergeParallel(hmas, pup.stopCh, func(hm *hitsMap) {
+	hitsMapMergeParallel(hms, cpusCount, pup.stopCh, func(hm *hitsMap) {
 		if hm.entriesCount() > 0 {
 			hmsLock.Lock()
 			hmsResult = append(hmsResult, hm)
--- a/lib/logstorage/stats_count_uniq.go
+++ b/lib/logstorage/stats_count_uniq.go
@@ -9,6 +9,7 @@ import (
 	"github.com/cespare/xxhash/v2"

 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
 )

@@ -32,40 +33,21 @@ func (su *statsCountUniq) updateNeededFields(neededFields fieldsSet) {
 func (su *statsCountUniq) newStatsProcessor(a *chunkedAllocator) statsProcessor {
 	sup := a.newStatsCountUniqProcessor()
 	sup.a = a
+	sup.m.init()
 	return sup
 }

 type statsCountUniqProcessor struct {
 	a *chunkedAllocator

-	// concurrency is the number of parallel workers to use when merging shards.
-	//
-	// this field must be updated by the caller before using statsCountUniqProcessor.
-	concurrency uint
-
-	// uniqValues is used for tracking small number of unique values until it reaches statsCountUniqValuesMaxLen.
-	// After that the unique values are tracked by shards.
-	uniqValues statsCountUniqSet
-
-	// shards are used for tracking big number of unique values.
-	//
-	// Every shard contains a share of unique values, which are merged in parallel at finalizeStats().
-	shards []statsCountUniqSet
-
-	// shardss is used for collecting shards from other statsCountUniqProcessor instances at mergeState().
-	shardss [][]statsCountUniqSet
+	m  statsCountUniqSet
+	ms []*statsCountUniqSet

 	columnValues [][]string
 	keyBuf       []byte
 	tmpNum       int
 }

-// the maximum number of values to track in statsCountUniqProcessor.uniqValues before switching to statsCountUniqProcessor.shards
-//
-// Too big value may slow down mergeState() across big number of CPU cores.
-// Too small value may significantly increase RAM usage when coun_uniq() is applied individually to big number of groups.
-const statsCountUniqValuesMaxLen = 4 << 10
-
 type statsCountUniqSet struct {
 	timestamps map[uint64]struct{}
 	u64        map[uint64]struct{}
@@ -74,7 +56,17 @@ type statsCountUniqSet struct {
 }

 func (sus *statsCountUniqSet) reset() {
-	*sus = statsCountUniqSet{}
+	sus.timestamps = nil
+	sus.u64 = nil
+	sus.negative64 = nil
+	sus.strings = nil
+}
+
+func (sus *statsCountUniqSet) init() {
+	sus.timestamps = make(map[uint64]struct{})
+	sus.u64 = make(map[uint64]struct{})
+	sus.negative64 = make(map[uint64]struct{})
+	sus.strings = make(map[string]struct{})
 }

 func (sus *statsCountUniqSet) entriesCount() uint64 {
@@ -83,33 +75,66 @@ func (sus *statsCountUniqSet) entriesCount() uint64 {
 }

 func (sus *statsCountUniqSet) updateStateTimestamp(ts int64) int {
-	return updateUint64Set(&sus.timestamps, uint64(ts))
+	_, ok := sus.timestamps[uint64(ts)]
+	if ok {
+		return 0
+	}
+	sus.timestamps[uint64(ts)] = struct{}{}
+	return 8
 }

 func (sus *statsCountUniqSet) updateStateUint64(n uint64) int {
-	return updateUint64Set(&sus.u64, n)
+	_, ok := sus.u64[n]
+	if ok {
+		return 0
+	}
+	sus.u64[n] = struct{}{}
+	return 8
+}
+
+func (sus *statsCountUniqSet) updateStateInt64(n int64) int {
+	if n >= 0 {
+		return sus.updateStateUint64(uint64(n))
+	}
+	return sus.updateStateNegativeInt64(n)
 }

 func (sus *statsCountUniqSet) updateStateNegativeInt64(n int64) int {
-	return updateUint64Set(&sus.negative64, uint64(n))
-}
-
-func (sus *statsCountUniqSet) updateStateString(a *chunkedAllocator, v []byte) int {
-	if _, ok := sus.strings[string(v)]; ok {
+	_, ok := sus.negative64[uint64(n)]
+	if ok {
 		return 0
 	}
-	vCopy := a.cloneBytesToString(v)
-	return setStringSet(&sus.strings, vCopy) + len(vCopy)
+	sus.negative64[uint64(n)] = struct{}{}
+	return 8
+}
+
+func (sus *statsCountUniqSet) updateStateGeneric(a *chunkedAllocator, v string) int {
+	if n, ok := tryParseUint64(v); ok {
+		return sus.updateStateUint64(n)
+	}
+	if len(v) > 0 && v[0] == '-' {
+		if n, ok := tryParseInt64(v); ok {
+			return sus.updateStateNegativeInt64(n)
+		}
+	}
+	return sus.updateStateString(a, v)
+}
+
+func (sus *statsCountUniqSet) updateStateString(a *chunkedAllocator, v string) int {
+	_, ok := sus.strings[v]
+	if ok {
+		return 0
+	}
+	vCopy := a.cloneString(v)
+	sus.strings[vCopy] = struct{}{}
+	return int(unsafe.Sizeof(v)) + len(v)
 }

 func (sus *statsCountUniqSet) mergeState(src *statsCountUniqSet, stopCh <-chan struct{}) {
-	mergeUint64Set(&sus.timestamps, src.timestamps, stopCh)
-	mergeUint64Set(&sus.u64, src.u64, stopCh)
-	mergeUint64Set(&sus.negative64, src.negative64, stopCh)
+	mergeUint64Set(sus.timestamps, src.timestamps, stopCh)
+	mergeUint64Set(sus.u64, src.u64, stopCh)
+	mergeUint64Set(sus.negative64, src.negative64, stopCh)

-	if sus.strings == nil {
-		sus.strings = make(map[string]struct{})
-	}
 	for k := range src.strings {
 		if needStop(stopCh) {
 			return
@@ -120,46 +145,7 @@ func (sus *statsCountUniqSet) mergeState(src *statsCountUniqSet, stopCh <-chan s
 	}
 }

-func updateUint64Set(dstPtr *map[uint64]struct{}, n uint64) int {
-	dst := *dstPtr
-	if _, ok := dst[n]; ok {
-		return 0
-	}
-	return setUint64Set(dstPtr, n)
-}
-
-func setUint64Set(dstPtr *map[uint64]struct{}, n uint64) int {
-	dst := *dstPtr
-	if dst == nil {
-		dst = map[uint64]struct{}{
-			n: {},
-		}
-		*dstPtr = dst
-		return int(unsafe.Sizeof(dst) + unsafe.Sizeof(n))
-	}
-	dst[n] = struct{}{}
-	return int(unsafe.Sizeof(n))
-}
-
-func setStringSet(dstPtr *map[string]struct{}, v string) int {
-	dst := *dstPtr
-	if dst == nil {
-		dst = map[string]struct{}{
-			v: {},
-		}
-		*dstPtr = dst
-		return int(unsafe.Sizeof(dst) + unsafe.Sizeof(v))
-	}
-	dst[v] = struct{}{}
-	return int(unsafe.Sizeof(v))
-}
-
-func mergeUint64Set(dstPtr *map[uint64]struct{}, src map[uint64]struct{}, stopCh <-chan struct{}) {
-	dst := *dstPtr
-	if dst == nil {
-		dst = make(map[uint64]struct{})
-		*dstPtr = dst
-	}
+func mergeUint64Set(dst map[uint64]struct{}, src map[uint64]struct{}, stopCh <-chan struct{}) {
 	for n := range src {
 		if needStop(stopCh) {
 			return
@@ -332,12 +318,12 @@ func (sup *statsCountUniqProcessor) updateStatsForAllRowsSingleColumn(br *blockR
 	if c.isTime {
 		// Count unique timestamps
 		timestamps := br.getTimestamps()
-		for i := range timestamps {
+		for i, timestamp := range timestamps {
 			if i > 0 && timestamps[i-1] == timestamps[i] {
 				// This timestamp has been already counted.
 				continue
 			}
-			stateSizeIncrease += sup.updateStateTimestamp(timestamps[i])
+			stateSizeIncrease += sup.m.updateStateTimestamp(timestamp)
 		}
 		return stateSizeIncrease
 	}
@@ -370,7 +356,7 @@ func (sup *statsCountUniqProcessor) updateStatsForAllRowsSingleColumn(br *blockR
 				continue
 			}
 			n := unmarshalUint8(v)
-			stateSizeIncrease += sup.updateStateUint64(uint64(n))
+			stateSizeIncrease += sup.m.updateStateUint64(uint64(n))
 		}
 		return stateSizeIncrease
 	case valueTypeUint16:
@@ -380,7 +366,7 @@ func (sup *statsCountUniqProcessor) updateStatsForAllRowsSingleColumn(br *blockR
 				continue
 			}
 			n := unmarshalUint16(v)
-			stateSizeIncrease += sup.updateStateUint64(uint64(n))
+			stateSizeIncrease += sup.m.updateStateUint64(uint64(n))
 		}
 		return stateSizeIncrease
 	case valueTypeUint32:
@@ -390,7 +376,7 @@ func (sup *statsCountUniqProcessor) updateStatsForAllRowsSingleColumn(br *blockR
 				continue
 			}
 			n := unmarshalUint32(v)
-			stateSizeIncrease += sup.updateStateUint64(uint64(n))
+			stateSizeIncrease += sup.m.updateStateUint64(uint64(n))
 		}
 		return stateSizeIncrease
 	case valueTypeUint64:
@@ -400,7 +386,7 @@ func (sup *statsCountUniqProcessor) updateStatsForAllRowsSingleColumn(br *blockR
 				continue
 			}
 			n := unmarshalUint64(v)
-			stateSizeIncrease += sup.updateStateUint64(n)
+			stateSizeIncrease += sup.m.updateStateUint64(n)
 		}
 		return stateSizeIncrease
 	case valueTypeInt64:
@@ -410,7 +396,7 @@ func (sup *statsCountUniqProcessor) updateStatsForAllRowsSingleColumn(br *blockR
 				continue
 			}
 			n := unmarshalInt64(v)
-			stateSizeIncrease += sup.updateStateInt64(n)
+			stateSizeIncrease += sup.m.updateStateInt64(n)
 		}
 		return stateSizeIncrease
 	default:
@@ -435,7 +421,8 @@ func (sup *statsCountUniqProcessor) updateStatsForRowSingleColumn(br *blockResul
 	if c.isTime {
 		// Count unique timestamps
 		timestamps := br.getTimestamps()
-		return sup.updateStateTimestamp(timestamps[rowIdx])
+		timestamp := timestamps[rowIdx]
+		return sup.m.updateStateTimestamp(timestamp)
 	}
 	if c.isConst {
 		// count unique const values
@@ -462,27 +449,27 @@ func (sup *statsCountUniqProcessor) updateStatsForRowSingleColumn(br *blockResul
 		values := c.getValuesEncoded(br)
 		v := values[rowIdx]
 		n := unmarshalUint8(v)
-		return sup.updateStateUint64(uint64(n))
+		return sup.m.updateStateUint64(uint64(n))
 	case valueTypeUint16:
 		values := c.getValuesEncoded(br)
 		v := values[rowIdx]
 		n := unmarshalUint16(v)
-		return sup.updateStateUint64(uint64(n))
+		return sup.m.updateStateUint64(uint64(n))
 	case valueTypeUint32:
 		values := c.getValuesEncoded(br)
 		v := values[rowIdx]
 		n := unmarshalUint32(v)
-		return sup.updateStateUint64(uint64(n))
+		return sup.m.updateStateUint64(uint64(n))
 	case valueTypeUint64:
 		values := c.getValuesEncoded(br)
 		v := values[rowIdx]
 		n := unmarshalUint64(v)
-		return sup.updateStateUint64(n)
+		return sup.m.updateStateUint64(n)
 	case valueTypeInt64:
 		values := c.getValuesEncoded(br)
 		v := values[rowIdx]
 		n := unmarshalInt64(v)
-		return sup.updateStateInt64(n)
+		return sup.m.updateStateInt64(n)
 	default:
 		// Count unique values for the given rowIdx
 		v := c.getValueAtRow(br, rowIdx)
@@ -501,32 +488,20 @@ func (sup *statsCountUniqProcessor) mergeState(sf statsFunc, sfp statsProcessor)
 	}

 	src := sfp.(*statsCountUniqProcessor)
-
-	if sup.shards == nil {
-		if src.shards == nil {
-			sup.uniqValues.mergeState(&src.uniqValues, nil)
-			src.uniqValues.reset()
-			sup.probablyMoveUniqValuesToShards()
-			return
-		}
-		sup.moveUniqValuesToShards()
+	if src.m.entriesCount() > 100_000 {
+		// Postpone merging too big number of items in parallel
+		sup.ms = append(sup.ms, &src.m)
+		return
 	}

-	if src.shards == nil {
-		src.moveUniqValuesToShards()
-	}
-	sup.shardss = append(sup.shardss, src.shards)
-	src.shards = nil
+	sup.m.mergeState(&src.m, nil)
 }

 func (sup *statsCountUniqProcessor) finalizeStats(sf statsFunc, dst []byte, stopCh <-chan struct{}) []byte {
-	n := sup.entriesCount()
-	if len(sup.shardss) > 0 {
-		if sup.shards != nil {
-			sup.shardss = append(sup.shardss, sup.shards)
-			sup.shards = nil
-		}
-		n = countUniqParallel(sup.shardss, stopCh)
+	n := sup.m.entriesCount()
+	if len(sup.ms) > 0 {
+		sup.ms = append(sup.ms, &sup.m)
+		n = countUniqParallel(sup.ms, stopCh)
 	}

 	su := sf.(*statsCountUniq)
@@ -536,22 +511,78 @@ func (sup *statsCountUniqProcessor) finalizeStats(sf statsFunc, dst []byte, stop
 	return strconv.AppendUint(dst, n, 10)
 }

-func countUniqParallel(shardss [][]statsCountUniqSet, stopCh <-chan struct{}) uint64 {
-	cpusCount := len(shardss[0])
-	perCPUCounts := make([]uint64, cpusCount)
+func countUniqParallel(ms []*statsCountUniqSet, stopCh <-chan struct{}) uint64 {
+	shardsLen := len(ms)
+	cpusCount := cgroup.AvailableCPUs()
+
 	var wg sync.WaitGroup
+	msShards := make([][]statsCountUniqSet, shardsLen)
+	for i := range msShards {
+		wg.Add(1)
+		go func(idx int) {
+			defer wg.Done()
+
+			perCPU := make([]statsCountUniqSet, cpusCount)
+			for i := range perCPU {
+				perCPU[i].init()
+			}
+
+			sus := ms[idx]
+
+			for ts := range sus.timestamps {
+				if needStop(stopCh) {
+					return
+				}
+				k := unsafe.Slice((*byte)(unsafe.Pointer(&ts)), 8)
+				h := xxhash.Sum64(k)
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].timestamps[ts] = struct{}{}
+			}
+			for n := range sus.u64 {
+				if needStop(stopCh) {
+					return
+				}
+				k := unsafe.Slice((*byte)(unsafe.Pointer(&n)), 8)
+				h := xxhash.Sum64(k)
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].u64[n] = struct{}{}
+			}
+			for n := range sus.negative64 {
+				if needStop(stopCh) {
+					return
+				}
+				k := unsafe.Slice((*byte)(unsafe.Pointer(&n)), 8)
+				h := xxhash.Sum64(k)
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].negative64[n] = struct{}{}
+			}
+			for k := range sus.strings {
+				if needStop(stopCh) {
+					return
+				}
+				h := xxhash.Sum64(bytesutil.ToUnsafeBytes(k))
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].strings[k] = struct{}{}
+			}
+
+			msShards[idx] = perCPU
+			ms[idx].reset()
+		}(i)
+	}
+	wg.Wait()
+
+	perCPUCounts := make([]uint64, cpusCount)
 	for i := range perCPUCounts {
 		wg.Add(1)
 		go func(cpuIdx int) {
 			defer wg.Done()

-			sus := &shardss[0][cpuIdx]
-			for _, perCPU := range shardss[1:] {
+			sus := &msShards[0][cpuIdx]
+			for _, perCPU := range msShards[1:] {
 				sus.mergeState(&perCPU[cpuIdx], stopCh)
 				perCPU[cpuIdx].reset()
 			}
 			perCPUCounts[cpuIdx] = sus.entriesCount()
-			sus.reset()
 		}(i)
 	}
 	wg.Wait()
@@ -563,130 +594,12 @@ func countUniqParallel(shardss [][]statsCountUniqSet, stopCh <-chan struct{}) ui
 	return countTotal
 }

-func (sup *statsCountUniqProcessor) entriesCount() uint64 {
-	if sup.shards == nil {
-		return sup.uniqValues.entriesCount()
-	}
-	n := uint64(0)
-	shards := sup.shards
-	for i := range shards {
-		n += shards[i].entriesCount()
-	}
-	return n
-}
-
 func (sup *statsCountUniqProcessor) updateStateGeneric(v string) int {
-	if n, ok := tryParseUint64(v); ok {
-		return sup.updateStateUint64(n)
-	}
-	if len(v) > 0 && v[0] == '-' {
-		if n, ok := tryParseInt64(v); ok {
-			return sup.updateStateNegativeInt64(n)
-		}
-	}
-	return sup.updateStateString(bytesutil.ToUnsafeBytes(v))
-}
-
-func (sup *statsCountUniqProcessor) updateStateInt64(n int64) int {
-	if n >= 0 {
-		return sup.updateStateUint64(uint64(n))
-	}
-	return sup.updateStateNegativeInt64(n)
+	return sup.m.updateStateGeneric(sup.a, v)
 }

 func (sup *statsCountUniqProcessor) updateStateString(v []byte) int {
-	if sup.shards == nil {
-		stateSizeIncrease := sup.uniqValues.updateStateString(sup.a, v)
-		if stateSizeIncrease > 0 {
-			stateSizeIncrease += sup.probablyMoveUniqValuesToShards()
-		}
-		return stateSizeIncrease
-	}
-	sus := sup.getShardByString(v)
-	return sus.updateStateString(sup.a, v)
-}
-
-func (sup *statsCountUniqProcessor) updateStateTimestamp(ts int64) int {
-	if sup.shards == nil {
-		stateSizeIncrease := sup.uniqValues.updateStateTimestamp(ts)
-		if stateSizeIncrease > 0 {
-			stateSizeIncrease += sup.probablyMoveUniqValuesToShards()
-		}
-		return stateSizeIncrease
-	}
-	sus := sup.getShardByUint64(uint64(ts))
-	return sus.updateStateTimestamp(ts)
-}
-
-func (sup *statsCountUniqProcessor) updateStateUint64(n uint64) int {
-	if sup.shards == nil {
-		stateSizeIncrease := sup.uniqValues.updateStateUint64(n)
-		if stateSizeIncrease > 0 {
-			stateSizeIncrease += sup.probablyMoveUniqValuesToShards()
-		}
-		return stateSizeIncrease
-	}
-	sus := sup.getShardByUint64(n)
-	return sus.updateStateUint64(n)
-}
-
-func (sup *statsCountUniqProcessor) updateStateNegativeInt64(n int64) int {
-	if sup.shards == nil {
-		stateSizeIncrease := sup.uniqValues.updateStateNegativeInt64(n)
-		if stateSizeIncrease > 0 {
-			stateSizeIncrease += sup.probablyMoveUniqValuesToShards()
-		}
-		return stateSizeIncrease
-	}
-	sus := sup.getShardByUint64(uint64(n))
-	return sus.updateStateNegativeInt64(n)
-}
-
-func (sup *statsCountUniqProcessor) probablyMoveUniqValuesToShards() int {
-	if sup.uniqValues.entriesCount() < statsCountUniqValuesMaxLen {
-		return 0
-	}
-	return sup.moveUniqValuesToShards()
-}
-
-func (sup *statsCountUniqProcessor) moveUniqValuesToShards() int {
-	cpusCount := sup.concurrency
-	bytesAllocatedPrev := sup.a.bytesAllocated
-	sup.shards = sup.a.newStatsCountUniqSets(cpusCount)
-	stateSizeIncrease := sup.a.bytesAllocated - bytesAllocatedPrev
-
-	for ts := range sup.uniqValues.timestamps {
-		sus := sup.getShardByUint64(ts)
-		setUint64Set(&sus.timestamps, ts)
-	}
-	for n := range sup.uniqValues.u64 {
-		sus := sup.getShardByUint64(n)
-		setUint64Set(&sus.u64, n)
-	}
-	for n := range sup.uniqValues.negative64 {
-		sus := sup.getShardByUint64(n)
-		setUint64Set(&sus.negative64, n)
-	}
-	for s := range sup.uniqValues.strings {
-		sus := sup.getShardByString(bytesutil.ToUnsafeBytes(s))
-		setStringSet(&sus.strings, s)
-	}
-
-	sup.uniqValues.reset()
-
-	return stateSizeIncrease
-}
-
-func (sup *statsCountUniqProcessor) getShardByString(v []byte) *statsCountUniqSet {
-	h := xxhash.Sum64(v)
-	cpuIdx := h % uint64(len(sup.shards))
-	return &sup.shards[cpuIdx]
-}
-
-func (sup *statsCountUniqProcessor) getShardByUint64(n uint64) *statsCountUniqSet {
-	h := fastHashUint64(n)
-	cpuIdx := h % uint64(len(sup.shards))
-	return &sup.shards[cpuIdx]
+	return sup.m.updateStateString(sup.a, bytesutil.ToUnsafeString(v))
 }

 func (sup *statsCountUniqProcessor) limitReached(su *statsCountUniq) bool {
@@ -694,7 +607,7 @@ func (sup *statsCountUniqProcessor) limitReached(su *statsCountUniq) bool {
 	if limit <= 0 {
 		return false
 	}
-	return sup.entriesCount() > limit
+	return sup.m.entriesCount() > limit
 }

 func parseStatsCountUniq(lex *lexer) (*statsCountUniq, error) {
--- a/lib/logstorage/stats_count_uniq_hash.go
+++ b/lib/logstorage/stats_count_uniq_hash.go
@@ -4,10 +4,12 @@ import (
 	"fmt"
 	"strconv"
 	"sync"
+	"unsafe"

 	"github.com/cespare/xxhash/v2"

 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding"
 )

@@ -31,40 +33,21 @@ func (su *statsCountUniqHash) updateNeededFields(neededFields fieldsSet) {
 func (su *statsCountUniqHash) newStatsProcessor(a *chunkedAllocator) statsProcessor {
 	sup := a.newStatsCountUniqHashProcessor()
 	sup.a = a
+	sup.m.init()
 	return sup
 }

 type statsCountUniqHashProcessor struct {
 	a *chunkedAllocator

-	// concurrency is the number of parallel workers to use when merging shards.
-	//
-	// this field must be updated by the caller before using statsCountUniqHashProcessor.
-	concurrency uint
-
-	// uniqValues is used for tracking small number of unique values until it reaches statsCountUniqHashValuesMaxLen.
-	// After that the unique values are tracked by shards.
-	uniqValues statsCountUniqHashSet
-
-	// shards are used for tracking big number of unique values.
-	//
-	// Every shard contains a share of unique values, which are merged in parallel at finalizeStats().
-	shards []statsCountUniqHashSet
-
-	// shardss is used for collecting shards from other statsCountUniqProcessor instances at mergeState().
-	shardss [][]statsCountUniqHashSet
+	m  statsCountUniqHashSet
+	ms []*statsCountUniqHashSet

 	columnValues [][]string
 	keyBuf       []byte
 	tmpNum       int
 }

-// the maximum number of values to track in statsCountUniqHashProcessor.uniqValues before switching to statsCountUniqHashProcessor.shards
-//
-// Too big value may slow down mergeState() across big number of CPU cores.
-// Too small value may significantly increase RAM usage when coun_uniq_hash() is applied individually to big number of groups.
-const statsCountUniqHashValuesMaxLen = 4 << 10
-
 type statsCountUniqHashSet struct {
 	timestamps map[uint64]struct{}
 	u64        map[uint64]struct{}
@@ -73,7 +56,17 @@ type statsCountUniqHashSet struct {
 }

 func (sus *statsCountUniqHashSet) reset() {
-	*sus = statsCountUniqHashSet{}
+	sus.timestamps = nil
+	sus.u64 = nil
+	sus.negative64 = nil
+	sus.strings = nil
+}
+
+func (sus *statsCountUniqHashSet) init() {
+	sus.timestamps = make(map[uint64]struct{})
+	sus.u64 = make(map[uint64]struct{})
+	sus.negative64 = make(map[uint64]struct{})
+	sus.strings = make(map[uint64]struct{})
 }

 func (sus *statsCountUniqHashSet) entriesCount() uint64 {
@@ -82,26 +75,66 @@ func (sus *statsCountUniqHashSet) entriesCount() uint64 {
 }

 func (sus *statsCountUniqHashSet) updateStateTimestamp(ts int64) int {
-	return updateUint64Set(&sus.timestamps, uint64(ts))
+	_, ok := sus.timestamps[uint64(ts)]
+	if ok {
+		return 0
+	}
+	sus.timestamps[uint64(ts)] = struct{}{}
+	return 8
 }

 func (sus *statsCountUniqHashSet) updateStateUint64(n uint64) int {
-	return updateUint64Set(&sus.timestamps, n)
+	_, ok := sus.u64[n]
+	if ok {
+		return 0
+	}
+	sus.u64[n] = struct{}{}
+	return 8
+}
+
+func (sus *statsCountUniqHashSet) updateStateInt64(n int64) int {
+	if n >= 0 {
+		return sus.updateStateUint64(uint64(n))
+	}
+	return sus.updateStateNegativeInt64(n)
 }

 func (sus *statsCountUniqHashSet) updateStateNegativeInt64(n int64) int {
-	return updateUint64Set(&sus.negative64, uint64(n))
+	_, ok := sus.negative64[uint64(n)]
+	if ok {
+		return 0
+	}
+	sus.negative64[uint64(n)] = struct{}{}
+	return 8
 }

-func (sus *statsCountUniqHashSet) updateStateStringHash(h uint64) int {
-	return updateUint64Set(&sus.strings, h)
+func (sus *statsCountUniqHashSet) updateStateGeneric(v string) int {
+	if n, ok := tryParseUint64(v); ok {
+		return sus.updateStateUint64(n)
+	}
+	if len(v) > 0 && v[0] == '-' {
+		if n, ok := tryParseInt64(v); ok {
+			return sus.updateStateNegativeInt64(n)
+		}
+	}
+	return sus.updateStateString(bytesutil.ToUnsafeBytes(v))
+}
+
+func (sus *statsCountUniqHashSet) updateStateString(v []byte) int {
+	h := xxhash.Sum64(v)
+	_, ok := sus.strings[h]
+	if ok {
+		return 0
+	}
+	sus.strings[h] = struct{}{}
+	return 8
 }

 func (sus *statsCountUniqHashSet) mergeState(src *statsCountUniqHashSet, stopCh <-chan struct{}) {
-	mergeUint64Set(&sus.timestamps, src.timestamps, stopCh)
-	mergeUint64Set(&sus.u64, src.u64, stopCh)
-	mergeUint64Set(&sus.negative64, src.negative64, stopCh)
-	mergeUint64Set(&sus.strings, src.strings, stopCh)
+	mergeUint64Set(sus.timestamps, src.timestamps, stopCh)
+	mergeUint64Set(sus.u64, src.u64, stopCh)
+	mergeUint64Set(sus.negative64, src.negative64, stopCh)
+	mergeUint64Set(sus.strings, src.strings, stopCh)
 }

 func (sup *statsCountUniqHashProcessor) updateStatsForAllRows(sf statsFunc, br *blockResult) int {
@@ -153,7 +186,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForAllRows(sf statsFunc, br *
 				// Do not count empty values
 				continue
 			}
-			stateSizeIncrease += sup.updateStateString(keyBuf)
+			stateSizeIncrease += sup.m.updateStateString(keyBuf)
 		}
 		sup.keyBuf = keyBuf
 		return stateSizeIncrease
@@ -200,7 +233,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForAllRows(sf statsFunc, br *
 			// Do not count empty values
 			continue
 		}
-		stateSizeIncrease += sup.updateStateString(keyBuf)
+		stateSizeIncrease += sup.m.updateStateString(keyBuf)
 	}
 	sup.keyBuf = keyBuf
 	return stateSizeIncrease
@@ -233,7 +266,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForRow(sf statsFunc, br *bloc
 			// Do not count empty values
 			return 0
 		}
-		return sup.updateStateString(keyBuf)
+		return sup.m.updateStateString(keyBuf)
 	}
 	if len(fields) == 1 {
 		// Fast path for a single column.
@@ -257,7 +290,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForRow(sf statsFunc, br *bloc
 		// Do not count empty values
 		return 0
 	}
-	return sup.updateStateString(keyBuf)
+	return sup.m.updateStateString(keyBuf)
 }

 func (sup *statsCountUniqHashProcessor) updateStatsForAllRowsSingleColumn(br *blockResult, columnName string) int {
@@ -266,12 +299,12 @@ func (sup *statsCountUniqHashProcessor) updateStatsForAllRowsSingleColumn(br *bl
 	if c.isTime {
 		// Count unique timestamps
 		timestamps := br.getTimestamps()
-		for i := range timestamps {
+		for i, timestamp := range timestamps {
 			if i > 0 && timestamps[i-1] == timestamps[i] {
 				// This timestamp has been already counted.
 				continue
 			}
-			stateSizeIncrease += sup.updateStateTimestamp(timestamps[i])
+			stateSizeIncrease += sup.m.updateStateTimestamp(timestamp)
 		}
 		return stateSizeIncrease
 	}
@@ -282,7 +315,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForAllRowsSingleColumn(br *bl
 			// Do not count empty values
 			return 0
 		}
-		return sup.updateStateGeneric(v)
+		return sup.m.updateStateGeneric(v)
 	}

 	switch c.valueType {
@@ -294,7 +327,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForAllRowsSingleColumn(br *bl
 				// Do not count empty values
 				return
 			}
-			sup.tmpNum += sup.updateStateGeneric(v)
+			sup.tmpNum += sup.m.updateStateGeneric(v)
 		})
 		return sup.tmpNum
 	case valueTypeUint8:
@@ -304,7 +337,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForAllRowsSingleColumn(br *bl
 				continue
 			}
 			n := unmarshalUint8(v)
-			stateSizeIncrease += sup.updateStateUint64(uint64(n))
+			stateSizeIncrease += sup.m.updateStateUint64(uint64(n))
 		}
 		return stateSizeIncrease
 	case valueTypeUint16:
@@ -314,7 +347,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForAllRowsSingleColumn(br *bl
 				continue
 			}
 			n := unmarshalUint16(v)
-			stateSizeIncrease += sup.updateStateUint64(uint64(n))
+			stateSizeIncrease += sup.m.updateStateUint64(uint64(n))
 		}
 		return stateSizeIncrease
 	case valueTypeUint32:
@@ -324,7 +357,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForAllRowsSingleColumn(br *bl
 				continue
 			}
 			n := unmarshalUint32(v)
-			stateSizeIncrease += sup.updateStateUint64(uint64(n))
+			stateSizeIncrease += sup.m.updateStateUint64(uint64(n))
 		}
 		return stateSizeIncrease
 	case valueTypeUint64:
@@ -334,7 +367,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForAllRowsSingleColumn(br *bl
 				continue
 			}
 			n := unmarshalUint64(v)
-			stateSizeIncrease += sup.updateStateUint64(n)
+			stateSizeIncrease += sup.m.updateStateUint64(n)
 		}
 		return stateSizeIncrease
 	case valueTypeInt64:
@@ -344,7 +377,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForAllRowsSingleColumn(br *bl
 				continue
 			}
 			n := unmarshalInt64(v)
-			stateSizeIncrease += sup.updateStateInt64(n)
+			stateSizeIncrease += sup.m.updateStateInt64(n)
 		}
 		return stateSizeIncrease
 	default:
@@ -359,7 +392,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForAllRowsSingleColumn(br *bl
 				// This value has been already counted.
 				continue
 			}
-			stateSizeIncrease += sup.updateStateGeneric(v)
+			stateSizeIncrease += sup.m.updateStateGeneric(v)
 		}
 		return stateSizeIncrease
 	}
@@ -370,7 +403,8 @@ func (sup *statsCountUniqHashProcessor) updateStatsForRowSingleColumn(br *blockR
 	if c.isTime {
 		// Count unique timestamps
 		timestamps := br.getTimestamps()
-		return sup.updateStateTimestamp(timestamps[rowIdx])
+		timestamp := timestamps[rowIdx]
+		return sup.m.updateStateTimestamp(timestamp)
 	}
 	if c.isConst {
 		// count unique const values
@@ -379,7 +413,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForRowSingleColumn(br *blockR
 			// Do not count empty values
 			return 0
 		}
-		return sup.updateStateGeneric(v)
+		return sup.m.updateStateGeneric(v)
 	}

 	switch c.valueType {
@@ -392,32 +426,32 @@ func (sup *statsCountUniqHashProcessor) updateStatsForRowSingleColumn(br *blockR
 			// Do not count empty values
 			return 0
 		}
-		return sup.updateStateGeneric(v)
+		return sup.m.updateStateGeneric(v)
 	case valueTypeUint8:
 		values := c.getValuesEncoded(br)
 		v := values[rowIdx]
 		n := unmarshalUint8(v)
-		return sup.updateStateUint64(uint64(n))
+		return sup.m.updateStateUint64(uint64(n))
 	case valueTypeUint16:
 		values := c.getValuesEncoded(br)
 		v := values[rowIdx]
 		n := unmarshalUint16(v)
-		return sup.updateStateUint64(uint64(n))
+		return sup.m.updateStateUint64(uint64(n))
 	case valueTypeUint32:
 		values := c.getValuesEncoded(br)
 		v := values[rowIdx]
 		n := unmarshalUint32(v)
-		return sup.updateStateUint64(uint64(n))
+		return sup.m.updateStateUint64(uint64(n))
 	case valueTypeUint64:
 		values := c.getValuesEncoded(br)
 		v := values[rowIdx]
 		n := unmarshalUint64(v)
-		return sup.updateStateUint64(n)
+		return sup.m.updateStateUint64(n)
 	case valueTypeInt64:
 		values := c.getValuesEncoded(br)
 		v := values[rowIdx]
 		n := unmarshalInt64(v)
-		return sup.updateStateInt64(n)
+		return sup.m.updateStateInt64(n)
 	default:
 		// Count unique values for the given rowIdx
 		v := c.getValueAtRow(br, rowIdx)
@@ -425,7 +459,7 @@ func (sup *statsCountUniqHashProcessor) updateStatsForRowSingleColumn(br *blockR
 			// Do not count empty values
 			return 0
 		}
-		return sup.updateStateGeneric(v)
+		return sup.m.updateStateGeneric(v)
 	}
 }

@@ -436,57 +470,100 @@ func (sup *statsCountUniqHashProcessor) mergeState(sf statsFunc, sfp statsProces
 	}

 	src := sfp.(*statsCountUniqHashProcessor)
-
-	if sup.shards == nil {
-		if src.shards == nil {
-			sup.uniqValues.mergeState(&src.uniqValues, nil)
-			src.uniqValues.reset()
-			sup.probablyMoveUniqValuesToShards()
-			return
-		}
-		sup.moveUniqValuesToShards()
+	if src.m.entriesCount() > 100_000 {
+		// Postpone merging too big number of items in parallel
+		sup.ms = append(sup.ms, &src.m)
+		return
 	}

-	if src.shards == nil {
-		src.moveUniqValuesToShards()
-	}
-	sup.shardss = append(sup.shardss, src.shards)
-	src.shards = nil
+	sup.m.mergeState(&src.m, nil)
 }

 func (sup *statsCountUniqHashProcessor) finalizeStats(sf statsFunc, dst []byte, stopCh <-chan struct{}) []byte {
-	n := sup.entriesCount()
-	if len(sup.shardss) > 0 {
-		if sup.shards != nil {
-			sup.shardss = append(sup.shardss, sup.shards)
-			sup.shards = nil
-		}
-		n = countUniqHashParallel(sup.shardss, stopCh)
+	su := sf.(*statsCountUniqHash)
+	n := sup.m.entriesCount()
+	if len(sup.ms) > 0 {
+		sup.ms = append(sup.ms, &sup.m)
+		n = countUniqHashParallel(sup.ms, stopCh)
 	}

-	su := sf.(*statsCountUniqHash)
 	if limit := su.limit; limit > 0 && n > limit {
 		n = limit
 	}
 	return strconv.AppendUint(dst, n, 10)
 }

-func countUniqHashParallel(shardss [][]statsCountUniqHashSet, stopCh <-chan struct{}) uint64 {
-	cpusCount := len(shardss[0])
-	perCPUCounts := make([]uint64, cpusCount)
+func countUniqHashParallel(ms []*statsCountUniqHashSet, stopCh <-chan struct{}) uint64 {
+	shardsLen := len(ms)
+	cpusCount := cgroup.AvailableCPUs()
+
 	var wg sync.WaitGroup
+	msShards := make([][]statsCountUniqHashSet, shardsLen)
+	for i := range msShards {
+		wg.Add(1)
+		go func(idx int) {
+			defer wg.Done()
+
+			perCPU := make([]statsCountUniqHashSet, cpusCount)
+			for i := range perCPU {
+				perCPU[i].init()
+			}
+
+			sus := ms[idx]
+
+			for ts := range sus.timestamps {
+				if needStop(stopCh) {
+					return
+				}
+				k := unsafe.Slice((*byte)(unsafe.Pointer(&ts)), 8)
+				h := xxhash.Sum64(k)
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].timestamps[ts] = struct{}{}
+			}
+			for n := range sus.u64 {
+				if needStop(stopCh) {
+					return
+				}
+				k := unsafe.Slice((*byte)(unsafe.Pointer(&n)), 8)
+				h := xxhash.Sum64(k)
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].u64[n] = struct{}{}
+			}
+			for n := range sus.negative64 {
+				if needStop(stopCh) {
+					return
+				}
+				k := unsafe.Slice((*byte)(unsafe.Pointer(&n)), 8)
+				h := xxhash.Sum64(k)
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].negative64[n] = struct{}{}
+			}
+			for h := range sus.strings {
+				if needStop(stopCh) {
+					return
+				}
+				cpuIdx := h % uint64(len(perCPU))
+				perCPU[cpuIdx].strings[h] = struct{}{}
+			}
+
+			msShards[idx] = perCPU
+			ms[idx].reset()
+		}(i)
+	}
+	wg.Wait()
+
+	perCPUCounts := make([]uint64, cpusCount)
 	for i := range perCPUCounts {
 		wg.Add(1)
 		go func(cpuIdx int) {
 			defer wg.Done()

-			sus := &shardss[0][cpuIdx]
-			for _, perCPU := range shardss[1:] {
+			sus := &msShards[0][cpuIdx]
+			for _, perCPU := range msShards[1:] {
 				sus.mergeState(&perCPU[cpuIdx], stopCh)
 				perCPU[cpuIdx].reset()
 			}
 			perCPUCounts[cpuIdx] = sus.entriesCount()
-			sus.reset()
 		}(i)
 	}
 	wg.Wait()
@@ -498,142 +575,12 @@ func countUniqHashParallel(shardss [][]statsCountUniqHashSet, stopCh <-chan stru
 	return countTotal
 }

-func (sup *statsCountUniqHashProcessor) entriesCount() uint64 {
-	if sup.shards == nil {
-		return sup.uniqValues.entriesCount()
-	}
-	n := uint64(0)
-	shards := sup.shards
-	for i := range shards {
-		n += shards[i].entriesCount()
-	}
-	return n
-}
-
-func (sup *statsCountUniqHashProcessor) updateStateGeneric(v string) int {
-	if n, ok := tryParseUint64(v); ok {
-		return sup.updateStateUint64(n)
-	}
-	if len(v) > 0 && v[0] == '-' {
-		if n, ok := tryParseInt64(v); ok {
-			return sup.updateStateNegativeInt64(n)
-		}
-	}
-	return sup.updateStateString(bytesutil.ToUnsafeBytes(v))
-}
-
-func (sup *statsCountUniqHashProcessor) updateStateInt64(n int64) int {
-	if n >= 0 {
-		return sup.updateStateUint64(uint64(n))
-	}
-	return sup.updateStateNegativeInt64(n)
-}
-
-func (sup *statsCountUniqHashProcessor) updateStateString(v []byte) int {
-	h := xxhash.Sum64(v)
-	if sup.shards == nil {
-		stateSizeIncrease := sup.uniqValues.updateStateStringHash(h)
-		if stateSizeIncrease > 0 {
-			stateSizeIncrease += sup.probablyMoveUniqValuesToShards()
-		}
-		return stateSizeIncrease
-	}
-	return sup.updateStateStringHash(h)
-}
-
-func (sup *statsCountUniqHashProcessor) updateStateStringHash(h uint64) int {
-	sus := sup.getShardByStringHash(h)
-	return sus.updateStateStringHash(h)
-}
-
-func (sup *statsCountUniqHashProcessor) updateStateTimestamp(ts int64) int {
-	if sup.shards == nil {
-		stateSizeIncrease := sup.uniqValues.updateStateTimestamp(ts)
-		if stateSizeIncrease > 0 {
-			stateSizeIncrease += sup.probablyMoveUniqValuesToShards()
-		}
-		return stateSizeIncrease
-	}
-	sus := sup.getShardByUint64(uint64(ts))
-	return sus.updateStateTimestamp(ts)
-}
-
-func (sup *statsCountUniqHashProcessor) updateStateUint64(n uint64) int {
-	if sup.shards == nil {
-		stateSizeIncrease := sup.uniqValues.updateStateUint64(n)
-		if stateSizeIncrease > 0 {
-			stateSizeIncrease += sup.probablyMoveUniqValuesToShards()
-		}
-		return stateSizeIncrease
-	}
-	sus := sup.getShardByUint64(n)
-	return sus.updateStateUint64(n)
-}
-
-func (sup *statsCountUniqHashProcessor) updateStateNegativeInt64(n int64) int {
-	if sup.shards == nil {
-		stateSizeIncrease := sup.uniqValues.updateStateNegativeInt64(n)
-		if stateSizeIncrease > 0 {
-			stateSizeIncrease += sup.probablyMoveUniqValuesToShards()
-		}
-		return stateSizeIncrease
-	}
-	sus := sup.getShardByUint64(uint64(n))
-	return sus.updateStateNegativeInt64(n)
-}
-
-func (sup *statsCountUniqHashProcessor) probablyMoveUniqValuesToShards() int {
-	if sup.uniqValues.entriesCount() < statsCountUniqHashValuesMaxLen {
-		return 0
-	}
-	return sup.moveUniqValuesToShards()
-}
-
-func (sup *statsCountUniqHashProcessor) moveUniqValuesToShards() int {
-	cpusCount := sup.concurrency
-	bytesAllocatedPrev := sup.a.bytesAllocated
-	sup.shards = sup.a.newStatsCountUniqHashSets(cpusCount)
-	stateSizeIncrease := sup.a.bytesAllocated - bytesAllocatedPrev
-
-	for ts := range sup.uniqValues.timestamps {
-		sus := sup.getShardByUint64(ts)
-		setUint64Set(&sus.timestamps, ts)
-	}
-	for n := range sup.uniqValues.u64 {
-		sus := sup.getShardByUint64(n)
-		setUint64Set(&sus.u64, n)
-	}
-	for n := range sup.uniqValues.negative64 {
-		sus := sup.getShardByUint64(n)
-		setUint64Set(&sus.negative64, n)
-	}
-	for h := range sup.uniqValues.strings {
-		sus := sup.getShardByStringHash(h)
-		setUint64Set(&sus.strings, h)
-	}
-
-	sup.uniqValues.reset()
-
-	return stateSizeIncrease
-}
-
-func (sup *statsCountUniqHashProcessor) getShardByStringHash(h uint64) *statsCountUniqHashSet {
-	cpuIdx := h % uint64(len(sup.shards))
-	return &sup.shards[cpuIdx]
-}
-
-func (sup *statsCountUniqHashProcessor) getShardByUint64(n uint64) *statsCountUniqHashSet {
-	h := fastHashUint64(n)
-	cpuIdx := h % uint64(len(sup.shards))
-	return &sup.shards[cpuIdx]
-}
-
 func (sup *statsCountUniqHashProcessor) limitReached(su *statsCountUniqHash) bool {
 	limit := su.limit
 	if limit <= 0 {
 		return false
 	}
-	return sup.entriesCount() > limit
+	return sup.m.entriesCount() > limit
 }

 func parseStatsCountUniqHash(lex *lexer) (*statsCountUniqHash, error) {
@@ -655,10 +602,3 @@ func parseStatsCountUniqHash(lex *lexer) (*statsCountUniqHash, error) {
 	}
 	return su, nil
 }
-
-func fastHashUint64(x uint64) uint64 {
-	x ^= x >> 12 // a
-	x ^= x << 25 // b
-	x ^= x >> 27 // c
-	return x * 2685821657736338717
-}
--- a/lib/logstorage/stats_uniq_values.go
+++ b/lib/logstorage/stats_uniq_values.go
@@ -11,6 +11,7 @@ import (
 	"github.com/valyala/quicktemplate"

 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
 )

 type statsUniqValues struct {
@@ -40,11 +41,6 @@ func (su *statsUniqValues) newStatsProcessor(a *chunkedAllocator) statsProcessor
 type statsUniqValuesProcessor struct {
 	a *chunkedAllocator

-	// concurrency is the number of parallel workers to use when merging shards.
-	//
-	// this field must be updated by the caller before using statsUniqValuesProcessor.
-	concurrency uint
-
 	m  map[string]struct{}
 	ms []map[string]struct{}
 }
@@ -166,7 +162,7 @@ func (sup *statsUniqValuesProcessor) finalizeStats(sf statsFunc, dst []byte, sto
 	var items []string
 	if len(sup.ms) > 0 {
 		sup.ms = append(sup.ms, sup.m)
-		items = mergeSetsParallel(sup.ms, sup.concurrency, stopCh)
+		items = mergeSetsParallel(sup.ms, stopCh)
 	} else {
 		items = setToSortedSlice(sup.m)
 	}
@@ -178,9 +174,9 @@ func (sup *statsUniqValuesProcessor) finalizeStats(sf statsFunc, dst []byte, sto
 	return marshalJSONArray(dst, items)
 }

-func mergeSetsParallel(ms []map[string]struct{}, concurrency uint, stopCh <-chan struct{}) []string {
+func mergeSetsParallel(ms []map[string]struct{}, stopCh <-chan struct{}) []string {
 	shardsLen := len(ms)
-	cpusCount := concurrency
+	cpusCount := cgroup.AvailableCPUs()

 	var wg sync.WaitGroup
 	msShards := make([][]map[string]struct{}, shardsLen)