From 243037823ab7b760d6dd75f2a8617cdfcaf8b0a2 Mon Sep 17 00:00:00 2001 From: Max Kotliar Date: Tue, 12 May 2026 15:42:55 +0300 Subject: [PATCH] app/vmagent: fix rare hash collision in getLabelsHash (#10937) Add '=' separator between label name and value when computing the hash to prevent false collisions, like {a="bc"} and {ab="c"} hashing to the same value. getLabelsHashForShard is added to avoid sharding disruptions in vmagent (-remoteWrite.shardByURL=true mode). The function preserves previous behavior, without '=' between name and value. PR https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10937 --- app/vmagent/remotewrite/remotewrite.go | 19 ++++++++++++++++++- app/vmagent/remotewrite/remotewrite_test.go | 10 ++++++---- docs/victoriametrics/changelog/CHANGELOG.md | 1 + lib/promscrape/scrapework.go | 1 + lib/promscrape/scrapework_test.go | 2 +- 5 files changed, 27 insertions(+), 6 deletions(-) diff --git a/app/vmagent/remotewrite/remotewrite.go b/app/vmagent/remotewrite/remotewrite.go index 1e0abbca73..7917993e46 100644 --- a/app/vmagent/remotewrite/remotewrite.go +++ b/app/vmagent/remotewrite/remotewrite.go @@ -699,7 +699,7 @@ func shardAmountRemoteWriteCtx(tssBlock []prompb.TimeSeries, shards [][]prompb.T } tmpLabels.Labels = hashLabels } - h := getLabelsHash(hashLabels) + h := getLabelsHashForShard(hashLabels) // Get the rwctxIdx through consistent hashing and then map it to the index in shards. // The rwctxIdx is not always equal to the shardIdx, for example, when some rwctx are not available. @@ -790,11 +790,28 @@ var ( dailySeriesLimitRowsDropped = metrics.NewCounter(`vmagent_daily_series_limit_rows_dropped_total`) ) +// getLabelsHashForShard is a separate function from getLabelsHash because +// it omits the '=' separator between label name and value for backward compatibility. +// Changing it would re-shard all series across remoteWrite targets. +func getLabelsHashForShard(labels []prompb.Label) uint64 { + bb := labelsHashBufPool.Get() + b := bb.B[:0] + for _, label := range labels { + b = append(b, label.Name...) + b = append(b, label.Value...) + } + h := xxhash.Sum64(b) + bb.B = b + labelsHashBufPool.Put(bb) + return h +} + func getLabelsHash(labels []prompb.Label) uint64 { bb := labelsHashBufPool.Get() b := bb.B[:0] for _, label := range labels { b = append(b, label.Name...) + b = append(b, '=') b = append(b, label.Value...) } h := xxhash.Sum64(b) diff --git a/app/vmagent/remotewrite/remotewrite_test.go b/app/vmagent/remotewrite/remotewrite_test.go index c0926a7ce4..f37ee18cd3 100644 --- a/app/vmagent/remotewrite/remotewrite_test.go +++ b/app/vmagent/remotewrite/remotewrite_test.go @@ -25,7 +25,7 @@ func TestGetLabelsHash_Distribution(t *testing.T) { t.Helper() // Distribute itemsCount hashes returned by getLabelsHash() across bucketsCount buckets. - itemsCount := 1_000 * bucketsCount + itemsCount := 10_000 * bucketsCount m := make([]int, bucketsCount) var labels []prompb.Label for i := range itemsCount { @@ -44,10 +44,12 @@ func TestGetLabelsHash_Distribution(t *testing.T) { } // Verify that the distribution is even - expectedItemsPerBucket := itemsCount / bucketsCount + expectedItemsPerBucket := float64(itemsCount / bucketsCount) + allowedDeviation := math.Round(float64(expectedItemsPerBucket) * 0.04) for _, n := range m { - if math.Abs(1-float64(n)/float64(expectedItemsPerBucket)) > 0.04 { - t.Fatalf("unexpected items in the bucket for %d buckets; got %d; want around %d", bucketsCount, n, expectedItemsPerBucket) + if math.Abs(expectedItemsPerBucket-float64(n)) > allowedDeviation { + t.Fatalf("unexpected items in the bucket for %d buckets; got %d; want in range [%.0f, %.0f]", + bucketsCount, n, expectedItemsPerBucket-allowedDeviation, expectedItemsPerBucket+allowedDeviation) } } } diff --git a/docs/victoriametrics/changelog/CHANGELOG.md b/docs/victoriametrics/changelog/CHANGELOG.md index 8a67a81290..228530b2a0 100644 --- a/docs/victoriametrics/changelog/CHANGELOG.md +++ b/docs/victoriametrics/changelog/CHANGELOG.md @@ -30,6 +30,7 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel * BUGFIX: [stream aggregation](https://docs.victoriametrics.com/victoriametrics/stream-aggregation/): stop emitting stale values for `quantiles(...)` outputs when a time series has no samples during the current aggregation interval. Thanks to @alexei38 for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10918). * BUGFIX: [stream aggregation](https://docs.victoriametrics.com/victoriametrics/stream-aggregation/): extend delay on aggregation windows flush by the biggest lag among pushed samples. Before, the delay was calculated as 95th percentile across samples, which could underrepresent outliers and reject them from aggregation as "too old". See [#10402](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10402). +* BUGFIX: [vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/): fix a bug in [cardinality limiters](https://docs.victoriametrics.com/victoriametrics/vmagent/#cardinality-limiter) where series with different labels, like `{a="bc"}` and `{ab="c"}`, could be incorrectly treated as identical and dropped. See [#10937](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10937). ## [v1.143.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.143.0) diff --git a/lib/promscrape/scrapework.go b/lib/promscrape/scrapework.go index b33ccb3567..2e201d5bde 100644 --- a/lib/promscrape/scrapework.go +++ b/lib/promscrape/scrapework.go @@ -961,6 +961,7 @@ func getLabelsHash(labels []prompb.Label) uint64 { for _, label := range labels { b = append(b, label.Name...) + b = append(b, '=') b = append(b, label.Value...) } h := xxhash.Sum64(b) diff --git a/lib/promscrape/scrapework_test.go b/lib/promscrape/scrapework_test.go index 9c0f971954..9fe329d14b 100644 --- a/lib/promscrape/scrapework_test.go +++ b/lib/promscrape/scrapework_test.go @@ -700,7 +700,7 @@ func TestScrapeWorkScrapeInternalStreamConcurrency(t *testing.T) { StreamParse: true, ScrapeTimeout: time.Second * 42, SeriesLimit: 4000, - }, 3, 4015, 2, 50) + }, 3, 4012, 2, 50) } func TestScrapeWorkScrapeInternalWithMaxScrapeSize(t *testing.T) {