mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-05-17 00:26:36 +03:00
app/vmagent: fix sharding correctness when disableOnDiskQueue is set
When -remoteWrite.shardByURL is enabled and one of the remote write targets has -remoteWrite.disableOnDiskQueue set and becomes blocked, samples could be rerouted to other shards, breaking the sharding guarantee. Fix this by allways using rwctxsGlobal in sharding mode. Add a startup check that requires -remoteWrite.disableOnDiskQueue to be configured uniformly across all targets when shardByURL is enabled. Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10507
This commit is contained in:
@@ -167,6 +167,14 @@ func Init() {
|
||||
if len(*remoteWriteURLs) == 0 {
|
||||
logger.Fatalf("at least one `-remoteWrite.url` command-line flag must be set")
|
||||
}
|
||||
if *shardByURL {
|
||||
copyDisableOnDiskQueue := append([]bool{}, *disableOnDiskQueue...)
|
||||
if len(slices.Compact(copyDisableOnDiskQueue)) != 1 {
|
||||
logger.Fatalf("all -remoteWrite.url targets must have the same -remoteWrite.disableOnDiskQueue setting when -remoteWrite.shardByURL is enabled; " +
|
||||
"either enable or disable -remoteWrite.disableOnDiskQueue for all targets")
|
||||
}
|
||||
}
|
||||
|
||||
if limit := getMaxHourlySeries(); limit > 0 {
|
||||
hourlySeriesLimiter = bloomfilter.NewLimiter(limit, time.Hour)
|
||||
_ = metrics.NewGauge(`vmagent_hourly_series_limit_max_series`, func() float64 {
|
||||
@@ -499,6 +507,9 @@ func tryPush(at *auth.Token, wr *prompb.WriteRequest, forceDropSamplesOnFailure
|
||||
//
|
||||
// calculateHealthyRwctxIdx will rely on the order of rwctx to be in ascending order.
|
||||
func getEligibleRemoteWriteCtxs(tss []prompb.TimeSeries, forceDropSamplesOnFailure bool) ([]*remoteWriteCtx, bool) {
|
||||
if *shardByURL {
|
||||
return rwctxsGlobal, true
|
||||
}
|
||||
if !disableOnDiskQueueAny {
|
||||
return rwctxsGlobal, true
|
||||
}
|
||||
@@ -514,12 +525,6 @@ func getEligibleRemoteWriteCtxs(tss []prompb.TimeSeries, forceDropSamplesOnFailu
|
||||
return nil, false
|
||||
}
|
||||
rowsCount := getRowsCount(tss)
|
||||
if *shardByURL {
|
||||
// Todo: When shardByURL is enabled, the following metrics won't be 100% accurate. Because vmagent don't know
|
||||
// which rwctx should data be pushed to yet. Let's consider the hashing algorithm fair and will distribute
|
||||
// data to all rwctxs evenly.
|
||||
rowsCount = rowsCount / len(rwctxsGlobal)
|
||||
}
|
||||
rwctx.rowsDroppedOnPushFailure.Add(rowsCount)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,6 +33,8 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel
|
||||
* BUGFIX: [stream aggregation](https://docs.victoriametrics.com/victoriametrics/stream-aggregation/): extend delay on aggregation windows flush by the biggest lag among pushed samples. Before, the delay was calculated as 95th percentile across samples, which could underrepresent outliers and reject them from aggregation as "too old". See [#10402](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10402).
|
||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/): fix a bug in [cardinality limiters](https://docs.victoriametrics.com/victoriametrics/vmagent/#cardinality-limiter) where series with different labels, like `{a="bc"}` and `{ab="c"}`, could be incorrectly treated as identical and dropped. See [#10937](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10937).
|
||||
* BUGFIX: [vmrestore](https://docs.victoriametrics.com/victoriametrics/vmrestore/): fix a bug where specifying `-storageDataPath` with a trailing slash could cause `vmrestore` to panic. See [#10823](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10823). Thanks to @utafrali for the contribution.
|
||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/): prevent unintentional rerouting of samples to other sharding targets when one of the `-remoteWrite.url` targets with `-remoteWrite.disableOnDiskQueue` becomes blocked. Previously this could break the sharding guarantee by sending samples to wrong targets instead of dropping or retrying them. See [#10507](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10507).
|
||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/): return error on startup if `-remoteWrite.disableOnDiskQueue` is not configured uniformly across all `-remoteWrite.url` targets when `-remoteWrite.shardByURL` is enabled. Either all targets must have it enabled or all must have it disabled. See [#10507](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10507).
|
||||
|
||||
## [v1.143.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.143.0)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user