From 72c9e9377ca04cde139a261868c98326dc04e45c Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Thu, 9 Apr 2026 17:22:38 +0800 Subject: [PATCH] app/vmalert: expose remotewrite queue_size metrics This commit adds new metrics `vmalert_remotewrite_queue_capacity` and `vmalert_remotewrite_queue_size`, which is updated with each push and it's frequency depends on `-remoteWrite.concurrency`, `remoteWrite.flushInterval` It doesn't account for the pending data within each pushers request, it should provide a general indication of the queue usage. Related PR https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10765 --- app/vmalert/remotewrite/client.go | 20 ++- dashboards/vm/vmalert.json | 142 +++++++++++++++++++- dashboards/vmalert.json | 142 +++++++++++++++++++- deployment/docker/rules/alerts-vmalert.yml | 15 +++ docs/victoriametrics/changelog/CHANGELOG.md | 1 + 5 files changed, 303 insertions(+), 17 deletions(-) diff --git a/app/vmalert/remotewrite/client.go b/app/vmalert/remotewrite/client.go index 7d6edb7e58..3f454d6ff8 100644 --- a/app/vmalert/remotewrite/client.go +++ b/app/vmalert/remotewrite/client.go @@ -233,13 +233,18 @@ var ( rwTotal = metrics.NewCounter(`vmalert_remotewrite_total`) // sentRows and sentBytes are historical counters that can now be replaced by flushedRows and flushedBytes histograms. They may be deprecated in the future after the new histograms have been adopted for some time. - sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`) - sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`) - flushedRows = metrics.NewHistogram(`vmalert_remotewrite_sent_rows`) - flushedBytes = metrics.NewHistogram(`vmalert_remotewrite_sent_bytes`) - droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`) - sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`) - bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`) + sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`) + sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`) + flushedRows = metrics.NewHistogram(`vmalert_remotewrite_sent_rows`) + flushedBytes = metrics.NewHistogram(`vmalert_remotewrite_sent_bytes`) + droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`) + sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`) + bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`) + remoteWriteQueueSize = metrics.NewHistogram(`vmalert_remotewrite_queue_size`) + + _ = metrics.NewGauge(`vmalert_remotewrite_queue_capacity`, func() float64 { + return float64(*maxQueueSize) + }) _ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 { return float64(*concurrency) @@ -253,6 +258,7 @@ func GetDroppedRows() int { return int(droppedRows.Get()) } // it to remote-write endpoint. Flush performs limited amount of retries // if request fails. func (c *Client) flush(ctx context.Context, wr *prompb.WriteRequest) { + remoteWriteQueueSize.Update(float64(len(c.input))) if len(wr.Timeseries) < 1 { return } diff --git a/dashboards/vm/vmalert.json b/dashboards/vm/vmalert.json index d141602036..afc1b5a1f5 100644 --- a/dashboards/vm/vmalert.json +++ b/dashboards/vm/vmalert.json @@ -4001,6 +4001,138 @@ "title": "Datapoints drop rate ($instance)", "type": "timeseries" }, + { + "datasource": { + "type": "victoriametrics-metrics-datasource", + "uid": "$ds" + }, + "description": "Displays the maximum 99th percentile of the number of time series pending in the remote write queue.\n\nThe maximum queue size is configured by remoteWrite.maxQueueSize. \nvmalert will begin dropping data if the queue has no room for newly generated data.\nThe queue can fill rapidly when heavy rules generate millions of series, or when remote write requests are unable to send data to the destination in a timely manner, causing data to accumulate in the queue. Consider tuning -remoteWrite.maxQueueSize or -remoteWrite.concurrency.\n\nSee also the Rows per request panel.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "max" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 38 + }, + "id": 68, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "victoriametrics-metrics-datasource", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "max(histogram_quantile(0.99, sum(increase(vmalert_remotewrite_queue_size_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (instance, vmrange))) > 1", + "legendFormat": "current", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "victoriametrics-metrics-datasource", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "min(vmalert_remotewrite_queue_capacity{job=~\"$job\", instance=~\"$instance\"})", + "hide": false, + "instant": false, + "legendFormat": "max", + "range": true, + "refId": "B" + } + ], + "title": "Remote write queue size ($instance)", + "type": "timeseries" + }, { "datasource": { "type": "victoriametrics-metrics-datasource", @@ -4066,10 +4198,10 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 38 + "x": 0, + "y": 46 }, - "id": 68, + "id": 69, "options": { "legend": { "calcs": [], @@ -4092,7 +4224,7 @@ }, "editorMode": "code", "expr": "max(histogram_quantile(0.99, sum(increase(vmalert_remotewrite_sent_rows_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (instance, vmrange)))", - "legendFormat": "__auto", + "legendFormat": "max", "range": true, "refId": "A" } @@ -4168,7 +4300,7 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 46 }, "id": 54, diff --git a/dashboards/vmalert.json b/dashboards/vmalert.json index 499b1223e0..a2e97de61c 100644 --- a/dashboards/vmalert.json +++ b/dashboards/vmalert.json @@ -4000,6 +4000,138 @@ "title": "Datapoints drop rate ($instance)", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Displays the maximum 99th percentile of the number of time series pending in the remote write queue.\n\nThe maximum queue size is configured by remoteWrite.maxQueueSize. \nvmalert will begin dropping data if the queue has no room for newly generated data.\nThe queue can fill rapidly when heavy rules generate millions of series, or when remote write requests are unable to send data to the destination in a timely manner, causing data to accumulate in the queue. Consider tuning -remoteWrite.maxQueueSize or -remoteWrite.concurrency.\n\nSee also the Rows per request panel.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "max" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 38 + }, + "id": 68, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "max(histogram_quantile(0.99, sum(increase(vmalert_remotewrite_queue_size_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (instance, vmrange))) > 1", + "legendFormat": "current", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "min(vmalert_remotewrite_queue_capacity{job=~\"$job\", instance=~\"$instance\"})", + "hide": false, + "instant": false, + "legendFormat": "max", + "range": true, + "refId": "B" + } + ], + "title": "Remote write queue size ($instance)", + "type": "timeseries" + }, { "datasource": { "type": "prometheus", @@ -4065,10 +4197,10 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 38 + "x": 0, + "y": 46 }, - "id": 68, + "id": 69, "options": { "legend": { "calcs": [], @@ -4091,7 +4223,7 @@ }, "editorMode": "code", "expr": "max(histogram_quantile(0.99, sum(increase(vmalert_remotewrite_sent_rows_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (instance, vmrange)))", - "legendFormat": "__auto", + "legendFormat": "max", "range": true, "refId": "A" } @@ -4167,7 +4299,7 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, + "x": 12, "y": 46 }, "id": 54, diff --git a/deployment/docker/rules/alerts-vmalert.yml b/deployment/docker/rules/alerts-vmalert.yml index eb8fccc6b8..3b3bdb084f 100644 --- a/deployment/docker/rules/alerts-vmalert.yml +++ b/deployment/docker/rules/alerts-vmalert.yml @@ -85,6 +85,20 @@ groups: to the configured remote write URL. This may result into gaps in recording rules or alerts state. Check vmalert's logs for detailed error message." + - alert: RemoteWriteQueueHighUsage + expr: histogram_quantile(0.99, sum(increase(vmalert_remotewrite_queue_size_bucket[5m])) by (job, instance, vmrange)) / vmalert_remotewrite_queue_capacity > 0.8 + for: 15m + labels: + severity: warning + annotations: + summary: "Remote write queue capacity on the vmalert instance {{ $labels.instance }} has exceeded 80% utilization" + description: "The remote write queue on vmalert instance {{ $labels.instance }} has consistently high utilization. + The queue acts as a buffer between rules generating series and remote-write client consuming and pushing these series. When queue overflows, vmalert will start dropping newly generated series. + Queue may overflow due to multiple reasons: + 1. Some bad rules produce too many series at once. This can be limited using the global `-rule.resultsLimit` flag or `limit` param at the rule group level. + 2. Remote write connection is slow. Increase `-remoteWrite.concurrency`, so vmalert could establish more concurrent connections. + 3. The queue size is too small. Increase `-remoteWrite.maxQueueSize` to extend the buffer size. Note that a larger queue will result in higher memory consumption when the queue is full." + - alert: AlertmanagerErrors expr: increase(vmalert_alerts_send_errors_total[5m]) > 0 for: 15m @@ -94,3 +108,4 @@ groups: summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager" description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\". Check vmalert's logs for detailed error message." + diff --git a/docs/victoriametrics/changelog/CHANGELOG.md b/docs/victoriametrics/changelog/CHANGELOG.md index 447143cf49..7d92a044d5 100644 --- a/docs/victoriametrics/changelog/CHANGELOG.md +++ b/docs/victoriametrics/changelog/CHANGELOG.md @@ -37,6 +37,7 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel * FEATURE: [vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/): allow setting `-1` value for `-remoteWrite.maxHourlySeries` and `-remoteWrite.maxDailySeries` command-line flags. This automatically sets limits to the highest possible value in order to enable tracking without enforcing any limits. This is helpful for estimating current usage before applying real limits. See [#9614](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/9614). * FEATURE: `vminsert` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): optimize vminsert buffer size per vmstorage node based on available CPU, memory and storage node count to reduce OOM risk. See [#10725](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10725). * FEATURE: [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): allow setting `-1` value for `-storage.maxHourlySeries` and `-storage.maxDailySeries` command-line flags. This automatically sets limits to the highest possible value in order to enable tracking without enforcing any limits. This is helpful for estimating current usage before applying real limits. See [#9614](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/9614). +* FEATURE: [vmalert](https://docs.victoriametrics.com/victoriametrics/vmalert/): expose `vmalert_remotewrite_queue_size` and `vmalert_remotewrite_queue_capacity` to facilitate monitoring of remote write queue usage. See [#10765](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10765). * BUGFIX: [vmbackup](https://docs.victoriametrics.com/vmbackup/), [vmbackupmanager](https://docs.victoriametrics.com/victoriametrics/vmbackupmanager/): retry the requests that failed with unexpected EOF due to unstable network to S3 service. See [#10699](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10699). * BUGFIX: All VictoriaMetrics components: Fix an issue where `unsupported` metric metadata type was exposed for summaries and quantiles if a summary wasn't updated within a certain time window. See [metrics#120](https://github.com/VictoriaMetrics/metrics/issues/120) and [metrics#121](https://github.com/VictoriaMetrics/metrics/pull/121).