Files
VictoriaMetrics/deployment/docker/rules/alerts-vmalert.yml
Hui Wang 72c9e9377c app/vmalert: expose remotewrite queue_size metrics
This commit adds new metrics `vmalert_remotewrite_queue_capacity` and `vmalert_remotewrite_queue_size`, which is updated with each push and it's
frequency depends on `-remoteWrite.concurrency`,
`remoteWrite.flushInterval`

It doesn't account for the pending data within each pushers request, it
should provide a general indication of the queue usage.

Related PR https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10765
2026-04-09 11:22:38 +02:00

112 lines
6.6 KiB
YAML

# File contains default list of alerts for vmalert service.
# The alerts below are just recommendations and may require some updates
# and threshold calibration according to every specific setup.
groups:
# Alerts group for vmalert assumes that Grafana dashboard
# https://grafana.com/grafana/dashboards/14950 is installed.
# Pls update the `dashboard` annotation according to your setup.
- name: vmalert
interval: 30s
rules:
- alert: ConfigurationReloadFailure
expr: vmalert_config_last_reload_successful != 1
labels:
severity: warning
annotations:
summary: "Configuration reload failed for vmalert instance {{ $labels.instance }}"
description: "Configuration hot-reload failed for vmalert on instance {{ $labels.instance }}.
Check vmalert's logs for detailed error message."
- alert: AlertingRulesError
expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) without(id) > 0
for: 5m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-file={{ $labels.file }}&var-group={{ $labels.group }}"
summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}"
description: "Alerting rules execution is failing for \"{{ $labels.alertname }}\" from group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\".
Check vmalert's logs for detailed error message."
- alert: RecordingRulesError
expr: sum(increase(vmalert_recording_rules_errors_total[5m])) without(id) > 0
for: 5m
labels:
severity: warning
annotations:
dashboard: "{{ $externalURL }}/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-file={{ $labels.file }}&var-group={{ $labels.group }}"
summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}"
description: "Recording rules execution is failing for \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\".
Check vmalert's logs for detailed error message."
- alert: RecordingRulesNoData
expr: sum(vmalert_recording_rules_last_evaluation_samples) without(id) < 1
for: 30m
labels:
severity: info
annotations:
dashboard: "{{ $externalURL }}/d/LzldHAVnz?viewPanel=33&var-file={{ $labels.file }}&var-group={{ $labels.group }}"
summary: "Recording rule {{ $labels.recording }} ({{ $labels.group }}) produces no data"
description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\ in file \"{{ $labels.file }}\"
produces 0 samples over the last 30min. It might be caused by a misconfiguration
or incorrect query expression."
- alert: TooManyMissedIterations
expr: increase(vmalert_iteration_missed_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations"
description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\".
The group evaluation time takes longer than the configured evaluation interval. This may result in missed
alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of
group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/victoriametrics/vmalert/#groups.
If rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/victoriametrics/troubleshooting/#slow-queries."
- alert: RemoteWriteErrors
expr: increase(vmalert_remotewrite_errors_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
summary: "vmalert instance {{ $labels.instance }} is failing to push metrics to remote write URL"
description: "vmalert instance {{ $labels.instance }} is failing to push metrics generated via alerting
or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message."
- alert: RemoteWriteDroppingData
expr: increase(vmalert_remotewrite_dropped_rows_total[5m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "vmalert instance {{ $labels.instance }} is dropping data sent to remote write URL"
description: "vmalert instance {{ $labels.instance }} is failing to send results of alerting or recording rules
to the configured remote write URL. This may result into gaps in recording rules or alerts state.
Check vmalert's logs for detailed error message."
- alert: RemoteWriteQueueHighUsage
expr: histogram_quantile(0.99, sum(increase(vmalert_remotewrite_queue_size_bucket[5m])) by (job, instance, vmrange)) / vmalert_remotewrite_queue_capacity > 0.8
for: 15m
labels:
severity: warning
annotations:
summary: "Remote write queue capacity on the vmalert instance {{ $labels.instance }} has exceeded 80% utilization"
description: "The remote write queue on vmalert instance {{ $labels.instance }} has consistently high utilization.
The queue acts as a buffer between rules generating series and remote-write client consuming and pushing these series. When queue overflows, vmalert will start dropping newly generated series.
Queue may overflow due to multiple reasons:
1. Some bad rules produce too many series at once. This can be limited using the global `-rule.resultsLimit` flag or `limit` param at the rule group level.
2. Remote write connection is slow. Increase `-remoteWrite.concurrency`, so vmalert could establish more concurrent connections.
3. The queue size is too small. Increase `-remoteWrite.maxQueueSize` to extend the buffer size. Note that a larger queue will result in higher memory consumption when the queue is full."
- alert: AlertmanagerErrors
expr: increase(vmalert_alerts_send_errors_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager"
description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\".
Check vmalert's logs for detailed error message."