lib/logger: count both printed and suppressed logs at vm_log_messages_total metric

This simplifies troubleshooting by investigating the vm_log_messages_total metric
when logs are unavailable. The logs may be unavailable when the -loggerLevel command-line
flag is set to value other than INFO. The logs may be unavailable when clients
use Monitoring of Monitoring service ( https://victoriametrics.com/products/mom/ ),
which provides metrics, but doesn't provide logs from VictoriaMetrics components
running at the client side.

Add `is_printed` label to the `vm_log_messages_total` metric in order to detect whether
the given log has been suppressed or printed.

See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10304

While at it, make more readable the description for the TooManyLogs alert,
which is based on the vm_log_messages_total metric.
Also return back the `level!="info"` instead of `level="error"` filter
in the query for this alerting rule, in order to be consistent with queries
at the official dashboards for VictoriaMetrics components.
TODO: investigate too high warnings rate at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2760
and fix it at the source of these warnings instead of modifying the query
for the TooManyLogs alert.
This commit is contained in:
Aliaksandr Valialkin
2026-01-25 17:43:17 +01:00
parent 446452857c
commit 5632ccc64a
7 changed files with 49 additions and 30 deletions

View File

@@ -627,7 +627,7 @@
"refId": "A"
}
],
"title": "Log errors (30m)",
"title": "Log messages (30m)",
"type": "stat"
},
{
@@ -8413,4 +8413,4 @@
"title": "VictoriaMetrics - vmagent (VM)",
"uid": "G7Z9GzMGz_vm",
"version": 1
}
}

View File

@@ -2573,13 +2573,13 @@
"uid": "$ds"
},
"editorMode": "code",
"expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (level)",
"expr": "sum(rate(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (level)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Log errors",
"title": "Logging rate",
"type": "timeseries"
},
{

View File

@@ -626,7 +626,7 @@
"refId": "A"
}
],
"title": "Log errors (30m)",
"title": "Log messages (30m)",
"type": "stat"
},
{
@@ -8412,4 +8412,4 @@
"title": "VictoriaMetrics - vmagent",
"uid": "G7Z9GzMGz",
"version": 1
}
}

View File

@@ -2572,13 +2572,13 @@
"uid": "$ds"
},
"editorMode": "code",
"expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (level)",
"expr": "sum(rate(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (level)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Log errors",
"title": "Logging rate",
"type": "timeseries"
},
{

View File

@@ -71,15 +71,16 @@ groups:
the service could work unreliably with delays in processing.
- alert: TooManyLogs
expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0
expr: sum(increase(vm_log_messages_total{level!="info"}[5m])) without (app_version, location, is_printed) > 0
for: 15m
labels:
severity: warning
annotations:
summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
summary: "Too many logs are generated for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
description: >
Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.
Worth to check logs for specific error messages.
The job \"{{ $labels.job }}\" ({{ $labels.instance }}) generated {{ $value }} log messages with the level higher than info for the last 5 minutes.
Check the logs for the given target. Check also the \"location\" label at the vm_log_messages_total metric if -loggerLevel command-line flag is set to value other than INFO.
This label contains code locations responsible for generating log messages suppressed by -loggerLevel.
- alert: TooManyTSIDMisses
expr: increase(vm_missing_tsids_for_metric_id_total[5m]) > 0

View File

@@ -26,6 +26,8 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel
## tip
* FEATURE: [monitoring](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#monitoring): take into account all the generated log messages at `vm_log_messages_total` metric, including suppressed logs if the `-loggerLevel` command-line flag is set to values other than `INFO`. Add `is_printed` label to the `vm_log_messages_total` metric in order to understand whether the log at the given code `location` was suppressed or not. This simplifies troubleshooting of VictoriaMetrics components when logs aren't available. See [#10304](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10304).
* BUGFIX: [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): properly return [/api/v1/status/tsdb](https://docs.victoriametrics.com/victoriametrics/#tsdb-stats) response for time range outside [partition index](https://docs.victoriametrics.com/victoriametrics/#indexdb). See [#10315](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10315).
## [v1.134.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.134.0)

View File

@@ -148,11 +148,38 @@ func logLevel(level, format string, args []any) {
}
func logLevelSkipframes(skipframes int, level, format string, args []any) {
location := getLogLocation(3 + skipframes)
if shouldSkipLog(level) {
// Increment vm_log_messages_total even if the log is suppressed.
// This simplified troubleshooting when logs are suppressed.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10304
countLogMessage(level, location, false)
return
}
msg := formatLogMessage(*maxLogArgLen, format, args)
logMessage(level, msg, 3+skipframes)
isPrinted := logMessageInternal(level, msg, location)
countLogMessage(level, location, isPrinted)
}
func countLogMessage(level, location string, isPrinted bool) {
levelLowercase := strings.ToLower(level)
counterName := fmt.Sprintf(`vm_log_messages_total{app_version=%q, level=%q, location=%q, is_printed="%v"}`, buildinfo.Version, levelLowercase, location, isPrinted)
metrics.GetOrCreateCounter(counterName).Inc()
}
func getLogLocation(skipframes int) string {
_, file, line, ok := runtime.Caller(skipframes)
if !ok {
file = "???"
line = 0
}
if n := strings.Index(file, "/VictoriaMetrics/"); n >= 0 {
// Strip /VictoriaMetrics/ prefix
file = file[n+len("/VictoriaMetrics/"):]
}
return fmt.Sprintf("%s:%d", file, line)
}
func formatLogMessage(maxArgLen int, format string, args []any) string {
@@ -235,22 +262,11 @@ func (lw *logWriter) Write(p []byte) (int, error) {
return len(p), nil
}
func logMessage(level, msg string, skipframes int) {
func logMessageInternal(level, msg, location string) bool {
timestamp := ""
if !*disableTimestamps {
timestamp = time.Now().In(timezone).Format("2006-01-02T15:04:05.000Z0700")
}
levelLowercase := strings.ToLower(level)
_, file, line, ok := runtime.Caller(skipframes)
if !ok {
file = "???"
line = 0
}
if n := strings.Index(file, "/VictoriaMetrics/"); n >= 0 {
// Strip /VictoriaMetrics/ prefix
file = file[n+len("/VictoriaMetrics/"):]
}
location := fmt.Sprintf("%s:%d", file, line)
// rate limit ERROR and WARN log messages with given limit.
if level == "ERROR" || level == "WARN" {
@@ -260,7 +276,7 @@ func logMessage(level, msg string, skipframes int) {
}
ok, suppressMessage := logLimiter.needSuppress(location, limit)
if ok {
return
return false
}
if len(suppressMessage) > 0 {
msg = suppressMessage + msg
@@ -270,6 +286,8 @@ func logMessage(level, msg string, skipframes int) {
for len(msg) > 0 && msg[len(msg)-1] == '\n' {
msg = msg[:len(msg)-1]
}
levelLowercase := strings.ToLower(level)
var logMsg string
switch *loggerFormat {
case "json":
@@ -302,10 +320,6 @@ func logMessage(level, msg string, skipframes int) {
fmt.Fprint(output, logMsg)
mu.Unlock()
// Increment vm_log_messages_total
counterName := fmt.Sprintf(`vm_log_messages_total{app_version=%q, level=%q, location=%q}`, buildinfo.Version, levelLowercase, location)
metrics.GetOrCreateCounter(counterName).Inc()
switch level {
case "PANIC":
if *loggerFormat == "json" {
@@ -316,6 +330,8 @@ func logMessage(level, msg string, skipframes int) {
case "FATAL":
os.Exit(-1)
}
return true
}
var mu sync.Mutex