mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-05-17 00:26:36 +03:00
lib/logger: count both printed and suppressed logs at vm_log_messages_total metric
This simplifies troubleshooting by investigating the vm_log_messages_total metric when logs are unavailable. The logs may be unavailable when the -loggerLevel command-line flag is set to value other than INFO. The logs may be unavailable when clients use Monitoring of Monitoring service ( https://victoriametrics.com/products/mom/ ), which provides metrics, but doesn't provide logs from VictoriaMetrics components running at the client side. Add `is_printed` label to the `vm_log_messages_total` metric in order to detect whether the given log has been suppressed or printed. See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10304 While at it, make more readable the description for the TooManyLogs alert, which is based on the vm_log_messages_total metric. Also return back the `level!="info"` instead of `level="error"` filter in the query for this alerting rule, in order to be consistent with queries at the official dashboards for VictoriaMetrics components. TODO: investigate too high warnings rate at https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2760 and fix it at the source of these warnings instead of modifying the query for the TooManyLogs alert.
This commit is contained in:
@@ -627,7 +627,7 @@
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Log errors (30m)",
|
||||
"title": "Log messages (30m)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
@@ -8413,4 +8413,4 @@
|
||||
"title": "VictoriaMetrics - vmagent (VM)",
|
||||
"uid": "G7Z9GzMGz_vm",
|
||||
"version": 1
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2573,13 +2573,13 @@
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (level)",
|
||||
"expr": "sum(rate(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (level)",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Log errors",
|
||||
"title": "Logging rate",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
|
||||
@@ -626,7 +626,7 @@
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Log errors (30m)",
|
||||
"title": "Log messages (30m)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
@@ -8412,4 +8412,4 @@
|
||||
"title": "VictoriaMetrics - vmagent",
|
||||
"uid": "G7Z9GzMGz",
|
||||
"version": 1
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2572,13 +2572,13 @@
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (level)",
|
||||
"expr": "sum(rate(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (level)",
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Log errors",
|
||||
"title": "Logging rate",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
|
||||
@@ -71,15 +71,16 @@ groups:
|
||||
the service could work unreliably with delays in processing.
|
||||
|
||||
- alert: TooManyLogs
|
||||
expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0
|
||||
expr: sum(increase(vm_log_messages_total{level!="info"}[5m])) without (app_version, location, is_printed) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
|
||||
summary: "Too many logs are generated for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
|
||||
description: >
|
||||
Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.
|
||||
Worth to check logs for specific error messages.
|
||||
The job \"{{ $labels.job }}\" ({{ $labels.instance }}) generated {{ $value }} log messages with the level higher than info for the last 5 minutes.
|
||||
Check the logs for the given target. Check also the \"location\" label at the vm_log_messages_total metric if -loggerLevel command-line flag is set to value other than INFO.
|
||||
This label contains code locations responsible for generating log messages suppressed by -loggerLevel.
|
||||
|
||||
- alert: TooManyTSIDMisses
|
||||
expr: increase(vm_missing_tsids_for_metric_id_total[5m]) > 0
|
||||
|
||||
@@ -26,6 +26,8 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel
|
||||
|
||||
## tip
|
||||
|
||||
* FEATURE: [monitoring](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#monitoring): take into account all the generated log messages at `vm_log_messages_total` metric, including suppressed logs if the `-loggerLevel` command-line flag is set to values other than `INFO`. Add `is_printed` label to the `vm_log_messages_total` metric in order to understand whether the log at the given code `location` was suppressed or not. This simplifies troubleshooting of VictoriaMetrics components when logs aren't available. See [#10304](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10304).
|
||||
|
||||
* BUGFIX: [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): properly return [/api/v1/status/tsdb](https://docs.victoriametrics.com/victoriametrics/#tsdb-stats) response for time range outside [partition index](https://docs.victoriametrics.com/victoriametrics/#indexdb). See [#10315](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10315).
|
||||
|
||||
## [v1.134.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.134.0)
|
||||
|
||||
@@ -148,11 +148,38 @@ func logLevel(level, format string, args []any) {
|
||||
}
|
||||
|
||||
func logLevelSkipframes(skipframes int, level, format string, args []any) {
|
||||
location := getLogLocation(3 + skipframes)
|
||||
|
||||
if shouldSkipLog(level) {
|
||||
// Increment vm_log_messages_total even if the log is suppressed.
|
||||
// This simplified troubleshooting when logs are suppressed.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10304
|
||||
countLogMessage(level, location, false)
|
||||
return
|
||||
}
|
||||
|
||||
msg := formatLogMessage(*maxLogArgLen, format, args)
|
||||
logMessage(level, msg, 3+skipframes)
|
||||
isPrinted := logMessageInternal(level, msg, location)
|
||||
countLogMessage(level, location, isPrinted)
|
||||
}
|
||||
|
||||
func countLogMessage(level, location string, isPrinted bool) {
|
||||
levelLowercase := strings.ToLower(level)
|
||||
counterName := fmt.Sprintf(`vm_log_messages_total{app_version=%q, level=%q, location=%q, is_printed="%v"}`, buildinfo.Version, levelLowercase, location, isPrinted)
|
||||
metrics.GetOrCreateCounter(counterName).Inc()
|
||||
}
|
||||
|
||||
func getLogLocation(skipframes int) string {
|
||||
_, file, line, ok := runtime.Caller(skipframes)
|
||||
if !ok {
|
||||
file = "???"
|
||||
line = 0
|
||||
}
|
||||
if n := strings.Index(file, "/VictoriaMetrics/"); n >= 0 {
|
||||
// Strip /VictoriaMetrics/ prefix
|
||||
file = file[n+len("/VictoriaMetrics/"):]
|
||||
}
|
||||
return fmt.Sprintf("%s:%d", file, line)
|
||||
}
|
||||
|
||||
func formatLogMessage(maxArgLen int, format string, args []any) string {
|
||||
@@ -235,22 +262,11 @@ func (lw *logWriter) Write(p []byte) (int, error) {
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
func logMessage(level, msg string, skipframes int) {
|
||||
func logMessageInternal(level, msg, location string) bool {
|
||||
timestamp := ""
|
||||
if !*disableTimestamps {
|
||||
timestamp = time.Now().In(timezone).Format("2006-01-02T15:04:05.000Z0700")
|
||||
}
|
||||
levelLowercase := strings.ToLower(level)
|
||||
_, file, line, ok := runtime.Caller(skipframes)
|
||||
if !ok {
|
||||
file = "???"
|
||||
line = 0
|
||||
}
|
||||
if n := strings.Index(file, "/VictoriaMetrics/"); n >= 0 {
|
||||
// Strip /VictoriaMetrics/ prefix
|
||||
file = file[n+len("/VictoriaMetrics/"):]
|
||||
}
|
||||
location := fmt.Sprintf("%s:%d", file, line)
|
||||
|
||||
// rate limit ERROR and WARN log messages with given limit.
|
||||
if level == "ERROR" || level == "WARN" {
|
||||
@@ -260,7 +276,7 @@ func logMessage(level, msg string, skipframes int) {
|
||||
}
|
||||
ok, suppressMessage := logLimiter.needSuppress(location, limit)
|
||||
if ok {
|
||||
return
|
||||
return false
|
||||
}
|
||||
if len(suppressMessage) > 0 {
|
||||
msg = suppressMessage + msg
|
||||
@@ -270,6 +286,8 @@ func logMessage(level, msg string, skipframes int) {
|
||||
for len(msg) > 0 && msg[len(msg)-1] == '\n' {
|
||||
msg = msg[:len(msg)-1]
|
||||
}
|
||||
|
||||
levelLowercase := strings.ToLower(level)
|
||||
var logMsg string
|
||||
switch *loggerFormat {
|
||||
case "json":
|
||||
@@ -302,10 +320,6 @@ func logMessage(level, msg string, skipframes int) {
|
||||
fmt.Fprint(output, logMsg)
|
||||
mu.Unlock()
|
||||
|
||||
// Increment vm_log_messages_total
|
||||
counterName := fmt.Sprintf(`vm_log_messages_total{app_version=%q, level=%q, location=%q}`, buildinfo.Version, levelLowercase, location)
|
||||
metrics.GetOrCreateCounter(counterName).Inc()
|
||||
|
||||
switch level {
|
||||
case "PANIC":
|
||||
if *loggerFormat == "json" {
|
||||
@@ -316,6 +330,8 @@ func logMessage(level, msg string, skipframes int) {
|
||||
case "FATAL":
|
||||
os.Exit(-1)
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
var mu sync.Mutex
|
||||
|
||||
Reference in New Issue
Block a user