diff --git a/dashboards/vm/vmagent.json b/dashboards/vm/vmagent.json index 666e6d7c7b..5198792f48 100644 --- a/dashboards/vm/vmagent.json +++ b/dashboards/vm/vmagent.json @@ -627,7 +627,7 @@ "refId": "A" } ], - "title": "Log errors (30m)", + "title": "Log messages (30m)", "type": "stat" }, { @@ -8413,4 +8413,4 @@ "title": "VictoriaMetrics - vmagent (VM)", "uid": "G7Z9GzMGz_vm", "version": 1 -} \ No newline at end of file +} diff --git a/dashboards/vm/vmauth.json b/dashboards/vm/vmauth.json index 1bb22665fd..b9edb6146b 100644 --- a/dashboards/vm/vmauth.json +++ b/dashboards/vm/vmauth.json @@ -2573,13 +2573,13 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (level)", + "expr": "sum(rate(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (level)", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Log errors", + "title": "Logging rate", "type": "timeseries" }, { diff --git a/dashboards/vmagent.json b/dashboards/vmagent.json index d91afbd08a..c33d4bfdad 100644 --- a/dashboards/vmagent.json +++ b/dashboards/vmagent.json @@ -626,7 +626,7 @@ "refId": "A" } ], - "title": "Log errors (30m)", + "title": "Log messages (30m)", "type": "stat" }, { @@ -8412,4 +8412,4 @@ "title": "VictoriaMetrics - vmagent", "uid": "G7Z9GzMGz", "version": 1 -} \ No newline at end of file +} diff --git a/dashboards/vmauth.json b/dashboards/vmauth.json index ca9595ce96..9cdee69ef6 100644 --- a/dashboards/vmauth.json +++ b/dashboards/vmauth.json @@ -2572,13 +2572,13 @@ "uid": "$ds" }, "editorMode": "code", - "expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (level)", + "expr": "sum(rate(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (level)", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Log errors", + "title": "Logging rate", "type": "timeseries" }, { diff --git a/deployment/docker/rules/alerts-health.yml b/deployment/docker/rules/alerts-health.yml index 2788404ff2..25edfb9eb0 100644 --- a/deployment/docker/rules/alerts-health.yml +++ b/deployment/docker/rules/alerts-health.yml @@ -71,15 +71,16 @@ groups: the service could work unreliably with delays in processing. - alert: TooManyLogs - expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0 + expr: sum(increase(vm_log_messages_total{level!="info"}[5m])) without (app_version, location, is_printed) > 0 for: 15m labels: severity: warning annotations: - summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})" + summary: "Too many logs are generated for job \"{{ $labels.job }}\" ({{ $labels.instance }})" description: > - Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m. - Worth to check logs for specific error messages. + The job \"{{ $labels.job }}\" ({{ $labels.instance }}) generated {{ $value }} log messages with the level higher than info for the last 5 minutes. + Check the logs for the given target. Check also the \"location\" label at the vm_log_messages_total metric if -loggerLevel command-line flag is set to value other than INFO. + This label contains code locations responsible for generating log messages suppressed by -loggerLevel. - alert: TooManyTSIDMisses expr: increase(vm_missing_tsids_for_metric_id_total[5m]) > 0 diff --git a/docs/victoriametrics/changelog/CHANGELOG.md b/docs/victoriametrics/changelog/CHANGELOG.md index e4e6e1e17f..166b3a2e2b 100644 --- a/docs/victoriametrics/changelog/CHANGELOG.md +++ b/docs/victoriametrics/changelog/CHANGELOG.md @@ -26,6 +26,8 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel ## tip +* FEATURE: [monitoring](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#monitoring): take into account all the generated log messages at `vm_log_messages_total` metric, including suppressed logs if the `-loggerLevel` command-line flag is set to values other than `INFO`. Add `is_printed` label to the `vm_log_messages_total` metric in order to understand whether the log at the given code `location` was suppressed or not. This simplifies troubleshooting of VictoriaMetrics components when logs aren't available. See [#10304](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10304). + * BUGFIX: [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): properly return [/api/v1/status/tsdb](https://docs.victoriametrics.com/victoriametrics/#tsdb-stats) response for time range outside [partition index](https://docs.victoriametrics.com/victoriametrics/#indexdb). See [#10315](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10315). ## [v1.134.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.134.0) diff --git a/lib/logger/logger.go b/lib/logger/logger.go index 08cb0ab684..4a384256ef 100644 --- a/lib/logger/logger.go +++ b/lib/logger/logger.go @@ -148,11 +148,38 @@ func logLevel(level, format string, args []any) { } func logLevelSkipframes(skipframes int, level, format string, args []any) { + location := getLogLocation(3 + skipframes) + if shouldSkipLog(level) { + // Increment vm_log_messages_total even if the log is suppressed. + // This simplified troubleshooting when logs are suppressed. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10304 + countLogMessage(level, location, false) return } + msg := formatLogMessage(*maxLogArgLen, format, args) - logMessage(level, msg, 3+skipframes) + isPrinted := logMessageInternal(level, msg, location) + countLogMessage(level, location, isPrinted) +} + +func countLogMessage(level, location string, isPrinted bool) { + levelLowercase := strings.ToLower(level) + counterName := fmt.Sprintf(`vm_log_messages_total{app_version=%q, level=%q, location=%q, is_printed="%v"}`, buildinfo.Version, levelLowercase, location, isPrinted) + metrics.GetOrCreateCounter(counterName).Inc() +} + +func getLogLocation(skipframes int) string { + _, file, line, ok := runtime.Caller(skipframes) + if !ok { + file = "???" + line = 0 + } + if n := strings.Index(file, "/VictoriaMetrics/"); n >= 0 { + // Strip /VictoriaMetrics/ prefix + file = file[n+len("/VictoriaMetrics/"):] + } + return fmt.Sprintf("%s:%d", file, line) } func formatLogMessage(maxArgLen int, format string, args []any) string { @@ -235,22 +262,11 @@ func (lw *logWriter) Write(p []byte) (int, error) { return len(p), nil } -func logMessage(level, msg string, skipframes int) { +func logMessageInternal(level, msg, location string) bool { timestamp := "" if !*disableTimestamps { timestamp = time.Now().In(timezone).Format("2006-01-02T15:04:05.000Z0700") } - levelLowercase := strings.ToLower(level) - _, file, line, ok := runtime.Caller(skipframes) - if !ok { - file = "???" - line = 0 - } - if n := strings.Index(file, "/VictoriaMetrics/"); n >= 0 { - // Strip /VictoriaMetrics/ prefix - file = file[n+len("/VictoriaMetrics/"):] - } - location := fmt.Sprintf("%s:%d", file, line) // rate limit ERROR and WARN log messages with given limit. if level == "ERROR" || level == "WARN" { @@ -260,7 +276,7 @@ func logMessage(level, msg string, skipframes int) { } ok, suppressMessage := logLimiter.needSuppress(location, limit) if ok { - return + return false } if len(suppressMessage) > 0 { msg = suppressMessage + msg @@ -270,6 +286,8 @@ func logMessage(level, msg string, skipframes int) { for len(msg) > 0 && msg[len(msg)-1] == '\n' { msg = msg[:len(msg)-1] } + + levelLowercase := strings.ToLower(level) var logMsg string switch *loggerFormat { case "json": @@ -302,10 +320,6 @@ func logMessage(level, msg string, skipframes int) { fmt.Fprint(output, logMsg) mu.Unlock() - // Increment vm_log_messages_total - counterName := fmt.Sprintf(`vm_log_messages_total{app_version=%q, level=%q, location=%q}`, buildinfo.Version, levelLowercase, location) - metrics.GetOrCreateCounter(counterName).Inc() - switch level { case "PANIC": if *loggerFormat == "json" { @@ -316,6 +330,8 @@ func logMessage(level, msg string, skipframes int) { case "FATAL": os.Exit(-1) } + + return true } var mu sync.Mutex