deployment/docker/rules: add VMSelectConcurrentQueriesExceedMemoryLimit alert

Warn users when cluster is misconfigured to allow too many concurrent selects
2026-07-01 22:53:50 +03:00 · 2026-07-01 12:53:03 +02:00
7 changed files with 35 additions and 33 deletions
--- a/app/vmauth/auth_config.go
+++ b/app/vmauth/auth_config.go
@@ -13,6 +13,7 @@ import (
 	"net/url"
 	"os"
 	"regexp"
+	"slices"
 	"sort"
 	"strconv"
 	"strings"
@@ -28,6 +29,7 @@ import (
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs/fscore"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
@@ -57,11 +59,6 @@ type AuthConfig struct {
 	Users            []UserInfo `yaml:"users,omitempty"`
 	UnauthorizedUser *UserInfo  `yaml:"unauthorized_user,omitempty"`

-	// UnauthorizedAccessLog defines access log settings for requests with missing or invalid auth tokens.
-	// When unauthorized_user is configured, its access_log setting takes precedence over this one.
-	// It is useful for detecting brute-force attacks by logging the remote_addr of rejected requests.
-	UnauthorizedAccessLog *AccessLog `yaml:"unauthorized_access_log,omitempty"`
-
 	// ms holds all the metrics for the given AuthConfig
 	ms *metrics.Set
 }
@@ -121,7 +118,20 @@ type AccessLogFilters struct {
 }

 func (ui *UserInfo) logRequest(r *http.Request, userName string, statusCode int, duration time.Duration) {
-	logRequest(ui.AccessLog, r, userName, statusCode, duration)
+	if ui.AccessLog == nil {
+		return
+	}
+	filters := ui.AccessLog.Filters
+	if filters != nil && len(filters.SkipStatusCodes) > 0 {
+		if slices.Contains(filters.SkipStatusCodes, statusCode) {
+			return
+		}
+	}
+
+	remoteAddr := httpserver.GetQuotedRemoteAddr(r)
+	requestURI := httpserver.GetRequestURI(r)
+	logger.Infof("access_log request_host=%q request_uri=%q status_code=%d remote_addr=%s user_agent=%q referer=%q duration_ms=%d username=%q",
+		r.Host, requestURI, statusCode, remoteAddr, r.UserAgent(), r.Referer(), duration.Milliseconds(), userName)
 }

 // HeadersConf represents config for request and response headers.
--- a/app/vmauth/main.go
+++ b/app/vmauth/main.go
@@ -180,7 +180,6 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
 			return true
 		}

-		logRequest(authConfig.Load().UnauthorizedAccessLog, r, ``, http.StatusUnauthorized, 0)
 		handleMissingAuthorizationError(w)
 		return true
 	}
@@ -194,7 +193,6 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
 			logger.Panicf("BUG: unexpected nil jwt token for user %q", ui.name())
 		}
 		if !tkn.HasVMAccessClaim() && ui.JWT.DefaultVMAccessClaim == nil {
-			logRequest(authConfig.Load().UnauthorizedAccessLog, r, ``, http.StatusUnauthorized, 0)
 			http.Error(w, "Unauthorized", http.StatusUnauthorized)
 			return true
 		}
@@ -211,7 +209,6 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {

 	invalidAuthTokenRequests.Inc()
 	slowdownUnauthorizedResponse(r)
-	logRequest(authConfig.Load().UnauthorizedAccessLog, r, ``, http.StatusUnauthorized, 0)
 	if *logInvalidAuthTokens {
 		err := fmt.Errorf("cannot authorize request with auth tokens %q", ats)
 		err = &httpserver.ErrorWithStatusCode{
@@ -225,23 +222,6 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
 	return true
 }

-func logRequest(ac *AccessLog, r *http.Request, userName string, statusCode int, duration time.Duration) {
-	if ac == nil {
-		return
-	}
-	filters := ac.Filters
-	if filters != nil && len(filters.SkipStatusCodes) > 0 {
-		if slices.Contains(filters.SkipStatusCodes, statusCode) {
-			return
-		}
-	}
-
-	remoteAddr := httpserver.GetQuotedRemoteAddr(r)
-	requestURI := httpserver.GetRequestURI(r)
-	logger.Infof("access_log request_host=%q request_uri=%q status_code=%d remote_addr=%s user_agent=%q referer=%q duration_ms=%d username=%q",
-		r.Host, requestURI, statusCode, remoteAddr, r.UserAgent(), r.Referer(), duration.Milliseconds(), userName)
-}
-
 func getUserInfoByAuthTokens(ats []string) *UserInfo {
 	ac := *authUsers.Load()
 	for _, at := range ats {
--- a/app/vmselect/promql/eval.go
+++ b/app/vmselect/promql/eval.go
@@ -1687,6 +1687,10 @@ func assertInstantValues(tss []*timeseries) {

 var memoryIntensiveQueries = metrics.NewCounter(`vm_memory_intensive_queries_total`)

+var _ = metrics.NewGauge(`vm_max_memory_per_query`, func() float64 {
+	return float64(maxMemoryPerQuery.N)
+})
+
 func evalRollupFuncWithMetricExpr(qt *querytracer.Tracer, ec *EvalConfig, funcName string, rf rollupFunc,
 	expr metricsql.Expr, me *metricsql.MetricExpr, iafc *incrementalAggrFuncContext, windowExpr *metricsql.DurationExpr,
 ) ([]*timeseries, error) {
--- a/auth.yaml
+++ b/auth.yaml
--- a/deployment/docker/rules/alerts-cluster.yml
+++ b/deployment/docker/rules/alerts-cluster.yml
@@ -223,4 +223,16 @@ groups:
            Unexpected TSID misses for \"{{ $labels.job }}\" ({{ $labels.instance }}) for the last 15 minutes.
            If this happens after unclean shutdown of VictoriaMetrics process (via \"kill -9\", OOM or power off),
            then this is OK - the alert must go away in a few minutes after the restart.
-            Otherwise this may point to the corruption of index data.
+            Otherwise this may point to the corruption of index data.
+
+      - alert: VMSelectConcurrentQueriesExceedMemoryLimit
+        expr: (vm_max_memory_per_query * on(job, instance) vm_concurrent_select_capacity) > on(job, instance) vm_available_memory_bytes
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "vmselect ({{ $labels.instance }}) concurrent query memory may exceed pod limit"
+          description: "Current concurrent queries ({{ $value | humanize1024 }} combined max memory) exceed
+            the available memory on instance {{ $labels.instance }}.
+            This may result in OOM kills. Consider reducing -maxConcurrentRequests,
+            lowering -maxMemoryPerQuery, or scaling up pod memory limits."
--- a/docs/victoriametrics/changelog/CHANGELOG.md
+++ b/docs/victoriametrics/changelog/CHANGELOG.md
@@ -28,11 +28,12 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel

 * SECURITY: upgrade base docker image (Alpine) from 3.23.4 to 3.24.1. See [Alpine 3.24.1 release notes](https://www.alpinelinux.org/posts/Alpine-3.24.1-released.html).

+* FEATURE: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): expose `vm_max_memory_per_query` metric reflecting the `-search.maxMemoryPerQuery` limit. Create `VMSelectConcurrentQueriesExceedMemoryLimit` alert to warn when OOMs are possible due to misconfiguration of `-search.maxMemoryPerQuery` and max concurrent queries.
+
 * FEATURE: [vmauth](https://docs.victoriametrics.com/victoriametrics/vmauth/): add `default_vm_access_claim` field into `jwt` section of auth config. It could be used at [JWT claim placeholders](https://docs.victoriametrics.com/victoriametrics/vmauth/#jwt-claim-based-request-templating), if `JWT` token doesn't have `vm_access` claim. See [#11054](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/11054).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/): reduces CPU usage by 10% at [sharding among remote storages](https://docs.victoriametrics.com/victoriametrics/vmagent/#sharding-among-remote-storages). See [#11113](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11113). Thanks to @bennf for contribution.
 * FEATURE: [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/) and `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): add `optimize_repeated_binary_op_subexprs=1` query arg to [/api/v1/query_range](https://docs.victoriametrics.com/victoriametrics/keyconcepts/#range-query) for executing binary operator sides sequentially when they share the same optimized aggregate rollup result expression. This allows the second side to reuse rollup result cache populated by the first side. See [#10575](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10575).
 * FEATURE: [vmauth](https://docs.victoriametrics.com/victoriametrics/vmauth/): prevent possible password brute-force attacks with an artificial 2-3 second delay as recommended by [OWASP](https://owasp.org/Top10/2025/A07_2025-Authentication_Failures). See [#11180](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/11180).
-* FEATURE: [vmauth](https://docs.victoriametrics.com/victoriametrics/vmauth/): add `unauthorized_access_log` top-level config option for logging requests with missing or invalid auth tokens. This is useful for identifying IPs performing brute-force attacks. See [#11180](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/11180).

 * BUGFIX: all VictoriaMetrics components: cancel in-flight HTTP requests shortly before `-http.maxGracefulShutdownDuration` elapses during graceful shutdown, so they can drain and the shutdown completes cleanly within that window instead of timing out and exiting via `logger.Fatalf` -> `os.Exit`. This prevents skipping the storage flush and losing in-memory data when long-lived requests are in flight (such as VictoriaLogs live tailing). See [#1502](https://github.com/VictoriaMetrics/VictoriaLogs/issues/1502).
 * BUGFIX: `vminsert` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): fixes unexpected rare rerouting. See [#11162](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/11162).
--- a/docs/victoriametrics/vmauth.md
+++ b/docs/victoriametrics/vmauth.md
@@ -1329,11 +1329,6 @@ unauthorized_user:
  access_log: {}
 ```

-If you do not have `unauthorized_user` but want to log requests with missing or invalid auth tokens, use the top-level `unauthorized_access_log` option{{% available_from "#" %}}:
-```yaml
-unauthorized_access_log: {}
-```
-
 Access logs contain limited information to prevent exposing sensitive data. See an example of the printed access log below:
 ```bash
 2026-02-26T15:00:00.207Z        info    VictoriaMetrics/app/vmauth/auth_config.go:134   access_log request_host="localhost:8427" request_uri="/prometheus/api/v1/query_range?query=1&start=1772116199.897&end=1772117999.897&step=5s" status_code=200 remote_addr="127.0.0.1:63425" user_agent="Mozilla/5.0..." referer="http://localhost:8427/vmui/?" duration_ms=8 username="unauthorized"