app/vmauth: pick first backend to process request when all backends are unavailable (#10886)

The commit restores the previous behavior where the first backend is still selected and the request is sent to it. This behavior existed before commit 9c36f0931a, but was later changed to return no backends. Hence, vmauth would reject all requests for the next 3s if all backends are unavailable. In some rare cases, it leads to an increase in error responses. 

The commit restores the original behavior, adds comments explaining why it is important, and introduces tests covering the logic.

Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10837
PR https://github.com/VictoriaMetrics/VictoriaMetrics/pull/10886

---------

Signed-off-by: JAYICE <1185430411@qq.com>
Signed-off-by: Max Kotliar <kotlyar.maksim@gmail.com>
Co-authored-by: Max Kotliar <mkotlyar@victoriametrics.com>
Co-authored-by: Hui Wang <haley@victoriametrics.com>
This commit is contained in:
JAYICE
2026-05-08 01:44:50 +08:00
committed by GitHub
parent 76e0bcdf45
commit 0c7928b0ff
4 changed files with 47 additions and 11 deletions

View File

@@ -610,6 +610,7 @@ func areEqualBackendURLs(a, b []*backendURL) bool {
}
// getFirstAvailableBackendURL returns the first available backendURL, which isn't broken.
// If all backendURLs are broken, then returns the first backendURL.
//
// backendURL.put() must be called on the returned backendURL after the request is complete.
func getFirstAvailableBackendURL(bus []*backendURL) *backendURL {
@@ -628,21 +629,22 @@ func getFirstAvailableBackendURL(bus []*backendURL) *backendURL {
return bu
}
}
return nil
// All backend urls are unavailable, then returning a first one, it could help increase the success rate of the requests。
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10837#issuecomment-4307050980.
bu.get()
return bu
}
// getLeastLoadedBackendURL returns a non-broken backendURL with the lowest number of concurrent requests.
// If all backendURLs are broken, then returns the first backendURL.
//
// backendURL.put() must be called on the returned backendURL after the request is complete.
func getLeastLoadedBackendURL(bus []*backendURL, atomicCounter *atomic.Uint32) *backendURL {
firstBu := bus[0]
if len(bus) == 1 {
// Fast path - return the only backend url.
bu := bus[0]
if bu.isBroken() {
return nil
}
bu.get()
return bu
firstBu.get()
return firstBu
}
// Slow path - select other backend urls.
@@ -680,7 +682,10 @@ func getLeastLoadedBackendURL(bus []*backendURL, atomicCounter *atomic.Uint32) *
}
buMin := bus[buMinIdx]
if buMin.isBroken() {
return nil
// If all backendURLs are broken, then returns the first backendURL.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10837#issuecomment-4307050980.
firstBu.get()
return firstBu
}
buMin.get()
atomicCounter.CompareAndSwap(n+1, buMinIdx+1)

View File

@@ -1031,6 +1031,33 @@ func TestLogRequest(t *testing.T) {
f("foo", 404, 10*time.Millisecond, `access_log request_host="localhost:8080" request_uri="" status_code=404 remote_addr="" user_agent="" referer="" duration_ms=10 username="foo"`)
}
func TestGetFirstAvailableBackend(t *testing.T) {
f := func(broken []bool, expectedIdx int) {
t.Helper()
bus := make([]*backendURL, len(broken))
for i := range broken {
bus[i] = &backendURL{
url: &url.URL{Host: fmt.Sprintf("server-%d", i)},
}
bus[i].broken.Store(broken[i])
}
bu := getFirstAvailableBackendURL(bus)
if bu == nil {
t.Fatalf("unexpected nil backend")
}
if bu.url.Host != fmt.Sprintf("server-%d", expectedIdx) {
t.Fatalf("unexpected backend, expected server-%d, got %s", expectedIdx, bu.url.Host)
}
}
f([]bool{false, false, false}, 0)
f([]bool{true, true, false}, 2)
// all backend are broken, then return the first one.
f([]bool{true, true, true}, 0)
f([]bool{true}, 0)
}
func getRegexs(paths []string) []*Regex {
var sps []*Regex
for _, path := range paths {

View File

@@ -30,6 +30,7 @@ See also [LTS releases](https://docs.victoriametrics.com/victoriametrics/lts-rel
* BUGFIX: `vminsert` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/): properly discover addresses of storage nodes with [enterprise automatic-vmstorage-discovery](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/#automatic-vmstorage-discovery). Bug was introduced at [v1.141.0](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/docs/victoriametrics/changelog/CHANGELOG.md#v11410).
* BUGFIX: [vmauth](https://docs.victoriametrics.com/victoriametrics/vmauth/): now correctly respects disabling retries via `-maxRequestBodySizeToRetry=0`. Previously, disabling retries required setting both `-maxRequestBodySizeToRetry` and `-requestBufferSize` to `0`. See [#10857](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10857). Thanks to @andriibeee for the contribution.
* BUGFIX: [vmbackupmanager](https://docs.victoriametrics.com/victoriametrics/vmbackupmanager/): explicitly set the MD5 checksum for `DeleteObjects` requests. Starting with [aws-sdk-go-v2/service/s3 v1.73.0](https://github.com/aws/aws-sdk-go-v2/blob/release-2025-01-15/service/s3/CHANGELOG.md#v1730-2025-01-15), the SDK switched the checksum algorithm from MD5 to CRC32, which [can break compatibility](https://github.com/aws/aws-sdk-go-v2/discussions/2960) with some third-party S3 implementations, including [Dell ECS](https://www.dell.com/en-us/lp/dt/elastic-cloud-storage). See [#10907](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10907).
* BUGFIX: [vmauth](https://docs.victoriametrics.com/victoriametrics/vmauth/): attempt to route requests to the first backend when all backends are marked as broken. Previously, `vmauth` returned an error immediately without attempting any backend. This change may improve success rate in rare edge cases. See [#10837](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10837).
* FEATURE: all VictoriaMetrics components: suppress TCP health check errors when `-tls` flag is set. See [#10538](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10538).
* FEATURE: [vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/) and [vmsingle](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/): add `__meta_hetzner_robot_datacenter` label for `robot` role in [hetzner_sd_configs](https://docs.victoriametrics.com/victoriametrics/sd_configs/#hetzner_sd_configs). See [#10909](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10909). Thanks to @juliusrickert for contribution.

View File

@@ -189,10 +189,12 @@ See also [authorization](#authorization) and [routing](#routing) docs.
### High availability
`vmauth` automatically switches from a temporarily unavailable backend to other hot standby backends listed in `url_prefix`
if it runs with the `-loadBalancingPolicy=first_available` command-line flag. The load balancing policy can be overridden at `user` and `url_map` sections of [`-auth.config`](#auth-config) via `load_balancing_policy` option. For example, the following config instructs `vmauth` to proxy requests to `http://victoria-metrics-main:8428/` backend.
if it runs with the `-loadBalancingPolicy=first_available` command-line flag. The load balancing policy can be overridden at `user` and `url_map` sections of [`-auth.config`](#auth-config) via `load_balancing_policy` option.
For example, the following config instructs `vmauth` to proxy requests to `http://victoria-metrics-main:8428/` backend.
If this backend becomes unavailable, then `vmauth` starts proxying requests to `http://victoria-metrics-standby1:8428/`.
If this backend also becomes unavailable, then requests are proxied to the last specified backend - `http://victoria-metrics-standby2:8428/`:
If all backends are marked as unavailable, requests are proxied to the first configured backend `http://victoria-metrics-main:8428/` instead of failing immediately.
```yaml
unauthorized_user:
url_prefix:
@@ -889,8 +891,9 @@ Each `url_prefix` in the [-auth.config](#auth-config) can be specified in the fo
This guarantees that incoming load is shared uniformly among the specified backends.
See also [discovering backend IPs](#discovering-backend-ips).
`vmauth` automatically detects temporarily unavailable backends and spreads incoming queries among the remaining available backends.
`vmauth` automatically detects temporarily unavailable backends and spreads incoming requests among the remaining available backends.
This allows restarting and performing maintenance on backends without removing them from the `url_prefix` list.
If all backends are marked as unavailable, requests are proxied to the first configured backend in the list instead of failing immediately.
By default, `vmauth` returns backend responses with all the HTTP status codes to the client. It is possible to configure automatic retry of requests at other backends if the backend responds with a status code specified in the `-retryStatusCodes` command-line flag.
It is possible to customize the list of HTTP response status codes to retry via the `retry_status_codes` list at the `user` and `url_map` level of [`-auth.config`](#auth-config).