app/vmauth: properly start backend healths

Previously, backend url health check start could produce a data race
and a race condition.

 The following panic could be produced:
`panic: sync: WaitGroup is reused before previous Wait has returned`

 It happened because concurrent goroutine could process request, while
 configuration was reloaded and stopHealthChecks method was called.

 This commit adds a dedicated structure for backend health checks.
Which protects from data race with mutex guard. And prevents race
condition with a boolean flag.

Fixes: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10806
This commit is contained in:
Nikolay
2026-04-23 11:05:06 +02:00
committed by GitHub
parent 1dcf0f6826
commit 9a74f71a5f
2 changed files with 41 additions and 18 deletions

View File

@@ -362,40 +362,62 @@ func (up *URLPrefix) setLoadBalancingPolicy(loadBalancingPolicy string) error {
}
type backendURLs struct {
healthChecksContext context.Context
healthChecksCancel func()
healthChecksWG sync.WaitGroup
bhc backendHealthCheck
bus []*backendURL
}
type backendHealthCheck struct {
ctx context.Context
// mu protects fields below
cancel func()
mu sync.Mutex
isStopped bool
wg sync.WaitGroup
}
func (bhc *backendHealthCheck) run(hc func()) {
bhc.mu.Lock()
defer bhc.mu.Unlock()
if bhc.isStopped {
return
}
bhc.wg.Go(hc)
}
func (bhc *backendHealthCheck) stop() {
bhc.mu.Lock()
bhc.cancel()
bhc.isStopped = true
bhc.mu.Unlock()
bhc.wg.Wait()
}
func newBackendURLs() *backendURLs {
ctx, cancel := context.WithCancel(context.Background())
return &backendURLs{
healthChecksContext: ctx,
healthChecksCancel: cancel,
bhc: backendHealthCheck{
ctx: ctx,
cancel: cancel,
},
}
}
func (bus *backendURLs) add(u *url.URL) {
bus.bus = append(bus.bus, &backendURL{
url: u,
healthCheckContext: bus.healthChecksContext,
healthCheckWG: &bus.healthChecksWG,
hasPlaceHolders: hasAnyPlaceholders(u),
url: u,
bhc: &bus.bhc,
hasPlaceHolders: hasAnyPlaceholders(u),
})
}
func (bus *backendURLs) stopHealthChecks() {
bus.healthChecksCancel()
bus.healthChecksWG.Wait()
bus.bhc.stop()
}
type backendURL struct {
broken atomic.Bool
healthCheckContext context.Context
healthCheckWG *sync.WaitGroup
bhc *backendHealthCheck
concurrentRequests atomic.Int32
@@ -410,7 +432,7 @@ func (bu *backendURL) isBroken() bool {
func (bu *backendURL) setBroken() {
if bu.broken.CompareAndSwap(false, true) {
bu.healthCheckWG.Go(func() {
bu.bhc.run(func() {
bu.runHealthCheck()
bu.broken.Store(false)
})
@@ -432,11 +454,11 @@ func (bu *backendURL) runHealthCheck() {
case <-t.C:
// Verify network connectivity via TCP dial before marking backend healthy.
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/9997
ctx, cancel := context.WithTimeout(bu.healthCheckContext, time.Second)
ctx, cancel := context.WithTimeout(bu.bhc.ctx, time.Second)
c, err := netutil.Dialer.DialContext(ctx, "tcp", addr)
cancel()
if err != nil {
if errors.Is(bu.healthCheckContext.Err(), context.Canceled) {
if errors.Is(bu.bhc.ctx.Err(), context.Canceled) {
return
}
logger.Warnf("ignoring the backend at %s for %s because of dial error: %s", addr, *failTimeout, err)
@@ -445,7 +467,7 @@ func (bu *backendURL) runHealthCheck() {
_ = c.Close()
return
case <-bu.healthCheckContext.Done():
case <-bu.bhc.ctx.Done():
return
}
}