mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-05-17 08:36:55 +03:00
app/vmauth: properly start backend healths
Previously, backend url health check start could produce a data race and a race condition. The following panic could be produced: `panic: sync: WaitGroup is reused before previous Wait has returned` It happened because concurrent goroutine could process request, while configuration was reloaded and stopHealthChecks method was called. This commit adds a dedicated structure for backend health checks. Which protects from data race with mutex guard. And prevents race condition with a boolean flag. Fixes: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/10806
This commit is contained in:
@@ -362,40 +362,62 @@ func (up *URLPrefix) setLoadBalancingPolicy(loadBalancingPolicy string) error {
|
||||
}
|
||||
|
||||
type backendURLs struct {
|
||||
healthChecksContext context.Context
|
||||
healthChecksCancel func()
|
||||
healthChecksWG sync.WaitGroup
|
||||
|
||||
bhc backendHealthCheck
|
||||
bus []*backendURL
|
||||
}
|
||||
|
||||
type backendHealthCheck struct {
|
||||
ctx context.Context
|
||||
// mu protects fields below
|
||||
cancel func()
|
||||
mu sync.Mutex
|
||||
isStopped bool
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
func (bhc *backendHealthCheck) run(hc func()) {
|
||||
bhc.mu.Lock()
|
||||
defer bhc.mu.Unlock()
|
||||
if bhc.isStopped {
|
||||
return
|
||||
}
|
||||
bhc.wg.Go(hc)
|
||||
}
|
||||
|
||||
func (bhc *backendHealthCheck) stop() {
|
||||
bhc.mu.Lock()
|
||||
bhc.cancel()
|
||||
bhc.isStopped = true
|
||||
bhc.mu.Unlock()
|
||||
bhc.wg.Wait()
|
||||
}
|
||||
|
||||
func newBackendURLs() *backendURLs {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
return &backendURLs{
|
||||
healthChecksContext: ctx,
|
||||
healthChecksCancel: cancel,
|
||||
bhc: backendHealthCheck{
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (bus *backendURLs) add(u *url.URL) {
|
||||
bus.bus = append(bus.bus, &backendURL{
|
||||
url: u,
|
||||
healthCheckContext: bus.healthChecksContext,
|
||||
healthCheckWG: &bus.healthChecksWG,
|
||||
hasPlaceHolders: hasAnyPlaceholders(u),
|
||||
url: u,
|
||||
bhc: &bus.bhc,
|
||||
hasPlaceHolders: hasAnyPlaceholders(u),
|
||||
})
|
||||
}
|
||||
|
||||
func (bus *backendURLs) stopHealthChecks() {
|
||||
bus.healthChecksCancel()
|
||||
bus.healthChecksWG.Wait()
|
||||
bus.bhc.stop()
|
||||
}
|
||||
|
||||
type backendURL struct {
|
||||
broken atomic.Bool
|
||||
|
||||
healthCheckContext context.Context
|
||||
healthCheckWG *sync.WaitGroup
|
||||
bhc *backendHealthCheck
|
||||
|
||||
concurrentRequests atomic.Int32
|
||||
|
||||
@@ -410,7 +432,7 @@ func (bu *backendURL) isBroken() bool {
|
||||
|
||||
func (bu *backendURL) setBroken() {
|
||||
if bu.broken.CompareAndSwap(false, true) {
|
||||
bu.healthCheckWG.Go(func() {
|
||||
bu.bhc.run(func() {
|
||||
bu.runHealthCheck()
|
||||
bu.broken.Store(false)
|
||||
})
|
||||
@@ -432,11 +454,11 @@ func (bu *backendURL) runHealthCheck() {
|
||||
case <-t.C:
|
||||
// Verify network connectivity via TCP dial before marking backend healthy.
|
||||
// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/9997
|
||||
ctx, cancel := context.WithTimeout(bu.healthCheckContext, time.Second)
|
||||
ctx, cancel := context.WithTimeout(bu.bhc.ctx, time.Second)
|
||||
c, err := netutil.Dialer.DialContext(ctx, "tcp", addr)
|
||||
cancel()
|
||||
if err != nil {
|
||||
if errors.Is(bu.healthCheckContext.Err(), context.Canceled) {
|
||||
if errors.Is(bu.bhc.ctx.Err(), context.Canceled) {
|
||||
return
|
||||
}
|
||||
logger.Warnf("ignoring the backend at %s for %s because of dial error: %s", addr, *failTimeout, err)
|
||||
@@ -445,7 +467,7 @@ func (bu *backendURL) runHealthCheck() {
|
||||
|
||||
_ = c.Close()
|
||||
return
|
||||
case <-bu.healthCheckContext.Done():
|
||||
case <-bu.bhc.ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user