app/vmalert/utils: unregister metrics only if there is no refs left

Currently, when performing rules reload vmalert treats interval change as a new group. This leads to previous group being closed.
When group is closed it also unregisters metrics related to the group.

The problem is that newly created group will still use metrics with the same names as name only includes "file" and "group name" as labels and these are the same.

This commit introduces a "reference tracking" for metric names and prevents unregistering metrics if metric name is still in use.

See: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8229
Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
This commit is contained in:
Zakhar Bessarab
2025-02-06 14:43:36 +04:00
parent 171d4019cd
commit b1e998f7ad
3 changed files with 100 additions and 2 deletions

View File

@@ -1,14 +1,56 @@
package utils
import "github.com/VictoriaMetrics/metrics"
import (
"sync"
"sync/atomic"
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
)
type namedMetric struct {
Name string
}
var usedMetrics map[string]*atomic.Int64
var usedMetricMu sync.Mutex
func trackUsedMetric(name string) {
usedMetricMu.Lock()
defer usedMetricMu.Unlock()
if usedMetrics == nil {
usedMetrics = make(map[string]*atomic.Int64)
}
if _, ok := usedMetrics[name]; !ok {
usedMetrics[name] = &atomic.Int64{}
}
usedMetrics[name].Add(1)
}
// Unregister removes the metric by name from default registry
func (nm namedMetric) Unregister() {
metrics.UnregisterMetric(nm.Name)
if usedMetrics == nil {
logger.Fatalf("BUG: unregistered metric %q before registering", nm.Name)
}
usedMetricMu.Lock()
counter, ok := usedMetrics[nm.Name]
if !ok {
logger.Fatalf("BUG: unregistered metric %q before registering", nm.Name)
}
current := counter.Add(-1)
usedMetricMu.Unlock()
if current < 0 {
logger.Fatalf("BUG: negative metric counter for %q", nm.Name)
}
if current == 0 {
metrics.UnregisterMetric(nm.Name)
}
}
// Gauge is a metrics.Gauge with Name
@@ -19,6 +61,7 @@ type Gauge struct {
// GetOrCreateGauge creates a new Gauge with the given name
func GetOrCreateGauge(name string, f func() float64) *Gauge {
trackUsedMetric(name)
return &Gauge{
namedMetric: namedMetric{Name: name},
Gauge: metrics.GetOrCreateGauge(name, f),
@@ -33,6 +76,7 @@ type Counter struct {
// GetOrCreateCounter creates a new Counter with the given name
func GetOrCreateCounter(name string) *Counter {
trackUsedMetric(name)
return &Counter{
namedMetric: namedMetric{Name: name},
Counter: metrics.GetOrCreateCounter(name),
@@ -47,6 +91,7 @@ type Summary struct {
// GetOrCreateSummary creates a new Summary with the given name
func GetOrCreateSummary(name string) *Summary {
trackUsedMetric(name)
return &Summary{
namedMetric: namedMetric{Name: name},
Summary: metrics.GetOrCreateSummary(name),

View File

@@ -0,0 +1,52 @@
package utils
import (
"testing"
"github.com/VictoriaMetrics/metrics"
)
func isMetricRegistered(name string) bool {
metricNames := metrics.GetDefaultSet().ListMetricNames()
for _, mn := range metricNames {
if mn == name {
return true
}
}
return false
}
func TestMetricIsUnregistered(t *testing.T) {
metricName := "example_runs_total"
c := GetOrCreateCounter(metricName)
if !isMetricRegistered(metricName) {
t.Errorf("Expected metric %s to be present", metricName)
}
c.Unregister()
if isMetricRegistered(metricName) {
t.Errorf("Expected metric %s to be unregistered", metricName)
}
}
func TestMetricIsRemovedIfNoUses(t *testing.T) {
metricName := "example_runs_total"
c := GetOrCreateCounter(metricName)
c2 := GetOrCreateCounter(metricName)
if !isMetricRegistered(metricName) {
t.Errorf("Expected metric %s to be present", metricName)
}
c.Unregister()
// metric should still be registered since c2 is using it
if !isMetricRegistered(metricName) {
t.Errorf("Expected metric %s to be present", metricName)
}
c2.Unregister()
if isMetricRegistered(metricName) {
t.Errorf("Expected metric %s to be unregistered", metricName)
}
}

View File

@@ -29,6 +29,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
* BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui) for [VictoriaMetrics enterprise](https://docs.victoriametrics.com/enterprise.html) components: properly display enterprise features when the enterprise version is used.
* BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and [vmselect](https://docs.victoriametrics.com/cluster-victoriametrics/): fix discrepancies when using `or` binary operator. See [this](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7759) and [this](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7640) issues for details.
* BUGFIX: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly update number of unique series for [cardinality limiter](https://docs.victoriametrics.com/#cardinality-limiter) on ingestion. Previously, limit could undercount the real number of the ingested unique series.
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert/): do not unregister group metrics if the group is still in use. Previously, this could lead to group metrics being absent even though rules group is still running. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8229) for details.
## [v1.102.12](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.12)