mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2026-05-29 14:51:27 +03:00
Compare commits
1 Commits
v1.129.0
...
debug-dock
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
29858bc0bb |
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@@ -47,8 +47,6 @@ jobs:
|
||||
arch: arm
|
||||
- os: linux
|
||||
arch: ppc64le
|
||||
- os: linux
|
||||
arch: s390x
|
||||
- os: darwin
|
||||
arch: amd64
|
||||
- os: darwin
|
||||
|
||||
17
Makefile
17
Makefile
@@ -125,15 +125,6 @@ vmutils-linux-ppc64le: \
|
||||
vmrestore-linux-ppc64le \
|
||||
vmctl-linux-ppc64le
|
||||
|
||||
vmutils-linux-s390x: \
|
||||
vmagent-linux-s390x \
|
||||
vmalert-linux-s390x \
|
||||
vmalert-tool-linux-s390x \
|
||||
vmauth-linux-s390x \
|
||||
vmbackup-linux-s390x \
|
||||
vmrestore-linux-s390x \
|
||||
vmctl-linux-s390x
|
||||
|
||||
vmutils-darwin-amd64: \
|
||||
vmagent-darwin-amd64 \
|
||||
vmalert-darwin-amd64 \
|
||||
@@ -266,7 +257,6 @@ release-victoria-metrics: \
|
||||
release-victoria-metrics-linux-amd64 \
|
||||
release-victoria-metrics-linux-arm \
|
||||
release-victoria-metrics-linux-arm64 \
|
||||
release-victoria-metrics-linux-s390x \
|
||||
release-victoria-metrics-darwin-amd64 \
|
||||
release-victoria-metrics-darwin-arm64 \
|
||||
release-victoria-metrics-freebsd-amd64 \
|
||||
@@ -285,9 +275,6 @@ release-victoria-metrics-linux-arm:
|
||||
release-victoria-metrics-linux-arm64:
|
||||
GOOS=linux GOARCH=arm64 $(MAKE) release-victoria-metrics-goos-goarch
|
||||
|
||||
release-victoria-metrics-linux-s390x:
|
||||
GOOS=linux GOARCH=s390x $(MAKE) release-victoria-metrics-goos-goarch
|
||||
|
||||
release-victoria-metrics-darwin-amd64:
|
||||
GOOS=darwin GOARCH=amd64 $(MAKE) release-victoria-metrics-goos-goarch
|
||||
|
||||
@@ -327,7 +314,6 @@ release-vmutils: \
|
||||
release-vmutils-linux-amd64 \
|
||||
release-vmutils-linux-arm64 \
|
||||
release-vmutils-linux-arm \
|
||||
release-vmutils-linux-s390x \
|
||||
release-vmutils-darwin-amd64 \
|
||||
release-vmutils-darwin-arm64 \
|
||||
release-vmutils-freebsd-amd64 \
|
||||
@@ -346,9 +332,6 @@ release-vmutils-linux-arm64:
|
||||
release-vmutils-linux-arm:
|
||||
GOOS=linux GOARCH=arm $(MAKE) release-vmutils-goos-goarch
|
||||
|
||||
release-vmutils-linux-s390x:
|
||||
GOOS=linux GOARCH=s390x $(MAKE) release-vmutils-goos-goarch
|
||||
|
||||
release-vmutils-darwin-amd64:
|
||||
GOOS=darwin GOARCH=amd64 $(MAKE) release-vmutils-goos-goarch
|
||||
|
||||
|
||||
@@ -27,9 +27,6 @@ victoria-metrics-linux-ppc64le-prod:
|
||||
victoria-metrics-linux-386-prod:
|
||||
APP_NAME=victoria-metrics $(MAKE) app-via-docker-linux-386
|
||||
|
||||
victoria-metrics-linux-s390x-prod:
|
||||
APP_NAME=victoria-metrics $(MAKE) app-via-docker-linux-s390x
|
||||
|
||||
victoria-metrics-darwin-amd64-prod:
|
||||
APP_NAME=victoria-metrics $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
|
||||
@@ -27,9 +27,6 @@ vmagent-linux-ppc64le-prod:
|
||||
vmagent-linux-386-prod:
|
||||
APP_NAME=vmagent $(MAKE) app-via-docker-linux-386
|
||||
|
||||
vmagent-linux-s390x-prod:
|
||||
APP_NAME=vmagent $(MAKE) app-via-docker-linux-s390x
|
||||
|
||||
vmagent-darwin-amd64-prod:
|
||||
APP_NAME=vmagent $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
|
||||
@@ -74,7 +74,7 @@ var (
|
||||
"See also -opentsdbHTTPListenAddr.useProxyProtocol")
|
||||
opentsdbHTTPUseProxyProtocol = flag.Bool("opentsdbHTTPListenAddr.useProxyProtocol", false, "Whether to use proxy protocol for connections accepted "+
|
||||
"at -opentsdbHTTPListenAddr . See https://www.haproxy.org/download/1.8/doc/proxy-protocol.txt")
|
||||
configAuthKey = flagutil.NewPassword("configAuthKey", "Authorization key for accessing /config and /remotewrite-.*-config pages. It must be passed via authKey query arg. It overrides -httpAuth.*")
|
||||
configAuthKey = flagutil.NewPassword("configAuthKey", "Authorization key for accessing /config page. It must be passed via authKey query arg. It overrides -httpAuth.*")
|
||||
reloadAuthKey = flagutil.NewPassword("reloadAuthKey", "Auth key for /-/reload http endpoint. It must be passed via authKey query arg. It overrides -httpAuth.*")
|
||||
dryRun = flag.Bool("dryRun", false, "Whether to check config files without running vmagent. The following files are checked: "+
|
||||
"-promscrape.config, -remoteWrite.relabelConfig, -remoteWrite.urlRelabelConfig, -remoteWrite.streamAggr.config . "+
|
||||
@@ -253,8 +253,6 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
{"metric-relabel-debug", "debug metric relabeling"},
|
||||
{"api/v1/targets", "advanced information about discovered targets in JSON format"},
|
||||
{"config", "-promscrape.config contents"},
|
||||
{"remotewrite-relabel-config", "-remoteWrite.relabelConfig contents"},
|
||||
{"remotewrite-url-relabel-config", "-remoteWrite.urlRelabelConfig contents"},
|
||||
{"metrics", "available service metrics"},
|
||||
{"flags", "command-line flags"},
|
||||
{"-/reload", "reload configuration"},
|
||||
@@ -480,42 +478,6 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
|
||||
promscrape.WriteConfigData(&bb)
|
||||
fmt.Fprintf(w, `{"status":"success","data":{"yaml":%s}}`, stringsutil.JSONString(string(bb.B)))
|
||||
return true
|
||||
case "/remotewrite-relabel-config":
|
||||
if !httpserver.CheckAuthFlag(w, r, configAuthKey) {
|
||||
return true
|
||||
}
|
||||
remoteWriteRelabelConfigRequests.Inc()
|
||||
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
|
||||
remotewrite.WriteRelabelConfigData(w)
|
||||
return true
|
||||
case "/api/v1/status/remotewrite-relabel-config":
|
||||
if !httpserver.CheckAuthFlag(w, r, configAuthKey) {
|
||||
return true
|
||||
}
|
||||
remoteWriteStatusRelabelConfigRequests.Inc()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
var bb bytesutil.ByteBuffer
|
||||
remotewrite.WriteRelabelConfigData(&bb)
|
||||
fmt.Fprintf(w, `{"status":"success","data":{"yaml":%s}}`, stringsutil.JSONString(string(bb.B)))
|
||||
return true
|
||||
case "/remotewrite-url-relabel-config":
|
||||
if !httpserver.CheckAuthFlag(w, r, configAuthKey) {
|
||||
return true
|
||||
}
|
||||
remoteWriteURLRelabelConfigRequests.Inc()
|
||||
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
|
||||
remotewrite.WriteURLRelabelConfigData(w)
|
||||
return true
|
||||
case "/api/v1/status/remotewrite-url-relabel-config":
|
||||
if !httpserver.CheckAuthFlag(w, r, configAuthKey) {
|
||||
return true
|
||||
}
|
||||
remoteWriteStatusURLRelabelConfigRequests.Inc()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
var bb bytesutil.ByteBuffer
|
||||
remotewrite.WriteURLRelabelConfigData(&bb)
|
||||
fmt.Fprintf(w, `{"status":"success","data":{"yaml":%s}}`, stringsutil.JSONString(string(bb.B)))
|
||||
return true
|
||||
case "/prometheus/-/reload", "/-/reload":
|
||||
if !httpserver.CheckAuthFlag(w, r, reloadAuthKey) {
|
||||
return true
|
||||
@@ -786,12 +748,6 @@ var (
|
||||
promscrapeConfigRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/config"}`)
|
||||
promscrapeStatusConfigRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/status/config"}`)
|
||||
|
||||
remoteWriteRelabelConfigRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/remotewrite-relabel-config"}`)
|
||||
remoteWriteStatusRelabelConfigRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/status/remotewrite-relabel-config"}`)
|
||||
|
||||
remoteWriteURLRelabelConfigRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/remotewrite-url-relabel-config"}`)
|
||||
remoteWriteStatusURLRelabelConfigRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/api/v1/status/remotewrite-url-relabel-config"}`)
|
||||
|
||||
promscrapeConfigReloadRequests = metrics.NewCounter(`vmagent_http_requests_total{path="/-/reload"}`)
|
||||
)
|
||||
|
||||
|
||||
@@ -3,18 +3,15 @@ package remotewrite
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
"go.yaml.in/yaml/v3"
|
||||
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
@@ -35,12 +32,9 @@ var (
|
||||
"See https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels")
|
||||
)
|
||||
|
||||
var labelsGlobal []prompb.Label
|
||||
|
||||
var (
|
||||
labelsGlobal []prompb.Label
|
||||
|
||||
remoteWriteRelabelConfigData atomic.Pointer[[]byte]
|
||||
remoteWriteURLRelabelConfigData atomic.Pointer[[]interface{}]
|
||||
|
||||
relabelConfigReloads *metrics.Counter
|
||||
relabelConfigReloadErrors *metrics.Counter
|
||||
relabelConfigSuccess *metrics.Gauge
|
||||
@@ -73,42 +67,6 @@ func initRelabelConfigs() {
|
||||
}
|
||||
}
|
||||
|
||||
// WriteRelabelConfigData writes -remoteWrite.relabelConfig contents to w
|
||||
func WriteRelabelConfigData(w io.Writer) {
|
||||
p := remoteWriteRelabelConfigData.Load()
|
||||
if p == nil {
|
||||
// Nothing to write to w
|
||||
return
|
||||
}
|
||||
_, _ = w.Write(*p)
|
||||
}
|
||||
|
||||
// WriteURLRelabelConfigData writes -remoteWrite.urlRelabelConfig contents to w
|
||||
func WriteURLRelabelConfigData(w io.Writer) {
|
||||
p := remoteWriteURLRelabelConfigData.Load()
|
||||
if p == nil {
|
||||
// Nothing to write to w
|
||||
return
|
||||
}
|
||||
type urlRelabelCfg struct {
|
||||
Url string `yaml:"url"`
|
||||
RelabelConfig interface{} `yaml:"relabel_config"`
|
||||
}
|
||||
var cs []urlRelabelCfg
|
||||
for i, url := range *remoteWriteURLs {
|
||||
cfgData := (*p)[i]
|
||||
if !*showRemoteWriteURL {
|
||||
url = fmt.Sprintf("%d:secret-url", i+1)
|
||||
}
|
||||
cs = append(cs, urlRelabelCfg{
|
||||
Url: url,
|
||||
RelabelConfig: cfgData,
|
||||
})
|
||||
}
|
||||
d, _ := yaml.Marshal(cs)
|
||||
_, _ = w.Write(d)
|
||||
}
|
||||
|
||||
func reloadRelabelConfigs() {
|
||||
rcs := allRelabelConfigs.Load()
|
||||
if !rcs.isSet() {
|
||||
@@ -132,42 +90,28 @@ func reloadRelabelConfigs() {
|
||||
func loadRelabelConfigs() (*relabelConfigs, error) {
|
||||
var rcs relabelConfigs
|
||||
if *relabelConfigPathGlobal != "" {
|
||||
global, rawCfg, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
|
||||
global, err := promrelabel.LoadRelabelConfigs(*relabelConfigPathGlobal)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load -remoteWrite.relabelConfig=%q: %w", *relabelConfigPathGlobal, err)
|
||||
}
|
||||
remoteWriteRelabelConfigData.Store(&rawCfg)
|
||||
rcs.global = global
|
||||
}
|
||||
if len(*relabelConfigPaths) > len(*remoteWriteURLs) {
|
||||
return nil, fmt.Errorf("too many -remoteWrite.urlRelabelConfig args: %d; it mustn't exceed the number of -remoteWrite.url args: %d",
|
||||
len(*relabelConfigPaths), (len(*remoteWriteURLs)))
|
||||
}
|
||||
|
||||
var urlRelabelCfgs []interface{}
|
||||
rcs.perURL = make([]*promrelabel.ParsedConfigs, len(*remoteWriteURLs))
|
||||
for i, path := range *relabelConfigPaths {
|
||||
if len(path) == 0 {
|
||||
urlRelabelCfgs = append(urlRelabelCfgs, nil)
|
||||
// Skip empty relabel config.
|
||||
continue
|
||||
}
|
||||
prc, rawCfg, err := promrelabel.LoadRelabelConfigs(path)
|
||||
prc, err := promrelabel.LoadRelabelConfigs(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("cannot load relabel configs from -remoteWrite.urlRelabelConfig=%q: %w", path, err)
|
||||
}
|
||||
rcs.perURL[i] = prc
|
||||
|
||||
var parsedCfg interface{}
|
||||
_ = yaml.Unmarshal(rawCfg, &parsedCfg)
|
||||
urlRelabelCfgs = append(urlRelabelCfgs, parsedCfg)
|
||||
}
|
||||
if len(*remoteWriteURLs) > len(*relabelConfigPaths) {
|
||||
// fill the urlRelabelCfgs with empty relabel configs if not set
|
||||
for i := len(*relabelConfigPaths); i < len(*remoteWriteURLs); i++ {
|
||||
urlRelabelCfgs = append(urlRelabelCfgs, nil)
|
||||
}
|
||||
}
|
||||
remoteWriteURLRelabelConfigData.Store(&urlRelabelCfgs)
|
||||
return &rcs, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -27,7 +27,6 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/ratelimiter"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/slicesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/streamaggr"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timeserieslimits"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
@@ -486,9 +485,6 @@ func tryPush(at *auth.Token, wr *prompb.WriteRequest, forceDropSamplesOnFailure
|
||||
matchIdxs.B = sas.Push(tssBlock, matchIdxs.B)
|
||||
if !*streamAggrGlobalKeepInput {
|
||||
tssBlock = dropAggregatedSeries(tssBlock, matchIdxs.B, *streamAggrGlobalDropInput)
|
||||
} else if *streamAggrGlobalDropInput {
|
||||
// if both keep_input and drop_input are true, we keep only the aggregated series
|
||||
tssBlock = dropUnaggregatedSeries(tssBlock, matchIdxs.B)
|
||||
}
|
||||
matchIdxsPool.Put(matchIdxs)
|
||||
}
|
||||
@@ -992,17 +988,7 @@ func (rwctx *remoteWriteCtx) TryPushTimeSeries(tss []prompb.TimeSeries, forceDro
|
||||
tss = append(*v, tss...)
|
||||
}
|
||||
tss = dropAggregatedSeries(tss, matchIdxs.B, rwctx.streamAggrDropInput)
|
||||
} else if rwctx.streamAggrDropInput {
|
||||
// if both keep_input and drop_input are true, we keep only the aggregated series
|
||||
if rctx == nil {
|
||||
rctx = getRelabelCtx()
|
||||
// Make a copy of tss before dropping aggregated series
|
||||
v = tssPool.Get().(*[]prompb.TimeSeries)
|
||||
tss = append(*v, tss...)
|
||||
}
|
||||
tss = dropUnaggregatedSeries(tss, matchIdxs.B)
|
||||
}
|
||||
|
||||
matchIdxsPool.Put(matchIdxs)
|
||||
}
|
||||
if rwctx.deduplicator != nil {
|
||||
@@ -1025,10 +1011,9 @@ func (rwctx *remoteWriteCtx) TryPushTimeSeries(tss []prompb.TimeSeries, forceDro
|
||||
return false
|
||||
}
|
||||
|
||||
var matchIdxsPool slicesutil.BufferPool[uint32]
|
||||
var matchIdxsPool bytesutil.ByteBufferPool
|
||||
|
||||
// dropAggregatedSeries drops matched series, also the unmatched if dropInput is true.
|
||||
func dropAggregatedSeries(src []prompb.TimeSeries, matchIdxs []uint32, dropInput bool) []prompb.TimeSeries {
|
||||
func dropAggregatedSeries(src []prompb.TimeSeries, matchIdxs []byte, dropInput bool) []prompb.TimeSeries {
|
||||
dst := src[:0]
|
||||
if !dropInput {
|
||||
for i, match := range matchIdxs {
|
||||
@@ -1043,20 +1028,6 @@ func dropAggregatedSeries(src []prompb.TimeSeries, matchIdxs []uint32, dropInput
|
||||
return dst
|
||||
}
|
||||
|
||||
// dropUnaggregatedSeries drops unmatched series.
|
||||
func dropUnaggregatedSeries(src []prompb.TimeSeries, matchIdxs []uint32) []prompb.TimeSeries {
|
||||
dst := src[:0]
|
||||
for i, match := range matchIdxs {
|
||||
if match == 0 {
|
||||
continue
|
||||
}
|
||||
dst = append(dst, src[i])
|
||||
}
|
||||
tail := src[len(dst):]
|
||||
clear(tail)
|
||||
return dst
|
||||
}
|
||||
|
||||
func (rwctx *remoteWriteCtx) pushInternalTrackDropped(tss []prompb.TimeSeries) {
|
||||
if rwctx.tryPushTimeSeriesInternal(tss) {
|
||||
return
|
||||
|
||||
@@ -10,8 +10,6 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/consistenthash"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/protoparser/prometheus"
|
||||
@@ -59,8 +57,8 @@ func TestGetLabelsHash_Distribution(t *testing.T) {
|
||||
f(10)
|
||||
}
|
||||
|
||||
func TestRemoteWriteContext_TryPushTimeSeries(t *testing.T) {
|
||||
f := func(streamAggrConfig, relabelConfig string, enableWindows bool, dedupInterval time.Duration, keepInput, dropInput bool, input string, expectedRowsPushedAfterRelabel, expectedPushedSample int) {
|
||||
func TestRemoteWriteContext_TryPush_ImmutableTimeseries(t *testing.T) {
|
||||
f := func(streamAggrConfig, relabelConfig string, enableWindows bool, dedupInterval time.Duration, keepInput, dropInput bool, input string) {
|
||||
t.Helper()
|
||||
perURLRelabel, err := promrelabel.ParseRelabelConfigsData([]byte(relabelConfig))
|
||||
if err != nil {
|
||||
@@ -73,16 +71,10 @@ func TestRemoteWriteContext_TryPushTimeSeries(t *testing.T) {
|
||||
}
|
||||
allRelabelConfigs.Store(rcs)
|
||||
|
||||
path := "fast-queue-write-test"
|
||||
fs.MustRemoveDir(path)
|
||||
fq := persistentqueue.MustOpenFastQueue(path, "test", 100, 0, false)
|
||||
defer fs.MustRemoveDir(path)
|
||||
defer fq.MustClose()
|
||||
|
||||
pss := make([]*pendingSeries, 1)
|
||||
isVMProto := &atomic.Bool{}
|
||||
isVMProto.Store(true)
|
||||
pss[0] = newPendingSeries(fq, isVMProto, 0, 100)
|
||||
pss[0] = newPendingSeries(nil, isVMProto, 0, 100)
|
||||
rwctx := &remoteWriteCtx{
|
||||
idx: 0,
|
||||
streamAggrKeepInput: keepInput,
|
||||
@@ -91,8 +83,6 @@ func TestRemoteWriteContext_TryPushTimeSeries(t *testing.T) {
|
||||
rowsPushedAfterRelabel: metrics.GetOrCreateCounter(`foo`),
|
||||
rowsDroppedByRelabel: metrics.GetOrCreateCounter(`bar`),
|
||||
}
|
||||
defer metrics.UnregisterAllMetrics()
|
||||
|
||||
if dedupInterval > 0 {
|
||||
rwctx.deduplicator = streamaggr.NewDeduplicator(nil, enableWindows, dedupInterval, nil, "dedup-global")
|
||||
}
|
||||
@@ -114,27 +104,23 @@ func TestRemoteWriteContext_TryPushTimeSeries(t *testing.T) {
|
||||
inputTss := prometheus.MustParsePromMetrics(input, offsetMsecs)
|
||||
expectedTss := make([]prompb.TimeSeries, len(inputTss))
|
||||
|
||||
// check inputTss is not modified after TryPushTimeSeries
|
||||
// copy inputTss to make sure it is not mutated during TryPush call
|
||||
copy(expectedTss, inputTss)
|
||||
if !rwctx.TryPushTimeSeries(inputTss, false) {
|
||||
t.Fatalf("cannot push samples to rwctx")
|
||||
}
|
||||
|
||||
if int(rwctx.rowsPushedAfterRelabel.Get()) != expectedRowsPushedAfterRelabel {
|
||||
t.Fatalf("unexpected number of rows after relabel; got %d; want %d", rwctx.rowsPushedAfterRelabel.Get(), expectedRowsPushedAfterRelabel)
|
||||
}
|
||||
|
||||
if len(pss[0].wr.tss) != expectedPushedSample {
|
||||
t.Fatalf("unexpected number of pushed samples; got %d; want %d", len(pss[0].wr.tss), expectedPushedSample)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(expectedTss, inputTss) {
|
||||
t.Fatalf("unexpected samples;\ngot\n%v\nwant\n%v", inputTss, expectedTss)
|
||||
}
|
||||
}
|
||||
|
||||
// relabeling
|
||||
f(``, `
|
||||
f(`
|
||||
- interval: 1m
|
||||
outputs: [sum_samples]
|
||||
- interval: 2m
|
||||
outputs: [count_series]
|
||||
`, `
|
||||
- action: keep
|
||||
source_labels: [env]
|
||||
regex: "dev"
|
||||
@@ -143,66 +129,53 @@ metric{env="dev"} 10
|
||||
metric{env="bar"} 20
|
||||
metric{env="dev"} 15
|
||||
metric{env="bar"} 25
|
||||
`, 2, 2)
|
||||
|
||||
// relabeling + aggregation
|
||||
f(`
|
||||
- match: '{env="dev"}'
|
||||
interval: 1m
|
||||
outputs: [sum_samples]
|
||||
`, `
|
||||
- action: keep
|
||||
source_labels: [env]
|
||||
regex: ".*"
|
||||
`, false, 0, false, false, `
|
||||
metric{env="dev"} 10
|
||||
metric{env="bar"} 20
|
||||
metric{env="dev"} 15
|
||||
metric{env="bar"} 25
|
||||
`, 4, 2)
|
||||
|
||||
// aggregation + keepInput
|
||||
f(`
|
||||
- match: '{env="dev"}'
|
||||
interval: 1m
|
||||
outputs: [sum_samples]
|
||||
`, ``, false, 0, true, false, `
|
||||
metric{env="dev"} 10
|
||||
metric{env="bar"} 20
|
||||
metric{env="dev"} 15
|
||||
metric{env="bar"} 25
|
||||
`, 4, 4)
|
||||
|
||||
// aggregation + dropInput
|
||||
f(`
|
||||
- match: '{env="dev"}'
|
||||
interval: 1m
|
||||
outputs: [sum_samples]
|
||||
`, ``, false, 0, false, true, `
|
||||
metric{env="dev"} 10
|
||||
metric{env="bar"} 20
|
||||
metric{env="dev"} 15
|
||||
metric{env="bar"} 25
|
||||
`, 4, 0)
|
||||
|
||||
// aggregation + keepInput + dropInput
|
||||
f(`
|
||||
- match: '{env="dev"}'
|
||||
interval: 1m
|
||||
outputs: [sum_samples]
|
||||
`, ``, false, 0, true, true, `
|
||||
metric{env="dev"} 10
|
||||
metric{env="bar"} 20
|
||||
metric{env="bar"} 25
|
||||
`, 3, 1)
|
||||
|
||||
// aggregation + deduplication
|
||||
`)
|
||||
f(``, ``, true, time.Hour, false, false, `
|
||||
metric{env="dev"} 10
|
||||
metric{env="foo"} 20
|
||||
metric{env="dev"} 15
|
||||
metric{env="foo"} 25
|
||||
`, 4, 0)
|
||||
`)
|
||||
f(``, `
|
||||
- action: keep
|
||||
source_labels: [env]
|
||||
regex: "dev"
|
||||
`, true, time.Hour, false, false, `
|
||||
metric{env="dev"} 10
|
||||
metric{env="bar"} 20
|
||||
metric{env="dev"} 15
|
||||
metric{env="bar"} 25
|
||||
`)
|
||||
f(``, `
|
||||
- action: keep
|
||||
source_labels: [env]
|
||||
regex: "dev"
|
||||
`, true, time.Hour, true, false, `
|
||||
metric{env="test"} 10
|
||||
metric{env="dev"} 20
|
||||
metric{env="foo"} 15
|
||||
metric{env="dev"} 25
|
||||
`)
|
||||
f(``, `
|
||||
- action: keep
|
||||
source_labels: [env]
|
||||
regex: "dev"
|
||||
`, true, time.Hour, false, true, `
|
||||
metric{env="foo"} 10
|
||||
metric{env="dev"} 20
|
||||
metric{env="foo"} 15
|
||||
metric{env="dev"} 25
|
||||
`)
|
||||
f(``, `
|
||||
- action: keep
|
||||
source_labels: [env]
|
||||
regex: "dev"
|
||||
`, true, time.Hour, true, true, `
|
||||
metric{env="dev"} 10
|
||||
metric{env="test"} 20
|
||||
metric{env="dev"} 15
|
||||
metric{env="bar"} 25
|
||||
`)
|
||||
}
|
||||
|
||||
func TestShardAmountRemoteWriteCtx(t *testing.T) {
|
||||
|
||||
@@ -18,12 +18,12 @@ var (
|
||||
streamAggrGlobalConfig = flag.String("streamAggr.config", "", "Optional path to file with stream aggregation config. "+
|
||||
"See https://docs.victoriametrics.com/victoriametrics/stream-aggregation/ . "+
|
||||
"See also -streamAggr.keepInput, -streamAggr.dropInput and -streamAggr.dedupInterval")
|
||||
streamAggrGlobalKeepInput = flag.Bool("streamAggr.keepInput", false, "Whether to keep input samples that match any rule in "+
|
||||
"-streamAggr.config. By default, matched raw samples are aggregated and dropped, while unmatched samples "+
|
||||
"are written to the remote storage. See also -streamAggr.dropInput and https://docs.victoriametrics.com/victoriametrics/stream-aggregation/")
|
||||
streamAggrGlobalDropInput = flag.Bool("streamAggr.dropInput", false, "Whether to drop input samples that not matching any rule in "+
|
||||
"-streamAggr.config. By default, only matched raw samples are dropped, while unmatched samples "+
|
||||
"are written to the remote storage. See also -streamAggr.keepInput and https://docs.victoriametrics.com/victoriametrics/stream-aggregation/")
|
||||
streamAggrGlobalKeepInput = flag.Bool("streamAggr.keepInput", false, "Whether to keep all the input samples after the aggregation "+
|
||||
"with -streamAggr.config. By default, only aggregates samples are dropped, while the remaining samples "+
|
||||
"are written to remote storages write. See also -streamAggr.dropInput and https://docs.victoriametrics.com/victoriametrics/stream-aggregation/")
|
||||
streamAggrGlobalDropInput = flag.Bool("streamAggr.dropInput", false, "Whether to drop all the input samples after the aggregation "+
|
||||
"with -remoteWrite.streamAggr.config. By default, only aggregates samples are dropped, while the remaining samples "+
|
||||
"are written to remote storages write. See also -streamAggr.keepInput and https://docs.victoriametrics.com/victoriametrics/stream-aggregation/")
|
||||
streamAggrGlobalDedupInterval = flag.Duration("streamAggr.dedupInterval", 0, "Input samples are de-duplicated with this interval on "+
|
||||
"aggregator before optional aggregation with -streamAggr.config . "+
|
||||
"See also -dedup.minScrapeInterval and https://docs.victoriametrics.com/victoriametrics/stream-aggregation/#deduplication")
|
||||
@@ -43,11 +43,11 @@ var (
|
||||
streamAggrConfig = flagutil.NewArrayString("remoteWrite.streamAggr.config", "Optional path to file with stream aggregation config for the corresponding -remoteWrite.url. "+
|
||||
"See https://docs.victoriametrics.com/victoriametrics/stream-aggregation/ . "+
|
||||
"See also -remoteWrite.streamAggr.keepInput, -remoteWrite.streamAggr.dropInput and -remoteWrite.streamAggr.dedupInterval")
|
||||
streamAggrDropInput = flagutil.NewArrayBool("remoteWrite.streamAggr.dropInput", "Whether to drop input samples that not matching any rule in "+
|
||||
"the corresponding -remoteWrite.streamAggr.config. By default, only matched raw samples are dropped, while unmatched samples "+
|
||||
streamAggrDropInput = flagutil.NewArrayBool("remoteWrite.streamAggr.dropInput", "Whether to drop all the input samples after the aggregation "+
|
||||
"with -remoteWrite.streamAggr.config at the corresponding -remoteWrite.url. By default, only aggregates samples are dropped, while the remaining samples "+
|
||||
"are written to the corresponding -remoteWrite.url . See also -remoteWrite.streamAggr.keepInput and https://docs.victoriametrics.com/victoriametrics/stream-aggregation/")
|
||||
streamAggrKeepInput = flagutil.NewArrayBool("remoteWrite.streamAggr.keepInput", "Whether to keep input samples that match any rule in "+
|
||||
"the corresponding -remoteWrite.streamAggr.config. By default, matched raw samples are aggregated and dropped, while unmatched samples "+
|
||||
streamAggrKeepInput = flagutil.NewArrayBool("remoteWrite.streamAggr.keepInput", "Whether to keep all the input samples after the aggregation "+
|
||||
"with -remoteWrite.streamAggr.config at the corresponding -remoteWrite.url. By default, only aggregates samples are dropped, while the remaining samples "+
|
||||
"are written to the corresponding -remoteWrite.url . See also -remoteWrite.streamAggr.dropInput and https://docs.victoriametrics.com/victoriametrics/stream-aggregation/")
|
||||
streamAggrDedupInterval = flagutil.NewArrayDuration("remoteWrite.streamAggr.dedupInterval", 0, "Input samples are de-duplicated with this interval before optional aggregation "+
|
||||
"with -remoteWrite.streamAggr.config at the corresponding -remoteWrite.url. See also -dedup.minScrapeInterval and https://docs.victoriametrics.com/victoriametrics/stream-aggregation/#deduplication")
|
||||
|
||||
@@ -27,9 +27,6 @@ vmalert-tool-linux-ppc64le-prod:
|
||||
vmalert-tool-linux-386-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-386
|
||||
|
||||
vmalert-tool-linux-s390x-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-linux-s390x
|
||||
|
||||
vmalert-tool-darwin-amd64-prod:
|
||||
APP_NAME=vmalert-tool $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
|
||||
@@ -132,7 +132,7 @@ func UnitTest(files []string, disableGroupLabel bool, externalLabels []string, e
|
||||
}
|
||||
labels[s[:n]] = s[n+1:]
|
||||
}
|
||||
err = notifier.Init(labels, externalURL)
|
||||
_, err = notifier.Init(labels, externalURL)
|
||||
if err != nil {
|
||||
logger.Fatalf("failed to init notifier: %v", err)
|
||||
}
|
||||
@@ -379,7 +379,7 @@ func (tg *testGroup) test(evalInterval time.Duration, groupOrderMap map[string]i
|
||||
if len(g.Rules) == 0 {
|
||||
continue
|
||||
}
|
||||
errs := g.ExecOnce(context.Background(), rw, ts)
|
||||
errs := g.ExecOnce(context.Background(), func() []notifier.Notifier { return nil }, rw, ts)
|
||||
for err := range errs {
|
||||
if err != nil {
|
||||
checkErrs = append(checkErrs, fmt.Errorf("\nfailed to exec group: %q, time: %s, err: %w", g.Name,
|
||||
|
||||
@@ -27,9 +27,6 @@ vmalert-linux-ppc64le-prod:
|
||||
vmalert-linux-386-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-linux-386
|
||||
|
||||
vmalert-linux-s390x-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-linux-s390x
|
||||
|
||||
vmalert-darwin-amd64-prod:
|
||||
APP_NAME=vmalert $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
|
||||
@@ -179,11 +179,11 @@ func (c *Client) Query(ctx context.Context, query string, ts time.Time) (Result,
|
||||
var parseFn func(resp *http.Response) (Result, error)
|
||||
switch c.dataSourceType {
|
||||
case datasourcePrometheus:
|
||||
parseFn = parsePrometheusInstantResponse
|
||||
parseFn = parsePrometheusResponse
|
||||
case datasourceGraphite:
|
||||
parseFn = parseGraphiteResponse
|
||||
case datasourceVLogs:
|
||||
parseFn = parseVLogsInstantResponse
|
||||
parseFn = parseVLogsResponse
|
||||
default:
|
||||
logger.Panicf("BUG: unsupported datasource type %q to parse query response", c.dataSourceType)
|
||||
}
|
||||
@@ -239,9 +239,9 @@ func (c *Client) QueryRange(ctx context.Context, query string, start, end time.T
|
||||
var parseFn func(resp *http.Response) (Result, error)
|
||||
switch c.dataSourceType {
|
||||
case datasourcePrometheus:
|
||||
parseFn = parsePrometheusRangeResponse
|
||||
parseFn = parsePrometheusResponse
|
||||
case datasourceVLogs:
|
||||
parseFn = parseVLogsRangeResponse
|
||||
parseFn = parseVLogsResponse
|
||||
default:
|
||||
logger.Panicf("BUG: unsupported datasource type %q to parse query range response", c.dataSourceType)
|
||||
}
|
||||
|
||||
@@ -172,26 +172,17 @@ const (
|
||||
rtVector, rtMatrix, rScalar = "vector", "matrix", "scalar"
|
||||
)
|
||||
|
||||
func parsePromResponse(resp *http.Response) (*promResponse, error) {
|
||||
func parsePrometheusResponse(resp *http.Response) (res Result, err error) {
|
||||
r := &promResponse{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(r); err != nil {
|
||||
return nil, fmt.Errorf("failed to decode response: %w", err)
|
||||
if err = json.NewDecoder(resp.Body).Decode(r); err != nil {
|
||||
return res, fmt.Errorf("failed to decode response: %w", err)
|
||||
}
|
||||
if r.Status == statusError {
|
||||
return nil, fmt.Errorf("response error %q: %s", r.ErrorType, r.Error)
|
||||
return res, fmt.Errorf("response error %q: %s", r.ErrorType, r.Error)
|
||||
}
|
||||
if r.Status != statusSuccess {
|
||||
return nil, fmt.Errorf("unknown response status %q", r.Status)
|
||||
return res, fmt.Errorf("unknown response status %q", r.Status)
|
||||
}
|
||||
return r, nil
|
||||
}
|
||||
|
||||
func parsePrometheusInstantResponse(resp *http.Response) (res Result, err error) {
|
||||
r, err := parsePromResponse(resp)
|
||||
if err != nil {
|
||||
return res, fmt.Errorf("failed to parse response: %w", err)
|
||||
}
|
||||
|
||||
var parseFn func() ([]Metric, error)
|
||||
switch r.Data.ResultType {
|
||||
case rtVector:
|
||||
@@ -200,6 +191,12 @@ func parsePrometheusInstantResponse(resp *http.Response) (res Result, err error)
|
||||
return res, fmt.Errorf("unmarshal err %w; \n %#v", err, string(r.Data.Result))
|
||||
}
|
||||
parseFn = pi.metrics
|
||||
case rtMatrix:
|
||||
var pr promRange
|
||||
if err := json.Unmarshal(r.Data.Result, &pr.Result); err != nil {
|
||||
return res, err
|
||||
}
|
||||
parseFn = pr.metrics
|
||||
case rScalar:
|
||||
var ps promScalar
|
||||
if err := json.Unmarshal(r.Data.Result, &ps); err != nil {
|
||||
@@ -209,6 +206,7 @@ func parsePrometheusInstantResponse(resp *http.Response) (res Result, err error)
|
||||
default:
|
||||
return res, fmt.Errorf("unknown result type %q", r.Data.ResultType)
|
||||
}
|
||||
|
||||
ms, err := parseFn()
|
||||
if err != nil {
|
||||
return res, err
|
||||
@@ -224,34 +222,6 @@ func parsePrometheusInstantResponse(resp *http.Response) (res Result, err error)
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func parsePrometheusRangeResponse(resp *http.Response) (res Result, err error) {
|
||||
r, err := parsePromResponse(resp)
|
||||
if err != nil {
|
||||
return res, fmt.Errorf("failed to parse response: %w", err)
|
||||
}
|
||||
if r.Data.ResultType != rtMatrix {
|
||||
return res, fmt.Errorf("unexpected result type %q; expected result type %q", r.Data.ResultType, rtMatrix)
|
||||
}
|
||||
|
||||
var pr promRange
|
||||
if err := json.Unmarshal(r.Data.Result, &pr.Result); err != nil {
|
||||
return res, err
|
||||
}
|
||||
ms, err := pr.metrics()
|
||||
if err != nil {
|
||||
return res, err
|
||||
}
|
||||
res = Result{Data: ms, IsPartial: r.IsPartial}
|
||||
if r.Stats.SeriesFetched != nil {
|
||||
intV, err := strconv.Atoi(*r.Stats.SeriesFetched)
|
||||
if err != nil {
|
||||
return res, fmt.Errorf("failed to convert stats.seriesFetched to int: %w", err)
|
||||
}
|
||||
res.SeriesFetched = &intV
|
||||
}
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func (c *Client) setPrometheusInstantReqParams(r *http.Request, query string, timestamp time.Time) {
|
||||
if c.appendTypePrefix {
|
||||
r.URL.Path += "/prometheus"
|
||||
|
||||
@@ -65,23 +65,21 @@ func TestVMInstantQuery(t *testing.T) {
|
||||
case 3:
|
||||
w.Write([]byte(`{"status":"unknown"}`))
|
||||
case 4:
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"vector"}}`))
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"matrix"}}`))
|
||||
case 5:
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"matrix","result":[{"metric":{"__name__":"vm_rows"},"values":[[1583786142,"13763"]]}]}}`))
|
||||
case 6:
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"vm_rows","foo":"bar"},"value":[1583786142,"13763"]},{"metric":{"__name__":"vm_requests","foo":"baz"},"value":[1583786140,"2000"]}]}}`))
|
||||
case 7:
|
||||
case 6:
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"scalar","result":[1583786142, "1"]}}`))
|
||||
case 8:
|
||||
case 7:
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"scalar","result":[1583786142, "1"]},"stats":{"seriesFetched": "42"}}`))
|
||||
case 9:
|
||||
case 8:
|
||||
w.Write([]byte(`{"status":"success", "isPartial":true, "data":{"resultType":"scalar","result":[1583786142, "1"]}}`))
|
||||
}
|
||||
})
|
||||
mux.HandleFunc("/render", func(w http.ResponseWriter, _ *http.Request) {
|
||||
c++
|
||||
switch c {
|
||||
case 10:
|
||||
case 9:
|
||||
w.Write([]byte(`[{"target":"constantLine(10)","tags":{"name":"constantLine(10)"},"datapoints":[[10,1611758343],[10,1611758373],[10,1611758403]]}]`))
|
||||
}
|
||||
})
|
||||
@@ -104,9 +102,9 @@ func TestVMInstantQuery(t *testing.T) {
|
||||
t.Fatalf("failed to parse 'time' query param %q: %s", timeParam, err)
|
||||
}
|
||||
switch c {
|
||||
case 11:
|
||||
case 10:
|
||||
w.Write([]byte("[]"))
|
||||
case 12:
|
||||
case 11:
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[{"metric":{"__name__":"total","foo":"bar"},"value":[1583786142,"13763"]},{"metric":{"__name__":"total","foo":"baz"},"value":[1583786140,"2000"]}]}}`))
|
||||
}
|
||||
})
|
||||
@@ -125,7 +123,6 @@ func TestVMInstantQuery(t *testing.T) {
|
||||
ts := time.Now()
|
||||
|
||||
expErr := func(query, err string) {
|
||||
t.Helper()
|
||||
_, _, gotErr := pq.Query(ctx, query, ts)
|
||||
if gotErr == nil {
|
||||
t.Fatalf("expected %q got nil", err)
|
||||
@@ -140,9 +137,8 @@ func TestVMInstantQuery(t *testing.T) {
|
||||
expErr(vmQuery, "response error") // 2
|
||||
expErr(vmQuery, "unknown response status") // 3
|
||||
expErr(vmQuery, "unexpected end of JSON input") // 4
|
||||
expErr(vmQuery, "unknown result type") // 5
|
||||
|
||||
res, _, err := pq.Query(ctx, vmQuery, ts) // 6 - vector
|
||||
res, _, err := pq.Query(ctx, vmQuery, ts) // 5 - vector
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected %s", err)
|
||||
}
|
||||
@@ -163,7 +159,7 @@ func TestVMInstantQuery(t *testing.T) {
|
||||
}
|
||||
metricsEqual(t, res.Data, expected)
|
||||
|
||||
res, req, err := pq.Query(ctx, vmQuery, ts) // 7 - scalar
|
||||
res, req, err := pq.Query(ctx, vmQuery, ts) // 6 - scalar
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected %s", err)
|
||||
}
|
||||
@@ -188,7 +184,7 @@ func TestVMInstantQuery(t *testing.T) {
|
||||
res.SeriesFetched)
|
||||
}
|
||||
|
||||
res, _, err = pq.Query(ctx, vmQuery, ts) // 8 - scalar with stats
|
||||
res, _, err = pq.Query(ctx, vmQuery, ts) // 7 - scalar with stats
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected %s", err)
|
||||
}
|
||||
@@ -209,7 +205,7 @@ func TestVMInstantQuery(t *testing.T) {
|
||||
*res.SeriesFetched)
|
||||
}
|
||||
|
||||
res, _, err = pq.Query(ctx, vmQuery, ts) // 9
|
||||
res, _, err = pq.Query(ctx, vmQuery, ts) // 8
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected %s", err)
|
||||
}
|
||||
@@ -220,7 +216,7 @@ func TestVMInstantQuery(t *testing.T) {
|
||||
// test graphite
|
||||
gq := s.BuildWithParams(QuerierParams{DataSourceType: string(datasourceGraphite)})
|
||||
|
||||
res, _, err = gq.Query(ctx, queryRender, ts) // 10 - graphite
|
||||
res, _, err = gq.Query(ctx, queryRender, ts) // 9 - graphite
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected %s", err)
|
||||
}
|
||||
@@ -240,9 +236,9 @@ func TestVMInstantQuery(t *testing.T) {
|
||||
vlogs := datasourceVLogs
|
||||
pq = s.BuildWithParams(QuerierParams{DataSourceType: string(vlogs), EvaluationInterval: 15 * time.Second})
|
||||
|
||||
expErr(vlogsQuery, "error parsing response") // 11
|
||||
expErr(vlogsQuery, "error parsing response") // 10
|
||||
|
||||
res, _, err = pq.Query(ctx, vlogsQuery, ts) // 12
|
||||
res, _, err = pq.Query(ctx, vlogsQuery, ts) // 11
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected %s", err)
|
||||
}
|
||||
@@ -394,8 +390,6 @@ func TestVMRangeQuery(t *testing.T) {
|
||||
switch c {
|
||||
case 0:
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"matrix","result":[{"metric":{"__name__":"vm_rows"},"values":[[1583786142,"13763"]]}]}}`))
|
||||
case 1:
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"vector","result":[1583786142, "1"]}}`))
|
||||
}
|
||||
})
|
||||
mux.HandleFunc("/select/logsql/stats_query_range", func(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -428,7 +422,7 @@ func TestVMRangeQuery(t *testing.T) {
|
||||
t.Fatalf("expected 'step' query param to be 60s; got %q instead", step)
|
||||
}
|
||||
switch c {
|
||||
case 2:
|
||||
case 1:
|
||||
w.Write([]byte(`{"status":"success","data":{"resultType":"matrix","result":[{"metric":{"__name__":"total"},"values":[[1583786142,"10"]]}]}}`))
|
||||
}
|
||||
})
|
||||
@@ -452,13 +446,13 @@ func TestVMRangeQuery(t *testing.T) {
|
||||
|
||||
start, end := time.Now().Add(-time.Minute), time.Now()
|
||||
|
||||
res, err := pq.QueryRange(ctx, vmQuery, start, end) // case 0
|
||||
res, err := pq.QueryRange(ctx, vmQuery, start, end)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected %s", err)
|
||||
}
|
||||
m := res.Data
|
||||
if len(m) != 1 {
|
||||
t.Fatalf("expected 1 metric got %d in %+v", len(m), m)
|
||||
t.Fatalf("expected 1 metric got %d in %+v", len(m), m)
|
||||
}
|
||||
expected := Metric{
|
||||
Labels: []prompb.Label{{Value: "vm_rows", Name: "__name__"}},
|
||||
@@ -469,9 +463,6 @@ func TestVMRangeQuery(t *testing.T) {
|
||||
t.Fatalf("unexpected metric %+v want %+v", m[0], expected)
|
||||
}
|
||||
|
||||
_, err = pq.QueryRange(ctx, vmQuery, start, end) // case 1
|
||||
expectError(t, err, "unexpected result type")
|
||||
|
||||
// test unsupported graphite
|
||||
gq := s.BuildWithParams(QuerierParams{DataSourceType: string(datasourceGraphite)})
|
||||
|
||||
|
||||
@@ -40,28 +40,8 @@ func (c *Client) setVLogsRangeReqParams(r *http.Request, query string, start, en
|
||||
c.setReqParams(r, query)
|
||||
}
|
||||
|
||||
func parseVLogsInstantResponse(resp *http.Response) (res Result, err error) {
|
||||
res, err = parsePrometheusInstantResponse(resp)
|
||||
if err != nil {
|
||||
return Result{}, err
|
||||
}
|
||||
for i := range res.Data {
|
||||
m := &res.Data[i]
|
||||
for j := range m.Labels {
|
||||
// reserve the stats func result name with a new label `stats_result` instead of dropping it,
|
||||
// since there could be multiple stats results in a single query, for instance:
|
||||
// _time:5m | stats quantile(0.5, request_duration_seconds) p50, quantile(0.9, request_duration_seconds) p90
|
||||
if m.Labels[j].Name == "__name__" {
|
||||
m.Labels[j].Name = "stats_result"
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func parseVLogsRangeResponse(resp *http.Response) (res Result, err error) {
|
||||
res, err = parsePrometheusRangeResponse(resp)
|
||||
func parseVLogsResponse(resp *http.Response) (res Result, err error) {
|
||||
res, err = parsePrometheusResponse(resp)
|
||||
if err != nil {
|
||||
return Result{}, err
|
||||
}
|
||||
|
||||
@@ -227,13 +227,14 @@ func newManager(ctx context.Context) (*manager, error) {
|
||||
labels[s[:n]] = s[n+1:]
|
||||
}
|
||||
|
||||
err = notifier.Init(labels, *externalURL)
|
||||
nts, err := notifier.Init(labels, *externalURL)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to init notifier: %w", err)
|
||||
}
|
||||
manager := &manager{
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
querierBuilder: q,
|
||||
notifiers: nts,
|
||||
labels: labels,
|
||||
}
|
||||
rw, err := remotewrite.Init(ctx)
|
||||
|
||||
@@ -96,10 +96,9 @@ groups:
|
||||
querierBuilder: &datasource.FakeQuerier{},
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
labels: map[string]string{},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} },
|
||||
rw: &remotewrite.Client{},
|
||||
}
|
||||
_, cleanup := notifier.InitFakeNotifier()
|
||||
defer cleanup()
|
||||
|
||||
syncCh := make(chan struct{})
|
||||
sighupCh := procutil.NewSighupChan()
|
||||
|
||||
@@ -16,6 +16,7 @@ import (
|
||||
// manager controls group states
|
||||
type manager struct {
|
||||
querierBuilder datasource.QuerierBuilder
|
||||
notifiers func() []notifier.Notifier
|
||||
|
||||
rw remotewrite.RWClient
|
||||
// remote read builder.
|
||||
@@ -93,16 +94,17 @@ func (m *manager) close() {
|
||||
}
|
||||
|
||||
func (m *manager) startGroup(ctx context.Context, g *rule.Group, restore bool) error {
|
||||
m.wg.Add(1)
|
||||
id := g.GetID()
|
||||
g.Init()
|
||||
m.wg.Go(func() {
|
||||
go func() {
|
||||
defer m.wg.Done()
|
||||
if restore {
|
||||
g.Start(ctx, m.rw, m.rr)
|
||||
g.Start(ctx, m.notifiers, m.rw, m.rr)
|
||||
} else {
|
||||
g.Start(ctx, m.rw, nil)
|
||||
g.Start(ctx, m.notifiers, m.rw, nil)
|
||||
}
|
||||
})
|
||||
|
||||
}()
|
||||
m.groups[id] = g
|
||||
return nil
|
||||
}
|
||||
@@ -129,7 +131,7 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
|
||||
if rrPresent && m.rw == nil {
|
||||
return fmt.Errorf("config contains recording rules but `-remoteWrite.url` isn't set")
|
||||
}
|
||||
if arPresent && notifier.GetTargets() == nil {
|
||||
if arPresent && m.notifiers == nil {
|
||||
return fmt.Errorf("config contains alerting rules but neither `-notifier.url` nor `-notifier.config` nor `-notifier.blackhole` aren't set")
|
||||
}
|
||||
|
||||
@@ -166,15 +168,15 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
|
||||
if len(toUpdate) > 0 {
|
||||
var wg sync.WaitGroup
|
||||
for _, item := range toUpdate {
|
||||
oldG := item.old
|
||||
newG := item.new
|
||||
wg.Go(func() {
|
||||
// cancel evaluation so the Update will be applied as fast as possible.
|
||||
// it is important to call InterruptEval before the update, because cancel fn
|
||||
// can be re-assigned during the update.
|
||||
oldG.InterruptEval()
|
||||
oldG.UpdateWith(newG)
|
||||
})
|
||||
wg.Add(1)
|
||||
// cancel evaluation so the Update will be applied as fast as possible.
|
||||
// it is important to call InterruptEval before the update, because cancel fn
|
||||
// can be re-assigned during the update.
|
||||
item.old.InterruptEval()
|
||||
go func(oldGroup *rule.Group, newGroup *rule.Group) {
|
||||
oldGroup.UpdateWith(newGroup)
|
||||
wg.Done()
|
||||
}(item.old, item.new)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
@@ -40,11 +40,10 @@ func TestManagerEmptyRulesDir(t *testing.T) {
|
||||
// execution of configuration update.
|
||||
// Should be executed with -race flag
|
||||
func TestManagerUpdateConcurrent(t *testing.T) {
|
||||
_, cleanup := notifier.InitFakeNotifier()
|
||||
defer cleanup()
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
querierBuilder: &datasource.FakeQuerier{},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} },
|
||||
}
|
||||
paths := []string{
|
||||
"config/testdata/dir/rules0-good.rules",
|
||||
@@ -128,9 +127,8 @@ func TestManagerUpdate_Success(t *testing.T) {
|
||||
m := &manager{
|
||||
groups: make(map[uint64]*rule.Group),
|
||||
querierBuilder: &datasource.FakeQuerier{},
|
||||
notifiers: func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} },
|
||||
}
|
||||
_, cleanup := notifier.InitFakeNotifier()
|
||||
defer cleanup()
|
||||
|
||||
cfgInit := loadCfg(t, []string{initPath}, true, true)
|
||||
if err := m.update(ctx, cfgInit, false); err != nil {
|
||||
@@ -279,8 +277,7 @@ func TestManagerUpdate_Failure(t *testing.T) {
|
||||
rw: rw,
|
||||
}
|
||||
if notifiers != nil {
|
||||
_, cleanup := notifier.InitFakeNotifier()
|
||||
defer cleanup()
|
||||
m.notifiers = func() []notifier.Notifier { return notifiers }
|
||||
}
|
||||
err := m.update(context.Background(), []config.Group{cfg}, false)
|
||||
if err == nil {
|
||||
|
||||
@@ -20,7 +20,7 @@ func TestAlertExecTemplate(t *testing.T) {
|
||||
)
|
||||
extLabels["cluster"] = extCluster
|
||||
extLabels["dc"] = extDC
|
||||
err := Init(extLabels, extURL)
|
||||
_, err := Init(extLabels, extURL)
|
||||
checkErr(t, err)
|
||||
|
||||
f := func(alert *Alert, annotations map[string]string, tplExpected map[string]string) {
|
||||
|
||||
@@ -77,13 +77,10 @@ func (am *AlertManager) LastError() string {
|
||||
}
|
||||
|
||||
// Send an alert or resolve message
|
||||
func (am *AlertManager) Send(ctx context.Context, alerts []Alert, alertLabels [][]prompb.Label, headers map[string]string) error {
|
||||
if len(alerts) != len(alertLabels) {
|
||||
return fmt.Errorf("mismatched number of alerts and label sets after global alert relabeling")
|
||||
}
|
||||
func (am *AlertManager) Send(ctx context.Context, alerts []Alert, headers map[string]string) error {
|
||||
am.metrics.alertsSent.Add(len(alerts))
|
||||
startTime := time.Now()
|
||||
err := am.send(ctx, alerts, alertLabels, headers)
|
||||
err := am.send(ctx, alerts, headers)
|
||||
am.metrics.alertsSendDuration.UpdateDuration(startTime)
|
||||
if err != nil {
|
||||
am.metrics.alertsSendErrors.Add(len(alerts))
|
||||
@@ -94,15 +91,12 @@ func (am *AlertManager) Send(ctx context.Context, alerts []Alert, alertLabels []
|
||||
return err
|
||||
}
|
||||
|
||||
func (am *AlertManager) send(ctx context.Context, alerts []Alert, alertLabels [][]prompb.Label, headers map[string]string) error {
|
||||
func (am *AlertManager) send(ctx context.Context, alerts []Alert, headers map[string]string) error {
|
||||
b := &bytes.Buffer{}
|
||||
alertsToSend := make([]Alert, 0, len(alerts))
|
||||
lblss := make([][]prompb.Label, 0, len(alerts))
|
||||
for i, a := range alerts {
|
||||
lbls := alertLabels[i]
|
||||
if am.relabelConfigs != nil {
|
||||
lbls = am.relabelConfigs.Apply(lbls, 0)
|
||||
}
|
||||
for _, a := range alerts {
|
||||
lbls := a.applyRelabelingIfNeeded(am.relabelConfigs)
|
||||
if len(lbls) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -11,7 +11,6 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
)
|
||||
|
||||
@@ -146,11 +145,11 @@ func TestAlertManager_Send(t *testing.T) {
|
||||
t.Fatalf("unexpected error: %s", err)
|
||||
}
|
||||
|
||||
if err := am.Send(context.Background(), []Alert{{Labels: map[string]string{"a": "b"}}}, [][]prompb.Label{{{Name: "a", Value: "b"}}}, nil); err == nil {
|
||||
if err := am.Send(context.Background(), []Alert{{Labels: map[string]string{"a": "b"}}}, nil); err == nil {
|
||||
t.Fatalf("expected connection error got nil")
|
||||
}
|
||||
|
||||
if err := am.Send(context.Background(), []Alert{{Labels: map[string]string{"a": "b"}}}, [][]prompb.Label{{{Name: "a", Value: "b"}}}, nil); err == nil {
|
||||
if err := am.Send(context.Background(), []Alert{{Labels: map[string]string{"a": "b"}}}, nil); err == nil {
|
||||
t.Fatalf("expected wrong http code error got nil")
|
||||
}
|
||||
|
||||
@@ -161,7 +160,7 @@ func TestAlertManager_Send(t *testing.T) {
|
||||
End: time.Now().UTC(),
|
||||
Labels: map[string]string{"alertname": "alert0"},
|
||||
Annotations: map[string]string{"a": "b", "c": "d"},
|
||||
}}, [][]prompb.Label{{{Name: "alertname", Value: "alert0"}}}, map[string]string{headerKey: "bar"}); err != nil {
|
||||
}}, map[string]string{headerKey: "bar"}); err != nil {
|
||||
t.Fatalf("unexpected error %s", err)
|
||||
}
|
||||
|
||||
@@ -175,7 +174,7 @@ func TestAlertManager_Send(t *testing.T) {
|
||||
Name: "alert2",
|
||||
Labels: map[string]string{"rule": "test", "tenant": "1"},
|
||||
},
|
||||
}, [][]prompb.Label{{{Name: "rule", Value: "test"}, {Name: "tenant", Value: "0"}}, {{Name: "rule", Value: "test"}, {Name: "tenant", Value: "1"}}}, map[string]string{headerKey: "bar"}); err != nil {
|
||||
}, map[string]string{headerKey: "bar"}); err != nil {
|
||||
t.Fatalf("unexpected error %s", err)
|
||||
}
|
||||
|
||||
@@ -188,7 +187,7 @@ func TestAlertManager_Send(t *testing.T) {
|
||||
Name: "alert2",
|
||||
Labels: map[string]string{},
|
||||
},
|
||||
}, [][]prompb.Label{{{Name: "rule", Value: "test"}}, {{}}}, map[string]string{}); err != nil {
|
||||
}, map[string]string{}); err != nil {
|
||||
t.Fatalf("unexpected error %s", err)
|
||||
}
|
||||
|
||||
|
||||
@@ -27,9 +27,15 @@ type Config struct {
|
||||
// PathPrefix is added to URL path before adding alertManagerPath value
|
||||
PathPrefix string `yaml:"path_prefix,omitempty"`
|
||||
|
||||
ConsulSDConfigs []ConsulSDConfigs `yaml:"consul_sd_configs,omitempty"`
|
||||
DNSSDConfigs []DNSSDConfigs `yaml:"dns_sd_configs,omitempty"`
|
||||
StaticConfigs []StaticConfig `yaml:"static_configs,omitempty"`
|
||||
// ConsulSDConfigs contains list of settings for service discovery via Consul
|
||||
// see https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config
|
||||
ConsulSDConfigs []consul.SDConfig `yaml:"consul_sd_configs,omitempty"`
|
||||
// DNSSDConfigs contains list of settings for service discovery via DNS.
|
||||
// See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config
|
||||
DNSSDConfigs []dns.SDConfig `yaml:"dns_sd_configs,omitempty"`
|
||||
|
||||
// StaticConfigs contains list of static targets
|
||||
StaticConfigs []StaticConfig `yaml:"static_configs,omitempty"`
|
||||
|
||||
// HTTPClientConfig contains HTTP configuration for Notifier clients
|
||||
HTTPClientConfig promauth.HTTPClientConfig `yaml:",inline"`
|
||||
@@ -56,29 +62,14 @@ type Config struct {
|
||||
parsedAlertRelabelConfigs *promrelabel.ParsedConfigs
|
||||
}
|
||||
|
||||
// staticConfig contains list of static targets in the following form:
|
||||
// StaticConfig contains list of static targets in the following form:
|
||||
//
|
||||
// targets:
|
||||
// [ - '<host>' ]
|
||||
type StaticConfig struct {
|
||||
Targets []string `yaml:"targets"`
|
||||
// HTTPClientConfig contains HTTP configuration for the Targets
|
||||
HTTPClientConfig promauth.HTTPClientConfig `yaml:",inline"`
|
||||
AlertRelabelConfigs []promrelabel.RelabelConfig `yaml:"alert_relabel_configs,omitempty"`
|
||||
}
|
||||
|
||||
// ConsulSDConfigs contains list of settings for service discovery via Consul,
|
||||
// see https://prometheus.io/docs/prometheus/latest/configuration/configuration/#consul_sd_config
|
||||
type ConsulSDConfigs struct {
|
||||
consul.SDConfig `yaml:",inline"`
|
||||
AlertRelabelConfigs []promrelabel.RelabelConfig `yaml:"alert_relabel_configs,omitempty"`
|
||||
}
|
||||
|
||||
// DNSSDConfigs contains list of settings for service discovery via DNS,
|
||||
// See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#dns_sd_config
|
||||
type DNSSDConfigs struct {
|
||||
dns.SDConfig `yaml:",inline"`
|
||||
AlertRelabelConfigs []promrelabel.RelabelConfig `yaml:"alert_relabel_configs,omitempty"`
|
||||
HTTPClientConfig promauth.HTTPClientConfig `yaml:",inline"`
|
||||
}
|
||||
|
||||
// UnmarshalYAML implements the yaml.Unmarshaler interface.
|
||||
@@ -104,31 +95,6 @@ func (cfg *Config) UnmarshalYAML(unmarshal func(any) error) error {
|
||||
}
|
||||
cfg.parsedAlertRelabelConfigs = arCfg
|
||||
|
||||
for _, s := range cfg.StaticConfigs {
|
||||
if len(s.AlertRelabelConfigs) > 0 {
|
||||
_, err := promrelabel.ParseRelabelConfigs(s.AlertRelabelConfigs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse alert_relabel_configs in static_config: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, s := range cfg.ConsulSDConfigs {
|
||||
if len(s.AlertRelabelConfigs) > 0 {
|
||||
_, err := promrelabel.ParseRelabelConfigs(s.AlertRelabelConfigs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse alert_relabel_configs in consul_sd_config: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, s := range cfg.DNSSDConfigs {
|
||||
if len(s.AlertRelabelConfigs) > 0 {
|
||||
_, err := promrelabel.ParseRelabelConfigs(s.AlertRelabelConfigs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse alert_relabel_configs in dns_sd_config: %w", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
b, err := yaml.Marshal(cfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal configuration for checksum: %w", err)
|
||||
|
||||
@@ -35,6 +35,4 @@ func TestParseConfig_Failure(t *testing.T) {
|
||||
|
||||
f("testdata/unknownFields.bad.yaml", "unknown field")
|
||||
f("non-existing-file", "error reading")
|
||||
f("testdata/consul.bad.yaml", "failed to parse alert_relabel_configs in consul_sd_config")
|
||||
f("testdata/dns.bad.yaml", "failed to parse alert relabeling config")
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@ import (
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discovery/consul"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discovery/dns"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutil"
|
||||
@@ -29,7 +28,11 @@ type configWatcher struct {
|
||||
targets map[TargetType][]Target
|
||||
}
|
||||
|
||||
func newWatcher(cfg *Config, gen AlertURLGenerator) (*configWatcher, error) {
|
||||
func newWatcher(path string, gen AlertURLGenerator) (*configWatcher, error) {
|
||||
cfg, err := parseConfig(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cw := &configWatcher{
|
||||
cfg: cfg,
|
||||
wg: sync.WaitGroup{},
|
||||
@@ -85,15 +88,18 @@ func (cw *configWatcher) reload(path string) error {
|
||||
return cw.start()
|
||||
}
|
||||
|
||||
func (cw *configWatcher) add(typeK TargetType, interval time.Duration, targetsFn getTargets) error {
|
||||
targetMetadata, errors := getTargetMetadata(targetsFn, cw.cfg)
|
||||
func (cw *configWatcher) add(typeK TargetType, interval time.Duration, labelsFn getLabels) error {
|
||||
targetMetadata, errors := getTargetMetadata(labelsFn, cw.cfg)
|
||||
for _, err := range errors {
|
||||
return fmt.Errorf("failed to init notifier for %q: %w", typeK, err)
|
||||
}
|
||||
|
||||
cw.updateTargets(typeK, targetMetadata, cw.cfg, cw.genFn)
|
||||
|
||||
cw.wg.Go(func() {
|
||||
cw.wg.Add(1)
|
||||
go func() {
|
||||
defer cw.wg.Done()
|
||||
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
@@ -103,77 +109,62 @@ func (cw *configWatcher) add(typeK TargetType, interval time.Duration, targetsFn
|
||||
return
|
||||
case <-ticker.C:
|
||||
}
|
||||
targetMetadata, errors := getTargetMetadata(targetsFn, cw.cfg)
|
||||
targetMetadata, errors := getTargetMetadata(labelsFn, cw.cfg)
|
||||
for _, err := range errors {
|
||||
logger.Errorf("failed to init notifier for %q: %w", typeK, err)
|
||||
}
|
||||
cw.updateTargets(typeK, targetMetadata, cw.cfg, cw.genFn)
|
||||
}
|
||||
})
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
||||
type targetMetadata struct {
|
||||
*promutil.Labels
|
||||
alertRelabelConfigs *promrelabel.ParsedConfigs
|
||||
}
|
||||
|
||||
func getTargetMetadata(targetsFn getTargets, cfg *Config) (map[string]targetMetadata, []error) {
|
||||
metaLabelsList, alertRelabelCfgs, err := targetsFn()
|
||||
func getTargetMetadata(labelsFn getLabels, cfg *Config) (map[string]*promutil.Labels, []error) {
|
||||
metaLabels, err := labelsFn()
|
||||
if err != nil {
|
||||
return nil, []error{fmt.Errorf("failed to get labels: %w", err)}
|
||||
}
|
||||
targetMts := make(map[string]targetMetadata, len(metaLabelsList))
|
||||
targetMetadata := make(map[string]*promutil.Labels, len(metaLabels))
|
||||
var errors []error
|
||||
duplicates := make(map[string]struct{})
|
||||
for i := range metaLabelsList {
|
||||
metaLabels := metaLabelsList[i]
|
||||
alertRelabelCfg := alertRelabelCfgs[i]
|
||||
for _, labels := range metaLabels {
|
||||
target := labels.Get("__address__")
|
||||
u, processedLabels, err := parseLabels(target, labels, cfg)
|
||||
if err != nil {
|
||||
errors = append(errors, err)
|
||||
continue
|
||||
}
|
||||
if len(u) == 0 {
|
||||
continue
|
||||
}
|
||||
// check for duplicated targets
|
||||
// targets with same address but different alert_relabel_configs are still considered duplicates since it's mostly due to misconfiguration and could cause duplicated notifications.
|
||||
if _, ok := duplicates[u]; ok {
|
||||
if !*suppressDuplicateTargetErrors {
|
||||
logger.Errorf("skipping duplicate target with identical address %q; "+
|
||||
"make sure service discovery and relabeling is set up properly; "+
|
||||
"original labels: %s; resulting labels: %s",
|
||||
u, labels, processedLabels)
|
||||
}
|
||||
continue
|
||||
}
|
||||
duplicates[u] = struct{}{}
|
||||
targetMts[u] = targetMetadata{
|
||||
Labels: processedLabels,
|
||||
alertRelabelConfigs: alertRelabelCfg,
|
||||
}
|
||||
for _, labels := range metaLabels {
|
||||
target := labels.Get("__address__")
|
||||
u, processedLabels, err := parseLabels(target, labels, cfg)
|
||||
if err != nil {
|
||||
errors = append(errors, err)
|
||||
continue
|
||||
}
|
||||
if len(u) == 0 {
|
||||
continue
|
||||
}
|
||||
if _, ok := duplicates[u]; ok { // check for duplicates
|
||||
if !*suppressDuplicateTargetErrors {
|
||||
logger.Errorf("skipping duplicate target with identical address %q; "+
|
||||
"make sure service discovery and relabeling is set up properly; "+
|
||||
"original labels: %s; resulting labels: %s",
|
||||
u, labels, processedLabels)
|
||||
}
|
||||
continue
|
||||
}
|
||||
duplicates[u] = struct{}{}
|
||||
targetMetadata[u] = processedLabels
|
||||
}
|
||||
return targetMts, errors
|
||||
return targetMetadata, errors
|
||||
}
|
||||
|
||||
type getTargets func() ([][]*promutil.Labels, []*promrelabel.ParsedConfigs, error)
|
||||
type getLabels func() ([]*promutil.Labels, error)
|
||||
|
||||
func (cw *configWatcher) start() error {
|
||||
if len(cw.cfg.StaticConfigs) > 0 {
|
||||
var targets []Target
|
||||
for i, cfg := range cw.cfg.StaticConfigs {
|
||||
alertRelabelConfig, _ := promrelabel.ParseRelabelConfigs(cw.cfg.StaticConfigs[i].AlertRelabelConfigs)
|
||||
for _, cfg := range cw.cfg.StaticConfigs {
|
||||
httpCfg := mergeHTTPClientConfigs(cw.cfg.HTTPClientConfig, cfg.HTTPClientConfig)
|
||||
for _, target := range cfg.Targets {
|
||||
address, labels, err := parseLabels(target, nil, cw.cfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse labels for target %q: %w", target, err)
|
||||
}
|
||||
notifier, err := NewAlertManager(address, cw.genFn, httpCfg, alertRelabelConfig, cw.cfg.Timeout.Duration())
|
||||
notifier, err := NewAlertManager(address, cw.genFn, httpCfg, cw.cfg.parsedAlertRelabelConfigs, cw.cfg.Timeout.Duration())
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to init alertmanager for addr %q: %w", address, err)
|
||||
}
|
||||
@@ -187,20 +178,17 @@ func (cw *configWatcher) start() error {
|
||||
}
|
||||
|
||||
if len(cw.cfg.ConsulSDConfigs) > 0 {
|
||||
err := cw.add(TargetConsul, *consul.SDCheckInterval, func() ([][]*promutil.Labels, []*promrelabel.ParsedConfigs, error) {
|
||||
var labels [][]*promutil.Labels
|
||||
var alertRelabelConfigs []*promrelabel.ParsedConfigs
|
||||
err := cw.add(TargetConsul, *consul.SDCheckInterval, func() ([]*promutil.Labels, error) {
|
||||
var labels []*promutil.Labels
|
||||
for i := range cw.cfg.ConsulSDConfigs {
|
||||
alertRelabelConfig, _ := promrelabel.ParseRelabelConfigs(cw.cfg.ConsulSDConfigs[i].AlertRelabelConfigs)
|
||||
sdc := &cw.cfg.ConsulSDConfigs[i]
|
||||
targetLabels, err := sdc.GetLabels(cw.cfg.baseDir)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("got labels err: %w", err)
|
||||
return nil, fmt.Errorf("got labels err: %w", err)
|
||||
}
|
||||
labels = append(labels, targetLabels)
|
||||
alertRelabelConfigs = append(alertRelabelConfigs, alertRelabelConfig)
|
||||
labels = append(labels, targetLabels...)
|
||||
}
|
||||
return labels, alertRelabelConfigs, nil
|
||||
return labels, nil
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to start consulSD discovery: %w", err)
|
||||
@@ -208,21 +196,17 @@ func (cw *configWatcher) start() error {
|
||||
}
|
||||
|
||||
if len(cw.cfg.DNSSDConfigs) > 0 {
|
||||
err := cw.add(TargetDNS, *dns.SDCheckInterval, func() ([][]*promutil.Labels, []*promrelabel.ParsedConfigs, error) {
|
||||
var labels [][]*promutil.Labels
|
||||
var alertRelabelConfigs []*promrelabel.ParsedConfigs
|
||||
err := cw.add(TargetDNS, *dns.SDCheckInterval, func() ([]*promutil.Labels, error) {
|
||||
var labels []*promutil.Labels
|
||||
for i := range cw.cfg.DNSSDConfigs {
|
||||
alertRelabelConfig, _ := promrelabel.ParseRelabelConfigs(cw.cfg.DNSSDConfigs[i].AlertRelabelConfigs)
|
||||
sdc := &cw.cfg.DNSSDConfigs[i]
|
||||
targetLabels, err := sdc.GetLabels(cw.cfg.baseDir)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("got labels err: %w", err)
|
||||
return nil, fmt.Errorf("got labels err: %w", err)
|
||||
}
|
||||
labels = append(labels, targetLabels)
|
||||
alertRelabelConfigs = append(alertRelabelConfigs, alertRelabelConfig)
|
||||
|
||||
labels = append(labels, targetLabels...)
|
||||
}
|
||||
return labels, alertRelabelConfigs, nil
|
||||
return labels, nil
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to start DNSSD discovery: %w", err)
|
||||
@@ -256,30 +240,30 @@ func (cw *configWatcher) setTargets(key TargetType, targets []Target) {
|
||||
cw.targetsMu.Unlock()
|
||||
}
|
||||
|
||||
func (cw *configWatcher) updateTargets(key TargetType, targetMts map[string]targetMetadata, cfg *Config, genFn AlertURLGenerator) {
|
||||
func (cw *configWatcher) updateTargets(key TargetType, targetMetadata map[string]*promutil.Labels, cfg *Config, genFn AlertURLGenerator) {
|
||||
cw.targetsMu.Lock()
|
||||
defer cw.targetsMu.Unlock()
|
||||
oldTargets := cw.targets[key]
|
||||
var updatedTargets []Target
|
||||
for _, ot := range oldTargets {
|
||||
if _, ok := targetMts[ot.Addr()]; !ok {
|
||||
if _, ok := targetMetadata[ot.Addr()]; !ok {
|
||||
// if target not exists in currentTargets, close it
|
||||
ot.Close()
|
||||
} else {
|
||||
updatedTargets = append(updatedTargets, ot)
|
||||
delete(targetMts, ot.Addr())
|
||||
delete(targetMetadata, ot.Addr())
|
||||
}
|
||||
}
|
||||
// create new resources for the new targets
|
||||
for addr, metadata := range targetMts {
|
||||
am, err := NewAlertManager(addr, genFn, cfg.HTTPClientConfig, metadata.alertRelabelConfigs, cfg.Timeout.Duration())
|
||||
for addr, labels := range targetMetadata {
|
||||
am, err := NewAlertManager(addr, genFn, cfg.HTTPClientConfig, cfg.parsedAlertRelabelConfigs, cfg.Timeout.Duration())
|
||||
if err != nil {
|
||||
logger.Errorf("failed to init %s notifier with addr %q: %w", key, addr, err)
|
||||
continue
|
||||
}
|
||||
updatedTargets = append(updatedTargets, Target{
|
||||
Notifier: am,
|
||||
Labels: metadata.Labels,
|
||||
Labels: labels,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ import (
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -29,11 +28,7 @@ static_configs:
|
||||
- localhost:9093
|
||||
- localhost:9094
|
||||
`)
|
||||
cfg, err := parseConfig(f.Name())
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse config: %s", err)
|
||||
}
|
||||
cw, err := newWatcher(cfg, nil)
|
||||
cw, err := newWatcher(f.Name(), nil)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to start config watcher: %s", err)
|
||||
}
|
||||
@@ -88,64 +83,33 @@ consul_sd_configs:
|
||||
- server: %s
|
||||
services:
|
||||
- alertmanager
|
||||
- server: %s
|
||||
services:
|
||||
- alertmanager
|
||||
alert_relabel_configs:
|
||||
- target_label: "foo"
|
||||
replacement: "tar"
|
||||
`, consulSDServer.URL, consulSDServer.URL))
|
||||
`, consulSDServer.URL))
|
||||
|
||||
cfg, err := parseConfig(consulSDFile.Name())
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse config: %s", err)
|
||||
}
|
||||
cw, err := newWatcher(cfg, nil)
|
||||
cw, err := newWatcher(consulSDFile.Name(), nil)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to start config watcher: %s", err)
|
||||
}
|
||||
defer cw.mustStop()
|
||||
|
||||
if len(cw.notifiers()) != 3 {
|
||||
t.Fatalf("expected to get 3 notifiers; got %d", len(cw.notifiers()))
|
||||
if len(cw.notifiers()) != 2 {
|
||||
t.Fatalf("expected to get 2 notifiers; got %d", len(cw.notifiers()))
|
||||
}
|
||||
|
||||
expAddr1 := fmt.Sprintf("https://%s/proxy/api/v2/alerts", fakeConsulService1)
|
||||
expAddr2 := fmt.Sprintf("https://%s/proxy/api/v2/alerts", fakeConsulService2)
|
||||
expAddr3 := fmt.Sprintf("https://%s/proxy/api/v2/alerts", fakeConsulService3)
|
||||
|
||||
n1, n2, n3 := cw.notifiers()[0], cw.notifiers()[1], cw.notifiers()[2]
|
||||
n1, n2 := cw.notifiers()[0], cw.notifiers()[1]
|
||||
if n1.Addr() != expAddr1 {
|
||||
t.Fatalf("exp address %q; got %q", expAddr1, n1.Addr())
|
||||
}
|
||||
if n2.Addr() != expAddr2 {
|
||||
t.Fatalf("exp address %q; got %q", expAddr2, n2.Addr())
|
||||
}
|
||||
if n3.Addr() != expAddr3 {
|
||||
t.Fatalf("exp address %q; got %q", expAddr3, n3.Addr())
|
||||
}
|
||||
|
||||
if n1.(*AlertManager).relabelConfigs.String() != "" {
|
||||
t.Fatalf("unexpected relabel configs: %q", n1.(*AlertManager).relabelConfigs.String())
|
||||
}
|
||||
if n2.(*AlertManager).relabelConfigs.String() != "" {
|
||||
t.Fatalf("unexpected relabel configs: %q", n2.(*AlertManager).relabelConfigs.String())
|
||||
}
|
||||
if n3.(*AlertManager).relabelConfigs.String() != "- target_label: foo\n replacement: tar\n" {
|
||||
t.Fatalf("unexpected relabel configs: %q", n3.(*AlertManager).relabelConfigs.String())
|
||||
}
|
||||
|
||||
f := func() bool { return len(cw.notifiers()) == 1 }
|
||||
if !waitFor(f, time.Second) {
|
||||
t.Fatalf("expected to get 1 notifiers; got %d", len(cw.notifiers()))
|
||||
}
|
||||
n3 = cw.notifiers()[0]
|
||||
if n3.Addr() != expAddr3 {
|
||||
t.Fatalf("exp address %q; got %q", expAddr3, n3.Addr())
|
||||
}
|
||||
if n3.(*AlertManager).relabelConfigs.String() != "- target_label: foo\n replacement: tar\n" {
|
||||
t.Fatalf("unexpected relabel configs: %q", n3.(*AlertManager).relabelConfigs.String())
|
||||
}
|
||||
}
|
||||
|
||||
// TestConfigWatcherReloadConcurrent supposed to test concurrent
|
||||
@@ -200,11 +164,7 @@ consul_sd_configs:
|
||||
"unknownFields.bad.yaml",
|
||||
}
|
||||
|
||||
cfg, err := parseConfig(paths[0])
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse config: %s", err)
|
||||
}
|
||||
cw, err := newWatcher(cfg, nil)
|
||||
cw, err := newWatcher(paths[0], nil)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to start config watcher: %s", err)
|
||||
}
|
||||
@@ -242,11 +202,10 @@ func checkErr(t *testing.T, err error) {
|
||||
const (
|
||||
fakeConsulService1 = "127.0.0.1:9093"
|
||||
fakeConsulService2 = "127.0.0.1:9095"
|
||||
fakeConsulService3 = "127.0.0.1:9097"
|
||||
)
|
||||
|
||||
func newFakeConsulServer() *httptest.Server {
|
||||
var requestCount atomic.Int32
|
||||
requestCount := 0
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/v1/agent/self", func(rw http.ResponseWriter, _ *http.Request) {
|
||||
rw.Write([]byte(`{"Config": {"Datacenter": "dc1"}}`))
|
||||
@@ -261,7 +220,7 @@ func newFakeConsulServer() *httptest.Server {
|
||||
}`))
|
||||
})
|
||||
mux.HandleFunc("/v1/health/service/alertmanager", func(rw http.ResponseWriter, _ *http.Request) {
|
||||
if requestCount.Load() == 0 {
|
||||
if requestCount == 0 {
|
||||
rw.Header().Set("X-Consul-Index", "1")
|
||||
rw.Write([]byte(`
|
||||
[
|
||||
@@ -401,7 +360,7 @@ func newFakeConsulServer() *httptest.Server {
|
||||
}
|
||||
]`))
|
||||
}
|
||||
requestCount.Add(1)
|
||||
requestCount++
|
||||
})
|
||||
|
||||
return httptest.NewServer(mux)
|
||||
|
||||
@@ -5,8 +5,6 @@ import (
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
|
||||
)
|
||||
|
||||
// FakeNotifier is a mock notifier
|
||||
@@ -17,19 +15,6 @@ type FakeNotifier struct {
|
||||
counter int
|
||||
}
|
||||
|
||||
// InitFakeNotifier initializes global notifier to FakeNotifier,
|
||||
// and returns a cleanup function to restore the original getActiveNotifiers.
|
||||
func InitFakeNotifier() (*FakeNotifier, func()) {
|
||||
originalGetActiveNotifiers := getActiveNotifiers
|
||||
fn := &FakeNotifier{}
|
||||
getActiveNotifiers = func() []Notifier {
|
||||
return []Notifier{fn}
|
||||
}
|
||||
return fn, func() {
|
||||
getActiveNotifiers = originalGetActiveNotifiers
|
||||
}
|
||||
}
|
||||
|
||||
// Close does nothing
|
||||
func (*FakeNotifier) Close() {}
|
||||
|
||||
@@ -42,7 +27,7 @@ func (*FakeNotifier) LastError() string {
|
||||
func (*FakeNotifier) Addr() string { return "" }
|
||||
|
||||
// Send sets alerts and increases counter
|
||||
func (fn *FakeNotifier) Send(_ context.Context, alerts []Alert, _ [][]prompb.Label, _ map[string]string) error {
|
||||
func (fn *FakeNotifier) Send(_ context.Context, alerts []Alert, _ map[string]string) error {
|
||||
fn.Lock()
|
||||
defer fn.Unlock()
|
||||
fn.counter += len(alerts)
|
||||
|
||||
@@ -1,22 +1,17 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/vmalertutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutil"
|
||||
)
|
||||
|
||||
@@ -101,25 +96,11 @@ func InitAlertURLGeneratorFn(externalURL *url.URL, externalAlertSource string, v
|
||||
return nil
|
||||
}
|
||||
|
||||
var (
|
||||
// getActiveNotifiers returns the current list of Notifier objects.
|
||||
getActiveNotifiers func() []Notifier
|
||||
// globalRelabelCfg stores the parsed alert relabeling config from the config file if there is
|
||||
globalRelabelCfg *promrelabel.ParsedConfigs
|
||||
|
||||
// cw holds a configWatcher for configPath configuration file
|
||||
// configWatcher provides a list of Notifier objects discovered
|
||||
// from static config or via service discovery.
|
||||
// cw is not nil only if configPath is provided.
|
||||
cw *configWatcher
|
||||
|
||||
// externalLabels is a global variable for holding external labels configured via flags
|
||||
// It is supposed to be inited via Init function only.
|
||||
externalLabels map[string]string
|
||||
// externalURL is a global variable for holding external URL value configured via flag
|
||||
// It is supposed to be inited via Init function only.
|
||||
externalURL string
|
||||
)
|
||||
// cw holds a configWatcher for configPath configuration file
|
||||
// configWatcher provides a list of Notifier objects discovered
|
||||
// from static config or via service discovery.
|
||||
// cw is not nil only if configPath is provided.
|
||||
var cw *configWatcher
|
||||
|
||||
// Reload checks the changes in configPath configuration file
|
||||
// and applies changes if any.
|
||||
@@ -130,62 +111,66 @@ func Reload() error {
|
||||
return cw.reload(*configPath)
|
||||
}
|
||||
|
||||
var staticNotifiersFn func() []Notifier
|
||||
|
||||
var (
|
||||
// externalLabels is a global variable for holding external labels configured via flags
|
||||
// It is supposed to be inited via Init function only.
|
||||
externalLabels map[string]string
|
||||
// externalURL is a global variable for holding external URL value configured via flag
|
||||
// It is supposed to be inited via Init function only.
|
||||
externalURL string
|
||||
)
|
||||
|
||||
// Init returns a function for retrieving actual list of Notifier objects.
|
||||
// Init works in two mods:
|
||||
// - configuration via flags (for backward compatibility). Is always static
|
||||
// and don't support live reloads.
|
||||
// - configuration via file. Supports live reloads and service discovery.
|
||||
//
|
||||
// Init returns an error if both mods are used.
|
||||
func Init(extLabels map[string]string, extURL string) error {
|
||||
func Init(extLabels map[string]string, extURL string) (func() []Notifier, error) {
|
||||
externalURL = extURL
|
||||
externalLabels = extLabels
|
||||
_, err := url.Parse(externalURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse external URL: %w", err)
|
||||
return nil, fmt.Errorf("failed to parse external URL: %w", err)
|
||||
}
|
||||
|
||||
if *blackHole {
|
||||
if len(*addrs) > 0 || *configPath != "" {
|
||||
return fmt.Errorf("only one of -notifier.blackhole, -notifier.url and -notifier.config flags must be specified")
|
||||
return nil, fmt.Errorf("only one of -notifier.blackhole, -notifier.url and -notifier.config flags must be specified")
|
||||
}
|
||||
notifier := newBlackHoleNotifier()
|
||||
getActiveNotifiers = func() []Notifier {
|
||||
staticNotifiersFn = func() []Notifier {
|
||||
return []Notifier{notifier}
|
||||
}
|
||||
return nil
|
||||
return staticNotifiersFn, nil
|
||||
}
|
||||
|
||||
if *configPath == "" && len(*addrs) == 0 {
|
||||
return nil
|
||||
return nil, nil
|
||||
}
|
||||
if *configPath != "" && len(*addrs) > 0 {
|
||||
return fmt.Errorf("only one of -notifier.config or -notifier.url flags must be specified")
|
||||
return nil, fmt.Errorf("only one of -notifier.config or -notifier.url flags must be specified")
|
||||
}
|
||||
|
||||
if len(*addrs) > 0 {
|
||||
notifiers, err := notifiersFromFlags(AlertURLGeneratorFn)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create notifier from flag values: %w", err)
|
||||
return nil, fmt.Errorf("failed to create notifier from flag values: %w", err)
|
||||
}
|
||||
getActiveNotifiers = func() []Notifier {
|
||||
staticNotifiersFn = func() []Notifier {
|
||||
return notifiers
|
||||
}
|
||||
return nil
|
||||
return staticNotifiersFn, nil
|
||||
}
|
||||
|
||||
cfg, err := parseConfig(*configPath)
|
||||
cw, err = newWatcher(*configPath, AlertURLGeneratorFn)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, fmt.Errorf("failed to init config watcher: %w", err)
|
||||
}
|
||||
if cfg.AlertRelabelConfigs != nil {
|
||||
globalRelabelCfg = cfg.parsedAlertRelabelConfigs
|
||||
}
|
||||
cw, err = newWatcher(cfg, AlertURLGeneratorFn)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to init config watcher: %w", err)
|
||||
}
|
||||
getActiveNotifiers = cw.notifiers
|
||||
return nil
|
||||
return cw.notifiers, nil
|
||||
}
|
||||
|
||||
// InitSecretFlags must be called after flag.Parse and before any logging
|
||||
@@ -260,57 +245,23 @@ const (
|
||||
|
||||
// GetTargets returns list of static or discovered targets
|
||||
// via notifier configuration.
|
||||
//
|
||||
// Must be called after Init.
|
||||
func GetTargets() map[TargetType][]Target {
|
||||
if getActiveNotifiers == nil {
|
||||
return nil
|
||||
}
|
||||
var targets = make(map[TargetType][]Target)
|
||||
// use cached targets from configWatcher instead of getActiveNotifiers for the extra target labels
|
||||
|
||||
if staticNotifiersFn != nil {
|
||||
for _, ns := range staticNotifiersFn() {
|
||||
targets[TargetStatic] = append(targets[TargetStatic], Target{
|
||||
Notifier: ns,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if cw != nil {
|
||||
cw.targetsMu.RLock()
|
||||
for key, ns := range cw.targets {
|
||||
targets[key] = append(targets[key], ns...)
|
||||
}
|
||||
cw.targetsMu.RUnlock()
|
||||
return targets
|
||||
}
|
||||
|
||||
// static notifiers don't have labels
|
||||
for _, ns := range getActiveNotifiers() {
|
||||
targets[TargetStatic] = append(targets[TargetStatic], Target{
|
||||
Notifier: ns,
|
||||
})
|
||||
}
|
||||
return targets
|
||||
}
|
||||
|
||||
// Send sends alerts to all active notifiers
|
||||
func Send(ctx context.Context, alerts []Alert, notifierHeaders map[string]string) *vmalertutil.ErrGroup {
|
||||
alertsToSend := make([]Alert, 0, len(alerts))
|
||||
lblss := make([][]prompb.Label, 0, len(alerts))
|
||||
// apply global relabel config first without modifying original alerts in alerts
|
||||
for _, a := range alerts {
|
||||
lbls := a.applyRelabelingIfNeeded(globalRelabelCfg)
|
||||
if len(lbls) == 0 {
|
||||
continue
|
||||
}
|
||||
alertsToSend = append(alertsToSend, a)
|
||||
lblss = append(lblss, lbls)
|
||||
}
|
||||
|
||||
errGr := new(vmalertutil.ErrGroup)
|
||||
wg := sync.WaitGroup{}
|
||||
activeNotifiers := getActiveNotifiers()
|
||||
for i := range activeNotifiers {
|
||||
nt := activeNotifiers[i]
|
||||
wg.Go(func() {
|
||||
if err := nt.Send(ctx, alertsToSend, lblss, notifierHeaders); err != nil {
|
||||
errGr.Add(fmt.Errorf("failed to send alerts to addr %q: %w", nt.Addr(), err))
|
||||
}
|
||||
})
|
||||
}
|
||||
wg.Wait()
|
||||
return errGr
|
||||
}
|
||||
|
||||
@@ -1,17 +1,11 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"net/url"
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fs"
|
||||
)
|
||||
|
||||
func TestInit(t *testing.T) {
|
||||
@@ -20,13 +14,14 @@ func TestInit(t *testing.T) {
|
||||
|
||||
*addrs = flagutil.ArrayString{"127.0.0.1", "127.0.0.2"}
|
||||
|
||||
err := Init(nil, "")
|
||||
fn, err := Init(nil, "")
|
||||
if err != nil {
|
||||
t.Fatalf("%s", err)
|
||||
}
|
||||
|
||||
if len(getActiveNotifiers()) != 2 {
|
||||
t.Fatalf("expected to get 2 notifiers; got %d", len(getActiveNotifiers()))
|
||||
nfs := fn()
|
||||
if len(nfs) != 2 {
|
||||
t.Fatalf("expected to get 2 notifiers; got %d", len(nfs))
|
||||
}
|
||||
|
||||
targets := GetTargets()
|
||||
@@ -59,7 +54,7 @@ func TestInitNegative(t *testing.T) {
|
||||
*configPath = path
|
||||
*addrs = flagutil.ArrayString{addr}
|
||||
*blackHole = bh
|
||||
if err := Init(nil, ""); err == nil {
|
||||
if _, err := Init(nil, ""); err == nil {
|
||||
t.Fatalf("expected to get error; got nil instead")
|
||||
}
|
||||
}
|
||||
@@ -76,13 +71,14 @@ func TestBlackHole(t *testing.T) {
|
||||
|
||||
*blackHole = true
|
||||
|
||||
err := Init(nil, "")
|
||||
fn, err := Init(nil, "")
|
||||
if err != nil {
|
||||
t.Fatalf("%s", err)
|
||||
}
|
||||
|
||||
if len(getActiveNotifiers()) != 1 {
|
||||
t.Fatalf("expected to get 1 notifier; got %d", len(getActiveNotifiers()))
|
||||
nfs := fn()
|
||||
if len(nfs) != 1 {
|
||||
t.Fatalf("expected to get 1 notifier; got %d", len(nfs))
|
||||
}
|
||||
|
||||
targets := GetTargets()
|
||||
@@ -124,85 +120,3 @@ func TestGetAlertURLGenerator(t *testing.T) {
|
||||
t.Fatalf("unexpected url want %s, got %s", exp, AlertURLGeneratorFn(testAlert))
|
||||
}
|
||||
}
|
||||
|
||||
func TestSendAlerts(t *testing.T) {
|
||||
oldAlertURLGeneratorFn := AlertURLGeneratorFn
|
||||
defer func() { AlertURLGeneratorFn = oldAlertURLGeneratorFn }()
|
||||
AlertURLGeneratorFn = func(alert Alert) string {
|
||||
return ""
|
||||
}
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/", func(_ http.ResponseWriter, _ *http.Request) {
|
||||
t.Fatalf("should not be called")
|
||||
})
|
||||
mux.HandleFunc(alertManagerPath, func(w http.ResponseWriter, r *http.Request) {
|
||||
var a []struct {
|
||||
Labels map[string]string `json:"labels"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&a); err != nil {
|
||||
t.Fatalf("can not unmarshal data into alert %s", err)
|
||||
}
|
||||
if len(a) != 2 {
|
||||
t.Fatalf("expected 2 alert in array got %d", len(a))
|
||||
}
|
||||
if len(a[0].Labels) != 4 {
|
||||
t.Fatalf("expected 4 labels got %d", len(a[0].Labels))
|
||||
}
|
||||
if a[0].Labels["env"] != "prod" {
|
||||
t.Fatalf("expected env label to be prod during relabeling, got %s", a[0].Labels["env"])
|
||||
}
|
||||
if a[0].Labels["c"] != "baz" {
|
||||
t.Fatalf("expected c label to be baz during relabeling, got %s", a[0].Labels["c"])
|
||||
}
|
||||
if len(a[1].Labels) != 1 {
|
||||
t.Fatalf("expected 1 labels got %d", len(a[1].Labels))
|
||||
}
|
||||
})
|
||||
srv := httptest.NewServer(mux)
|
||||
defer srv.Close()
|
||||
|
||||
f, err := os.CreateTemp("", "")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer fs.MustRemovePath(f.Name())
|
||||
|
||||
rawConfig := `
|
||||
static_configs:
|
||||
- targets:
|
||||
- %s
|
||||
alert_relabel_configs:
|
||||
- source_labels: [b]
|
||||
target_label: "c"
|
||||
alert_relabel_configs:
|
||||
- source_labels: [a]
|
||||
target_label: "b"
|
||||
- target_label: "env"
|
||||
replacement: "prod"
|
||||
`
|
||||
config := fmt.Sprintf(rawConfig, srv.URL+alertManagerPath)
|
||||
writeToFile(f.Name(), config)
|
||||
|
||||
oldConfigPath := configPath
|
||||
defer func() { configPath = oldConfigPath }()
|
||||
*configPath = f.Name()
|
||||
err = Init(nil, "")
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error when parse notifier config: %s", err)
|
||||
}
|
||||
|
||||
firingAlerts := []Alert{
|
||||
{
|
||||
Name: "alert1",
|
||||
Labels: map[string]string{"a": "baz"},
|
||||
},
|
||||
{
|
||||
Name: "alert2",
|
||||
Labels: map[string]string{},
|
||||
},
|
||||
}
|
||||
errG := Send(context.Background(), firingAlerts, nil)
|
||||
if errG.Err() != nil {
|
||||
t.Fatalf("unexpected error when sending alerts: %s", err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,17 +1,13 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
|
||||
)
|
||||
import "context"
|
||||
|
||||
// Notifier is a common interface for alert manager provider
|
||||
type Notifier interface {
|
||||
// Send sends the given list of alerts.
|
||||
// Returns an error if fails to send the alerts.
|
||||
// Must unblock if the given ctx is cancelled.
|
||||
Send(ctx context.Context, alerts []Alert, alertLabels [][]prompb.Label, notifierHeaders map[string]string) error
|
||||
Send(ctx context.Context, alerts []Alert, notifierHeaders map[string]string) error
|
||||
// Addr returns address where alerts are sent.
|
||||
Addr() string
|
||||
// LastError returns error, that occured during last attempt to send data
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
package notifier
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
|
||||
)
|
||||
import "context"
|
||||
|
||||
// blackHoleNotifier is a Notifier stub, used when no notifications need
|
||||
// to be sent.
|
||||
@@ -14,7 +10,7 @@ type blackHoleNotifier struct {
|
||||
}
|
||||
|
||||
// Send will send no notifications, but increase the metric.
|
||||
func (bh *blackHoleNotifier) Send(_ context.Context, alerts []Alert, _ [][]prompb.Label, _ map[string]string) error { //nolint:revive
|
||||
func (bh *blackHoleNotifier) Send(_ context.Context, alerts []Alert, _ map[string]string) error { //nolint:revive
|
||||
bh.metrics.alertsSent.Add(len(alerts))
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
|
||||
metricset "github.com/VictoriaMetrics/metrics"
|
||||
)
|
||||
|
||||
@@ -17,7 +16,7 @@ func TestBlackHoleNotifier_Send(t *testing.T) {
|
||||
Start: time.Now().UTC(),
|
||||
End: time.Now().UTC(),
|
||||
Annotations: map[string]string{"a": "b", "c": "d", "e": "f"},
|
||||
}}, [][]prompb.Label{{}}, nil); err != nil {
|
||||
}}, nil); err != nil {
|
||||
t.Fatalf("unexpected error %s", err)
|
||||
}
|
||||
|
||||
@@ -35,7 +34,7 @@ func TestBlackHoleNotifier_Close(t *testing.T) {
|
||||
Start: time.Now().UTC(),
|
||||
End: time.Now().UTC(),
|
||||
Annotations: map[string]string{"a": "b", "c": "d", "e": "f"},
|
||||
}}, [][]prompb.Label{{}}, nil); err != nil {
|
||||
}}, nil); err != nil {
|
||||
t.Fatalf("unexpected error %s", err)
|
||||
}
|
||||
|
||||
|
||||
19
app/vmalert/notifier/testdata/consul.bad.yaml
vendored
19
app/vmalert/notifier/testdata/consul.bad.yaml
vendored
@@ -1,19 +0,0 @@
|
||||
consul_sd_configs:
|
||||
- server: localhost:8500
|
||||
scheme: http
|
||||
services:
|
||||
- alertmanager
|
||||
alert_relabel_configs:
|
||||
- action: keep
|
||||
source_labels: [env]
|
||||
regex: "prod"
|
||||
- server: localhost:8500
|
||||
services:
|
||||
- consul
|
||||
alert_relabel_configs:
|
||||
- action: keep
|
||||
source_labels: [env]
|
||||
regex: "(abc"
|
||||
alert_relabel_configs:
|
||||
- target_label: "foo"
|
||||
replacement: "aaa"
|
||||
13
app/vmalert/notifier/testdata/dns.bad.yaml
vendored
13
app/vmalert/notifier/testdata/dns.bad.yaml
vendored
@@ -1,13 +0,0 @@
|
||||
dns_sd_configs:
|
||||
- names:
|
||||
- cloudflare.com
|
||||
type: 'A'
|
||||
port: 9093
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_dns_name]
|
||||
replacement: '${1}'
|
||||
target_label: dns_name
|
||||
alert_relabel_configs:
|
||||
- action: keep
|
||||
source_labels: [env]
|
||||
regex: "(abc"
|
||||
15
app/vmalert/notifier/testdata/mixed.good.yaml
vendored
15
app/vmalert/notifier/testdata/mixed.good.yaml
vendored
@@ -2,19 +2,12 @@ static_configs:
|
||||
- targets:
|
||||
- localhost:9093
|
||||
- localhost:9095
|
||||
alert_relabel_configs:
|
||||
- action: keep
|
||||
source_labels: [env]
|
||||
regex: "static"
|
||||
|
||||
consul_sd_configs:
|
||||
- server: localhost:8500
|
||||
scheme: http
|
||||
services:
|
||||
- alertmanager
|
||||
alert_relabel_configs:
|
||||
- action: keep
|
||||
source_labels: [env]
|
||||
regex: "consul"
|
||||
- server: localhost:8500
|
||||
services:
|
||||
- consul
|
||||
@@ -24,10 +17,6 @@ dns_sd_configs:
|
||||
- cloudflare.com
|
||||
type: 'A'
|
||||
port: 9093
|
||||
alert_relabel_configs:
|
||||
- action: keep
|
||||
source_labels: [env]
|
||||
regex: "dns"
|
||||
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_consul_tags]
|
||||
@@ -36,4 +25,4 @@ relabel_configs:
|
||||
target_label: __scheme__
|
||||
- source_labels: [__meta_dns_name]
|
||||
replacement: '${1}'
|
||||
target_label: dns_name
|
||||
target_label: dns_name
|
||||
26
app/vmalert/notifier/testdata/static.good.yaml
vendored
26
app/vmalert/notifier/testdata/static.good.yaml
vendored
@@ -1,14 +1,22 @@
|
||||
headers:
|
||||
- 'CustomHeader: foo'
|
||||
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://192.168.0.101:9093
|
||||
alert_relabel_configs:
|
||||
- target_label: "foo"
|
||||
replacement: "aaa"
|
||||
- localhost:9093
|
||||
- localhost:9095
|
||||
- https://localhost:9093/test/api/v2/alerts
|
||||
basic_auth:
|
||||
username: foo
|
||||
password: bar
|
||||
|
||||
- targets:
|
||||
- http://192.168.0.101:9093
|
||||
alert_relabel_configs:
|
||||
- target_label: "foo"
|
||||
replacement: "ccc"
|
||||
|
||||
- localhost:9096
|
||||
- localhost:9097
|
||||
basic_auth:
|
||||
username: foo
|
||||
password: baz
|
||||
|
||||
alert_relabel_configs:
|
||||
- target_label: "foo"
|
||||
replacement: "aaa"
|
||||
|
||||
@@ -173,8 +173,9 @@ func (c *Client) run(ctx context.Context) {
|
||||
|
||||
cancel()
|
||||
}
|
||||
|
||||
c.wg.Go(func() {
|
||||
c.wg.Add(1)
|
||||
go func() {
|
||||
defer c.wg.Done()
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
@@ -196,7 +197,7 @@ func (c *Client) run(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}()
|
||||
}
|
||||
|
||||
var (
|
||||
|
||||
@@ -827,9 +827,12 @@ func TestGroup_Restore(t *testing.T) {
|
||||
fg := NewGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil)
|
||||
fg.Init()
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Go(func() {
|
||||
fg.Start(context.Background(), nil, fqr)
|
||||
})
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
nts := func() []notifier.Notifier { return []notifier.Notifier{¬ifier.FakeNotifier{}} }
|
||||
fg.Start(context.Background(), nts, nil, fqr)
|
||||
wg.Done()
|
||||
}()
|
||||
fg.Close()
|
||||
wg.Wait()
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/vmalertutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
|
||||
)
|
||||
@@ -38,8 +39,6 @@ var (
|
||||
disableAlertGroupLabel = flag.Bool("disableAlertgroupLabel", false, "Whether to disable adding group's Name as label to generated alerts and time series.")
|
||||
remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries. "+
|
||||
"For example, if lookback=1h then range from now() to now()-1h will be scanned.")
|
||||
maxStartDelay = flag.Duration("group.maxStartDelay", 5*time.Minute, "Defines the max delay before starting the group evaluation. Group's start is artificially delayed for random duration on interval"+
|
||||
" [0..min(--group.maxStartDelay, group.interval)]. This helps smoothing out the load on the configured datasource, so evaluations aren't executed too close to each other.")
|
||||
)
|
||||
|
||||
// Group is an entity for grouping rules
|
||||
@@ -331,13 +330,13 @@ func (g *Group) Init() {
|
||||
}
|
||||
|
||||
// Start starts group's evaluation
|
||||
func (g *Group) Start(ctx context.Context, rw remotewrite.RWClient, rr datasource.QuerierBuilder) {
|
||||
func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, rr datasource.QuerierBuilder) {
|
||||
defer func() { close(g.finishedCh) }()
|
||||
evalTS := time.Now()
|
||||
// sleep random duration to spread group rules evaluation
|
||||
// over maxStartDelay to reduce the load on datasource.
|
||||
// over time to reduce the load on datasource.
|
||||
if !SkipRandSleepOnGroupStart {
|
||||
sleepBeforeStart := g.delayBeforeStart(evalTS, *maxStartDelay)
|
||||
sleepBeforeStart := delayBeforeStart(evalTS, g.GetID(), g.Interval, g.EvalOffset)
|
||||
g.infof("will start in %v", sleepBeforeStart)
|
||||
|
||||
sleepTimer := time.NewTimer(sleepBeforeStart)
|
||||
@@ -369,6 +368,7 @@ func (g *Group) Start(ctx context.Context, rw remotewrite.RWClient, rr datasourc
|
||||
|
||||
e := &executor{
|
||||
Rw: rw,
|
||||
Notifiers: nts,
|
||||
notifierHeaders: g.NotifierHeaders,
|
||||
}
|
||||
|
||||
@@ -475,31 +475,20 @@ func (g *Group) UpdateWith(newGroup *Group) {
|
||||
g.updateCh <- newGroup
|
||||
}
|
||||
|
||||
// delayBeforeStart returns duration for delaying the evaluation start
|
||||
// based on given ts and Group settings. The delay can't exceed maxDelay.
|
||||
// maxDelay is ignored if g.EvalOffset != nil.
|
||||
//
|
||||
// Delaying is important to smooth out the load on the datasource when all groups start at the same time.
|
||||
// delayBeforeStart calculates delay based on Group ID, so all groups will start at different moments of time.
|
||||
func (g *Group) delayBeforeStart(ts time.Time, maxDelay time.Duration) time.Duration {
|
||||
if g.EvalOffset != nil {
|
||||
// if offset is specified, ignore the maxDelay and return a duration aligned with offset
|
||||
currentOffsetPoint := ts.Truncate(g.Interval).Add(*g.EvalOffset)
|
||||
// if offset is specified, delayBeforeStart returns a duration to help aligning timestamp with offset;
|
||||
// otherwise, it returns a random duration between [0..interval] based on group key.
|
||||
func delayBeforeStart(ts time.Time, key uint64, interval time.Duration, offset *time.Duration) time.Duration {
|
||||
if offset != nil {
|
||||
currentOffsetPoint := ts.Truncate(interval).Add(*offset)
|
||||
if currentOffsetPoint.Before(ts) {
|
||||
// wait until the next offset point
|
||||
return currentOffsetPoint.Add(g.Interval).Sub(ts)
|
||||
return currentOffsetPoint.Add(interval).Sub(ts)
|
||||
}
|
||||
return currentOffsetPoint.Sub(ts)
|
||||
}
|
||||
|
||||
// otherwise, return a random duration between [0..min(interval, maxDelay)] based on group ID
|
||||
interval := g.Interval
|
||||
if interval > maxDelay {
|
||||
// artificially limit interval, so groups with big intervals could start sooner.
|
||||
interval = maxDelay
|
||||
}
|
||||
var randSleep time.Duration
|
||||
randSleep = time.Duration(float64(interval) * (float64(g.GetID()) / (1 << 64)))
|
||||
randSleep = time.Duration(float64(interval) * (float64(key) / (1 << 64)))
|
||||
sleepOffset := time.Duration(ts.UnixNano() % interval.Nanoseconds())
|
||||
if randSleep < sleepOffset {
|
||||
randSleep += interval
|
||||
@@ -561,13 +550,15 @@ func (g *Group) Replay(start, end time.Time, rw remotewrite.RWClient, maxDataPoi
|
||||
if !disableProgressBar {
|
||||
bar = pb.StartNew(iterations * len(g.Rules))
|
||||
}
|
||||
for i := range g.Rules {
|
||||
rule := g.Rules[i]
|
||||
for _, r := range g.Rules {
|
||||
sem <- struct{}{}
|
||||
wg.Go(func() {
|
||||
res <- replayRuleRange(rule, ri, bar, rw, replayRuleRetryAttempts, ruleEvaluationConcurrency)
|
||||
wg.Add(1)
|
||||
go func(r Rule, ri rangeIterator) {
|
||||
// pass ri as a copy, so it can be modified within the replayRuleRange
|
||||
res <- replayRuleRange(r, ri, bar, rw, replayRuleRetryAttempts, ruleEvaluationConcurrency)
|
||||
<-sem
|
||||
})
|
||||
wg.Done()
|
||||
}(r, ri)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
@@ -597,10 +588,10 @@ func replayRuleRange(r Rule, ri rangeIterator, bar *pb.ProgressBar, rw remotewri
|
||||
res := make(chan int, int(ri.end.Sub(ri.start)/ri.step)+1)
|
||||
for ri.next() {
|
||||
sem <- struct{}{}
|
||||
start := ri.s
|
||||
end := ri.e
|
||||
wg.Go(func() {
|
||||
n, err := replayRule(r, start, end, rw, replayRuleRetryAttempts)
|
||||
wg.Add(1)
|
||||
|
||||
go func(s, e time.Time) {
|
||||
n, err := replayRule(r, s, e, rw, replayRuleRetryAttempts)
|
||||
if err != nil {
|
||||
logger.Fatalf("rule %q: %s", r, err)
|
||||
}
|
||||
@@ -609,7 +600,8 @@ func replayRuleRange(r Rule, ri rangeIterator, bar *pb.ProgressBar, rw remotewri
|
||||
}
|
||||
res <- n
|
||||
<-sem
|
||||
})
|
||||
wg.Done()
|
||||
}(ri.s, ri.e)
|
||||
}
|
||||
wg.Wait()
|
||||
close(res)
|
||||
@@ -623,9 +615,10 @@ func replayRuleRange(r Rule, ri rangeIterator, bar *pb.ProgressBar, rw remotewri
|
||||
}
|
||||
|
||||
// ExecOnce evaluates all the rules under group for once with given timestamp.
|
||||
func (g *Group) ExecOnce(ctx context.Context, rw remotewrite.RWClient, evalTS time.Time) chan error {
|
||||
func (g *Group) ExecOnce(ctx context.Context, nts func() []notifier.Notifier, rw remotewrite.RWClient, evalTS time.Time) chan error {
|
||||
e := &executor{
|
||||
Rw: rw,
|
||||
Notifiers: nts,
|
||||
notifierHeaders: g.NotifierHeaders,
|
||||
}
|
||||
if len(g.Rules) < 1 {
|
||||
@@ -700,6 +693,7 @@ func (g *Group) getEvalDelay() time.Duration {
|
||||
|
||||
// executor contains group's notify and rw configs
|
||||
type executor struct {
|
||||
Notifiers func() []notifier.Notifier
|
||||
notifierHeaders map[string]string
|
||||
|
||||
Rw remotewrite.RWClient
|
||||
@@ -720,13 +714,14 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, ts time.T
|
||||
sem := make(chan struct{}, concurrency)
|
||||
go func() {
|
||||
wg := sync.WaitGroup{}
|
||||
for i := range rules {
|
||||
rule := rules[i]
|
||||
for _, r := range rules {
|
||||
sem <- struct{}{}
|
||||
wg.Go(func() {
|
||||
res <- e.exec(ctx, rule, ts, resolveDuration, limit)
|
||||
wg.Add(1)
|
||||
go func(r Rule) {
|
||||
res <- e.exec(ctx, r, ts, resolveDuration, limit)
|
||||
<-sem
|
||||
})
|
||||
wg.Done()
|
||||
}(r)
|
||||
}
|
||||
wg.Wait()
|
||||
close(res)
|
||||
@@ -780,6 +775,17 @@ func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDurati
|
||||
return nil
|
||||
}
|
||||
|
||||
errGr := notifier.Send(ctx, alerts, e.notifierHeaders)
|
||||
wg := sync.WaitGroup{}
|
||||
errGr := new(vmalertutil.ErrGroup)
|
||||
for _, nt := range e.Notifiers() {
|
||||
wg.Add(1)
|
||||
go func(nt notifier.Notifier) {
|
||||
if err := nt.Send(ctx, alerts, e.notifierHeaders); err != nil {
|
||||
errGr.Add(fmt.Errorf("rule %q: failed to send alerts to addr %q: %w", r, nt.Addr(), err))
|
||||
}
|
||||
wg.Done()
|
||||
}(nt)
|
||||
}
|
||||
wg.Wait()
|
||||
return errGr.Err()
|
||||
}
|
||||
|
||||
@@ -262,7 +262,7 @@ func TestUpdateDuringRandSleep(t *testing.T) {
|
||||
updateCh: make(chan *Group),
|
||||
}
|
||||
g.Init()
|
||||
go g.Start(context.Background(), nil, nil)
|
||||
go g.Start(context.Background(), nil, nil, nil)
|
||||
|
||||
rule1 := AlertingRule{
|
||||
Name: "jobDown",
|
||||
@@ -346,8 +346,7 @@ func TestGroupStart(t *testing.T) {
|
||||
}
|
||||
|
||||
fs := &datasource.FakeQuerier{}
|
||||
fn, cleanup := notifier.InitFakeNotifier()
|
||||
defer cleanup()
|
||||
fn := ¬ifier.FakeNotifier{}
|
||||
|
||||
const evalInterval = time.Millisecond
|
||||
g := NewGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"})
|
||||
@@ -396,7 +395,7 @@ func TestGroupStart(t *testing.T) {
|
||||
fs.Add(m2)
|
||||
g.Init()
|
||||
go func() {
|
||||
g.Start(context.Background(), nil, fs)
|
||||
g.Start(context.Background(), func() []notifier.Notifier { return []notifier.Notifier{fn} }, nil, fs)
|
||||
close(finished)
|
||||
}()
|
||||
|
||||
@@ -473,10 +472,15 @@ func TestFaultyNotifier(t *testing.T) {
|
||||
r := newTestAlertingRule("instant", 0)
|
||||
r.q = fq
|
||||
|
||||
fn, cleanup := notifier.InitFakeNotifier()
|
||||
defer cleanup()
|
||||
|
||||
e := &executor{}
|
||||
fn := ¬ifier.FakeNotifier{}
|
||||
e := &executor{
|
||||
Notifiers: func() []notifier.Notifier {
|
||||
return []notifier.Notifier{
|
||||
¬ifier.FaultyNotifier{},
|
||||
fn,
|
||||
}
|
||||
},
|
||||
}
|
||||
delay := 5 * time.Second
|
||||
ctx, cancel := context.WithTimeout(context.Background(), delay)
|
||||
defer cancel()
|
||||
@@ -549,7 +553,7 @@ func TestCloseWithEvalInterruption(t *testing.T) {
|
||||
g := NewGroup(groups[0], fq, evalInterval, nil)
|
||||
g.Init()
|
||||
|
||||
go g.Start(context.Background(), nil, nil)
|
||||
go g.Start(context.Background(), nil, nil, nil)
|
||||
|
||||
time.Sleep(evalInterval * 20)
|
||||
|
||||
@@ -567,10 +571,9 @@ func TestCloseWithEvalInterruption(t *testing.T) {
|
||||
|
||||
func TestGroupStartDelay(t *testing.T) {
|
||||
g := &Group{}
|
||||
g.id = uint64(math.MaxUint64 / 10)
|
||||
// interval of 5min and key generate a static delay of 30s
|
||||
g.Interval = time.Minute * 5
|
||||
maxDelay := time.Minute * 5
|
||||
key := uint64(math.MaxUint64 / 10)
|
||||
|
||||
f := func(atS, expS string) {
|
||||
t.Helper()
|
||||
@@ -582,7 +585,7 @@ func TestGroupStartDelay(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
delay := g.delayBeforeStart(at, maxDelay)
|
||||
delay := delayBeforeStart(at, key, g.Interval, g.EvalOffset)
|
||||
gotStart := at.Add(delay)
|
||||
if expTS != gotStart {
|
||||
t.Fatalf("expected to get %v; got %v instead", expTS, gotStart)
|
||||
@@ -603,15 +606,6 @@ func TestGroupStartDelay(t *testing.T) {
|
||||
f("2023-01-01T00:01:00.000+00:00", "2023-01-01T00:03:00.000+00:00")
|
||||
f("2023-01-01T00:03:30.000+00:00", "2023-01-01T00:08:00.000+00:00")
|
||||
f("2023-01-01T00:08:00.000+00:00", "2023-01-01T00:08:00.000+00:00")
|
||||
|
||||
maxDelay = time.Minute * 1
|
||||
g.EvalOffset = nil
|
||||
|
||||
// test group with maxDelay, and offset disabled
|
||||
f("2023-01-01T00:00:00.000+00:00", "2023-01-01T00:00:06.000+00:00")
|
||||
f("2023-01-01T00:00:01.000+00:00", "2023-01-01T00:00:06.000+00:00")
|
||||
f("2023-01-01T00:00:06.100+00:00", "2023-01-01T00:01:06.000+00:00")
|
||||
f("2023-01-01T00:00:11.000+00:00", "2023-01-01T00:01:06.000+00:00")
|
||||
}
|
||||
|
||||
func TestGetPrometheusReqTimestamp(t *testing.T) {
|
||||
|
||||
@@ -34,12 +34,11 @@ body {
|
||||
padding-top: 4.5rem;
|
||||
}
|
||||
|
||||
.vm-group {
|
||||
.group-items {
|
||||
cursor: pointer;
|
||||
padding: 5px;
|
||||
margin-top: 5px;
|
||||
position: relative;
|
||||
display: none;
|
||||
}
|
||||
|
||||
.btn svg, .dropdown-item svg {
|
||||
@@ -56,22 +55,14 @@ body {
|
||||
height: 38px;
|
||||
}
|
||||
|
||||
.vm-item:not(.vm-found) {
|
||||
display: none;
|
||||
.group-items:not(:has(.sub-item:not(.d-none))) {
|
||||
display: none !important;
|
||||
}
|
||||
|
||||
.vm-group:has(.vm-item:is(.vm-found)), .vm-group:is(.vm-found) {
|
||||
display: flex;
|
||||
}
|
||||
|
||||
.vm-group:hover {
|
||||
.group-items:hover {
|
||||
background-color: #f8f9fa!important;
|
||||
}
|
||||
|
||||
.vm-group:is(.vm-found) .vm-item {
|
||||
display: table-row;
|
||||
}
|
||||
|
||||
.table {
|
||||
table-layout: fixed;
|
||||
}
|
||||
@@ -120,9 +111,3 @@ textarea.curl-area {
|
||||
.w-60 {
|
||||
width: 60%;
|
||||
}
|
||||
|
||||
.annotations {
|
||||
white-space: pre-wrap;
|
||||
color: gray;
|
||||
word-wrap: break-word;
|
||||
}
|
||||
|
||||
@@ -65,34 +65,32 @@ function getParamURL(key) {
|
||||
return url.searchParams.get(key)
|
||||
}
|
||||
|
||||
function matchText(search, item) {
|
||||
const text = item.innerText.toLowerCase();
|
||||
return text.indexOf(search) >= 0;
|
||||
}
|
||||
|
||||
function filterRules(searchPhrase) {
|
||||
document.querySelectorAll('.vm-group').forEach((group) => {
|
||||
if (!searchPhrase) {
|
||||
group.classList.add('vm-found');
|
||||
return;
|
||||
}
|
||||
for (const item of group.querySelectorAll('.vm-group-search')) {
|
||||
if (matchText(searchPhrase, item)) {
|
||||
group.classList.add('vm-found');
|
||||
return;
|
||||
document.querySelectorAll('.sub-items').forEach((rules) => {
|
||||
let found = false;
|
||||
rules.querySelectorAll('.sub-item').forEach((rule) => {
|
||||
if (searchPhrase) {
|
||||
const ruleName = rule.innerText.toLowerCase();
|
||||
const matches = []
|
||||
const hasValue = ruleName.indexOf(searchPhrase) >= 0;
|
||||
rule.querySelectorAll('.label').forEach((label) => {
|
||||
const text = label.innerText.toLowerCase();
|
||||
if (text.indexOf(searchPhrase) >= 0) {
|
||||
matches.push(text);
|
||||
}
|
||||
});
|
||||
if (!matches.length && !hasValue) {
|
||||
rule.classList.add('d-none');
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
group.classList.remove('vm-found');
|
||||
for (const item of group.querySelectorAll('.vm-item')) {
|
||||
if (matchText(searchPhrase, item)) {
|
||||
item.classList.add('vm-found');
|
||||
continue;
|
||||
}
|
||||
if (Array.from(item.querySelectorAll('.label')).find(l => matchText(searchPhrase, l))) {
|
||||
item.classList.add('vm-found');
|
||||
continue;
|
||||
}
|
||||
item.classList.remove('vm-found');
|
||||
rule.classList.remove('d-none');
|
||||
found = true;
|
||||
});
|
||||
if (found && searchPhrase || !searchPhrase) {
|
||||
rules.classList.remove('d-none');
|
||||
} else {
|
||||
rules.classList.add('d-none');
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@@ -485,12 +485,6 @@ func templateFuncs() textTpl.FuncMap {
|
||||
|
||||
/* Helpers */
|
||||
|
||||
// now returns the Unix timestamp in seconds at the time of the template evaluation.
|
||||
// For example: {{ (now | toTime).Sub $activeAt }} will return the duration the alert has been active.
|
||||
"now": func() float64 {
|
||||
return float64(time.Now().Unix())
|
||||
},
|
||||
|
||||
// Converts a list of objects to a map with keys arg0, arg1 etc.
|
||||
// This is intended to allow multiple arguments to be passed to templates.
|
||||
"args": func(args ...any) map[string]any {
|
||||
|
||||
@@ -114,17 +114,14 @@
|
||||
{%= Controls(prefix, currentIcon, currentText, icons, filters, true) %}
|
||||
{% if len(groups) > 0 %}
|
||||
{% for _, g := range groups %}
|
||||
<div id="group-{%s g.ID %}" class="w-100 border-0 flex-column vm-group{% if g.Unhealthy > 0 %} alert-danger{% endif %}">
|
||||
<div id="group-{%s g.ID %}" class="d-flex w-100 border-0 flex-column group-items{% if g.Unhealthy > 0 %} alert-danger{% endif %}">
|
||||
<span class="d-flex justify-content-between">
|
||||
<a
|
||||
class="vm-group-search"
|
||||
href="#group-{%s g.ID %}"
|
||||
>{%s g.Name %}{% if g.Type != "prometheus" %} ({%s g.Type %}){% endif %} (every {%f.0 g.Interval %}s) #</a>
|
||||
<a href="#group-{%s g.ID %}">{%s g.Name %}{% if g.Type != "prometheus" %} ({%s g.Type %}){% endif %} (every {%f.0 g.Interval %}s) #</a>
|
||||
<span
|
||||
class="flex-grow-1 d-flex justify-content-end"
|
||||
role="button"
|
||||
data-bs-toggle="collapse"
|
||||
data-bs-target="#item-{%s g.ID %}"
|
||||
data-bs-target="#sub-{%s g.ID %}"
|
||||
>
|
||||
<span class="d-flex gap-2">
|
||||
{% if g.Unhealthy > 0 %}<span class="badge bg-danger" title="Number of rules with status Error">{%d g.Unhealthy %}</span> {% endif %}
|
||||
@@ -137,9 +134,9 @@
|
||||
class="d-flex flex-column row-gap-2 mb-2"
|
||||
role="button"
|
||||
data-bs-toggle="collapse"
|
||||
data-bs-target="#item-{%s g.ID %}"
|
||||
data-bs-target="#sub-{%s g.ID %}"
|
||||
>
|
||||
<span class="fs-6 text-start vm-group-search w-100 fw-lighter">{%s g.File %}</span>
|
||||
<span class="fs-6 text-start w-100 fw-lighter">{%s g.File %}</span>
|
||||
{% if len(g.Params) > 0 %}
|
||||
<span class="fs-6 text-start w-100 d-flex justify-content-between fw-lighter">
|
||||
<span>Extra params</span>
|
||||
@@ -161,7 +158,7 @@
|
||||
</span>
|
||||
{% endif %}
|
||||
</span>
|
||||
<div class="collapse" id="item-{%s g.ID %}">
|
||||
<div class="collapse sub-items" id="sub-{%s g.ID %}">
|
||||
<table class="table table-striped table-hover table-sm">
|
||||
<thead>
|
||||
<tr>
|
||||
@@ -172,7 +169,7 @@
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for _, r := range g.Rules %}
|
||||
<tr class="vm-item{% if r.LastError != "" %} alert-danger{% endif %}">
|
||||
<tr class="sub-item{% if r.LastError != "" %} alert-danger{% endif %}">
|
||||
<td>
|
||||
<div class="row">
|
||||
<div class="col-12 mb-2">
|
||||
@@ -209,12 +206,7 @@
|
||||
</div>
|
||||
</td>
|
||||
<td class="text-center">{%d r.LastSamples %}</td>
|
||||
<td class="text-center">{% if r.LastEvaluation.IsZero() %}
|
||||
Never
|
||||
{% else %}
|
||||
{%f.3 time.Since(r.LastEvaluation).Seconds() %}s ago
|
||||
{% endif %}
|
||||
</td>
|
||||
<td class="text-center">{%f.3 time.Since(r.LastEvaluation).Seconds() %}s ago</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
@@ -249,14 +241,14 @@
|
||||
}
|
||||
sort.Strings(keys)
|
||||
%}
|
||||
<div class="w-100 flex-column vm-group alert-danger">
|
||||
<div class="d-flex w-100 flex-column group-items alert-danger">
|
||||
<span id="group-{%s g.ID %}" class="d-flex justify-content-between">
|
||||
<a href="#group-{%s g.ID %}">{%s g.Name %}{% if g.Type != "prometheus" %} ({%s g.Type %}){% endif %}</a>
|
||||
<span
|
||||
class="flex-grow-1 d-flex justify-content-end"
|
||||
role="button"
|
||||
data-bs-toggle="collapse"
|
||||
data-bs-target="#item-{%s g.ID %}"
|
||||
data-bs-target="#sub-{%s g.ID %}"
|
||||
>
|
||||
<span class="badge bg-danger" title="Number of active alerts">{%d len(ga.Alerts) %}</span>
|
||||
</span>
|
||||
@@ -266,10 +258,10 @@
|
||||
class="fs-6 text-start w-100 fw-lighter"
|
||||
role="button"
|
||||
data-bs-toggle="collapse"
|
||||
data-bs-target="#item-{%s g.ID %}"
|
||||
data-bs-target="#sub-{%s g.ID %}"
|
||||
>{%s g.File %}</span>
|
||||
</span>
|
||||
<div class="collapse" id="item-{%s g.ID %}">
|
||||
<div class="collapse sub-items" id="sub-{%s g.ID %}">
|
||||
{% for _, ruleID := range keys %}
|
||||
{%code
|
||||
defaultAR := alertsByRule[ruleID][0]
|
||||
@@ -280,7 +272,7 @@
|
||||
sort.Strings(labelKeys)
|
||||
%}
|
||||
<br>
|
||||
<div class="vm-item">
|
||||
<div class="sub-item">
|
||||
<b>alert:</b> {%s defaultAR.Name %} ({%d len(alertsByRule[ruleID]) %})
|
||||
| <span><a target="_blank" href="{%s defaultAR.SourceLink %}">Source</a></span>
|
||||
<br>
|
||||
@@ -345,20 +337,20 @@
|
||||
typeK, ns := keys[i], targets[notifier.TargetType(keys[i])]
|
||||
count := len(ns)
|
||||
%}
|
||||
<div class="w-100 flex-column vm-group">
|
||||
<div class="d-flex w-100 flex-column group-items">
|
||||
<span class="d-flex justify-content-between" id="group-{%s typeK %}">
|
||||
<a href="#group-{%s typeK %}">{%s typeK %} ({%d count %})</a>
|
||||
<span
|
||||
class="flex-grow-1"
|
||||
role="button"
|
||||
data-bs-toggle="collapse"
|
||||
data-bs-target="#item-{%s typeK %}"
|
||||
data-bs-target="#sub-{%s typeK %}"
|
||||
></span>
|
||||
</span>
|
||||
<div id="item-{%s typeK %}" class="collapse show">
|
||||
<div id="sub-{%s typeK %}" class="collapse show sub-items">
|
||||
<table class="table table-striped table-hover table-sm">
|
||||
<thead>
|
||||
<tr class="vm-item">
|
||||
<tr class="sub-item">
|
||||
<th scope="col">Labels</th>
|
||||
<th scope="col">Address</th>
|
||||
</tr>
|
||||
@@ -443,7 +435,7 @@
|
||||
<div class="col">
|
||||
{% for _, k := range annotationKeys %}
|
||||
<b>{%s k %}:</b><br>
|
||||
<p class="annotations">{%s alert.Annotations[k] %}</p>
|
||||
<p>{%s alert.Annotations[k] %}</p>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
@@ -557,7 +549,7 @@
|
||||
<div class="col">
|
||||
{% for _, k := range annotationKeys %}
|
||||
<b>{%s k %}:</b><br>
|
||||
<p class="annotations">{%s rule.Annotations[k] %}</p>
|
||||
<p>{%s rule.Annotations[k] %}</p>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -23,9 +23,6 @@ func TestHandler(t *testing.T) {
|
||||
Timestamps: []int64{0},
|
||||
})
|
||||
m := &manager{groups: map[uint64]*rule.Group{}}
|
||||
_, cleanup := notifier.InitFakeNotifier()
|
||||
defer cleanup()
|
||||
|
||||
var ar *rule.AlertingRule
|
||||
var rr *rule.RecordingRule
|
||||
var groupIDs []uint64
|
||||
@@ -48,7 +45,7 @@ func TestHandler(t *testing.T) {
|
||||
}, fq, 1*time.Minute, nil)
|
||||
ar = g.Rules[0].(*rule.AlertingRule)
|
||||
rr = g.Rules[1].(*rule.RecordingRule)
|
||||
g.ExecOnce(context.Background(), nil, time.Time{})
|
||||
g.ExecOnce(context.Background(), func() []notifier.Notifier { return nil }, nil, time.Time{})
|
||||
id := g.CreateID()
|
||||
m.groups[id] = g
|
||||
groupIDs = append(groupIDs, id)
|
||||
|
||||
@@ -27,9 +27,6 @@ vmauth-linux-ppc64le-prod:
|
||||
vmauth-linux-386-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-linux-386
|
||||
|
||||
vmauth-linux-s390x-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-linux-s390x
|
||||
|
||||
vmauth-darwin-amd64-prod:
|
||||
APP_NAME=vmauth $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
|
||||
@@ -482,34 +482,27 @@ func getLeastLoadedBackendURL(bus []*backendURL, atomicCounter *atomic.Uint32) *
|
||||
if bu.isBroken() {
|
||||
continue
|
||||
}
|
||||
|
||||
// The Load() in front of CompareAndSwap() avoids CAS overhead for items with values bigger than 0.
|
||||
if bu.concurrentRequests.Load() == 0 && bu.concurrentRequests.CompareAndSwap(0, 1) {
|
||||
atomicCounter.CompareAndSwap(n+1, idx+1)
|
||||
// There is no need in the call bu.get(), because we already incremented bu.concrrentRequests above.
|
||||
if bu.concurrentRequests.Load() == 0 {
|
||||
// Fast path - return the backend with zero concurrently executed requests.
|
||||
// Do not use CompareAndSwap() instead of Load(), since it is much slower on systems with many CPU cores.
|
||||
bu.concurrentRequests.Add(1)
|
||||
return bu
|
||||
}
|
||||
}
|
||||
|
||||
// Slow path - return the backend with the minimum number of concurrently executed requests.
|
||||
buMinIdx := n % uint32(len(bus))
|
||||
minRequests := bus[buMinIdx].concurrentRequests.Load()
|
||||
for i := uint32(0); i < uint32(len(bus)); i++ {
|
||||
idx := (n + i) % uint32(len(bus))
|
||||
bu := bus[idx]
|
||||
buMin := bus[n%uint32(len(bus))]
|
||||
minRequests := buMin.concurrentRequests.Load()
|
||||
for _, bu := range bus {
|
||||
if bu.isBroken() {
|
||||
continue
|
||||
}
|
||||
|
||||
reqs := bu.concurrentRequests.Load()
|
||||
if reqs < minRequests || bus[buMinIdx].isBroken() {
|
||||
buMinIdx = idx
|
||||
minRequests = reqs
|
||||
if n := bu.concurrentRequests.Load(); n < minRequests || buMin.isBroken() {
|
||||
buMin = bu
|
||||
minRequests = n
|
||||
}
|
||||
}
|
||||
buMin := bus[buMinIdx]
|
||||
buMin.get()
|
||||
atomicCounter.CompareAndSwap(n+1, buMinIdx+1)
|
||||
return buMin
|
||||
}
|
||||
|
||||
|
||||
@@ -752,12 +752,10 @@ func TestGetLeastLoadedBackendURL(t *testing.T) {
|
||||
})
|
||||
up.loadBalancingPolicy = "least_loaded"
|
||||
|
||||
pbus := up.bus.Load()
|
||||
bus := *pbus
|
||||
|
||||
fn := func(ns ...int) {
|
||||
t.Helper()
|
||||
|
||||
pbus := up.bus.Load()
|
||||
bus := *pbus
|
||||
for i, b := range bus {
|
||||
got := int(b.concurrentRequests.Load())
|
||||
exp := ns[i]
|
||||
@@ -769,52 +767,45 @@ func TestGetLeastLoadedBackendURL(t *testing.T) {
|
||||
|
||||
up.getBackendURL()
|
||||
fn(1, 0, 0)
|
||||
|
||||
up.getBackendURL()
|
||||
fn(1, 1, 0)
|
||||
|
||||
up.getBackendURL()
|
||||
fn(1, 1, 1)
|
||||
|
||||
bus[1].put()
|
||||
bus[2].put()
|
||||
fn(1, 0, 0)
|
||||
up.getBackendURL()
|
||||
up.getBackendURL()
|
||||
fn(2, 2, 1)
|
||||
|
||||
bus := up.bus.Load()
|
||||
pbus := *bus
|
||||
pbus[0].concurrentRequests.Add(2)
|
||||
pbus[2].concurrentRequests.Add(5)
|
||||
fn(4, 2, 6)
|
||||
|
||||
up.getBackendURL()
|
||||
fn(1, 1, 0)
|
||||
fn(4, 3, 6)
|
||||
|
||||
bus[1].put()
|
||||
up.getBackendURL()
|
||||
fn(1, 0, 1)
|
||||
fn(4, 4, 6)
|
||||
|
||||
up.getBackendURL()
|
||||
fn(4, 5, 6)
|
||||
|
||||
up.getBackendURL()
|
||||
fn(5, 5, 6)
|
||||
|
||||
up.getBackendURL()
|
||||
fn(6, 5, 6)
|
||||
|
||||
up.getBackendURL()
|
||||
fn(6, 6, 6)
|
||||
|
||||
up.getBackendURL()
|
||||
fn(6, 6, 7)
|
||||
|
||||
up.getBackendURL()
|
||||
up.getBackendURL()
|
||||
fn(1, 1, 2)
|
||||
|
||||
bus[0].concurrentRequests.Add(2)
|
||||
bus[2].concurrentRequests.Add(2)
|
||||
fn(3, 1, 4)
|
||||
|
||||
up.getBackendURL()
|
||||
fn(3, 2, 4)
|
||||
|
||||
up.getBackendURL()
|
||||
fn(3, 3, 4)
|
||||
|
||||
up.getBackendURL()
|
||||
fn(4, 3, 4)
|
||||
|
||||
up.getBackendURL()
|
||||
fn(4, 4, 4)
|
||||
|
||||
bus[0].put()
|
||||
bus[2].put()
|
||||
|
||||
up.getBackendURL()
|
||||
fn(3, 4, 4)
|
||||
|
||||
up.getBackendURL()
|
||||
fn(4, 4, 4)
|
||||
fn(7, 7, 7)
|
||||
}
|
||||
|
||||
func TestBrokenBackend(t *testing.T) {
|
||||
|
||||
@@ -310,21 +310,14 @@ func tryProcessingRequest(w http.ResponseWriter, r *http.Request, targetURL *url
|
||||
|
||||
rtb, rtbOK := req.Body.(*readTrackingBody)
|
||||
res, err := ui.rt.RoundTrip(req)
|
||||
|
||||
if ctxErr := r.Context().Err(); ctxErr != nil {
|
||||
// Override the error returned by the RoundTrip with the context error if it isn't non-nil
|
||||
// This makes sure the proper logging for canceled and timed out requests - log the real cause of the error
|
||||
// instead of the random error, which could be returned from RoundTrip because of canceled or timed out request.
|
||||
err = ctxErr
|
||||
}
|
||||
if err != nil {
|
||||
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
|
||||
// Do not retry canceled or timed out requests
|
||||
remoteAddr := httpserver.GetQuotedRemoteAddr(r)
|
||||
requestURI := httpserver.GetRequestURI(r)
|
||||
logger.Warnf("remoteAddr: %s; requestURI: %s; error when proxying response body from %s: %s", remoteAddr, requestURI, targetURL, err)
|
||||
if errors.Is(err, context.DeadlineExceeded) {
|
||||
// Timed out request must be counted as errors, since this usually means that the backend is slow.
|
||||
logger.Warnf("remoteAddr: %s; requestURI: %s; timeout while proxying the response from %s: %s", remoteAddr, requestURI, targetURL, err)
|
||||
ui.backendErrors.Inc()
|
||||
}
|
||||
return false, false
|
||||
|
||||
@@ -31,9 +31,6 @@ vmbackup-linux-ppc64le-prod:
|
||||
vmbackup-linux-386-prod:
|
||||
APP_NAME=vmbackup EXTRA_GO_BUILD_TAGS=$(VMBACKUP_GO_BUILD_TAGS) $(MAKE) app-via-docker-linux-386
|
||||
|
||||
vmbackup-linux-s390x-prod:
|
||||
APP_NAME=vmbackup EXTRA_GO_BUILD_TAGS=$(VMBACKUP_GO_BUILD_TAGS) $(MAKE) app-via-docker-linux-s390x
|
||||
|
||||
vmbackup-darwin-amd64-prod:
|
||||
APP_NAME=vmbackup EXTRA_GO_BUILD_TAGS=$(VMBACKUP_GO_BUILD_TAGS) $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
|
||||
@@ -27,9 +27,6 @@ vmctl-linux-ppc64le-prod:
|
||||
vmctl-linux-386-prod:
|
||||
APP_NAME=vmctl $(MAKE) app-via-docker-linux-386
|
||||
|
||||
vmctl-linux-s390x-prod:
|
||||
APP_NAME=vmctl $(MAKE) app-via-docker-linux-s390x
|
||||
|
||||
vmctl-darwin-amd64-prod:
|
||||
APP_NAME=vmctl $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
|
||||
@@ -689,15 +689,15 @@ var (
|
||||
Usage: "The time filter in RFC3339 format to select timeseries with timestamp equal or lower than provided value. E.g. '2020-01-01T20:07:00Z'",
|
||||
Layout: time.RFC3339,
|
||||
},
|
||||
&cli.StringSliceFlag{
|
||||
Name: remoteReadFilterLabel,
|
||||
Usage: "Prometheus label name to filter timeseries by. E.g. '__name__' will filter timeseries by name.",
|
||||
DefaultText: "__name__",
|
||||
&cli.StringFlag{
|
||||
Name: remoteReadFilterLabel,
|
||||
Usage: "Prometheus label name to filter timeseries by. E.g. '__name__' will filter timeseries by name.",
|
||||
Value: "__name__",
|
||||
},
|
||||
&cli.StringSliceFlag{
|
||||
Name: remoteReadFilterLabelValue,
|
||||
Usage: fmt.Sprintf("Prometheus regular expression to filter label from %q flag.", remoteReadFilterLabelValue),
|
||||
DefaultText: ".*",
|
||||
&cli.StringFlag{
|
||||
Name: remoteReadFilterLabelValue,
|
||||
Usage: fmt.Sprintf("Prometheus regular expression to filter label from %q flag.", remoteReadFilterLabelValue),
|
||||
Value: ".*",
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: remoteRead,
|
||||
|
||||
@@ -192,14 +192,6 @@ func main() {
|
||||
return fmt.Errorf("failed to create transport for -%s=%q: %s", remoteReadSrcAddr, addr, err)
|
||||
}
|
||||
|
||||
// Backwards compatible default values if none provided by user
|
||||
rrLabelNames := c.StringSlice(remoteReadFilterLabel)
|
||||
rrLabelValues := c.StringSlice(remoteReadFilterLabelValue)
|
||||
if len(rrLabelNames) == 0 && len(rrLabelValues) == 0 {
|
||||
rrLabelNames = []string{"__name__"}
|
||||
rrLabelValues = []string{".*"}
|
||||
}
|
||||
|
||||
rr, err := remoteread.NewClient(remoteread.Config{
|
||||
Addr: addr,
|
||||
Transport: tr,
|
||||
@@ -208,8 +200,8 @@ func main() {
|
||||
Timeout: c.Duration(remoteReadHTTPTimeout),
|
||||
UseStream: c.Bool(remoteReadUseStream),
|
||||
Headers: c.String(remoteReadHeaders),
|
||||
LabelNames: rrLabelNames,
|
||||
LabelValues: rrLabelValues,
|
||||
LabelName: c.String(remoteReadFilterLabel),
|
||||
LabelValue: c.String(remoteReadFilterLabelValue),
|
||||
DisablePathAppend: c.Bool(remoteReadDisablePathAppend),
|
||||
})
|
||||
if err != nil {
|
||||
|
||||
@@ -11,15 +11,14 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
"github.com/gogo/protobuf/proto"
|
||||
"github.com/golang/snappy"
|
||||
"github.com/prometheus/prometheus/config"
|
||||
"github.com/prometheus/prometheus/prompb"
|
||||
"github.com/prometheus/prometheus/storage/remote"
|
||||
"github.com/prometheus/prometheus/tsdb/chunkenc"
|
||||
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmctl/vm"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -64,9 +63,9 @@ type Config struct {
|
||||
UseStream bool
|
||||
// Headers optional HTTP headers to send with each request to the corresponding remote storage
|
||||
Headers string
|
||||
// LabelNames, LabelValues stands for label=~value pair used for read requests.
|
||||
// LabelName, LabelValue stands for label=~value pair used for read requests.
|
||||
// Is optional.
|
||||
LabelNames, LabelValues []string
|
||||
LabelName, LabelValue string
|
||||
}
|
||||
|
||||
// Filter defines a list of filters applied to requested data
|
||||
@@ -95,22 +94,12 @@ func NewClient(cfg Config) (*Client, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var matchers []*prompb.LabelMatcher
|
||||
if len(cfg.LabelNames) > 0 || len(cfg.LabelValues) > 0 {
|
||||
if len(cfg.LabelNames) != len(cfg.LabelValues) {
|
||||
return nil, fmt.Errorf("the number of label names and label values must be the same")
|
||||
}
|
||||
|
||||
for i := range cfg.LabelNames {
|
||||
if cfg.LabelNames[i] == "" {
|
||||
return nil, fmt.Errorf("label name cannot be empty")
|
||||
}
|
||||
matcher := &prompb.LabelMatcher{
|
||||
Type: prompb.LabelMatcher_RE,
|
||||
Name: cfg.LabelNames[i],
|
||||
Value: cfg.LabelValues[i],
|
||||
}
|
||||
matchers = append(matchers, matcher)
|
||||
var m *prompb.LabelMatcher
|
||||
if cfg.LabelName != "" && cfg.LabelValue != "" {
|
||||
m = &prompb.LabelMatcher{
|
||||
Type: prompb.LabelMatcher_RE,
|
||||
Name: cfg.LabelName,
|
||||
Value: cfg.LabelValue,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -127,7 +116,7 @@ func NewClient(cfg Config) (*Client, error) {
|
||||
password: cfg.Password,
|
||||
useStream: cfg.UseStream,
|
||||
headers: headers,
|
||||
matchers: matchers,
|
||||
matchers: []*prompb.LabelMatcher{m},
|
||||
}
|
||||
|
||||
return c, nil
|
||||
|
||||
@@ -221,7 +221,7 @@ func (ctx *InsertCtx) FlushBufs() error {
|
||||
}
|
||||
}
|
||||
|
||||
func (ctx *InsertCtx) dropAggregatedRows(matchIdxs []uint32) {
|
||||
func (ctx *InsertCtx) dropAggregatedRows(matchIdxs []byte) {
|
||||
dst := ctx.mrs[:0]
|
||||
src := ctx.mrs
|
||||
if !*streamAggrDropInput {
|
||||
@@ -239,4 +239,4 @@ func (ctx *InsertCtx) dropAggregatedRows(matchIdxs []uint32) {
|
||||
ctx.mrs = dst
|
||||
}
|
||||
|
||||
var matchIdxsPool slicesutil.BufferPool[uint32]
|
||||
var matchIdxsPool bytesutil.ByteBufferPool
|
||||
|
||||
@@ -13,7 +13,6 @@ import (
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompb"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/slicesutil"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/storage"
|
||||
"github.com/VictoriaMetrics/VictoriaMetrics/lib/streamaggr"
|
||||
"github.com/VictoriaMetrics/metrics"
|
||||
@@ -23,11 +22,11 @@ var (
|
||||
streamAggrConfig = flag.String("streamAggr.config", "", "Optional path to file with stream aggregation config. "+
|
||||
"See https://docs.victoriametrics.com/victoriametrics/stream-aggregation/ . "+
|
||||
"See also -streamAggr.keepInput, -streamAggr.dropInput and -streamAggr.dedupInterval")
|
||||
streamAggrKeepInput = flag.Bool("streamAggr.keepInput", false, "Whether to keep input samples that match any rule in -streamAggr.config. "+
|
||||
"By default, matched raw samples are aggregated and dropped, while unmatched samples are written to the remote storage. "+
|
||||
streamAggrKeepInput = flag.Bool("streamAggr.keepInput", false, "Whether to keep all the input samples after the aggregation with -streamAggr.config. "+
|
||||
"By default, only aggregated samples are dropped, while the remaining samples are stored in the database. "+
|
||||
"See also -streamAggr.dropInput and https://docs.victoriametrics.com/victoriametrics/stream-aggregation/")
|
||||
streamAggrDropInput = flag.Bool("streamAggr.dropInput", false, "Whether to drop input samples that not matching any rule in -streamAggr.config. "+
|
||||
"By default, only matched raw samples are dropped, while unmatched samples are written to the remote storage."+
|
||||
streamAggrDropInput = flag.Bool("streamAggr.dropInput", false, "Whether to drop all the input samples after the aggregation with -streamAggr.config. "+
|
||||
"By default, only aggregated samples are dropped, while the remaining samples are stored in the database. "+
|
||||
"See also -streamAggr.keepInput and https://docs.victoriametrics.com/victoriametrics/stream-aggregation/")
|
||||
streamAggrDedupInterval = flag.Duration("streamAggr.dedupInterval", 0, "Input samples are de-duplicated with this interval before optional aggregation with -streamAggr.config . "+
|
||||
"See also -streamAggr.dropInputLabels and -dedup.minScrapeInterval and https://docs.victoriametrics.com/victoriametrics/stream-aggregation/#deduplication")
|
||||
@@ -190,7 +189,7 @@ func (ctx *streamAggrCtx) Reset() {
|
||||
ctx.buf = ctx.buf[:0]
|
||||
}
|
||||
|
||||
func (ctx *streamAggrCtx) push(mrs []storage.MetricRow, matchIdxs []uint32) []uint32 {
|
||||
func (ctx *streamAggrCtx) push(mrs []storage.MetricRow, matchIdxs []byte) []byte {
|
||||
mn := &ctx.mn
|
||||
tss := ctx.tss
|
||||
labels := ctx.labels
|
||||
@@ -249,7 +248,7 @@ func (ctx *streamAggrCtx) push(mrs []storage.MetricRow, matchIdxs []uint32) []ui
|
||||
if sas.IsEnabled() {
|
||||
matchIdxs = sas.Push(tss, matchIdxs)
|
||||
} else if deduplicator != nil {
|
||||
matchIdxs = slicesutil.SetLength(matchIdxs, len(tss))
|
||||
matchIdxs = bytesutil.ResizeNoCopyMayOverallocate(matchIdxs, len(tss))
|
||||
for i := range matchIdxs {
|
||||
matchIdxs[i] = 1
|
||||
}
|
||||
|
||||
@@ -86,7 +86,7 @@ func loadRelabelConfig() (*promrelabel.ParsedConfigs, error) {
|
||||
if len(*relabelConfig) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
pcs, _, err := promrelabel.LoadRelabelConfigs(*relabelConfig)
|
||||
pcs, err := promrelabel.LoadRelabelConfigs(*relabelConfig)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error when reading -relabelConfig=%q: %w", *relabelConfig, err)
|
||||
}
|
||||
|
||||
@@ -31,9 +31,6 @@ vmrestore-linux-ppc64le-prod:
|
||||
vmrestore-linux-386-prod:
|
||||
APP_NAME=vmrestore EXTRA_GO_BUILD_TAGS=$(VMRESTORE_GO_BUILD_TAGS) $(MAKE) app-via-docker-linux-386
|
||||
|
||||
vmrestore-linux-s390x-prod:
|
||||
APP_NAME=vmrestore EXTRA_GO_BUILD_TAGS=$(VMRESTORE_GO_BUILD_TAGS) $(MAKE) app-via-docker-linux-s390x
|
||||
|
||||
vmrestore-darwin-amd64-prod:
|
||||
APP_NAME=vmrestore EXTRA_GO_BUILD_TAGS=$(VMRESTORE_GO_BUILD_TAGS) $(MAKE) app-via-docker-darwin-amd64
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -37,10 +37,10 @@
|
||||
<meta property="og:title" content="UI for VictoriaMetrics">
|
||||
<meta property="og:url" content="https://victoriametrics.com/">
|
||||
<meta property="og:description" content="Explore and troubleshoot your VictoriaMetrics data">
|
||||
<script type="module" crossorigin src="./assets/index-zpalCSif.js"></script>
|
||||
<script type="module" crossorigin src="./assets/index-D13qGB62.js"></script>
|
||||
<link rel="modulepreload" crossorigin href="./assets/vendor-DY9kCvzk.js">
|
||||
<link rel="stylesheet" crossorigin href="./assets/vendor-D1GxaB_c.css">
|
||||
<link rel="stylesheet" crossorigin href="./assets/index-CBxdwuZH.css">
|
||||
<link rel="stylesheet" crossorigin href="./assets/index-I8MVeF75.css">
|
||||
</head>
|
||||
<body>
|
||||
<noscript>You need to enable JavaScript to run this app.</noscript>
|
||||
|
||||
8
app/vmui/packages/vmui/package-lock.json
generated
8
app/vmui/packages/vmui/package-lock.json
generated
@@ -17,7 +17,7 @@
|
||||
"react-input-mask": "^2.0.4",
|
||||
"react-router-dom": "^7.6.3",
|
||||
"uplot": "^1.6.32",
|
||||
"vite": "^7.1.11",
|
||||
"vite": "^7.1.5",
|
||||
"web-vitals": "^5.0.3"
|
||||
},
|
||||
"devDependencies": {
|
||||
@@ -7660,9 +7660,9 @@
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/vite": {
|
||||
"version": "7.1.11",
|
||||
"resolved": "https://registry.npmjs.org/vite/-/vite-7.1.11.tgz",
|
||||
"integrity": "sha512-uzcxnSDVjAopEUjljkWh8EIrg6tlzrjFUfMcR1EVsRDGwf/ccef0qQPRyOrROwhrTDaApueq+ja+KLPlzR/zdg==",
|
||||
"version": "7.1.5",
|
||||
"resolved": "https://registry.npmjs.org/vite/-/vite-7.1.5.tgz",
|
||||
"integrity": "sha512-4cKBO9wR75r0BeIWWWId9XK9Lj6La5X846Zw9dFfzMRw38IlTk2iCcUt6hsyiDRcPidc55ZParFYDXi0nXOeLQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"esbuild": "^0.25.0",
|
||||
|
||||
@@ -29,7 +29,7 @@
|
||||
"react-input-mask": "^2.0.4",
|
||||
"react-router-dom": "^7.6.3",
|
||||
"uplot": "^1.6.32",
|
||||
"vite": "^7.1.11",
|
||||
"vite": "^7.1.5",
|
||||
"web-vitals": "^5.0.3"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
||||
@@ -4,10 +4,10 @@ import { Alert as APIAlert } from "../../../types";
|
||||
import { createSearchParams } from "react-router-dom";
|
||||
import Button from "../../Main/Button/Button";
|
||||
import Badges, { BadgeColor } from "../Badges";
|
||||
import { formatEventTime } from "../helpers";
|
||||
import {
|
||||
SearchIcon,
|
||||
} from "../../Main/Icons";
|
||||
import dayjs from "dayjs";
|
||||
import CodeExample from "../../Main/CodeExample/CodeExample";
|
||||
|
||||
interface BaseAlertProps {
|
||||
@@ -66,7 +66,7 @@ const BaseAlert = ({ item }: BaseAlertProps) => {
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Active at</td>
|
||||
<td>{formatEventTime(item.activeAt)}</td>
|
||||
<td>{dayjs(item.activeAt).format("DD MMM YYYY HH:mm:ss")}</td>
|
||||
</tr>
|
||||
{!!Object.keys(alertLabels).length && (
|
||||
<tr>
|
||||
@@ -82,7 +82,7 @@ const BaseAlert = ({ item }: BaseAlertProps) => {
|
||||
</table>
|
||||
{!!Object.keys(item.annotations || {}).length && (
|
||||
<>
|
||||
<span className="vm-alerts-title">Annotations</span>
|
||||
<span className="title">Annotations</span>
|
||||
<table>
|
||||
<colgroup>
|
||||
<col className="vm-col-md"/>
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
import { useMemo } from "preact/compat";
|
||||
import "./style.scss";
|
||||
import { Group as APIGroup } from "../../../types";
|
||||
import { formatDuration, formatEventTime } from "../helpers";
|
||||
import dayjs from "dayjs";
|
||||
import { formatDuration } from "../helpers";
|
||||
import Badges, { BadgeColor } from "../Badges";
|
||||
|
||||
interface BaseGroupProps {
|
||||
@@ -47,10 +48,12 @@ const BaseGroup = ({ group }: BaseGroupProps) => {
|
||||
<td>{formatDuration(group.interval)}</td>
|
||||
</tr>
|
||||
)}
|
||||
<tr>
|
||||
<td className="vm-col-md">Last evaluation</td>
|
||||
<td>{formatEventTime(group.lastEvaluation)}</td>
|
||||
</tr>
|
||||
{!!group.lastEvaluation && (
|
||||
<tr>
|
||||
<td className="vm-col-md">Last evaluation</td>
|
||||
<td>{dayjs(group.lastEvaluation).format("DD MMM YYYY HH:mm:ss")}</td>
|
||||
</tr>
|
||||
)}
|
||||
{!!group.eval_offset && (
|
||||
<tr>
|
||||
<td className="vm-col-md">Eval offset</td>
|
||||
|
||||
@@ -6,7 +6,8 @@ import { SearchIcon, DetailsIcon } from "../../Main/Icons";
|
||||
import Button from "../../Main/Button/Button";
|
||||
import Alert from "../../Main/Alert/Alert";
|
||||
import Badges, { BadgeColor } from "../Badges";
|
||||
import { formatDuration, formatEventTime } from "../helpers";
|
||||
import dayjs from "dayjs";
|
||||
import { formatDuration } from "../helpers";
|
||||
import CodeExample from "../../Main/CodeExample/CodeExample";
|
||||
|
||||
interface BaseRuleProps {
|
||||
@@ -79,10 +80,12 @@ const BaseRule = ({ item }: BaseRuleProps) => {
|
||||
<td>{formatDuration(item.duration)}</td>
|
||||
</tr>
|
||||
)}
|
||||
<tr>
|
||||
<td>Last evaluation</td>
|
||||
<td>{formatEventTime(item.lastEvaluation)}</td>
|
||||
</tr>
|
||||
{!!item.lastEvaluation && (
|
||||
<tr>
|
||||
<td>Last evaluation</td>
|
||||
<td>{dayjs(item.lastEvaluation).format("DD MMM YYYY HH:mm:ss")}</td>
|
||||
</tr>
|
||||
)}
|
||||
{!!item.lastError && item.health !== "ok" && (
|
||||
<tr>
|
||||
<td>Last error</td>
|
||||
@@ -105,7 +108,7 @@ const BaseRule = ({ item }: BaseRuleProps) => {
|
||||
</table>
|
||||
{!!Object.keys(item?.annotations || {}).length && (
|
||||
<>
|
||||
<span className="vm-alerts-title">Annotations</span>
|
||||
<span className="title">Annotations</span>
|
||||
<table>
|
||||
<colgroup>
|
||||
<col className="vm-col-md"/>
|
||||
@@ -124,7 +127,7 @@ const BaseRule = ({ item }: BaseRuleProps) => {
|
||||
)}
|
||||
{!!item?.updates?.length && (
|
||||
<>
|
||||
<span className="vm-alerts-title">{`Last updates ${item.updates.length}/${item.max_updates_entries}`}</span>
|
||||
<span className="title">{`Last updates ${item.updates.length}/${item.max_updates_entries}`}</span>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
@@ -140,11 +143,11 @@ const BaseRule = ({ item }: BaseRuleProps) => {
|
||||
<tr
|
||||
key={update.at}
|
||||
>
|
||||
<td>{formatEventTime(update.time)}</td>
|
||||
<td>{dayjs(update.time).format("DD MMM YYYY HH:mm:ss")}</td>
|
||||
<td>{update.samples}</td>
|
||||
<td>{update.series_fetched}</td>
|
||||
<td>{formatDuration(update.duration / 1e9)}</td>
|
||||
<td>{formatEventTime(update.at)}</td>
|
||||
<td>{dayjs(update.at).format("DD MMM YYYY HH:mm:ss")}</td>
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
@@ -153,7 +156,7 @@ const BaseRule = ({ item }: BaseRuleProps) => {
|
||||
)}
|
||||
{!!item?.alerts?.length && (
|
||||
<>
|
||||
<span className="vm-alerts-title">Alerts</span>
|
||||
<span className="title">Alerts</span>
|
||||
<table>
|
||||
<colgroup>
|
||||
<col className="vm-col-sm"/>
|
||||
@@ -167,7 +170,7 @@ const BaseRule = ({ item }: BaseRuleProps) => {
|
||||
<th>Active since</th>
|
||||
<th>State</th>
|
||||
<th>Value</th>
|
||||
<th className="vm-alerts-title">Labels</th>
|
||||
<th className="title">Labels</th>
|
||||
<th></th>
|
||||
</tr>
|
||||
</thead>
|
||||
@@ -177,7 +180,9 @@ const BaseRule = ({ item }: BaseRuleProps) => {
|
||||
id={`alert-${alert.id}`}
|
||||
key={alert.id}
|
||||
>
|
||||
<td>{formatEventTime(alert.activeAt)}</td>
|
||||
<td>
|
||||
{dayjs(alert.activeAt).format("DD MMM YYYY HH:mm:ss")}
|
||||
</td>
|
||||
<td>
|
||||
<Badges
|
||||
items={{ [alert.state]: { color: alert.state as BadgeColor } }}
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
|
||||
.vm-alerts-title {
|
||||
.title {
|
||||
font-weight: bold;
|
||||
text-align: center;
|
||||
}
|
||||
@@ -48,13 +48,11 @@
|
||||
line-height: 30px;
|
||||
padding: 4px $padding-small;
|
||||
vertical-align: middle;
|
||||
white-space: nowrap;
|
||||
text-align: left;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
}
|
||||
th {
|
||||
white-space: nowrap;
|
||||
}
|
||||
td.align-center {
|
||||
text-align: center
|
||||
}
|
||||
|
||||
@@ -13,8 +13,3 @@ export const formatDuration = (raw: number) => {
|
||||
}
|
||||
return duration.format(fmt.join(" "));
|
||||
};
|
||||
|
||||
export const formatEventTime = (raw: string) => {
|
||||
const t = dayjs(raw);
|
||||
return t.year() <= 1 ? "Never" : t.format("DD MMM YYYY HH:mm:ss");
|
||||
}
|
||||
|
||||
@@ -192,7 +192,7 @@ export interface Group {
|
||||
rules: Rule[];
|
||||
interval: number;
|
||||
limit: number;
|
||||
lastEvaluation: string;
|
||||
lastEvaluation: number;
|
||||
evaluationTime: number;
|
||||
type: string;
|
||||
id: string;
|
||||
@@ -216,7 +216,7 @@ export interface Rule {
|
||||
annotations: Record<string, string>;
|
||||
alerts: Alert[];
|
||||
health: string;
|
||||
lastEvaluation: string;
|
||||
lastEvaluation: number;
|
||||
lastError: string;
|
||||
evaluationTime: number;
|
||||
type: string;
|
||||
@@ -247,7 +247,7 @@ export interface Alert {
|
||||
expression: string;
|
||||
labels: Record<string, string>;
|
||||
annotations: Record<string, string>;
|
||||
activeAt: string;
|
||||
activeAt: number;
|
||||
id: string;
|
||||
source: string;
|
||||
restored: boolean;
|
||||
|
||||
@@ -2,10 +2,8 @@ package tests
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
@@ -36,29 +34,6 @@ func TestSingleVMAgentReloadConfigs(t *testing.T) {
|
||||
fmt.Sprintf(`-remoteWrite.urlRelabelConfig=%s`, relabelFilePath),
|
||||
}, ``)
|
||||
|
||||
checkResponse := func(query, expResponse string) {
|
||||
t.Helper()
|
||||
resp, err := http.Get(query)
|
||||
if err != nil {
|
||||
t.Fatalf("cannot get response from %s: %s", query, err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
t.Fatalf("unexpected response from %s: %s", query, resp.Status)
|
||||
}
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
t.Fatalf("cannot read response from %s: %s", query, err)
|
||||
}
|
||||
if !strings.Contains(string(body), expResponse) {
|
||||
t.Fatalf("expected to get\n%s\nbut got\n%s", expResponse, string(body))
|
||||
}
|
||||
}
|
||||
|
||||
vmagentAddr := fmt.Sprintf("http://%s", vmagent.HTTPAddr())
|
||||
checkResponse(vmagentAddr+"/remotewrite-url-relabel-config", "replacement: value1")
|
||||
checkResponse(vmagentAddr+"/api/v1/status/remotewrite-url-relabel-config", "replacement: value1")
|
||||
|
||||
vmagent.APIV1ImportPrometheus(t, []string{
|
||||
"foo_bar 1 1652169600000", // 2022-05-10T08:00:00Z
|
||||
}, apptest.QueryOpts{})
|
||||
@@ -88,9 +63,6 @@ func TestSingleVMAgentReloadConfigs(t *testing.T) {
|
||||
|
||||
vmagent.ReloadRelabelConfigs(t)
|
||||
|
||||
checkResponse(vmagentAddr+"/remotewrite-url-relabel-config", "replacement: value2")
|
||||
checkResponse(vmagentAddr+"/api/v1/status/remotewrite-url-relabel-config", "replacement: value2")
|
||||
|
||||
vmagent.APIV1ImportPrometheus(t, []string{
|
||||
"bar_foo 1 1652169600001", // 2022-05-10T08:00:00Z
|
||||
}, apptest.QueryOpts{})
|
||||
|
||||
@@ -156,12 +156,6 @@ func (app *Vmagent) ReloadRelabelConfigs(t *testing.T) {
|
||||
t.Fatalf("relabel configs were not reloaded after SIGHUP signal; previous total: %f, current total: %f", prevTotal, currTotal)
|
||||
}
|
||||
|
||||
// HTTPAddr returns the address at which the vmagent process is listening
|
||||
// for http connections.
|
||||
func (app *Vmagent) HTTPAddr() string {
|
||||
return app.httpListenAddr
|
||||
}
|
||||
|
||||
// sendBlocking sends the data to vmstorage by executing `send` function and
|
||||
// waits until the data is actually sent.
|
||||
//
|
||||
|
||||
@@ -4609,7 +4609,6 @@
|
||||
"cellOptions": {
|
||||
"type": "auto"
|
||||
},
|
||||
"filterable": true,
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
@@ -4671,8 +4670,6 @@
|
||||
"show": false
|
||||
},
|
||||
"showHeader": true,
|
||||
"showSearch": true,
|
||||
"filterable": true,
|
||||
"sortBy": [
|
||||
{
|
||||
"desc": true,
|
||||
|
||||
@@ -4680,7 +4680,6 @@
|
||||
"cellOptions": {
|
||||
"type": "auto"
|
||||
},
|
||||
"filterable": true,
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
@@ -4753,9 +4752,7 @@
|
||||
],
|
||||
"show": false
|
||||
},
|
||||
"showHeader": true,
|
||||
"showSearch": true,
|
||||
"filterable": true
|
||||
"showHeader": true
|
||||
},
|
||||
"pluginVersion": "11.5.0",
|
||||
"targets": [
|
||||
|
||||
@@ -1994,7 +1994,7 @@
|
||||
"baseFilters": [],
|
||||
"datasource": {
|
||||
"type": "victoriametrics-metrics-datasource",
|
||||
"uid": "PE8D8DB4BEE4E4B22"
|
||||
"uid": "$ds"
|
||||
},
|
||||
"filters": [],
|
||||
"name": "adhoc",
|
||||
|
||||
@@ -1169,7 +1169,7 @@
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(controller_runtime_reconcile_time_seconds_bucket{job=~\"$job\"}[$__rate_interval])) by (le, controller) )",
|
||||
"expr": "histogram_quantile(0.99,sum(rate(controller_runtime_reconcile_time_seconds_bucket{job=~\"$job\"}[$__rate_interval])) by(le,controller) )",
|
||||
"legendFormat": "q.99 {{controller}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
@@ -1266,7 +1266,7 @@
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "sum(rate(rest_client_requests_total{job=~\"$job\"}[$__interval])) by (method, code)",
|
||||
"expr": "sum(rate(rest_client_requests_total{job=~\"$job\"}[$__interval])) by (method,code)",
|
||||
"instant": false,
|
||||
"legendFormat": "{{method}} {{code}}",
|
||||
"range": true,
|
||||
@@ -1490,7 +1490,7 @@
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "max(histogram_quantile(0.99, sum(rate(go_sched_latencies_seconds_bucket{job=~\"$job\"}[$__rate_interval])) by (job, instance, le))) by (job)",
|
||||
"expr": "max(histogram_quantile(0.99, sum(rate(go_sched_latencies_seconds_bucket{job=~\"$job\"}[$__rate_interval])) by (job, instance, le))) by(job)",
|
||||
"instant": false,
|
||||
"legendFormat": "__auto",
|
||||
"range": true,
|
||||
@@ -1589,7 +1589,7 @@
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{job=~\"$job\"}[$__rate_interval])) by (le, method, api))",
|
||||
"expr": "histogram_quantile(0.99,sum(rate(rest_client_request_duration_seconds_bucket{job=~\"$job\"})) by(le,method,api) )",
|
||||
"instant": false,
|
||||
"legendFormat": "{{method}} {{api}}",
|
||||
"range": true,
|
||||
|
||||
@@ -4609,7 +4609,6 @@
|
||||
"cellOptions": {
|
||||
"type": "auto"
|
||||
},
|
||||
"filterable": true,
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
@@ -4671,8 +4670,6 @@
|
||||
"show": false
|
||||
},
|
||||
"showHeader": true,
|
||||
"showSearch": true,
|
||||
"filterable": true,
|
||||
"sortBy": [
|
||||
{
|
||||
"desc": true,
|
||||
@@ -5640,7 +5637,7 @@
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "victoriametrics-metrics-datasource",
|
||||
"type": "victoriametrics-datasource",
|
||||
"uid": "$ds"
|
||||
},
|
||||
"description": "Shows the approx time needed to reach 100% of allowed disk capacity for at least one vmstorage node based on the following params:\n* free disk space (after -storage.minFreeDiskSpaceBytes);\n* row ingestion rate;\n* compression.",
|
||||
@@ -5738,7 +5735,7 @@
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "victoriametrics-metrics-datasource",
|
||||
"type": "victoriametrics-datasource",
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
@@ -10297,7 +10294,7 @@
|
||||
"uid": "$ds"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "(vm_free_disk_space_bytes{job=~\"$job_storage\", instance=~\"$instance\"}-vm_free_disk_space_limit_bytes{job=~\"$job_storage\", instance=~\"$instance\"}) \n/ \nignoring(path) (\n (rate(vm_rows_added_to_storage_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d]) - \n sum(rate(vm_deduplicated_samples_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d])) without(type)) * \n (\n sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"}) without(type) /\n sum(vm_rows{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"}) without(type)\n )\n +\n rate(vm_new_timeseries_created_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d]) * \n (\n sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type=\"indexdb/file\"}) without(type) /\n sum(vm_rows{job=~\"$job_storage\", instance=~\"$instance\", type=\"indexdb/file\"}) without(type)\n )\n) > 0",
|
||||
"expr": "(vm_free_disk_space_bytes{job=~\"$job_storage\", instance=~\"$instance\"}-vm_free_disk_space_limit_bytes{job=~\"$job_storage\", instance=~\"$instance\"}) \n/ \nignoring(path) (\n (rate(vm_rows_added_to_storage_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d]) - \n sum(rate(vm_deduplicated_samples_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d])) without(type)) * \n (\n sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"}) without(type) /\n sum(vm_rows{job=~\"$job_storage\", instance=~\"$instance\", type!~\"indexdb.*\"}) without(type)\n )\n +\n rate(vm_new_timeseries_created_total{job=~\"$job_storage\", instance=~\"$instance\"}[1d]) * \n (\n sum(vm_data_size_bytes{job=~\"$job_storage\", instance=~\"$instance\", type=\"indexdb/file\"}) without(type) /\n sum(vm_rows{job=~\"$job_storage\", instance=~\"$instance\", type=\"indexdb/file\"}) without(type) \n )\n) > 0",
|
||||
"format": "time_series",
|
||||
"interval": "",
|
||||
"intervalFactor": 1,
|
||||
@@ -10649,8 +10646,8 @@
|
||||
"list": [
|
||||
{
|
||||
"current": {
|
||||
"text": "VictoriaMetrics - cluster",
|
||||
"value": "PAF93674D0B4E9963"
|
||||
"text": "victoriametrics-metrics-datasource",
|
||||
"value": "ceuqoq3dxttkwb"
|
||||
},
|
||||
"includeAll": false,
|
||||
"name": "ds",
|
||||
|
||||
@@ -4681,7 +4681,6 @@
|
||||
"cellOptions": {
|
||||
"type": "auto"
|
||||
},
|
||||
"filterable": true,
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
@@ -4754,9 +4753,7 @@
|
||||
],
|
||||
"show": false
|
||||
},
|
||||
"showHeader": true,
|
||||
"showSearch": true,
|
||||
"filterable": true
|
||||
"showHeader": true
|
||||
},
|
||||
"pluginVersion": "11.5.0",
|
||||
"targets": [
|
||||
|
||||
@@ -4192,7 +4192,6 @@
|
||||
"cellOptions": {
|
||||
"type": "auto"
|
||||
},
|
||||
"filterable": true,
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
@@ -4254,8 +4253,6 @@
|
||||
"show": false
|
||||
},
|
||||
"showHeader": true,
|
||||
"showSearch": true,
|
||||
"filterable": true,
|
||||
"sortBy": [
|
||||
{
|
||||
"desc": true,
|
||||
|
||||
@@ -2509,7 +2509,6 @@
|
||||
"cellOptions": {
|
||||
"type": "auto"
|
||||
},
|
||||
"filterable": true,
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
@@ -2571,8 +2570,6 @@
|
||||
"show": false
|
||||
},
|
||||
"showHeader": true,
|
||||
"showSearch": true,
|
||||
"filterable": true,
|
||||
"sortBy": [
|
||||
{
|
||||
"desc": true,
|
||||
@@ -4241,4 +4238,4 @@
|
||||
"title": "VictoriaMetrics - vmalert (VM)",
|
||||
"uid": "LzldHAVnz_vm",
|
||||
"version": 1
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2238,7 +2238,6 @@
|
||||
"cellOptions": {
|
||||
"type": "auto"
|
||||
},
|
||||
"filterable": true,
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
@@ -2301,8 +2300,6 @@
|
||||
"show": false
|
||||
},
|
||||
"showHeader": true,
|
||||
"showSearch": true,
|
||||
"filterable": true,
|
||||
"sortBy": [
|
||||
{
|
||||
"desc": true,
|
||||
@@ -2655,7 +2652,7 @@
|
||||
{
|
||||
"datasource": {
|
||||
"type": "victoriametrics-datasource",
|
||||
"uid": "P38648FE0F8C5BEA2"
|
||||
"uid": "$ds"
|
||||
},
|
||||
"filters": [],
|
||||
"hide": 0,
|
||||
|
||||
@@ -4191,7 +4191,6 @@
|
||||
"cellOptions": {
|
||||
"type": "auto"
|
||||
},
|
||||
"filterable": true,
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
@@ -4253,8 +4252,6 @@
|
||||
"show": false
|
||||
},
|
||||
"showHeader": true,
|
||||
"showSearch": true,
|
||||
"filterable": true,
|
||||
"sortBy": [
|
||||
{
|
||||
"desc": true,
|
||||
|
||||
@@ -2508,7 +2508,6 @@
|
||||
"cellOptions": {
|
||||
"type": "auto"
|
||||
},
|
||||
"filterable": true,
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
@@ -2570,8 +2569,6 @@
|
||||
"show": false
|
||||
},
|
||||
"showHeader": true,
|
||||
"showSearch": true,
|
||||
"filterable": true,
|
||||
"sortBy": [
|
||||
{
|
||||
"desc": true,
|
||||
|
||||
@@ -2237,7 +2237,6 @@
|
||||
"cellOptions": {
|
||||
"type": "auto"
|
||||
},
|
||||
"filterable": true,
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
@@ -2300,8 +2299,6 @@
|
||||
"show": false
|
||||
},
|
||||
"showHeader": true,
|
||||
"showSearch": true,
|
||||
"filterable": true,
|
||||
"sortBy": [
|
||||
{
|
||||
"desc": true,
|
||||
|
||||
@@ -176,9 +176,6 @@ app-via-docker-linux-ppc64le:
|
||||
app-via-docker-linux-386:
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=386 $(MAKE) app-via-docker-goos-goarch
|
||||
|
||||
app-via-docker-linux-s390x:
|
||||
CGO_ENABLED=0 GOOS=linux GOARCH=s390x $(MAKE) app-via-docker-goos-goarch
|
||||
|
||||
app-via-docker-darwin-amd64:
|
||||
CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 $(MAKE) app-via-docker-goos-goarch
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ services:
|
||||
# It scrapes targets defined in --promscrape.config
|
||||
# And forward them to --remoteWrite.url
|
||||
vmagent:
|
||||
image: victoriametrics/vmagent:v1.128.0
|
||||
image: victoriametrics/vmagent:v1.127.0
|
||||
depends_on:
|
||||
- "vmauth"
|
||||
ports:
|
||||
@@ -37,14 +37,14 @@ services:
|
||||
# vmstorage shards. Each shard receives 1/N of all metrics sent to vminserts,
|
||||
# where N is number of vmstorages (2 in this case).
|
||||
vmstorage-1:
|
||||
image: victoriametrics/vmstorage:v1.128.0-cluster
|
||||
image: victoriametrics/vmstorage:v1.127.0-cluster
|
||||
volumes:
|
||||
- strgdata-1:/storage
|
||||
command:
|
||||
- "--storageDataPath=/storage"
|
||||
restart: always
|
||||
vmstorage-2:
|
||||
image: victoriametrics/vmstorage:v1.128.0-cluster
|
||||
image: victoriametrics/vmstorage:v1.127.0-cluster
|
||||
volumes:
|
||||
- strgdata-2:/storage
|
||||
command:
|
||||
@@ -54,7 +54,7 @@ services:
|
||||
# vminsert is ingestion frontend. It receives metrics pushed by vmagent,
|
||||
# pre-process them and distributes across configured vmstorage shards.
|
||||
vminsert-1:
|
||||
image: victoriametrics/vminsert:v1.128.0-cluster
|
||||
image: victoriametrics/vminsert:v1.127.0-cluster
|
||||
depends_on:
|
||||
- "vmstorage-1"
|
||||
- "vmstorage-2"
|
||||
@@ -63,7 +63,7 @@ services:
|
||||
- "--storageNode=vmstorage-2:8400"
|
||||
restart: always
|
||||
vminsert-2:
|
||||
image: victoriametrics/vminsert:v1.128.0-cluster
|
||||
image: victoriametrics/vminsert:v1.127.0-cluster
|
||||
depends_on:
|
||||
- "vmstorage-1"
|
||||
- "vmstorage-2"
|
||||
@@ -75,7 +75,7 @@ services:
|
||||
# vmselect is a query fronted. It serves read queries in MetricsQL or PromQL.
|
||||
# vmselect collects results from configured `--storageNode` shards.
|
||||
vmselect-1:
|
||||
image: victoriametrics/vmselect:v1.128.0-cluster
|
||||
image: victoriametrics/vmselect:v1.127.0-cluster
|
||||
depends_on:
|
||||
- "vmstorage-1"
|
||||
- "vmstorage-2"
|
||||
@@ -85,7 +85,7 @@ services:
|
||||
- "--vmalert.proxyURL=http://vmalert:8880"
|
||||
restart: always
|
||||
vmselect-2:
|
||||
image: victoriametrics/vmselect:v1.128.0-cluster
|
||||
image: victoriametrics/vmselect:v1.127.0-cluster
|
||||
depends_on:
|
||||
- "vmstorage-1"
|
||||
- "vmstorage-2"
|
||||
@@ -100,7 +100,7 @@ services:
|
||||
# read requests from Grafana, vmui, vmalert among vmselects.
|
||||
# It can be used as an authentication proxy.
|
||||
vmauth:
|
||||
image: victoriametrics/vmauth:v1.128.0
|
||||
image: victoriametrics/vmauth:v1.127.0
|
||||
depends_on:
|
||||
- "vmselect-1"
|
||||
- "vmselect-2"
|
||||
@@ -114,7 +114,7 @@ services:
|
||||
|
||||
# vmalert executes alerting and recording rules
|
||||
vmalert:
|
||||
image: victoriametrics/vmalert:v1.128.0
|
||||
image: victoriametrics/vmalert:v1.127.0
|
||||
depends_on:
|
||||
- "vmauth"
|
||||
ports:
|
||||
|
||||
@@ -3,7 +3,7 @@ services:
|
||||
# It scrapes targets defined in --promscrape.config
|
||||
# And forward them to --remoteWrite.url
|
||||
vmagent:
|
||||
image: victoriametrics/vmagent:v1.128.0
|
||||
image: victoriametrics/vmagent:v1.127.0
|
||||
depends_on:
|
||||
- "victoriametrics"
|
||||
ports:
|
||||
@@ -18,7 +18,7 @@ services:
|
||||
# VictoriaMetrics instance, a single process responsible for
|
||||
# storing metrics and serve read requests.
|
||||
victoriametrics:
|
||||
image: victoriametrics/victoria-metrics:v1.128.0
|
||||
image: victoriametrics/victoria-metrics:v1.127.0
|
||||
ports:
|
||||
- 8428:8428
|
||||
- 8089:8089
|
||||
@@ -54,7 +54,7 @@ services:
|
||||
|
||||
# vmalert executes alerting and recording rules
|
||||
vmalert:
|
||||
image: victoriametrics/vmalert:v1.128.0
|
||||
image: victoriametrics/vmalert:v1.127.0
|
||||
depends_on:
|
||||
- "victoriametrics"
|
||||
- "alertmanager"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
services:
|
||||
vmagent:
|
||||
image: victoriametrics/vmagent:v1.128.0
|
||||
image: victoriametrics/vmagent:v1.127.0
|
||||
depends_on:
|
||||
- "victoriametrics"
|
||||
ports:
|
||||
@@ -14,7 +14,7 @@ services:
|
||||
restart: always
|
||||
|
||||
victoriametrics:
|
||||
image: victoriametrics/victoria-metrics:v1.128.0
|
||||
image: victoriametrics/victoria-metrics:v1.127.0
|
||||
ports:
|
||||
- 8428:8428
|
||||
volumes:
|
||||
@@ -40,7 +40,7 @@ services:
|
||||
restart: always
|
||||
|
||||
vmalert:
|
||||
image: victoriametrics/vmalert:v1.128.0
|
||||
image: victoriametrics/vmalert:v1.127.0
|
||||
depends_on:
|
||||
- "victoriametrics"
|
||||
ports:
|
||||
|
||||
@@ -10,9 +10,9 @@ sitemap:
|
||||
|
||||
- To use *vmanomaly*, part of the enterprise package, a license key is required. Obtain your key [here](https://victoriametrics.com/products/enterprise/trial/) for this tutorial or for enterprise use.
|
||||
- In the tutorial, we'll be using the following VictoriaMetrics components:
|
||||
- [VictoriaMetrics Single-Node](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/) (v1.128.0)
|
||||
- [vmalert](https://docs.victoriametrics.com/victoriametrics/vmalert/) (v1.128.0)
|
||||
- [vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/) (v1.128.0)
|
||||
- [VictoriaMetrics Single-Node](https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/) (v1.127.0)
|
||||
- [vmalert](https://docs.victoriametrics.com/victoriametrics/vmalert/) (v1.127.0)
|
||||
- [vmagent](https://docs.victoriametrics.com/victoriametrics/vmagent/) (v1.127.0)
|
||||
- [Grafana](https://grafana.com/) (v.10.2.1)
|
||||
- [Docker](https://docs.docker.com/get-docker/) and [Docker Compose](https://docs.docker.com/compose/)
|
||||
- [Node exporter](https://github.com/prometheus/node_exporter#node-exporter) (v1.7.0) and [Alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/) (v0.27.0)
|
||||
@@ -323,7 +323,7 @@ Let's wrap it all up together into the `docker-compose.yml` file.
|
||||
services:
|
||||
vmagent:
|
||||
container_name: vmagent
|
||||
image: victoriametrics/vmagent:v1.128.0
|
||||
image: victoriametrics/vmagent:v1.127.0
|
||||
depends_on:
|
||||
- "victoriametrics"
|
||||
ports:
|
||||
@@ -340,7 +340,7 @@ services:
|
||||
|
||||
victoriametrics:
|
||||
container_name: victoriametrics
|
||||
image: victoriametrics/victoria-metrics:v1.128.0
|
||||
image: victoriametrics/victoria-metrics:v1.127.0
|
||||
ports:
|
||||
- 8428:8428
|
||||
volumes:
|
||||
@@ -373,7 +373,7 @@ services:
|
||||
|
||||
vmalert:
|
||||
container_name: vmalert
|
||||
image: victoriametrics/vmalert:v1.128.0
|
||||
image: victoriametrics/vmalert:v1.127.0
|
||||
depends_on:
|
||||
- "victoriametrics"
|
||||
ports:
|
||||
|
||||
@@ -249,27 +249,27 @@ services:
|
||||
- grafana_data:/var/lib/grafana/
|
||||
|
||||
vmsingle:
|
||||
image: victoriametrics/victoria-metrics:v1.128.0
|
||||
image: victoriametrics/victoria-metrics:v1.127.0
|
||||
command:
|
||||
- -httpListenAddr=0.0.0.0:8429
|
||||
|
||||
vmstorage:
|
||||
image: victoriametrics/vmstorage:v1.128.0-cluster
|
||||
image: victoriametrics/vmstorage:v1.127.0-cluster
|
||||
|
||||
vminsert:
|
||||
image: victoriametrics/vminsert:v1.128.0-cluster
|
||||
image: victoriametrics/vminsert:v1.127.0-cluster
|
||||
command:
|
||||
- -storageNode=vmstorage:8400
|
||||
- -httpListenAddr=0.0.0.0:8480
|
||||
|
||||
vmselect:
|
||||
image: victoriametrics/vmselect:v1.128.0-cluster
|
||||
image: victoriametrics/vmselect:v1.127.0-cluster
|
||||
command:
|
||||
- -storageNode=vmstorage:8401
|
||||
- -httpListenAddr=0.0.0.0:8481
|
||||
|
||||
vmagent:
|
||||
image: victoriametrics/vmagent:v1.128.0
|
||||
image: victoriametrics/vmagent:v1.127.0
|
||||
volumes:
|
||||
- ./scrape.yaml:/etc/vmagent/config.yaml
|
||||
command:
|
||||
@@ -278,7 +278,7 @@ services:
|
||||
- -remoteWrite.url=http://vmsingle:8429/api/v1/write
|
||||
|
||||
vmgateway-cluster:
|
||||
image: victoriametrics/vmgateway:v1.128.0-enterprise
|
||||
image: victoriametrics/vmgateway:v1.127.0-enterprise
|
||||
ports:
|
||||
- 8431:8431
|
||||
volumes:
|
||||
@@ -294,7 +294,7 @@ services:
|
||||
- -auth.oidcDiscoveryEndpoints=http://keycloak:8080/realms/master/.well-known/openid-configuration
|
||||
|
||||
vmgateway-single:
|
||||
image: victoriametrics/vmgateway:v1.128.0-enterprise
|
||||
image: victoriametrics/vmgateway:v1.127.0-enterprise
|
||||
ports:
|
||||
- 8432:8431
|
||||
volumes:
|
||||
@@ -405,7 +405,7 @@ Once iDP configuration is done, vmagent configuration needs to be updated to use
|
||||
|
||||
```yaml
|
||||
vmagent:
|
||||
image: victoriametrics/vmagent:v1.128.0
|
||||
image: victoriametrics/vmagent:v1.127.0
|
||||
volumes:
|
||||
- ./scrape.yaml:/etc/vmagent/config.yaml
|
||||
- ./vmagent-client-secret:/etc/vmagent/oauth2-client-secret
|
||||
|
||||
@@ -1,399 +0,0 @@
|
||||
---
|
||||
build:
|
||||
list: never
|
||||
publishResources: false
|
||||
render: never
|
||||
sitemap:
|
||||
disable: true
|
||||
---
|
||||
|
||||
## Matching Architecture to Risk
|
||||
|
||||
The complexity of any monitoring system is not an end in itself. It is a direct response to two questions: what risks are we protecting against, and how much performance do we need? This guide is designed to help you choose an architecture that precisely matches your answers.
|
||||
|
||||
### Availability as a Guarantee Against Risk
|
||||
|
||||
It's a common mistake to think of availability as a simple number. In reality, availability is a guarantee against a specific level of risk. For example, 99.9% ("three nines") availability allows for about 44 minutes of downtime per month. Before chasing higher nines, ask yourself: is this level of downtime acceptable for your defined risks? Remember that each additional 'nine' of availability often comes with an exponential increase in both system complexity and operational cost.
|
||||
|
||||
The scope of the failure you are designing for is your **"blast radius".** Before choosing an architecture, you must first define the blast radius you need to withstand.
|
||||
|
||||
### Resilience and Scalability
|
||||
|
||||
It is also crucial to distinguish between two fundamental goals:
|
||||
|
||||
* **Resilience (or Availability)** is about surviving failures. We achieve it by creating copies ([replicas](https://docs.victoriametrics.com/cluster-victoriametrics/#replication-and-data-safety)) of our components and data.
|
||||
* **Scalability (or Performance)** is about handling load. We achieve it by [adding more components](https://docs.victoriametrics.com/victoriametrics/#scalability-and-cluster-version) on every layer to distribute the work.
|
||||
|
||||
The architectures in this guide are simply different combinations of these two approaches, designed to handle a specific blast radius.
|
||||
|
||||
### Architectures as Answers to Blast Radius
|
||||
|
||||
Each subsequent section of this guide presents an architecture designed to handle a specific blast radius, moving from the most straightforward setup to the most resilient.
|
||||
|
||||
* **[Basic](#basic) (No Resilience).** This architecture is the baseline for non-critical systems. It has no fault tolerance, and its blast radius is the instance itself. Any failure leads to a complete outage.
|
||||
* **[Single AZ Cluster](#single-availability-zone) (Node-Level Resilience).** This architecture protects against the failure of individual servers (nodes) or application instances within a single Availability Zone. However, its blast radius is the entire AZ; it will not survive a datacenter-wide outage.
|
||||
* **[Multi-Cluster and Multi-AZ](#multi-cluster-and-multi-az) (Cluster/AZ/Datacenter-Level Resilience).** Designed as a disaster recovery solution, this architecture can withstand the complete failure of an entire availability zone or data center.
|
||||
* **[Hyperscale](#the-hyperscale-cell-based) (Cell/AZ and Region-Level Resilience).** An advanced architecture is built to survive failures of entire Availability Zones or logical "cells" within a region, often degrading gracefully instead of failing.
|
||||
* **[Logical Layers](#logical-layers) (Logical Resilience).** This is not a physical resilience level but an architectural layer on top of any setup. It addresses the risk of data access conflicts by providing strong logical isolation between different teams or customers.
|
||||
|
||||
### The decision tree
|
||||
|
||||
<p align="center">
|
||||
<img src="decision-tree.webp" alt="Decision Tree" width="80%">
|
||||
</p>
|
||||
|
||||
## Basic
|
||||
|
||||
**Recommended for:** Pet projects, development/test stages, and non-critical systems monitoring.
|
||||
|
||||
Installation guide reference: [https://docs.victoriametrics.com/guides/k8s-monitoring-via-vm-single](https://docs.victoriametrics.com/guides/k8s-monitoring-via-vm-single)
|
||||
|
||||
**Key characteristics**: Single instance that does everything: stores, retrieves, and provides metrics.
|
||||
|
||||
**Pros**:
|
||||
|
||||
* **Straightforward.** Quick deployment without additional components
|
||||
* **Cost-efficient.** It avoids redundant work, such as writing or transmitting the same data twice, thereby reducing both computational and network expenses. Additionally, there are no extra copies of data.
|
||||
|
||||
**Cons**:
|
||||
|
||||
* **Single point of failure.** No fault tolerance and no availability
|
||||
|
||||
**Schema:**
|
||||
|
||||
<p align="center">
|
||||
<img src="basic-architecture.webp" alt="Basic Architecture" width="40%">
|
||||
</p>
|
||||
|
||||
### Unavailability Scenarios
|
||||
|
||||
In this simplest setup, any single-node failure leads to temporary data unavailability or loss until the instance restarts or storage is restored. There are no built-in redundancy or replication layers.
|
||||
|
||||
For this section, you can increase availability by utilizing backup and restore mechanisms on various levels: hardware, virtualization, persistence volume management, or application. VictoriaMetrics provides the [backup tools](https://docs.victoriametrics.com/victoriametrics/vmbackup/) to achieve that.
|
||||
|
||||
## Single Availability Zone
|
||||
|
||||
**Recommended for:** Single availability zone hosted systems of any scale
|
||||
|
||||
Installation guide reference: [https://docs.victoriametrics.com/guides/k8s-monitoring-via-vm-cluster](https://docs.victoriametrics.com/guides/k8s-monitoring-via-vm-cluster)
|
||||
|
||||
High availability implementation: [https://docs.victoriametrics.com/guides/k8s-ha-monitoring-via-vm-cluster](https://docs.victoriametrics.com/guides/k8s-ha-monitoring-via-vm-cluster)
|
||||
|
||||
**Key characteristics:** This is a complete VictoriaMetrics cluster, commonly running in a single Kubernetes cluster. Each component of the cluster: vminsert, vmselect, and vmstorage has multiple copies (replicas). The data is also copied and sharded between vmstorage nodes using the `--replicationFactor` setting on vminsert. [See the official documentation](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/#replication-and-data-safety) to determine the optimal replication factor for your needs.
|
||||
|
||||
**Pros**:
|
||||
|
||||
* **Reliability.** The system can survive a failure of any component without service disruption. If a single vmstorage node dies, other replicas continue to operate. And it is the same for other components.
|
||||
|
||||
**Cons**:
|
||||
|
||||
* **No disaster recovery.** If the entire Kubernetes cluster, availability zone, or data center fails, you lose the entire monitoring system.
|
||||
* **Increased Cost:** Storage cost grows linearly with the replicationFactor (e.g., RF=2 equals 2x storage, RF=3 equals 3x). Compute components like vminsert or vmselect scale horizontally and increase throughput rather than duplicating data.
|
||||
|
||||
**Schema:**
|
||||
|
||||
<p align="center">
|
||||
<img src="single-az-architecture.webp" alt="Single AZ Architecture" width="60%">
|
||||
</p>
|
||||
|
||||
### Application vs. Storage Replication
|
||||
|
||||
When building a resilient cluster, several replication options are available.
|
||||
|
||||
**Path A: Application-Level Replication.** This approach is enabled [by setting](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/#replication-and-data-safety) the `-replicationFactor=N` flag, where N is an integer representing the desired number of replicas. It makes the cluster components responsible for writing N copies of the data across different vmstorage nodes.
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **Guaranteed Query Completeness on Node Failure.** The key advantage is that the cluster is aware of its replication. It can survive a complete vmstorage node failure and still guarantee 100% complete query responses from the remaining replicas (as long as the number of failed nodes is less than the replication factor).
|
||||
* **Infrastructure-Independent Logic.** The replication logic is part of the VictoriaMetrics application, ensuring the same predictable behavior whether you run on-premise or on any cloud provider.
|
||||
|
||||
**Cons:**
|
||||
|
||||
* **Latency sensitivity risks**. A slow or overloaded replica can increase write latency, since inserts must complete on multiple nodes. A larger number of nodes increases the risk of problems with one of them.
|
||||
|
||||
**Path B: Storage-Level Replication (The Cloud Provider Way)** In this model, VictoriaMetrics replication factor is set to 1, and the vmstorage data is backed up with cloud-provided and replicated volumes(i.e., AWS EBS replicated within AZ, Google Zonal PD).
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **Offloaded Resource Cost.** The data replication is no longer bound by application CPU and network performance, and is offloaded to the cloud provider's storage infrastructure.
|
||||
|
||||
**Cons:**
|
||||
|
||||
* **No read resilience.** Any vmstorage restart (including planned maintenance) or failure makes its data temporarily unavailable for querying.
|
||||
* **Failover duration.** When a node or disk fails, the PVC must be reattached to another node. For zonal volumes (single AZ), this can take seconds to minutes (e.g., 10-60 seconds for clean detach/attach; up to 5 minutes in force-detach cases), making data from that shard temporarily unavailable for querying until reschedule completes.
|
||||
|
||||
### Query Consistency Partial vs. Complete Responses
|
||||
|
||||
In a large, distributed system, partial failures are a common occurrence. A critical choice is how your read path should behave when only partial data can be retrieved.
|
||||
|
||||
**Path A: Allow Partial Responses (Focus on Availability)** By default, if a vmstorage node is down, vmselect will continue getting results from the healthy vmstorage nodes. If more than or equal to the replicationFactor vmstorage nodes fail to respond, the response will have the "isPartial" field set to true.
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **High Availability.** Some data, albeit incomplete, would still be available. The system degrades gracefully. It will continue to return available data from the remaining healthy nodes, rather than failing the entire query.
|
||||
|
||||
**Cons:**
|
||||
|
||||
* **Risk of incomplete data.** Users might not realize the "partial" warning and make decisions based on incomplete and possibly misleading graphs.
|
||||
|
||||
**Path B:** **Deny Partial Responses (Focus on Consistency)** You can configure vmselect with the `-search.denyPartialResponse` flag. If vmselect cannot fetch a complete result from all vmstorage nodes that hold the requested data according to the replication factor value, it will return an error instead of a partial result.
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **Guaranteed data consistency.** This approach ensures that any successful query returns 100% of the requested data. If vmselect receives only a partial response from its vmstorage nodes, the entire query is marked as failed, preventing any misleading or incomplete results.
|
||||
|
||||
**Cons:**
|
||||
|
||||
* **Lower Availability.** This approach sacrifices availability to guarantee consistency. So if more than replicationFactor vmstorage nodes are unavailable, read queries will start returning errors.
|
||||
|
||||
### Buffering Strategy Trade-off
|
||||
|
||||
Once you have a vmagent sending data to the storage component (vmsingle or cluster), you face your first important trade-off: what should vmagent do when the storage is temporarily unavailable? This choice defines the trade-off between higher availability (by not losing data) and lower resource consumption (by not using disk). By default, vmagent acts as a durable queue: it persists compressed unsent data to the local filesystem. The size of the queue is controlled via \`--remoteWrite.maxDiskUsagePerURL\` and can be [estimated in advance](https://docs.victoriametrics.com/victoriametrics/vmagent/#calculating-disk-space-for-persistence-queue).
|
||||
|
||||
**Path A: Stateful Mode (Most Reliable).** By default, [the operator uses ephemeral storage](https://docs.victoriametrics.com/operator/resources/vmagent/#statefulmode) for the vmagent queue. In production, we recommend explicitly configuring a PersistentVolumeClaim (PVC) for vmagent to ensure the buffer is stored on a persistent disk and survives pod restarts. [The documentation](https://docs.victoriametrics.com/victoriametrics/vmagent/#on-disk-persistence) about on-disk persistence.
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **Improved reliability.** Unsent data is safe during vmagent restarts or when remote storage is down (until queue is full).
|
||||
|
||||
**Cons:**
|
||||
|
||||
* **Requires additional resources.** Deployment becomes stateful, uses disk space, and I/O. The queue size can build extra pressure on remote storage once it becomes available.
|
||||
|
||||
For Enterprise users, the queueing can be offloaded to an external message broker, such as **Kafka**. In that case vmagent can [read or write into Kafka](https://docs.victoriametrics.com/victoriametrics/integrations/kafka/).
|
||||
|
||||
**Path B: Ephemeral Buffering (with tmpfs).** For maximum performance, the vmagent buffer directory can be mounted as a tmpfs volume, which is physically stored in the node's RAM. In Kubernetes, this is configured via `emptyDir: { medium: "Memory" }`.
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **Fast I/O.** Buffering happens at RAM speed. This path safeguards against brief network outages without any loss of performance.
|
||||
|
||||
**Cons:**
|
||||
|
||||
* **Significant risk of data loss.** Unsent data is lost on vmagent restarts. The queue size is limited by the available memory.
|
||||
|
||||
### Unavailability Scenarios
|
||||
|
||||
**Blast radius:** Cluster
|
||||
|
||||
* **Instance/pod failure:**
|
||||
* Path A (Application-level replication, RF ≥2): no impact; cluster continues with remaining replicas.
|
||||
* Path B (Storage-level replication, RF=1): temporary data unavailability (can be around 1 minute regarding PVC detach/attach, depending on the type of replication).
|
||||
* Path A (Buffering Strategy Trade-off, stateful): if vmagent uses a PersistentVolumeClaim, buffered data survives pod restarts and is replayed automatically.
|
||||
* Path A (Buffering Strategy Trade-off, Ephemeral): if vmagent uses an in-memory (tmpfs) buffer, all unsent samples are lost on restart.
|
||||
* **Node/server failure:** pods rescheduled; impact depends on replication mode.
|
||||
* **AZ/datacenter failure:** complete outage; no cross-AZ protection.
|
||||
* **vminsert or vmstorage unavailability:**
|
||||
* Path A, Path B (Buffering Strategy Trade-off) data replayed after reconnecting.
|
||||
|
||||
## Multi-Cluster and Multi-AZ
|
||||
|
||||
**Recommended for:** Large-scale workloads or services with high SLA requirements that must survive the complete failure of a datacenter or an Availability Zone (AZ).
|
||||
|
||||
High availability implementation: [https://docs.victoriametrics.com/guides/multi-regional-setup-dedicated-regions](https://docs.victoriametrics.com/guides/multi-regional-setup-dedicated-regions)
|
||||
|
||||
**Key characteristics:** The core principle of this architecture is to run two or more independent, self-contained VictoriaMetrics clusters (from the [Single AZ](#single-availability-zone) section) in separate failure domains, such as different Availability Zones or geographic regions. A global, stateless layer is responsible for routing write and read traffic to these clusters. Each participating AZ must be provisioned to handle the entire workload if another AZ fails.
|
||||
|
||||
There are no differences in the VictoriaMetrics clusters' topology regarding the multi-AZ approach. It can be Active-Active or Active-Passive - the schema will be the same.
|
||||
|
||||
To ensure reliability, vmagent implements the bulkhead pattern: each destination URL configured via `--remoteWrite.url` is assigned a dedicated data queue and an isolated pool of workers. This isolates the data streams, ensuring that if one storage destination becomes slow or unavailable, it does not impact data delivery to the others.
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **Disaster Recovery:** The system can survive a complete failure of one cluster's location (AZ or region).
|
||||
* **Isolation:** Incidents, maintenance, or configuration errors in one cluster do not affect the others.
|
||||
|
||||
**Cons:**
|
||||
|
||||
* **Increased Cost:** You are paying more for the infrastructure (compute, storage, and network). The capacity of vmstorage in each AZ is underutilized, since every AZ must be ready to absorb the full traffic load in case of failure. Overhead is ~100% with 2 AZs (50% utilization). For other components, it is possible to use [VPA](https://kubernetes.io/docs/concepts/workloads/autoscaling) or [HPA](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale).
|
||||
|
||||
**Schema:**
|
||||
|
||||
<p align="center">
|
||||
<img src="multi-az-architecture.webp" alt="Multi-AZ Architecture" width="65%">
|
||||
</p>
|
||||
|
||||
### Unavailability Scenarios
|
||||
|
||||
**Blast radius:** Availability zone
|
||||
|
||||
* **Primary region failure (Active-Passive):** switchover in minutes; stale reads until DNS/load balancer/BGP reroute.
|
||||
|
||||
* **Single AZ/cluster failure (Active-Active):** seamless reroute; read results may temporarily differ between clusters if cross-AZ replication lags.
|
||||
|
||||
* **Cross-region link failure:**
|
||||
|
||||
* Writes: buffered by vmagent.
|
||||
* Reads: may return stale data until the link is restored.
|
||||
|
||||
## The Hyperscale (Cell-based)
|
||||
|
||||
**Recommended for:** Systems that require extra reliability and scalability across multiple regions and zones.
|
||||
|
||||
**Key characteristics:** This architecture is built on two main ideas - cells and the separation of routing and storage paths
|
||||
|
||||
First, we have logical groups of Availability Zones (AZs). Think of these as our data pods. Inside these groups, we deploy our basic clusters. The data within these groups can be distributed in two ways:
|
||||
- **Fully replicated:** An identical copy exists in each AZ.
|
||||
- **Sharded:** Each AZ holds a portion of the data. For example, with replication factor 3 across 4 cells, each cell stores approximately 75% of all metrics.
|
||||
|
||||
Inside each Storage Cell, the VictoriaMetrics cluster is configured with a `-replicationFactor` of 1. High availability is achieved by replicating data across multiple cells by the global routing layer, not within the cell or the cluster.
|
||||
|
||||
Next, we have a separate, stateless layer of routing cells. Their only purpose is to manage traffic. They accept all incoming data and queries and intelligently route them to the correct storage groups. This separation of routing and storage is key to the design.
|
||||
|
||||
For complete disaster recovery, this entire cell-based architecture is duplicated in a second geographic region.
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **Maximum Fault Tolerance:** The system survives failures of servers, entire storage cells, and even availability zones within a region. It degrades gracefully instead of failing completely.
|
||||
* **Horizontal Scaling:** You can add new storage cells to increase capacity or new routing cells to handle more traffic.
|
||||
|
||||
**Cons / Trade-offs:**
|
||||
|
||||
* **Increased Complexity:** This architecture requires significant expertise and a large amount of automation (a control plane) to manage the routing and data placement.
|
||||
* **High Cost:** The number of components and the data redundancy make this the most expensive option.
|
||||
|
||||
**Schema:**
|
||||
|
||||
A global, stateless layer of routing cells (vmagent, vmauth) sits on top. It routes traffic to several logical groups of storage cells. Each storage group contains multiple AZs, and data is replicated or sharded across them. There are several approaches to implementing it.
|
||||
|
||||
<p align="center">
|
||||
<img src="hyperscale-architecture.webp" alt="Hyperscale Architecture" width="85%">
|
||||
</p>
|
||||
|
||||
### Choosing Your Read Path Strategy
|
||||
|
||||
When you build a system that spans multiple AZs or regions, you face a fundamental choice: how to read the data? The answer to this question will define the trade-offs in your architecture between data completeness, query speed, and cost. Your choice of how to write data directly impacts how you can read it. Let's look at two pairs of write/read strategies.
|
||||
|
||||
### Path A: Prioritize Data Completeness (The Global vmselect model)
|
||||
|
||||
In this model, your primary goal is to obtain as complete and consistent data as possible for every query, even if some storage cells are lagging behind.
|
||||
|
||||
**Write Path:** vmagent [shards data](https://docs.victoriametrics.com/victoriametrics/vmagent/#sharding-among-remote-storages) across your storage cells. Fault tolerance is configured via `-remoteWrite.shardByURL` and `-remoteWrite.shardByURLReplicas` (for example, writing each time series to 3 out of 4 cells). Redundancy is achieved across cells, not within a cell. This provides resilience against cell failures while saving storage compared to full copies.
|
||||
|
||||
**Read Path:** You use a two-level vmselect system. A global vmselect receives user queries. In turn, it queries local vmselects in each of your storage cells and merges the results. Exposing local VMSelects to a global one is necessary because there can be no possibility to connect directly to vmstorage on the local cell, especially if it is in Kubernetes, as there is no HTTP endpoint for querying vmstorage. And using NodePort may not be a good practice for production.
|
||||
|
||||
**Schema:**
|
||||
|
||||
Global vmselect -> Local vmselects (in each cell)
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **High availability of complete data.** The global VMSelect can fill in any gaps from a lagging cell by retrieving data from another replica. The higher the replicationFactor, the more durable it is against storage failures.
|
||||
|
||||
**Cons:**
|
||||
|
||||
* **High resource overhead.** The global VMSelect performs a significant amount of redundant work, merging and aggregating data. This requires significant CPU and memory, and increases query latency.
|
||||
|
||||
### Path B: Focus on Read Speed (The vmauth with first_available mode)
|
||||
|
||||
In this model, your primary goal is to provide users with the fastest possible response, accepting certain risks associated with data freshness.
|
||||
|
||||
**Write Path:** This is where you face another choice. To make the `first_available` read path work, every storage cell must contain a full copy of all data. This is achieved by configuring the global vmagent to replicate 100% of the write traffic to every storage cell. This is achieved by providing all storage cell URLs in the `-remoteWrite.url` flags. If you provide another count of storage cells in the URL section, it will affect the completeness of the data on the read path.
|
||||
|
||||
**Read Path:** A global vmauth directs the user to the first available cell.
|
||||
|
||||
**Schema:**
|
||||
|
||||
Global vmauth -> Cell -> vmselect
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **Very fast queries.** There is no overhead from merging results from multiple cells.
|
||||
* **Low cross-cell traffic for reads.** This can significantly reduce network costs.
|
||||
|
||||
**Cons:**
|
||||
|
||||
* **High storage cost.** You are storing redundant, full copies of data, which is an expensive approach.
|
||||
* **The Freshness Trap.** This is the greatest and most significant risk associated with this approach. If the write path to one storage cell slows down, vmagent will start buffering data for it. Internally, vmagent maintains a separate queue for each `-remoteWrite.url` target, so lag in a single cell can cause it to serve stale results under the `first_available` policy. If vmauth sends a user to this cell while its queue is not empty, that user will receive stale data (data that is not 100% fresh). A certain automation could be used to disable reads from cells that are lagging behind.
|
||||
|
||||
### Alerting Strategy Trade-offs
|
||||
|
||||
Just like the read path, your alerting strategy in a hyperscale setup also involves critical trade-offs.
|
||||
|
||||
**Path A: Local vmalert (Fast Evaluation, High Traffic). In this model, you deploy vmalert inside each storage cell.**
|
||||
|
||||
**How it works:** Each vmalert queries its local vmselect for data. This is very fast and efficient. It then sends its firing alerts to a global Alertmanager cluster, which is likely located in the compute cells.
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **Low latency for alert evaluation.** Query evaluation is always local and fast.
|
||||
|
||||
**Cons:**
|
||||
|
||||
* **Inconsistent alerts (if data is sharded).** This approach only works reliably if every storage cell has a full copy of the data (Read Path B from the upper tradeoff of this section). If data is sharded (Read Path A), no single vmalert has a complete picture, so global alerts cannot be evaluated correctly.
|
||||
* **High traffic cost.** Every vmalert instance must send its alerts to **every** Alertmanager instance in the global cluster. If you have many storage cells and alertmanagers in different AZs or regions, this creates a lot of expensive cross-network traffic, if you have many cells and Alertmanagers in different regions. This consideration is especially important for those who want to minimize cross-region traffic.
|
||||
|
||||
**Path B: Global vmalert (Consistent Alerts, Higher Latency) In this model, you move vmalert out of the storage cells and into the global compute cells.**
|
||||
|
||||
**How it works:** The global vmalert instances query the same entry point as users (either the global vmselect or vmauth). This provides them with a comprehensive view of all data. They then send alerts to their local Alertmanager instances in the same compute cell.
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **Consistent, global view.** Alerts are always evaluated against the complete dataset. This works perfectly with the efficient sharded write path (Read Path A).
|
||||
* **Low alert traffic.** The communication between vmalert and Alertmanager is all local within the compute cell, which significantly reduces cross-AZ/region traffic.
|
||||
|
||||
**Cons:**
|
||||
|
||||
* **Slower alert evaluation.** Every evaluation now involves a cross-cell query, which has higher latency than a local query. In practice, alerting rules usually generate the majority of the read load.
|
||||
|
||||
### Unavailability Scenarios
|
||||
|
||||
**Blast radius:** Region / Cell
|
||||
|
||||
* **Single node failure within a cell:** degraded performance in that cell; global system continues normally.
|
||||
|
||||
* **Single cell failure:**
|
||||
|
||||
* Path A (Global vmselect): queries still complete but slower (merging from healthy cells).
|
||||
|
||||
* Path B (First-available vmauth): queries are routed to healthy cells; stale data is possible if a write lag exists.
|
||||
|
||||
* **Region outage:** the duplicated architecture in the standby region takes over, resulting in temporary degradation until the reroute is completed.
|
||||
|
||||
## Logical layers
|
||||
|
||||
**Recommended for:** Companies of any scale that need to serve multiple internal teams or external customers with separate data. Each tenant may have different requirements for data isolation and performance.
|
||||
|
||||
The other use case is a different retention across tenants, which is described in this guide: [https://docs.victoriametrics.com/guides/guide-vmcluster-multiple-retention-setup](https://docs.victoriametrics.com/guides/guide-vmcluster-multiple-retention-setup)
|
||||
|
||||
**Key characteristics:** This architecture introduces a logical layer of multitenancy on top of the physical architectures mentioned before.
|
||||
|
||||
* The main goal is to serve multiple tenants (datasets) on the same shared infrastructure while providing strong logical isolation. This solves the problem of ensuring that Team A cannot view data from Team B.
|
||||
* This is achieved using [URL-based multitenancy.](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/#url-format) Each tenant is assigned a unique AccountID.
|
||||
* This AccountID is used in the URL path to create a "virtual slice" or a separate "lane" for that tenant's data, from ingestion at vmagent all the way to querying at vmselect.
|
||||
|
||||
**How it works:**
|
||||
|
||||
1. **At the vmagent:** A vmagent receives data from all sources. It uses relabeling rules to identify which tenant the data belongs to. When vmagent sends the data to vminsert, it attaches the tenant ID as a label (see [docs](https://docs.victoriametrics.com/victoriametrics/cluster-victoriametrics/#multitenancy)).
|
||||
2. **At the vminsert and vmstorage:** These components natively separate data based on the tenant ID. The data from one tenant is logically isolated from another tenant.
|
||||
3. **At the vmauth and vmselect:** When a query comes in, vmauth checks if the user has permission to access the tenant ID in the URL. It only allows valid requests to pass through. vmselect will then only query the data for that specific, authorized tenant.
|
||||
|
||||
### Architectural Models for the isolation
|
||||
|
||||
This multitenancy approach gives us another trade-off in the isolation implementation.
|
||||
|
||||
**Schema:**
|
||||
|
||||
<p align="center">
|
||||
<img src="logical-layers-architecture.webp" alt="Logical Layers Architecture" width="80%">
|
||||
</p>
|
||||
|
||||
**Path A: Shared resources.** We have a single, shared pool of all cluster components.
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **Resource efficient.** This is the cheapest way to run the ingestion layer.
|
||||
|
||||
**Cons:**
|
||||
|
||||
* **Noisy Neighbor Problem.** There is no performance isolation at the entry point. A single tenant sending too much data can slow down ingestion for everyone else.
|
||||
|
||||
**Path B: Dedicated processing layer.** For very important tenants, we can create a separate, dedicated layer of vmagents, vmselect, vminsert, and other components in use.
|
||||
|
||||
**Pros:**
|
||||
|
||||
* **Full performance isolation.** The performance of important tenants is not affected by others.
|
||||
|
||||
**Cons:**
|
||||
|
||||
* **More expensive and complex** to manage multiple service pools.
|
||||
@@ -1,19 +0,0 @@
|
||||
---
|
||||
weight: 14
|
||||
title: VictoriaMetrics topologies
|
||||
menu:
|
||||
docs:
|
||||
parent: "guides"
|
||||
weight: 14
|
||||
tags:
|
||||
- architecture
|
||||
- guide
|
||||
- scalability
|
||||
- kubernetes
|
||||
- high-availability
|
||||
- cell-based infrasructure
|
||||
- reliability
|
||||
aliases:
|
||||
- /guides/vm-architectures.html
|
||||
---
|
||||
{{% content "README.md" %}}
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
Before Width: | Height: | Size: 18 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 151 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 58 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 129 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 62 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 32 KiB |
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user