From e553a41fa0629c167e34504c54286416bbd2f51b Mon Sep 17 00:00:00 2001 From: Nikolay Date: Mon, 30 Jun 2025 17:01:05 +0200 Subject: [PATCH] app: add vlagent component This commit introduces new component - VictoriaLogs Agent (vlagent). It accepts logs data via any data ingestion protocol supported by VictoriaLogs and forwards it to the provided remote storages. Fixes https://github.com/VictoriaMetrics/VictoriaMetrics/issues/8766 --- .gitignore | 1 + Makefile | 2 +- app/vlagent/Makefile | 106 + app/vlagent/README.md | 3 + app/vlagent/deployment/Dockerfile | 8 + app/vlagent/main.go | 97 + app/vlagent/multiarch/Dockerfile | 12 + app/vlagent/remotewrite/client.go | 462 ++ app/vlagent/remotewrite/client_test.go | 99 + app/vlagent/remotewrite/pendinglogrows.go | 158 + app/vlagent/remotewrite/remotewrite.go | 277 ++ apptest/testcase.go | 21 + apptest/tests/vlagent_remotewrite_test.go | 154 + apptest/vlagent.go | 159 + dashboards/vlagent.json | 5138 +++++++++++++++++++++ docs/victorialogs/vlagent.md | 508 ++ lib/logstorage/log_rows.go | 5 + 17 files changed, 7209 insertions(+), 1 deletion(-) create mode 100644 app/vlagent/Makefile create mode 100644 app/vlagent/README.md create mode 100644 app/vlagent/deployment/Dockerfile create mode 100644 app/vlagent/main.go create mode 100644 app/vlagent/multiarch/Dockerfile create mode 100644 app/vlagent/remotewrite/client.go create mode 100644 app/vlagent/remotewrite/client_test.go create mode 100644 app/vlagent/remotewrite/pendinglogrows.go create mode 100644 app/vlagent/remotewrite/remotewrite.go create mode 100644 apptest/tests/vlagent_remotewrite_test.go create mode 100644 apptest/vlagent.go create mode 100644 dashboards/vlagent.json create mode 100644 docs/victorialogs/vlagent.md diff --git a/.gitignore b/.gitignore index 9df3d37d39..33ae56942b 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ /victoria-logs-data /victoria-metrics-data /vmagent-remotewrite-data +/vlagent-remotewritewrite /vmstorage-data /vmselect-cache /package/temp-deb-* diff --git a/Makefile b/Makefile index 1aadf19822..60eaac4272 100644 --- a/Makefile +++ b/Makefile @@ -545,7 +545,7 @@ test-full: test-full-386: GOEXPERIMENT=synctest GOARCH=386 go test -coverprofile=coverage.txt -covermode=atomic ./lib/... ./app/... -integration-test: victoria-metrics vmagent vmalert vmauth vmctl vmbackup vmrestore victoria-logs +integration-test: victoria-metrics vmagent vmalert vmauth vmctl vmbackup vmrestore victoria-logs vlagent go test ./apptest/... -skip="^TestCluster.*" benchmark: diff --git a/app/vlagent/Makefile b/app/vlagent/Makefile new file mode 100644 index 0000000000..9dc0af315a --- /dev/null +++ b/app/vlagent/Makefile @@ -0,0 +1,106 @@ +# All these commands must run from repository root. + +vlagent: + APP_NAME=vlagent $(MAKE) app-local + +vlagent-race: + APP_NAME=vlagent RACE=-race $(MAKE) app-local + +vlagent-prod: + APP_NAME=vlagent $(MAKE) app-via-docker + +vlagent-pure-prod: + APP_NAME=vlagent $(MAKE) app-via-docker-pure + +vlagent-linux-amd64-prod: + APP_NAME=vlagent $(MAKE) app-via-docker-linux-amd64 + +vlagent-linux-arm-prod: + APP_NAME=vlagent $(MAKE) app-via-docker-linux-arm + +vlagent-linux-arm64-prod: + APP_NAME=vlagent $(MAKE) app-via-docker-linux-arm64 + +vlagent-linux-ppc64le-prod: + APP_NAME=vlagent $(MAKE) app-via-docker-linux-ppc64le + +vlagent-linux-386-prod: + APP_NAME=vlagent $(MAKE) app-via-docker-linux-386 + +vlagent-darwin-amd64-prod: + APP_NAME=vlagent $(MAKE) app-via-docker-darwin-amd64 + +vlagent-darwin-arm64-prod: + APP_NAME=vlagent $(MAKE) app-via-docker-darwin-arm64 + +vlagent-freebsd-amd64-prod: + APP_NAME=vlagent $(MAKE) app-via-docker-freebsd-amd64 + +vlagent-openbsd-amd64-prod: + APP_NAME=vlagent $(MAKE) app-via-docker-openbsd-amd64 + +vlagent-windows-amd64-prod: + APP_NAME=vlagent $(MAKE) app-via-docker-windows-amd64 + +package-vlagent: + APP_NAME=vlagent $(MAKE) package-via-docker + +package-vlagent-pure: + APP_NAME=vlagent $(MAKE) package-via-docker-pure + +package-vlagent-amd64: + APP_NAME=vlagent $(MAKE) package-via-docker-amd64 + +package-vlagent-arm: + APP_NAME=vlagent $(MAKE) package-via-docker-arm + +package-vlagent-arm64: + APP_NAME=vlagent $(MAKE) package-via-docker-arm64 + +package-vlagent-ppc64le: + APP_NAME=vlagent $(MAKE) package-via-docker-ppc64le + +package-vlagent-386: + APP_NAME=vlagent $(MAKE) package-via-docker-386 + +publish-vlagent: + APP_NAME=vlagent $(MAKE) publish-via-docker + +vlagent-linux-amd64: + APP_NAME=vlagent CGO_ENABLED=1 GOOS=linux GOARCH=amd64 $(MAKE) app-local-goos-goarch + +vlagent-linux-arm: + APP_NAME=vlagent CGO_ENABLED=0 GOOS=linux GOARCH=arm $(MAKE) app-local-goos-goarch + +vlagent-linux-arm64: + APP_NAME=vlagent CGO_ENABLED=0 GOOS=linux GOARCH=arm64 $(MAKE) app-local-goos-goarch + +vlagent-linux-ppc64le: + APP_NAME=vlagent CGO_ENABLED=0 GOOS=linux GOARCH=ppc64le $(MAKE) app-local-goos-goarch + +vlagent-linux-s390x: + APP_NAME=vlagent CGO_ENABLED=0 GOOS=linux GOARCH=s390x $(MAKE) app-local-goos-goarch + +vlagent-linux-loong64: + APP_NAME=vlagent CGO_ENABLED=0 GOOS=linux GOARCH=loong64 $(MAKE) app-local-goos-goarch + +vlagent-linux-386: + APP_NAME=vlagent CGO_ENABLED=0 GOOS=linux GOARCH=386 $(MAKE) app-local-goos-goarch + +vlagent-darwin-amd64: + APP_NAME=vlagent CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 $(MAKE) app-local-goos-goarch + +vlagent-darwin-arm64: + APP_NAME=vlagent CGO_ENABLED=0 GOOS=darwin GOARCH=arm64 $(MAKE) app-local-goos-goarch + +vlagent-freebsd-amd64: + APP_NAME=vlagent CGO_ENABLED=0 GOOS=freebsd GOARCH=amd64 $(MAKE) app-local-goos-goarch + +vlagent-openbsd-amd64: + APP_NAME=vlagent CGO_ENABLED=0 GOOS=openbsd GOARCH=amd64 $(MAKE) app-local-goos-goarch + +vlagent-windows-amd64: + GOARCH=amd64 APP_NAME=vlagent $(MAKE) app-local-windows-goarch + +vlagent-pure: + APP_NAME=vlagent $(MAKE) app-local-pure diff --git a/app/vlagent/README.md b/app/vlagent/README.md new file mode 100644 index 0000000000..451e289bd4 --- /dev/null +++ b/app/vlagent/README.md @@ -0,0 +1,3 @@ +See vlagent docs [here](https://docs.victoriametrics.com/victorialogs/vlagent/). + +vlagent docs can be edited at [docs/vlagent.md](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/docs/victorialogs/vlagent.md). diff --git a/app/vlagent/deployment/Dockerfile b/app/vlagent/deployment/Dockerfile new file mode 100644 index 0000000000..0a81dc63a9 --- /dev/null +++ b/app/vlagent/deployment/Dockerfile @@ -0,0 +1,8 @@ +ARG base_image=non-existing +FROM $base_image + +EXPOSE 9429 + +ENTRYPOINT ["/vlagent-prod"] +ARG src_binary=non-existing +COPY $src_binary ./vlagent-prod diff --git a/app/vlagent/main.go b/app/vlagent/main.go new file mode 100644 index 0000000000..14eae2a75c --- /dev/null +++ b/app/vlagent/main.go @@ -0,0 +1,97 @@ +package main + +import ( + "flag" + "fmt" + "net/http" + "os" + "time" + + "github.com/VictoriaMetrics/VictoriaMetrics/app/vlagent/remotewrite" + "github.com/VictoriaMetrics/VictoriaMetrics/app/vlinsert" + "github.com/VictoriaMetrics/VictoriaMetrics/app/vlinsert/insertutil" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/buildinfo" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/envflag" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/httpserver" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/pushmetrics" +) + +var ( + httpListenAddrs = flagutil.NewArrayString("httpListenAddr", "TCP address to listen for incoming http requests. "+ + "Set this flag to empty value in order to disable listening on any port. This mode may be useful for running multiple vlagent instances on the same server. "+ + "Note that /targets and /metrics pages aren't available if -httpListenAddr=''. See also -tls and -httpListenAddr.useProxyProtocol") + useProxyProtocol = flagutil.NewArrayBool("httpListenAddr.useProxyProtocol", "Whether to use proxy protocol for connections accepted at the corresponding -httpListenAddr . "+ + "See https://www.haproxy.org/download/1.8/doc/proxy-protocol.txt . "+ + "With enabled proxy protocol http server cannot serve regular /metrics endpoint. Use -pushmetrics.url for metrics pushing") +) + +func main() { + // Write flags and help message to stdout, since it is easier to grep or pipe. + flag.CommandLine.SetOutput(os.Stdout) + flag.Usage = usage + envflag.Parse() + buildinfo.Init() + remotewrite.InitSecretFlags() + logger.Init() + + remotewrite.Init() + vlinsert.Init() + + insertutil.SetLogRowsStorage(&remotewrite.Storage{}) + listenAddrs := *httpListenAddrs + if len(listenAddrs) == 0 { + listenAddrs = []string{":9429"} + } + logger.Infof("starting vlagent at %q...", listenAddrs) + startTime := time.Now() + go httpserver.Serve(listenAddrs, requestHandler, httpserver.ServeOptions{ + UseProxyProtocol: useProxyProtocol, + }) + logger.Infof("started vlagent in %.3f seconds", time.Since(startTime).Seconds()) + + pushmetrics.Init() + sig := procutil.WaitForSigterm() + logger.Infof("received signal %s", sig) + pushmetrics.Stop() + + startTime = time.Now() + logger.Infof("gracefully shutting down webservice at %q", listenAddrs) + if err := httpserver.Stop(listenAddrs); err != nil { + logger.Fatalf("cannot stop the webservice: %s", err) + } + vlinsert.Stop() + remotewrite.Stop() + logger.Infof("successfully shut down the webservice in %.3f seconds", time.Since(startTime).Seconds()) + logger.Infof("successfully stopped vlagent in %.3f seconds", time.Since(startTime).Seconds()) +} + +// RequestHandler handles insert requests for VictoriaLogs +func requestHandler(w http.ResponseWriter, r *http.Request) bool { + if r.URL.Path == "/" { + if r.Method != http.MethodGet { + return false + } + w.Header().Add("Content-Type", "text/html; charset=utf-8") + fmt.Fprintf(w, "

vlagent

") + fmt.Fprintf(w, "See docs at https://docs.victoriametrics.com/victorialogs/vlagent/
") + fmt.Fprintf(w, "Useful endpoints:
") + httpserver.WriteAPIHelp(w, [][2]string{ + {"metrics", "available service metrics"}, + {"flags", "command-line flags"}, + }) + return true + } + return vlinsert.RequestHandler(w, r) +} + +func usage() { + const s = ` +vlagent collects logs via popular data ingestion protocols and routes it to VictoriaLogs. + +See the docs at https://docs.victoriametrics.com/victorialogs/vlagent/ . +` + flagutil.Usage(s) +} diff --git a/app/vlagent/multiarch/Dockerfile b/app/vlagent/multiarch/Dockerfile new file mode 100644 index 0000000000..289a0ab018 --- /dev/null +++ b/app/vlagent/multiarch/Dockerfile @@ -0,0 +1,12 @@ +# See https://medium.com/on-docker/use-multi-stage-builds-to-inject-ca-certs-ad1e8f01de1b +ARG certs_image=non-existing +ARG root_image=non-existing +FROM $certs_image AS certs +RUN apk update && apk upgrade && apk --update --no-cache add ca-certificates + +FROM $root_image +COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt +EXPOSE 9429 +ENTRYPOINT ["/vlagent-prod"] +ARG TARGETARCH +COPY vlagent-linux-${TARGETARCH}-prod ./vlagent-prod diff --git a/app/vlagent/remotewrite/client.go b/app/vlagent/remotewrite/client.go new file mode 100644 index 0000000000..4cbbde9d70 --- /dev/null +++ b/app/vlagent/remotewrite/client.go @@ -0,0 +1,462 @@ +package remotewrite + +import ( + "bytes" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strconv" + "strings" + "sync" + "time" + + "github.com/VictoriaMetrics/metrics" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/httputil" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/ratelimiter" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/timeutil" +) + +var ( + rateLimit = flagutil.NewArrayInt("remoteWrite.rateLimit", 0, "Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. "+ + "By default, the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data ") + sendTimeout = flagutil.NewArrayDuration("remoteWrite.sendTimeout", time.Minute, "Timeout for sending a single block of data to the corresponding -remoteWrite.url") + retryMinInterval = flagutil.NewArrayDuration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts to send a block of data to the corresponding -remoteWrite.url. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxTime") + retryMaxTime = flagutil.NewArrayDuration("remoteWrite.retryMaxTime", time.Minute, "The max time spent on retry attempts to send a block of data to the corresponding -remoteWrite.url. Change this value if it is expected for -remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval") + proxyURL = flagutil.NewArrayString("remoteWrite.proxyURL", "Optional proxy URL for writing data to the corresponding -remoteWrite.url. "+ + "Supported proxies: http, https, socks5. Example: -remoteWrite.proxyURL=socks5://proxy:1234") + + tlsHandshakeTimeout = flagutil.NewArrayDuration("remoteWrite.tlsHandshakeTimeout", 20*time.Second, "The timeout for establishing tls connections to the corresponding -remoteWrite.url") + tlsInsecureSkipVerify = flagutil.NewArrayBool("remoteWrite.tlsInsecureSkipVerify", "Whether to skip tls verification when connecting to the corresponding -remoteWrite.url") + tlsCertFile = flagutil.NewArrayString("remoteWrite.tlsCertFile", "Optional path to client-side TLS certificate file to use when connecting "+ + "to the corresponding -remoteWrite.url") + tlsKeyFile = flagutil.NewArrayString("remoteWrite.tlsKeyFile", "Optional path to client-side TLS certificate key to use when connecting to the corresponding -remoteWrite.url") + tlsCAFile = flagutil.NewArrayString("remoteWrite.tlsCAFile", "Optional path to TLS CA file to use for verifying connections to the corresponding -remoteWrite.url. "+ + "By default, system CA is used") + tlsServerName = flagutil.NewArrayString("remoteWrite.tlsServerName", "Optional TLS server name to use for connections to the corresponding -remoteWrite.url. "+ + "By default, the server name from -remoteWrite.url is used") + + headers = flagutil.NewArrayString("remoteWrite.headers", "Optional HTTP headers to send with each request to the corresponding -remoteWrite.url. "+ + "For example, -remoteWrite.headers='My-Auth:foobar' would send 'My-Auth: foobar' HTTP header with every request to the corresponding -remoteWrite.url. "+ + "Multiple headers must be delimited by '^^': -remoteWrite.headers='header1:value1^^header2:value2'") + + basicAuthUsername = flagutil.NewArrayString("remoteWrite.basicAuth.username", "Optional basic auth username to use for the corresponding -remoteWrite.url") + basicAuthPassword = flagutil.NewArrayString("remoteWrite.basicAuth.password", "Optional basic auth password to use for the corresponding -remoteWrite.url") + basicAuthPasswordFile = flagutil.NewArrayString("remoteWrite.basicAuth.passwordFile", "Optional path to basic auth password to use for the corresponding -remoteWrite.url. "+ + "The file is re-read every second") + bearerToken = flagutil.NewArrayString("remoteWrite.bearerToken", "Optional bearer auth token to use for the corresponding -remoteWrite.url") + bearerTokenFile = flagutil.NewArrayString("remoteWrite.bearerTokenFile", "Optional path to bearer token file to use for the corresponding -remoteWrite.url. "+ + "The token is re-read from the file every second") + + oauth2ClientID = flagutil.NewArrayString("remoteWrite.oauth2.clientID", "Optional OAuth2 clientID to use for the corresponding -remoteWrite.url") + oauth2ClientSecret = flagutil.NewArrayString("remoteWrite.oauth2.clientSecret", "Optional OAuth2 clientSecret to use for the corresponding -remoteWrite.url") + oauth2ClientSecretFile = flagutil.NewArrayString("remoteWrite.oauth2.clientSecretFile", "Optional OAuth2 clientSecretFile to use for the corresponding -remoteWrite.url") + oauth2EndpointParams = flagutil.NewArrayString("remoteWrite.oauth2.endpointParams", "Optional OAuth2 endpoint parameters to use for the corresponding -remoteWrite.url . "+ + `The endpoint parameters must be set in JSON format: {"param1":"value1",...,"paramN":"valueN"}`) + oauth2TokenURL = flagutil.NewArrayString("remoteWrite.oauth2.tokenUrl", "Optional OAuth2 tokenURL to use for the corresponding -remoteWrite.url") + oauth2Scopes = flagutil.NewArrayString("remoteWrite.oauth2.scopes", "Optional OAuth2 scopes to use for the corresponding -remoteWrite.url. Scopes must be delimited by ';'") +) + +type client struct { + sanitizedURL string + remoteWriteURL string + + fq *persistentqueue.FastQueue + hc *http.Client + + retryMinInterval time.Duration + retryMaxTime time.Duration + + sendBlock func(block []byte) bool + authCfg *promauth.Config + + rl *ratelimiter.RateLimiter + + bytesSent *metrics.Counter + blocksSent *metrics.Counter + requestDuration *metrics.Histogram + requestsOKCount *metrics.Counter + errorsCount *metrics.Counter + packetsDropped *metrics.Counter + rateLimit *metrics.Gauge + retriesCount *metrics.Counter + sendDuration *metrics.FloatCounter + + wg sync.WaitGroup + stopCh chan struct{} +} + +func newHTTPClient(argIdx int, remoteWriteURL, sanitizedURL string, fq *persistentqueue.FastQueue, concurrency int) *client { + authCfg, err := getAuthConfig(argIdx) + if err != nil { + logger.Fatalf("cannot initialize auth config for -remoteWrite.url=%q: %s", remoteWriteURL, err) + } + + tr := httputil.NewTransport(false, "vlagent_remotewrite") + tr.TLSHandshakeTimeout = tlsHandshakeTimeout.GetOptionalArg(argIdx) + tr.MaxConnsPerHost = 2 * concurrency + tr.MaxIdleConnsPerHost = 2 * concurrency + tr.IdleConnTimeout = time.Minute + tr.WriteBufferSize = 64 * 1024 + + pURL := proxyURL.GetOptionalArg(argIdx) + if len(pURL) > 0 { + if !strings.Contains(pURL, "://") { + logger.Fatalf("cannot parse -remoteWrite.proxyURL=%q: it must start with `http://`, `https://` or `socks5://`", pURL) + } + pu, err := url.Parse(pURL) + if err != nil { + logger.Fatalf("cannot parse -remoteWrite.proxyURL=%q: %s", pURL, err) + } + tr.Proxy = http.ProxyURL(pu) + } + hc := &http.Client{ + Transport: authCfg.NewRoundTripper(tr), + Timeout: sendTimeout.GetOptionalArg(argIdx), + } + c := &client{ + sanitizedURL: sanitizedURL, + remoteWriteURL: remoteWriteURL, + authCfg: authCfg, + fq: fq, + hc: hc, + retryMinInterval: retryMinInterval.GetOptionalArg(argIdx), + retryMaxTime: retryMaxTime.GetOptionalArg(argIdx), + stopCh: make(chan struct{}), + } + c.sendBlock = c.sendBlockHTTP + return c +} + +func (c *client) init(argIdx, concurrency int, sanitizedURL string) { + limitReached := metrics.GetOrCreateCounter(fmt.Sprintf(`vlagent_remotewrite_rate_limit_reached_total{url=%q}`, c.sanitizedURL)) + if bytesPerSec := rateLimit.GetOptionalArg(argIdx); bytesPerSec > 0 { + logger.Infof("applying %d bytes per second rate limit for -remoteWrite.url=%q", bytesPerSec, sanitizedURL) + c.rl = ratelimiter.New(int64(bytesPerSec), limitReached, c.stopCh) + } + c.bytesSent = metrics.GetOrCreateCounter(fmt.Sprintf(`vlagent_remotewrite_bytes_sent_total{url=%q}`, c.sanitizedURL)) + c.blocksSent = metrics.GetOrCreateCounter(fmt.Sprintf(`vlagent_remotewrite_blocks_sent_total{url=%q}`, c.sanitizedURL)) + c.rateLimit = metrics.GetOrCreateGauge(fmt.Sprintf(`vlagent_remotewrite_rate_limit{url=%q}`, c.sanitizedURL), func() float64 { + return float64(rateLimit.GetOptionalArg(argIdx)) + }) + c.requestDuration = metrics.GetOrCreateHistogram(fmt.Sprintf(`vlagent_remotewrite_duration_seconds{url=%q}`, c.sanitizedURL)) + c.requestsOKCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vlagent_remotewrite_requests_total{url=%q, status_code="2XX"}`, c.sanitizedURL)) + c.errorsCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vlagent_remotewrite_errors_total{url=%q}`, c.sanitizedURL)) + c.packetsDropped = metrics.GetOrCreateCounter(fmt.Sprintf(`vlagent_remotewrite_packets_dropped_total{url=%q}`, c.sanitizedURL)) + c.retriesCount = metrics.GetOrCreateCounter(fmt.Sprintf(`vlagent_remotewrite_retries_count_total{url=%q}`, c.sanitizedURL)) + c.sendDuration = metrics.GetOrCreateFloatCounter(fmt.Sprintf(`vlagent_remotewrite_send_duration_seconds_total{url=%q}`, c.sanitizedURL)) + metrics.GetOrCreateGauge(fmt.Sprintf(`vlagent_remotewrite_queues{url=%q}`, c.sanitizedURL), func() float64 { + return float64(*queues) + }) + for i := 0; i < concurrency; i++ { + c.wg.Add(1) + go func() { + defer c.wg.Done() + c.runWorker() + }() + } + logger.Infof("initialized client for -remoteWrite.url=%q", c.sanitizedURL) +} + +func (c *client) MustStop() { + close(c.stopCh) + c.wg.Wait() + logger.Infof("stopped client for -remoteWrite.url=%q", c.sanitizedURL) +} + +func getAuthConfig(argIdx int) (*promauth.Config, error) { + headersValue := headers.GetOptionalArg(argIdx) + var hdrs []string + if headersValue != "" { + hdrs = strings.Split(headersValue, "^^") + } + username := basicAuthUsername.GetOptionalArg(argIdx) + password := basicAuthPassword.GetOptionalArg(argIdx) + passwordFile := basicAuthPasswordFile.GetOptionalArg(argIdx) + var basicAuthCfg *promauth.BasicAuthConfig + if username != "" || password != "" || passwordFile != "" { + basicAuthCfg = &promauth.BasicAuthConfig{ + Username: username, + Password: promauth.NewSecret(password), + PasswordFile: passwordFile, + } + } + + token := bearerToken.GetOptionalArg(argIdx) + tokenFile := bearerTokenFile.GetOptionalArg(argIdx) + + var oauth2Cfg *promauth.OAuth2Config + clientSecret := oauth2ClientSecret.GetOptionalArg(argIdx) + clientSecretFile := oauth2ClientSecretFile.GetOptionalArg(argIdx) + if clientSecretFile != "" || clientSecret != "" { + endpointParamsJSON := oauth2EndpointParams.GetOptionalArg(argIdx) + endpointParams, err := flagutil.ParseJSONMap(endpointParamsJSON) + if err != nil { + return nil, fmt.Errorf("cannot parse JSON for -remoteWrite.oauth2.endpointParams=%s: %w", endpointParamsJSON, err) + } + oauth2Cfg = &promauth.OAuth2Config{ + ClientID: oauth2ClientID.GetOptionalArg(argIdx), + ClientSecret: promauth.NewSecret(clientSecret), + ClientSecretFile: clientSecretFile, + EndpointParams: endpointParams, + TokenURL: oauth2TokenURL.GetOptionalArg(argIdx), + Scopes: strings.Split(oauth2Scopes.GetOptionalArg(argIdx), ";"), + } + } + + tlsCfg := &promauth.TLSConfig{ + CAFile: tlsCAFile.GetOptionalArg(argIdx), + CertFile: tlsCertFile.GetOptionalArg(argIdx), + KeyFile: tlsKeyFile.GetOptionalArg(argIdx), + ServerName: tlsServerName.GetOptionalArg(argIdx), + InsecureSkipVerify: tlsInsecureSkipVerify.GetOptionalArg(argIdx), + } + + opts := &promauth.Options{ + BasicAuth: basicAuthCfg, + BearerToken: token, + BearerTokenFile: tokenFile, + OAuth2: oauth2Cfg, + TLSConfig: tlsCfg, + Headers: hdrs, + } + authCfg, err := opts.NewConfig() + if err != nil { + return nil, fmt.Errorf("cannot populate auth config for remoteWrite idx: %d, err: %w", argIdx, err) + } + return authCfg, nil +} + +func (c *client) runWorker() { + var ok bool + var block []byte + ch := make(chan bool, 1) + for { + block, ok = c.fq.MustReadBlock(block[:0]) + if !ok { + return + } + if len(block) == 0 { + // skip empty data blocks from sending + continue + } + go func() { + startTime := time.Now() + ch <- c.sendBlock(block) + c.sendDuration.Add(time.Since(startTime).Seconds()) + }() + select { + case ok := <-ch: + if ok { + // The block has been sent successfully + continue + } + // Return unsent block to the queue. + c.fq.MustWriteBlockIgnoreDisabledPQ(block) + return + case <-c.stopCh: + // c must be stopped. Wait for a while in the hope the block will be sent. + graceDuration := 5 * time.Second + select { + case ok := <-ch: + if !ok { + // Return unsent block to the queue. + c.fq.MustWriteBlockIgnoreDisabledPQ(block) + } + case <-time.After(graceDuration): + // Return unsent block to the queue. + c.fq.MustWriteBlockIgnoreDisabledPQ(block) + } + return + } + } +} + +func (c *client) doRequest(url string, body []byte) (*http.Response, error) { + req, err := c.newRequest(url, body) + if err != nil { + return nil, err + } + resp, err := c.hc.Do(req) + if err == nil { + return resp, nil + } + if !errors.Is(err, io.EOF) && !errors.Is(err, io.ErrUnexpectedEOF) { + return nil, err + } + // It is likely connection became stale or timed out during the first request. + // Make another attempt in hope request will succeed. + // If not, the error should be handled by the caller as usual. + // This should help with https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4139 + req, err = c.newRequest(url, body) + if err != nil { + return nil, fmt.Errorf("second attempt: %w", err) + } + resp, err = c.hc.Do(req) + if err != nil { + return nil, fmt.Errorf("second attempt: %w", err) + } + return resp, nil +} + +func (c *client) newRequest(url string, body []byte) (*http.Request, error) { + reqBody := bytes.NewBuffer(body) + req, err := http.NewRequest(http.MethodPost, url, reqBody) + if err != nil { + logger.Panicf("BUG: unexpected error from http.NewRequest(%q): %s", url, err) + } + err = c.authCfg.SetHeaders(req, true) + if err != nil { + return nil, err + } + h := req.Header + h.Set("User-Agent", "vlagent") + h.Set("Content-Encoding", "zstd") + h.Set("Content-Type", "application/octet-stream") + + return req, nil +} + +// sendBlockHTTP sends the given block to c.remoteWriteURL. +// +// The function returns false only if c.stopCh is closed. +// Otherwise, it tries sending the block to remote storage indefinitely. +func (c *client) sendBlockHTTP(block []byte) bool { + c.rl.Register(len(block)) + maxRetryDuration := timeutil.AddJitterToDuration(c.retryMaxTime) + retryDuration := timeutil.AddJitterToDuration(c.retryMinInterval) + retriesCount := 0 + +again: + startTime := time.Now() + resp, err := c.doRequest(c.remoteWriteURL, block) + c.requestDuration.UpdateDuration(startTime) + if err != nil { + c.errorsCount.Inc() + retryDuration *= 2 + if retryDuration > maxRetryDuration { + retryDuration = maxRetryDuration + } + remoteWriteRetryLogger.Warnf("couldn't send a block with size %d bytes to %q: %s; re-sending the block in %.3f seconds", + len(block), c.sanitizedURL, err, retryDuration.Seconds()) + t := timerpool.Get(retryDuration) + select { + case <-c.stopCh: + timerpool.Put(t) + return false + case <-t.C: + timerpool.Put(t) + } + c.retriesCount.Inc() + goto again + } + + statusCode := resp.StatusCode + if statusCode/100 == 2 { + _ = resp.Body.Close() + c.requestsOKCount.Inc() + c.bytesSent.Add(len(block)) + c.blocksSent.Inc() + return true + } + + metrics.GetOrCreateCounter(fmt.Sprintf(`vlagent_remotewrite_requests_total{url=%q, status_code="%d"}`, c.sanitizedURL, statusCode)).Inc() + if statusCode == 400 || statusCode == 404 { + logBlockRejected(block, c.sanitizedURL, resp) + _ = resp.Body.Close() + c.packetsDropped.Inc() + return true + } + // Unexpected status code returned + retriesCount++ + retryAfterHeader := parseRetryAfterHeader(resp.Header.Get("Retry-After")) + retryDuration = getRetryDuration(retryAfterHeader, retryDuration, maxRetryDuration) + + // Handle response + body, err := io.ReadAll(resp.Body) + _ = resp.Body.Close() + if err != nil { + logger.Errorf("cannot read response body from %q during retry #%d: %s", c.sanitizedURL, retriesCount, err) + } else { + logger.Errorf("unexpected status code received after sending a block with size %d bytes to %q during retry #%d: %d; response body=%q; "+ + "re-sending the block in %.3f seconds", len(block), c.sanitizedURL, retriesCount, statusCode, body, retryDuration.Seconds()) + } + t := timerpool.Get(retryDuration) + select { + case <-c.stopCh: + timerpool.Put(t) + return false + case <-t.C: + timerpool.Put(t) + } + c.retriesCount.Inc() + goto again +} + +var remoteWriteRejectedLogger = logger.WithThrottler("remoteWriteRejected", 5*time.Second) +var remoteWriteRetryLogger = logger.WithThrottler("remoteWriteRetry", 5*time.Second) + +// getRetryDuration returns retry duration. +// retryAfterDuration has the highest priority. +// If retryAfterDuration is not specified, retryDuration gets doubled. +// retryDuration can't exceed maxRetryDuration. +// +// Also see: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6097 +func getRetryDuration(retryAfterDuration, retryDuration, maxRetryDuration time.Duration) time.Duration { + // retryAfterDuration has the highest priority duration + if retryAfterDuration > 0 { + return timeutil.AddJitterToDuration(retryAfterDuration) + } + + // default backoff retry policy + retryDuration *= 2 + if retryDuration > maxRetryDuration { + retryDuration = maxRetryDuration + } + + return retryDuration +} + +func logBlockRejected(block []byte, sanitizedURL string, resp *http.Response) { + body, err := io.ReadAll(resp.Body) + if err != nil { + remoteWriteRejectedLogger.Errorf("sending a block with size %d bytes to %q was rejected (skipping the block): status code %d; "+ + "failed to read response body: %s", + len(block), sanitizedURL, resp.StatusCode, err) + } else { + remoteWriteRejectedLogger.Errorf("sending a block with size %d bytes to %q was rejected (skipping the block): status code %d; response body: %s", + len(block), sanitizedURL, resp.StatusCode, string(body)) + } +} + +// parseRetryAfterHeader parses `Retry-After` value retrieved from HTTP response header. +// retryAfterString should be in either HTTP-date or a number of seconds. +// It will return time.Duration(0) if `retryAfterString` does not follow RFC 7231. +func parseRetryAfterHeader(retryAfterString string) (retryAfterDuration time.Duration) { + if retryAfterString == "" { + return retryAfterDuration + } + + defer func() { + v := retryAfterDuration.Seconds() + logger.Infof("'Retry-After: %s' parsed into %.2f second(s)", retryAfterString, v) + }() + + // Retry-After could be in "Mon, 02 Jan 2006 15:04:05 GMT" format. + if parsedTime, err := time.Parse(http.TimeFormat, retryAfterString); err == nil { + return time.Duration(time.Until(parsedTime).Seconds()) * time.Second + } + // Retry-After could be in seconds. + if seconds, err := strconv.Atoi(retryAfterString); err == nil { + return time.Duration(seconds) * time.Second + } + + return 0 +} diff --git a/app/vlagent/remotewrite/client_test.go b/app/vlagent/remotewrite/client_test.go new file mode 100644 index 0000000000..fda56e2f85 --- /dev/null +++ b/app/vlagent/remotewrite/client_test.go @@ -0,0 +1,99 @@ +package remotewrite + +import ( + "math" + "net/http" + "testing" + "time" +) + +func TestCalculateRetryDuration(t *testing.T) { + // `testFunc` call `calculateRetryDuration` for `n` times + // and evaluate if the result of `calculateRetryDuration` is + // 1. >= expectMinDuration + // 2. <= expectMinDuration + 10% (see timeutil.AddJitterToDuration) + f := func(retryAfterDuration, retryDuration time.Duration, n int, expectMinDuration time.Duration) { + t.Helper() + + for i := 0; i < n; i++ { + retryDuration = getRetryDuration(retryAfterDuration, retryDuration, time.Minute) + } + + expectMaxDuration := helper(expectMinDuration) + expectMinDuration = expectMinDuration - (1000 * time.Millisecond) // Avoid edge case when calculating time.Until(now) + + if !(retryDuration >= expectMinDuration && retryDuration <= expectMaxDuration) { + t.Fatalf( + "incorrect retry duration, want (ms): [%d, %d], got (ms): %d", + expectMinDuration.Milliseconds(), expectMaxDuration.Milliseconds(), + retryDuration.Milliseconds(), + ) + } + } + + // Call calculateRetryDuration for 1 time. + { + // default backoff policy + f(0, time.Second, 1, 2*time.Second) + // default backoff policy exceed max limit" + f(0, 10*time.Minute, 1, time.Minute) + + // retry after > default backoff policy + f(10*time.Second, 1*time.Second, 1, 10*time.Second) + // retry after < default backoff policy + f(1*time.Second, 10*time.Second, 1, 1*time.Second) + // retry after invalid and < default backoff policy + f(0, time.Second, 1, 2*time.Second) + + } + + // Call calculateRetryDuration for multiple times. + { + // default backoff policy 2 times + f(0, time.Second, 2, 4*time.Second) + // default backoff policy 3 times + f(0, time.Second, 3, 8*time.Second) + // default backoff policy N times exceed max limit + f(0, time.Second, 10, time.Minute) + + // retry after 120s 1 times + f(120*time.Second, time.Second, 1, 120*time.Second) + // retry after 120s 2 times + f(120*time.Second, time.Second, 2, 120*time.Second) + } +} + +func TestParseRetryAfterHeader(t *testing.T) { + f := func(retryAfterString string, expectResult time.Duration) { + t.Helper() + + result := parseRetryAfterHeader(retryAfterString) + // expect `expectResult == result` when retryAfterString is in seconds or invalid + // expect the difference between result and expectResult to be lower than 10% + if !(expectResult == result || math.Abs(float64(expectResult-result))/float64(expectResult) < 0.10) { + t.Fatalf( + "incorrect retry after duration, want (ms): %d, got (ms): %d", + expectResult.Milliseconds(), result.Milliseconds(), + ) + } + } + + // retry after header in seconds + f("10", 10*time.Second) + // retry after header in date time + f(time.Now().Add(30*time.Second).UTC().Format(http.TimeFormat), 30*time.Second) + // retry after header invalid + f("invalid-retry-after", 0) + // retry after header not in GMT + f(time.Now().Add(10*time.Second).Format("Mon, 02 Jan 2006 15:04:05 FAKETZ"), 0) +} + +// helper calculate the max possible time duration calculated by timeutil.AddJitterToDuration. +func helper(d time.Duration) time.Duration { + dv := d / 10 + if dv > 10*time.Second { + dv = 10 * time.Second + } + + return d + dv +} diff --git a/app/vlagent/remotewrite/pendinglogrows.go b/app/vlagent/remotewrite/pendinglogrows.go new file mode 100644 index 0000000000..7e9c78072f --- /dev/null +++ b/app/vlagent/remotewrite/pendinglogrows.go @@ -0,0 +1,158 @@ +package remotewrite + +import ( + "flag" + "sync" + "sync/atomic" + "time" + + "github.com/VictoriaMetrics/metrics" + + "github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/encoding/zstd" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/timeutil" +) + +var ( + maxUnpackedBlockSize = flagutil.NewBytes("remoteWrite.maxBlockSize", 8*1024*1024, "The maximum block size to send to remote storage. Bigger blocks may improve performance at the cost of the increased memory usage.") + flushInterval = flag.Duration("remoteWrite.flushInterval", time.Second, "Interval for flushing the data to remote storage. "+ + "This option takes effect only when less than 2MB of data per second are pushed to -remoteWrite.url") +) + +type pendingLogs struct { + lastFlushTime atomic.Uint64 + + // The queue to send blocks to. + fq *persistentqueue.FastQueue + + // mu protects wr + mu sync.Mutex + wr writeRequest + + stopCh chan struct{} + periodicFlusherWG sync.WaitGroup +} + +func newPendingLogs(fq *persistentqueue.FastQueue) *pendingLogs { + pl := &pendingLogs{ + fq: fq, + stopCh: make(chan struct{}), + } + + pl.periodicFlusherWG.Add(1) + go func() { + defer pl.periodicFlusherWG.Done() + pl.periodicFlusher() + }() + + return pl +} + +func (pl *pendingLogs) add(lr *logstorage.LogRows) { + lr.ForEachRow(func(_ uint64, r *logstorage.InsertRow) { + pl.addLogRow(r) + }) +} + +func (pl *pendingLogs) addLogRow(r *logstorage.InsertRow) { + bb := bbPool.Get() + bb.B = r.Marshal(bb.B) + + pl.mu.Lock() + _, _ = pl.wr.pendingData.Write(bb.B) + pl.wr.pendingLogRowsCount++ + if len(pl.wr.pendingData.B) > maxUnpackedBlockSize.IntN() { + pl.mustFlushLocked() + } + pl.mu.Unlock() + bbPool.Put(bb) +} + +func (pl *pendingLogs) mustFlushLocked() { + pl.lastFlushTime.Store(fasttime.UnixTimestamp()) + pl.wr.push(func(b []byte) { + if !pl.fq.TryWriteBlock(b) { + logger.Fatalf("BUG: TryWriteBlock cannot return false") + } + }) + pl.wr.reset() +} + +func (pl *pendingLogs) periodicFlusher() { + flushSeconds := int64(flushInterval.Seconds()) + if flushSeconds <= 0 { + flushSeconds = 1 + } + d := timeutil.AddJitterToDuration(*flushInterval) + ticker := time.NewTicker(d) + defer ticker.Stop() + for { + select { + case <-pl.stopCh: + pl.mu.Lock() + pl.mustFlushOnStop() + pl.mu.Unlock() + return + case <-ticker.C: + if fasttime.UnixTimestamp()-pl.lastFlushTime.Load() < uint64(flushSeconds) { + continue + } + } + pl.mu.Lock() + pl.mustFlushLocked() + pl.mu.Unlock() + } +} + +// mustFlushOnStop force pushes wr data +// +// This is needed in order to properly save in-memory data to persistent queue on graceful shutdown. +func (pl *pendingLogs) mustFlushOnStop() { + pl.wr.push(pl.fq.MustWriteBlockIgnoreDisabledPQ) + pl.wr.reset() +} + +func (pl *pendingLogs) mustStop() { + close(pl.stopCh) + pl.periodicFlusherWG.Wait() +} + +type writeRequest struct { + pendingData bytesutil.ByteBuffer + pendingLogRowsCount int64 +} + +func (wr *writeRequest) push(pushBlock func([]byte)) { + if len(wr.pendingData.B) == 0 { + return + } + b := wr.pendingData.B + + zb := compressBufPool.Get() + zb.B = zstd.CompressLevel(zb.B[:0], b, 1) + zbLen := len(zb.B) + pushBlock(zb.B) + compressBufPool.Put(zb) + blockSizeBytes.Update(float64(zbLen)) + blockSizeLogRows.Update(float64(wr.pendingLogRowsCount)) +} + +func (wr *writeRequest) reset() { + wr.pendingData.Reset() + wr.pendingLogRowsCount = 0 +} + +var ( + blockSizeBytes = metrics.NewHistogram(`vlagent_remotewrite_block_size_bytes`) + blockSizeLogRows = metrics.NewHistogram(`vlagent_remotewrite_block_size_rows`) +) + +var ( + compressBufPool bytesutil.ByteBufferPool + bbPool bytesutil.ByteBufferPool +) diff --git a/app/vlagent/remotewrite/remotewrite.go b/app/vlagent/remotewrite/remotewrite.go new file mode 100644 index 0000000000..2ff66db120 --- /dev/null +++ b/app/vlagent/remotewrite/remotewrite.go @@ -0,0 +1,277 @@ +package remotewrite + +import ( + "flag" + "fmt" + "net/url" + "path/filepath" + "sync" + "sync/atomic" + + "github.com/VictoriaMetrics/metrics" + "github.com/cespare/xxhash/v2" + + "github.com/VictoriaMetrics/VictoriaMetrics/app/vlstorage/netinsert" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/fs" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logger" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/logstorage" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/memory" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/persistentqueue" +) + +var ( + remoteWriteURLs = flagutil.NewArrayString("remoteWrite.url", "Remote storage URL to write data to. It must support VictoriaLogs native protocol. "+ + "Example url: http://:9428/internal/insert. "+ + "Pass multiple -remoteWrite.url options in order to replicate the collected data to multiple remote storage systems.") + maxPendingBytesPerURL = flagutil.NewArrayBytes("remoteWrite.maxDiskUsagePerURL", 0, "The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath "+ + "for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. "+ + "Buffered data is stored in ~500MB chunks. It is recommended to set the value for this flag to a multiple of the block size 500MB. "+ + "Disk usage is unlimited if the value is set to 0") + + tmpDataPath = flag.String("remoteWrite.tmpDataPath", "vlagent-remotewrite-data", "Path to directory for storing pending data, which isn't sent to the configured -remoteWrite.url . "+ + "See also -remoteWrite.maxDiskUsagePerURL") + queues = flag.Int("remoteWrite.queues", cgroup.AvailableCPUs()*2, "The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues "+ + "isn't enough for sending high volume of collected data to remote storage. "+ + "Default value depends on the number of available CPU cores. It should work fine in most cases since it minimizes resource usage") + + showRemoteWriteURL = flag.Bool("remoteWrite.showURL", false, "Whether to show -remoteWrite.url in the exported metrics. "+ + "It is hidden by default, since it can contain sensitive info such as auth key") +) + +// rwctxsGlobal contains statically populated entries when -remoteWrite.url is specified. +var rwctxsGlobal []*remoteWriteCtx + +// Storage implements insertutil.LogRowsStorage interface +type Storage struct{} + +// MustAddRows implements insertutil.LogRowsStorage interface +func (*Storage) MustAddRows(lr *logstorage.LogRows) { + pushToRemoteStorages(lr) +} + +// CanWriteData implements insertutil.LogRowsStorage interface +func (*Storage) CanWriteData() error { + return nil +} + +// maxQueues limits the maximum value for `-remoteWrite.queues`. There is no sense in setting too high value, +// since it may lead to high memory usage due to big number of buffers. +var maxQueues = cgroup.AvailableCPUs() * 16 + +const persistentQueueDirname = "persistent-queue" + +// InitSecretFlags must be called after flag.Parse and before any logging. +func InitSecretFlags() { + if !*showRemoteWriteURL { + // remoteWrite.url can contain authentication codes, so hide it at `/metrics` output. + flagutil.RegisterSecretFlag("remoteWrite.url") + } +} + +// Init initializes remotewrite. +// +// It must be called after flag.Parse(). +// +// Stop must be called for graceful shutdown. +func Init() { + if len(*remoteWriteURLs) == 0 { + logger.Fatalf("at least one `-remoteWrite.url` command-line flag must be set") + } + if *queues > maxQueues { + *queues = maxQueues + } + if *queues <= 0 { + *queues = 1 + } + initRemoteWriteCtxs(*remoteWriteURLs) + dropDanglingQueues() +} + +// Stop stops remotewrite. +// +// It is expected that nobody calls TryPush during and after the call to this func. +func Stop() { + for _, rwctx := range rwctxsGlobal { + rwctx.mustStop() + } + rwctxsGlobal = nil +} + +func dropDanglingQueues() { + // Remove dangling persistent queues, if any. + // This is required for the case when the number of queues has been changed or URL have been changed. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4014 + // + // In case if there were many persistent queues with identical *remoteWriteURLs + // the queue with the last index will be dropped. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6140 + existingQueues := make(map[string]struct{}, len(rwctxsGlobal)) + for _, rwctx := range rwctxsGlobal { + existingQueues[rwctx.fq.Dirname()] = struct{}{} + } + + queuesDir := filepath.Join(*tmpDataPath, persistentQueueDirname) + files := fs.MustReadDir(queuesDir) + removed := 0 + for _, f := range files { + dirname := f.Name() + if _, ok := existingQueues[dirname]; !ok { + logger.Infof("removing dangling queue %q", dirname) + fullPath := filepath.Join(queuesDir, dirname) + fs.MustRemoveAll(fullPath) + removed++ + } + } + if removed > 0 { + logger.Infof("removed %d dangling queues from %q, active queues: %d", removed, *tmpDataPath, len(rwctxsGlobal)) + } +} + +func initRemoteWriteCtxs(urls []string) { + if len(urls) == 0 { + logger.Panicf("BUG: urls must be non-empty") + } + + maxInmemoryBlocks := memory.Allowed() / len(urls) / 10000 + if maxInmemoryBlocks / *queues > 100 { + // There is no much sense in keeping higher number of blocks in memory, + // since this means that the producer outperforms consumer and the queue + // will continue growing. It is better storing the queue to file. + maxInmemoryBlocks = 100 * *queues + } + if maxInmemoryBlocks < 2 { + maxInmemoryBlocks = 2 + } + rwctxs := make([]*remoteWriteCtx, len(urls)) + rwctxIdx := make([]int, len(urls)) + for i, remoteWriteURLRaw := range urls { + remoteWriteURL, err := url.Parse(remoteWriteURLRaw) + if err != nil { + logger.Fatalf("invalid -remoteWrite.url=%q: %s", remoteWriteURL, err) + } + sanitizedURL := fmt.Sprintf("%d:secret-url", i+1) + if *showRemoteWriteURL { + sanitizedURL = fmt.Sprintf("%d:%s", i+1, remoteWriteURL) + } + rwctxs[i] = newRemoteWriteCtx(i, remoteWriteURL, maxInmemoryBlocks, sanitizedURL) + rwctxIdx[i] = i + } + + rwctxsGlobal = rwctxs +} + +func pushToRemoteStorages(lr *logstorage.LogRows) { + rwctxs := rwctxsGlobal + if len(rwctxs) == 1 { + // fast path + rwctxs[0].push(lr) + return + } + // Push samples to remote storage systems in parallel in order to reduce + // the time needed for sending the data to multiple remote storage systems. + var wg sync.WaitGroup + for _, rwctx := range rwctxs { + wg.Add(1) + go func(rwctx *remoteWriteCtx) { + defer wg.Done() + rwctx.push(lr) + + }(rwctx) + } + wg.Wait() +} + +type remoteWriteCtx struct { + idx int + fq *persistentqueue.FastQueue + c *client + + pls []*pendingLogs + pssNextIdx atomic.Uint64 +} + +func newRemoteWriteCtx(argIdx int, remoteWriteURL *url.URL, maxInmemoryBlocks int, sanitizedURL string) *remoteWriteCtx { + // protocol version is required by victoria-logs + q := remoteWriteURL.Query() + q.Set("version", netinsert.ProtocolVersion) + remoteWriteURL.RawQuery = q.Encode() + + // strip query params, otherwise changing params resets pq + pqURL := *remoteWriteURL + pqURL.RawQuery = "" + pqURL.Fragment = "" + h := xxhash.Sum64([]byte(pqURL.String())) + queuePath := filepath.Join(*tmpDataPath, persistentQueueDirname, fmt.Sprintf("%d_%016X", argIdx+1, h)) + maxPendingBytes := maxPendingBytesPerURL.GetOptionalArg(argIdx) + if maxPendingBytes != 0 && maxPendingBytes < persistentqueue.DefaultChunkFileSize { + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4195 + logger.Warnf("rounding the -remoteWrite.maxDiskUsagePerURL=%d to the minimum supported value: %d", maxPendingBytes, persistentqueue.DefaultChunkFileSize) + maxPendingBytes = persistentqueue.DefaultChunkFileSize + } + + fq := persistentqueue.MustOpenFastQueue(queuePath, sanitizedURL, maxInmemoryBlocks, maxPendingBytes, false) + _ = metrics.GetOrCreateGauge(fmt.Sprintf(`vlagent_remotewrite_pending_data_bytes{path=%q, url=%q}`, queuePath, sanitizedURL), func() float64 { + return float64(fq.GetPendingBytes()) + }) + _ = metrics.GetOrCreateGauge(fmt.Sprintf(`vlagent_remotewrite_pending_inmemory_blocks{path=%q, url=%q}`, queuePath, sanitizedURL), func() float64 { + return float64(fq.GetInmemoryQueueLen()) + }) + _ = metrics.GetOrCreateGauge(fmt.Sprintf(`vlagent_remotewrite_queue_blocked{path=%q, url=%q}`, queuePath, sanitizedURL), func() float64 { + if fq.IsWriteBlocked() { + return 1 + } + return 0 + }) + + var c *client + switch remoteWriteURL.Scheme { + case "http", "https": + c = newHTTPClient(argIdx, remoteWriteURL.String(), sanitizedURL, fq, *queues) + default: + logger.Fatalf("unsupported scheme: %s for remoteWriteURL: %s, want `http`, `https`", remoteWriteURL.Scheme, sanitizedURL) + } + c.init(argIdx, *queues, sanitizedURL) + + // Initialize pss + plsLen := *queues + if n := cgroup.AvailableCPUs(); plsLen > n { + // There is no sense in running more than availableCPUs concurrent pendingLogs, + // since every pendingLogs can saturate up to a single CPU. + plsLen = n + } + pls := make([]*pendingLogs, plsLen) + for i := range pls { + pls[i] = newPendingLogs(fq) + } + + rwctx := &remoteWriteCtx{ + idx: argIdx, + fq: fq, + c: c, + pls: pls, + } + + return rwctx +} + +func (rwctx *remoteWriteCtx) push(lr *logstorage.LogRows) { + pls := rwctx.pls + idx := rwctx.pssNextIdx.Add(1) % uint64(len(pls)) + pls[idx].add(lr) +} + +func (rwctx *remoteWriteCtx) mustStop() { + for _, ps := range rwctx.pls { + ps.mustStop() + } + rwctx.idx = 0 + rwctx.pls = nil + rwctx.fq.UnblockAllReaders() + rwctx.c.MustStop() + rwctx.c = nil + + rwctx.fq.MustClose() + rwctx.fq = nil +} diff --git a/apptest/testcase.go b/apptest/testcase.go index 973eedae0f..3cd7a5fd13 100644 --- a/apptest/testcase.go +++ b/apptest/testcase.go @@ -433,3 +433,24 @@ func (tc *TestCase) MustStartVlsingle(instance string, flags []string) *Vlsingle tc.addApp(instance, app) return app } + +// MustStartDefaultVlagent is a test helper function that starts an instance of +// vlagent with defaults suitable for most tests. +func (tc *TestCase) MustStartDefaultVlagent(remoteWriteURLs []string) *Vlagent { + tc.t.Helper() + + return tc.MustStartVlagent("vlagent", remoteWriteURLs, nil) +} + +// MustStartVlagent is a test helper function that starts an instance of +// vlagent and fails the test if the app fails to start. +func (tc *TestCase) MustStartVlagent(instance string, remoteWriteURLs []string, flags []string) *Vlagent { + tc.t.Helper() + + app, err := StartVlagent(instance, remoteWriteURLs, flags, tc.cli) + if err != nil { + tc.t.Fatalf("Could not start %s: %v", instance, err) + } + tc.addApp(instance, app) + return app +} diff --git a/apptest/tests/vlagent_remotewrite_test.go b/apptest/tests/vlagent_remotewrite_test.go new file mode 100644 index 0000000000..f805a32e27 --- /dev/null +++ b/apptest/tests/vlagent_remotewrite_test.go @@ -0,0 +1,154 @@ +package tests + +import ( + "fmt" + "os" + "path" + "testing" + "time" + + at "github.com/VictoriaMetrics/VictoriaMetrics/apptest" +) + +// TestSingleVlagentRemoteWrite performs tests for remote write data ingestion +// by vlagent application +func TestSingleVlagentRemoteWrite(t *testing.T) { + os.RemoveAll(t.Name()) + tc := at.NewTestCase(t) + defer tc.Stop() + + // test data ingestion into + const instance = "vlsingle" + const r1Port = "50425" + sutFlags := []string{ + "-httpListenAddr=127.0.0.1:" + r1Port, + "-storageDataPath=" + tc.Dir() + "/" + instance, + "-retentionPeriod=100y", + } + + sut := tc.MustStartVlsingle(instance, sutFlags) + remoteWriteURL := fmt.Sprintf("http://%s/internal/insert", sut.HTTPAddr()) + + vlagent := tc.MustStartDefaultVlagent([]string{remoteWriteURL}) + vlagent.JSONLineWrite(t, []string{ + `{"_msg":"ingest jsonline","_time": "2025-06-05T14:30:19.088007Z", "foo":"bar"}`, + `{"_msg":"ingest jsonline","_time": "2025-06-05T14:30:19.088007Z", "bar":"foo"}`, + }, at.QueryOptsLogs{}) + + sut.ForceFlush(t) + got := sut.LogsQLQuery(t, "ingest jsonline", at.QueryOptsLogs{}) + wantLogLines := []string{ + `{"_msg":"ingest jsonline","_stream":"{}","_time":"2025-06-05T14:30:19.088007Z","bar":"foo"}`, + `{"_msg":"ingest jsonline","_stream":"{}","_time":"2025-06-05T14:30:19.088007Z","foo":"bar"}`, + } + assertLogsQLResponseEqual(t, got, &at.LogsQLQueryResponse{LogLines: wantLogLines}) + + // stop log storage and check data buffering works correctly + tc.StopApp(instance) + + // ingest some data vlagent must hold it in memory + vlagent.JSONLineWrite(t, []string{ + `{"_msg":"ingest jsonline2","_time": "2025-06-05T14:30:19.088007Z", "foo":"bar"}`, + `{"_msg":"ingest jsonline2","_time": "2025-06-05T14:30:19.088007Z", "bar":"foo"}`, + }, at.QueryOptsLogs{}) + + vlagent.WaitQueueEmptyAfter(t, func() { + // start storage and check if buffered data correctly ingested + sut = tc.MustStartVlsingle(instance, sutFlags) + }) + + sut.ForceFlush(t) + got = sut.LogsQLQuery(t, "ingest jsonline2", at.QueryOptsLogs{}) + wantLogLines = []string{ + `{"_msg":"ingest jsonline2","_stream":"{}","_time":"2025-06-05T14:30:19.088007Z","bar":"foo"}`, + `{"_msg":"ingest jsonline2","_stream":"{}","_time":"2025-06-05T14:30:19.088007Z","foo":"bar"}`, + } + assertLogsQLResponseEqual(t, got, &at.LogsQLQueryResponse{LogLines: wantLogLines}) +} + +func TestSingleVlagentRemoteWriteReplication(t *testing.T) { + os.RemoveAll(t.Name()) + tc := at.NewTestCase(t) + defer tc.Stop() + + const ( + instanceReplica0 = "vlsingle-0" + vlsinglePortR0 = "53541" + instanceReplica1 = "vlsingle-1" + vlsinglePortR1 = "53124" + vlagentInstance = "vlagent" + ) + sutFlagsR0 := []string{ + "-httpListenAddr=127.0.0.1:" + vlsinglePortR0, + "-storageDataPath=" + path.Join(tc.Dir(), instanceReplica0), + "-retentionPeriod=100y", + } + sutFlagsR1 := []string{ + "-httpListenAddr=127.0.0.1:" + vlsinglePortR1, + "-storageDataPath=" + path.Join(tc.Dir(), instanceReplica1), + "-retentionPeriod=100y", + } + + sutR0 := tc.MustStartVlsingle(instanceReplica0, sutFlagsR0) + sutR1 := tc.MustStartVlsingle(instanceReplica1, sutFlagsR1) + + vlagentRemoteWriteURLs := []string{ + fmt.Sprintf("http://%s/internal/insert", sutR0.HTTPAddr()), + fmt.Sprintf("http://%s/internal/insert", sutR1.HTTPAddr()), + } + vlagentFlags := []string{ + "-remoteWrite.tmpDataPath=" + fmt.Sprintf("%s/%s-%d", os.TempDir(), vlagentInstance, time.Now().UnixNano()), + } + vlagent := tc.MustStartVlagent(vlagentInstance, vlagentRemoteWriteURLs, vlagentFlags) + + // ingest data and check if it properly replicated to the vlsingles + vlagent.JSONLineWrite(t, []string{ + `{"_msg":"ingest jsonline","_time": "2025-06-05T14:30:19.088007Z", "foo":"bar"}`, + `{"_msg":"ingest jsonline","_time": "2025-06-05T14:30:19.088007Z", "bar":"foo"}`, + }, at.QueryOptsLogs{}) + + wantLogLines := []string{ + `{"_msg":"ingest jsonline","_stream":"{}","_time":"2025-06-05T14:30:19.088007Z","bar":"foo"}`, + `{"_msg":"ingest jsonline","_stream":"{}","_time":"2025-06-05T14:30:19.088007Z","foo":"bar"}`, + } + + sutR0.ForceFlush(t) + gotR0 := sutR0.LogsQLQuery(t, "ingest jsonline", at.QueryOptsLogs{}) + assertLogsQLResponseEqual(t, gotR0, &at.LogsQLQueryResponse{LogLines: wantLogLines}) + + sutR1.ForceFlush(t) + gotR1 := sutR1.LogsQLQuery(t, "ingest jsonline", at.QueryOptsLogs{}) + assertLogsQLResponseEqual(t, gotR1, &at.LogsQLQueryResponse{LogLines: wantLogLines}) + + // stop log storage and check data buffering works correctly at vlagent + tc.StopApp(instanceReplica0) + + // ingest some data vlagent must hold it in memory + vlagent.JSONLineWrite(t, []string{ + `{"_msg":"ingest jsonline2","_stream":"{}","_time":"2025-06-05T14:30:19.088007Z","bar":"foo"}`, + `{"_msg":"ingest jsonline2","_stream":"{}","_time":"2025-06-05T14:30:19.088007Z","foo":"bar"}`, + }, at.QueryOptsLogs{}) + + // check alive storage received data + wantLogLines = []string{ + `{"_msg":"ingest jsonline2","_stream":"{}","_time":"2025-06-05T14:30:19.088007Z","bar":"foo"}`, + `{"_msg":"ingest jsonline2","_stream":"{}","_time":"2025-06-05T14:30:19.088007Z","foo":"bar"}`, + } + + sutR1.ForceFlush(t) + gotR1 = sutR1.LogsQLQuery(t, "ingest jsonline2", at.QueryOptsLogs{}) + assertLogsQLResponseEqual(t, gotR1, &at.LogsQLQueryResponse{LogLines: wantLogLines}) + + // stop vmagent, it must buffer data on-disk + tc.StopApp(vlagentInstance) + + vlagent = tc.MustStartVlagent(vlagentInstance, vlagentRemoteWriteURLs, vlagentFlags) + vlagent.WaitQueueEmptyAfter(t, func() { + // start storage and check if buffered data correctly ingested + sutR0 = tc.MustStartVlsingle(instanceReplica0, sutFlagsR0) + }) + + sutR0.ForceFlush(t) + gotR0 = sutR0.LogsQLQuery(t, "ingest jsonline2", at.QueryOptsLogs{}) + assertLogsQLResponseEqual(t, gotR0, &at.LogsQLQueryResponse{LogLines: wantLogLines}) +} diff --git a/apptest/vlagent.go b/apptest/vlagent.go new file mode 100644 index 0000000000..6f4fdd19ca --- /dev/null +++ b/apptest/vlagent.go @@ -0,0 +1,159 @@ +package apptest + +import ( + "fmt" + "net/http" + "os" + "regexp" + "strings" + "testing" + "time" +) + +// Vlagent holds the state of a vlagent app and provides vlagent-specific functions +type Vlagent struct { + *app + *ServesMetrics + + remoteStoragesCount int + httpListenAddr string +} + +// StartVlagent starts an instance of vlagent with the given flags. +// It also sets the default flags and populates the app instance state with runtime +// values extracted from the application log (such as httpListenAddr) +func StartVlagent(instance string, remoteWriteURLs []string, flags []string, cli *Client) (*Vlagent, error) { + extractREs := []*regexp.Regexp{ + httpListenAddrRE, + } + + app, stderrExtracts, err := startApp(instance, "../../bin/vlagent", flags, &appOptions{ + defaultFlags: map[string]string{ + "-httpListenAddr": "127.0.0.1:0", + "-remoteWrite.url": strings.Join(remoteWriteURLs, ","), + "-remoteWrite.tmpDataPath": fmt.Sprintf("%s/%s-%d", os.TempDir(), instance, time.Now().UnixNano()), + "-remoteWrite.flushInterval": "10ms", + "-remoteWrite.showURL": "true", + }, + extractREs: extractREs, + }) + if err != nil { + return nil, err + } + + return &Vlagent{ + app: app, + remoteStoragesCount: len(remoteWriteURLs), + ServesMetrics: &ServesMetrics{ + metricsURL: fmt.Sprintf("http://%s/metrics", stderrExtracts[0]), + cli: cli, + }, + httpListenAddr: stderrExtracts[0], + }, nil +} + +// JSONLineWrite is a test helper function that inserts a +// collection of records in json line format by sending a HTTP +// POST request to /insert/jsonline vlagent endpoint. +// +// See https://docs.victoriametrics.com/victorialogs/data-ingestion/#json-stream-api +func (app *Vlagent) JSONLineWrite(t *testing.T, records []string, opts QueryOptsLogs) { + t.Helper() + + data := []byte(strings.Join(records, "\n")) + + url := fmt.Sprintf("http://%s/insert/jsonline", app.httpListenAddr) + uv := opts.asURLValues() + uvs := uv.Encode() + if len(uvs) > 0 { + url += "?" + uvs + } + app.sendBlocking(t, len(records), func() { + _, statusCode := app.cli.Post(t, url, "text/plain", data) + if statusCode != http.StatusOK { + t.Fatalf("unexpected status code: got %d, want %d", statusCode, http.StatusOK) + } + }) +} + +// WaitQueueEmptyAfter checks that persistent queue is empty +// after execution of provided callback +func (app *Vlagent) WaitQueueEmptyAfter(t *testing.T, cb func()) { + t.Helper() + const ( + retries = 70 + period = 100 * time.Millisecond + ) + // vlagent_remotewrite_blocks_sent_total + // take in account data replication + blocksSent := app.remoteWriteBlocksSent(t) + cb() + for range retries { + if app.remoteWriteBlocksSent(t) > blocksSent && app.persistentQueueSize(t) == 0 { + return + } + time.Sleep(period) + } + t.Fatalf("timed out while waiting for inserted logs to be flushed to remote storage") + +} + +// sendBlocking sends the data to remote write url by executing `send` function and +// waits until the data is actually sent. +// +// vlagent does not send the data immediately. It first puts the data into a +// buffer. Then a background goroutine takes the data from the buffer sends it +// to the vmstorage. This happens every 1s by default. +// +// Waiting is implemented a retrieving the value of `vlagent_remotewrite_block_size_rows_sum` +// metric and checking whether it is equal or greater than the wanted value. +// If it is, then the data has been sent to remote storage. +// +// Unreliable if the records are inserted concurrently. +func (app *Vlagent) sendBlocking(t *testing.T, numRecordsToSend int, send func()) { + t.Helper() + + send() + + const ( + retries = 50 + period = 100 * time.Millisecond + ) + // take in account data replication + wantRowsSentCount := app.remoteWriteRowsPushed(t) + numRecordsToSend*app.remoteStoragesCount + for range retries { + if app.remoteWriteRowsPushed(t) >= wantRowsSentCount { + return + } + time.Sleep(period) + } + t.Fatalf("timed out while waiting for inserted rows to be sent to remote storage") +} + +func (app *Vlagent) remoteWriteBlocksSent(t *testing.T) int { + total := 0.0 + for _, v := range app.GetMetricsByPrefix(t, "vlagent_remotewrite_blocks_sent_total") { + total += v + } + return int(total) +} + +func (app *Vlagent) remoteWriteRowsPushed(t *testing.T) int { + total := 0.0 + // vlagent_remotewrite_blocks_sent_total + for _, v := range app.GetMetricsByPrefix(t, "vlagent_remotewrite_block_size_rows_sum") { + total += v + } + return int(total) +} + +func (app *Vlagent) persistentQueueSize(t *testing.T) int { + total := 0.0 + for _, v := range app.GetMetricsByPrefix(t, "vlagent_remotewrite_pending_data_bytes") { + total += v + } + for _, v := range app.GetMetricsByPrefix(t, "vlagent_remotewrite_pending_inmemory_blocks") { + total += v + } + return int(total) +} diff --git a/dashboards/vlagent.json b/dashboards/vlagent.json new file mode 100644 index 0000000000..f781163427 --- /dev/null +++ b/dashboards/vlagent.json @@ -0,0 +1,5138 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "enable": true, + "expr": "sum(vm_app_version{job=~\"$job\", instance=~\"$instance\"}) by(short_version) unless (sum(vm_app_version{job=~\"$job\", instance=~\"$instance\"} offset $__interval) by(short_version))", + "hide": true, + "iconColor": "dark-blue", + "name": "version", + "textFormat": "{{short_version}}", + "titleFormat": "Version change" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "enable": true, + "expr": "sum(changes(vm_app_start_timestamp{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(job, instance)", + "hide": false, + "iconColor": "dark-yellow", + "name": "restarts", + "textFormat": "{{job}}:{{instance}} restarted" + } + ] + }, + "description": "Overview for VictoriaMetrics vlagent v1.117.0 or higher", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 22, + "links": [ + { + "icon": "doc", + "tags": [], + "targetBlank": true, + "title": "vlagent wiki", + "tooltip": "", + "type": "link", + "url": "https://docs.victoriametrics.com/victorialogs/vlagent/" + }, + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "Found a bug?", + "type": "link", + "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/issues" + }, + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "New releases", + "type": "link", + "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/releases" + }, + { + "asDropdown": false, + "icon": "external link", + "includeVars": false, + "keepTime": false, + "tags": [], + "targetBlank": true, + "title": "Troubleshooting", + "tooltip": "", + "type": "link", + "url": "https://docs.victoriametrics.com/victorialogs/vlagent/#troubleshooting" + } + ], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 105, + "panels": [], + "title": "Stats", + "type": "row" + }, + { + "description": "See [latest releases](https://docs.victoriametrics.com/victorialogs/changelog/).", + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 162, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "
$version
", + "mode": "markdown" + }, + "pluginVersion": "9.2.3", + "title": "Version", + "type": "text" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows number of generated error messages in logs over last 30m. Non-zero value may be a sign of connectivity or misconfiguration errors.", + "fieldConfig": { + "defaults": { + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 16, + "links": [ + { + "targetBlank": true, + "title": "Troubleshooting", + "url": "https://docs.victoriametrics.com/victorialogs/vlagent/#troubleshooting" + } + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "9.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[30m]))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Log errors (30m)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Persistent queue size shows size of pending logs in bytes which hasn't been flushed to remote storage yet. \nIncreasing of value might be a sign of connectivity issues. In such cases, vlagent starts to flush pending data on disk with attempt to send it later once connection is restored.", + "fieldConfig": { + "defaults": { + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 10485760 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 56, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "9.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "expr": "sum(vlagent_remotewrite_pending_data_bytes{job=~\"$job\", instance=~\"$instance\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Persistent queue size", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the cumulative number of log entries ingested. \n\nThe size is calculated before compression.", + "fieldConfig": { + "defaults": { + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 102, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "9.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vl_bytes_ingested_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Ingested bytes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Total number of available CPUs for selected vlagents. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 152, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "9.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(vm_available_cpu_cores{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Available CPU", + "type": "stat" + }, + { + "columns": [], + "datasource": { + "uid": "$ds" + }, + "fontSize": "100%", + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 164, + "scroll": true, + "showHeader": true, + "sort": { + "col": 3, + "desc": false + }, + "styles": [ + { + "alias": "uptime", + "align": "auto", + "colorMode": "cell", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value", + "thresholds": [ + "1800", + "3600" + ], + "type": "number", + "unit": "s" + }, + { + "alias": "", + "align": "auto", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "instance", + "thresholds": [], + "type": "string", + "unit": "short" + }, + { + "alias": "", + "align": "auto", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [], + "type": "hidden", + "unit": "short" + } + ], + "targets": [ + { + "datasource": { + "uid": "$ds" + }, + "expr": "sort((time() - vm_app_start_timestamp{job=~\"$job\", instance=~\"$instance\"}) or (up{job=~\"$job\", instance=~\"$instance\"}))", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "title": "Uptime", + "transform": "table", + "type": "table-old" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 4, + "w": 15, + "x": 0, + "y": 4 + }, + "hiddenSeries": false, + "id": 166, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.2.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "datasource": { + "uid": "$ds" + }, + "expr": "sort(sum(up{job=~\"$job\", instance=~\"$instance\"}) by (job, instance))", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Uptime", + "tooltip": { + "shared": true, + "sort": 1, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "logBase": 1, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 2 + } + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Total size of available memory for selected vlagents.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 4 + }, + "id": 153, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "9.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(vm_available_memory_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Available memory", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 24, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows ingestion rate in number of log entries and bytes per second.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*(bytes)/" + }, + "properties": [ + { + "id": "custom.axisPlacement", + "value": "right" + }, + { + "id": "unit", + "value": "bytes" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vl_rows_ingested_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (type) > 0", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{type}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vl_bytes_ingested_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (type) > 0", + "hide": false, + "instant": false, + "legendFormat": "{{type}} (bytes)", + "range": true, + "refId": "B" + } + ], + "title": "Logs ingestion rate ", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the persistent queue size of pending logs in bytes >2MB which hasn't been flushed to remote storage yet. \n\nIncreasing of value might be a sign of connectivity issues. In such cases, vlagent starts to flush pending data on disk with attempt to send it later once connection is restored.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.\n\nClick on the line and choose Drilldown to show the persistent queue size per instance.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [ + { + "targetBlank": true, + "title": "Drilldown", + "url": "/d/G7Z9GzMGz?viewPanel=125&var-url=${__field.labels.url}&var-ds=$ds&var-instance=$instance&var-job=$job&${__url_time_range}" + } + ], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 17, + "links": [ + { + "title": "Troubleshooting", + "url": "https://docs.victoriametrics.com/victorialogs/vlagent/#troubleshooting" + } + ], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(vlagent_remotewrite_pending_data_bytes{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}) by (job, url) > 2e6", + "interval": "", + "legendFormat": "{{job}} => {{url}}", + "range": true, + "refId": "A" + } + ], + "title": "Persistent queue size to ($url)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "* `*` - unsupported query path\n* `/insert` - [inserts](https://docs.victoriametrics.com/victorialogs/data-ingestion/)\n* `/select` - [reads](https://docs.victoriametrics.com/victorialogs/querying/)\n* `/metrics` - scraping of system metrics", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vl_http_requests_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (path) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{path}}", + "range": true, + "refId": "A" + } + ], + "title": "Requests rate ", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows rate of dropped logs from persistent queue. vlagent drops log lines from queue if in-memory and on-disk queues are full and it is unable to flush them to remote storage.\nThe max size of on-disk queue is configured by `-remoteWrite.maxDiskUsagePerURL` flag.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 49, + "links": [ + { + "targetBlank": true, + "title": "Troubleshooting", + "url": "https://docs.victoriametrics.com/victorialogs/vlagent/#troubleshooting" + } + ], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vm_persistentqueue_bytes_dropped_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (path) > 0", + "interval": "", + "legendFormat": "{{ path }}", + "range": true, + "refId": "A" + } + ], + "title": "Persistent queue dropped rate ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 107, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(vm_log_messages_total{job=~\"$job\",instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (job, level) > 0", + "format": "time_series", + "hide": false, + "interval": "5m", + "intervalFactor": 1, + "legendFormat": "{{job}} - {{level}}", + "range": true, + "refId": "A" + } + ], + "title": "Logging rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Errors rate shows rate for multiple metrics that track possible errors in vlagent, such as network or parsing errors.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 69, + "links": [ + { + "targetBlank": true, + "title": "Troubleshooting", + "url": "https://docs.victoriametrics.com/victorialogs/vlagent/#troubleshooting" + } + ], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vm_http_request_errors_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, protocol) > 0", + "interval": "", + "legendFormat": "requests:{{protocol}} ({{job}})", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vm_protoparser_read_errors_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, type) > 0", + "interval": "", + "legendFormat": "parse: {{type}} ({{job}})", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vm_ingestserver_request_errors_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, type) > 0", + "interval": "", + "legendFormat": "ingest: {{type}} ({{job}})", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vm_protoparser_unmarshal_errors_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, type) > 0", + "interval": "", + "legendFormat": "unmarshal: {{type}} ({{job}})", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vm_promscrape_dial_errors_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job) > 0", + "interval": "", + "legendFormat": "scrape dial ({{job}})", + "range": true, + "refId": "E" + } + ], + "title": "Errors rate ", + "type": "timeseries" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 45, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Percentage of used RSS memory (resident).\nThe RSS memory shows the amount of memory recently accessed by the application. It includes anonymous memory and data from recently accessed files (aka page cache).\nThe application's performance will significantly degrade when memory usage is close to 100%.\n\nClick on the line and choose Drilldown to show memory usage per instance", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [ + { + "targetBlank": true, + "title": "Drilldown", + "url": "/d/G7Z9GzMGz?viewPanel=117&var-job=${__field.labels.job}&var-ds=$ds&var-instance=$instance&${__url_time_range}" + } + ], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 111, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "max(\n max_over_time(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n /\n vm_available_memory_bytes{job=~\"$job\", instance=~\"$instance\"}\n) by(job)", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "RSS memory % usage ", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [ + { + "targetBlank": true, + "title": "Drilldown", + "url": "/d/G7Z9GzMGz?viewPanel=119&var-job=${__field.labels.job}&var-ds=$ds&var-instance=$instance&${__url_time_range}" + } + ], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 157, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(\n rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n /\n process_cpu_cores_available{job=~\"$job\", instance=~\"$instance\"}\n) by(instance)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "CPU % usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Share for memory allocated by the process itself. When memory usage reaches 100% it will be likely OOM-killed.\nSafe memory usage % considered to be below 80%\n\nClick on the line and choose Drilldown to show memory usage per instance", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 155, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(\n max_over_time(process_resident_memory_anon_bytes{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n /\n vm_available_memory_bytes{job=~\"$job\", instance=~\"$instance\"}\n) by(instance)", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "RSS anonymous memory % usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows CPU pressure based on [Pressure Stall Information](https://docs.kernel.org/accounting/psi.html).\n\nThe lower the better.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 158, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(process_pressure_cpu_waiting_seconds_total{job=~\"$job\"}[$__rate_interval])) by (job, instance)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} - waiting", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(process_pressure_cpu_stalled_seconds_total{job=~\"$job\"}[$__rate_interval])) by (job, instance)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} - stalled", + "range": true, + "refId": "B" + } + ], + "title": "CPU pressure", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows memory pressure based on [Pressure Stall Information](https://docs.kernel.org/accounting/psi.html).\n\nThe lower the better.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 156, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(process_pressure_memory_waiting_seconds_total{job=~\"$job\"}[$__rate_interval])) by (job, instance)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} - waiting", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(process_pressure_memory_stalled_seconds_total{job=~\"$job\"}[$__rate_interval])) by (job, instance)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} - stalled", + "range": true, + "refId": "B" + } + ], + "title": "Memory pressure", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the number of bytes read/write from the storage layer when vlagent has to buffer data on disk or read already buffered data.\n\nClick on the line and choose Drilldown to show CPU usage per instance", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [ + { + "targetBlank": true, + "title": "Drilldown", + "url": "/d/G7Z9GzMGz?viewPanel=121&var-job=${__field.labels.job}&var-ds=$ds&var-instance=$instance&${__url_time_range}" + } + ], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "read" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 81, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(process_io_storage_read_bytes_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job) ", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "read {{job}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(process_io_storage_written_bytes_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "write {{job}}", + "range": true, + "refId": "B" + } + ], + "title": "Disk writes/reads", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 39, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(max_over_time(go_goroutines{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Goroutines ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows IO pressure based on [Pressure Stall Information](https://docs.kernel.org/accounting/psi.html).\n\nThe lower the better.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 159, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(process_pressure_io_waiting_seconds_total{job=~\"$job\"}[$__rate_interval])) by (job, instance)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} - waiting", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(process_pressure_io_stalled_seconds_total{job=~\"$job\"}[$__rate_interval])) by (job, instance)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} - stalled", + "range": true, + "refId": "B" + } + ], + "title": "IO pressure", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 41, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(max_over_time(process_num_threads{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Threads", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Network usage shows the bytes rate for data accepted by vlagent and pushed via remotewrite protocol.\nDiscrepancies are possible because of different protocols used for ingesting, scraping and writing data.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vm_tcplistener_read_bytes_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job) * 8 \n+ sum(rate(vm_promscrape_conn_bytes_read_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job) * 8", + "interval": "", + "legendFormat": "in {{job}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vlagent_remotewrite_conn_bytes_written_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job) * 8", + "interval": "", + "legendFormat": "out {{job}}", + "range": true, + "refId": "B" + } + ], + "title": "Network usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the percent of CPU spent on garbage collection.\n\nIf % is high, then CPU usage can be decreased by changing GOGC to higher values. Increasing GOGC value will increase memory usage, and decrease CPU usage.\n\nTry searching for keyword `GOGC` at https://docs.victoriametrics.com/victoriametrics/troubleshooting/ ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 50 + }, + "id": 135, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "max(\n rate(go_gc_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]) \n / rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n ) by(job)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "CPU spent on GC ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the time goroutines have spent in runnable state before actually running. The lower is better.\n\nHigh values or values exceeding the threshold is usually a sign of insufficient CPU resources or CPU throttling. \n\nVerify that service has enough CPU resources. Otherwise, the service could work unreliably with delays in processing.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.1 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 50 + }, + "id": 149, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "max(histogram_quantile(0.99, sum(rate(go_sched_latencies_seconds_bucket{job=~\"$job\"}[$__rate_interval])) by (job, instance, le))) by(job)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Go scheduling latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the rate of allocations in memory. Sudden increase in allocations would mean increased pressure on Go Garbage Collector and can saturate CPU resources of the application.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 58 + }, + "id": 154, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(go_memstats_alloc_bytes_total{job=~\"$job\"}[$__rate_interval])) by (job, instance)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Memory allocations rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Panel shows the percentage of open file descriptors in the OS per instance.\nReaching the limit of open files (100%) can cause various issues and must be prevented.\n\nSee how to change limits here https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 5, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 58 + }, + "id": 83, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "max(\n max_over_time(process_open_fds{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n /\n process_max_fds{job=~\"$job\", instance=~\"$instance\"}\n) by(job)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Open FDs usage %", + "type": "timeseries" + } + ], + "title": "Resource usage", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 94, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows write saturation of the persistent queue. If the threshold of 0.9sec is reached, then the persistent queue is saturated by more than 90% and vlagent won't be able to keep up with flushing data on disk. In this case, consider to decrease load on the vlagent or improve the disk throughput.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 2, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 98, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "max(rate(vm_persistentqueue_write_duration_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (job)", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Persistent queue write saturation", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows read saturation of the persistent queue. If the threshold of 0.9sec is reached, then the persistent queue is saturated by more than 90% and vlagent won't be able to keep up with reading data from the disk. In this case, consider to decrease load on the vlagent or improve the disk throughput.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 2, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 99, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "max(rate(vm_persistentqueue_read_duration_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (job)", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Persistent queue read saturation", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the rate of dropped data blocks in cases when remote storage replies with `400 Bad Request` and `409 Conflict` HTTP responses.\n\nSee https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1149", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 79, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(vlagent_remotewrite_packets_dropped_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__rate_interval])) by(job, url) > 0", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Data blocks dropped to ($url)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the rate of dropped log lines in cases when -remoteWrite.dropSamplesOnOverload or multiple -remoteWrite.disableOnDiskQueue options are set", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 160, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(vlagent_remotewrite_loglines_dropped_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__rate_interval])) by(job, url) > 0", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Log Lines dropped to ($url)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto", + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 129, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "job" + } + ] + }, + "pluginVersion": "9.2.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(flag{is_set=\"true\", job=~\"$job\", instance=~\"$instance\"}) by(job, instance, name, value)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Non-default flags", + "transformations": [ + { + "id": "groupBy", + "options": { + "fields": { + "instance": { + "aggregations": [] + }, + "job": { + "aggregations": [], + "operation": "groupby" + }, + "name": { + "aggregations": [], + "operation": "groupby" + }, + "value": { + "aggregations": [], + "operation": "groupby" + } + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the number of restarts per job. The chart can be useful to identify periodic process restarts and correlate them with potential issues or anomalies. Normally, processes shouldn't restart unless restart was inited by user. The reason of restarts should be figured out by checking the logs of each specific service. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "stepAfter", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 150, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(changes(vm_app_start_timestamp{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]) > 0) by(job)", + "format": "time_series", + "instant": false, + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "title": "Restarts ($job)", + "type": "timeseries" + } + ], + "title": "Troubleshooting", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 71, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows how many concurrent inserts are taking place.\n\nIf the number of concurrent inserts hitting the `limit` or is close to the `limit` constantly - it might be a sign of a resource shortage.\n\n If vlagent's CPU usage and remote write connection saturation are at normal level, it might be that `-maxConcurrentInserts` cmd-line flag need to be increased.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 130, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.2.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "max_over_time(vm_concurrent_insert_current{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{instance}} ({{job}})", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "min(vm_concurrent_insert_capacity{job=~\"$job\", instance=~\"$instance\"}) by(job)", + "interval": "", + "legendFormat": "limit ({{job}})", + "range": true, + "refId": "B" + } + ], + "title": "Concurrent inserts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the rate of write errors in ingestserver (UDP, TCP connections) and HTTP server.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 77, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.2.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "exemplar": true, + "expr": "sum(rate(vm_ingestserver_request_errors_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(type, net) > 0", + "interval": "", + "legendFormat": "{{ type }} ({{net}})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "exemplar": true, + "expr": "sum(rate(vlagent_http_request_errors_total{job=~\"$job\", instance=~\"$instance\", protocol!=\"\"}[$__rate_interval])) by(protocol) > 0", + "interval": "", + "legendFormat": "{{ protocol }} (http)", + "refId": "B" + } + ], + "title": "Error rate ", + "type": "timeseries" + } + ], + "title": "Ingestion", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 36 + }, + "id": 58, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the rate of requests to configured remote write endpoints by url and status code.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 60, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(vlagent_remotewrite_requests_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__rate_interval])) by(job, url, status_code) > 0", + "interval": "", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Requests rate to ($url)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the global rate for number of written bytes via remote write connections.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 66, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(vlagent_remotewrite_conn_bytes_written_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job) > 0", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Bytes write rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows requests retry rate by url. Number of retries is unlimited but protected with delays up to 1m between attempts.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 61, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(vlagent_remotewrite_retries_count_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__rate_interval])) by(url) > 0", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Retry rate to ($url)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows current number of established connections to remote write endpoints.\n\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 65, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(max_over_time(vlagent_remotewrite_conns{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job)", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows saturation of every connection to remote storage. If the threshold of 90% is reached, then the connection is saturated (busy or slow) by more than 90%, so vlagent won't be able to keep up and can start buffering data. \n\nThis usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase the number of connections per each remote storage.\n", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 29 + }, + "id": 84, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "max(\n rate(vlagent_remotewrite_send_duration_seconds_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__rate_interval])\n /\n vlagent_remotewrite_queues{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}\n) by(job, url)", + "interval": "", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Remote write connection saturation", + "type": "timeseries" + } + ], + "title": "Remote write", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 37 + }, + "id": 113, + "panels": [ + { + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 115, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "Drilldown row is used by other panels on the dashboard to show more detailed metrics per-instance.", + "mode": "markdown" + }, + "pluginVersion": "9.2.3", + "transparent": true, + "type": "text" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 119, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, instance)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}} ({{job}})", + "range": true, + "refId": "A" + } + ], + "title": "CPU usage ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the used memory (resident).\nThe application's performance will significantly degrade when memory usage is close to 100%.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 117, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "max_over_time(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{instance}} ({{job}})", + "range": true, + "refId": "A" + } + ], + "title": "RSS memory usage ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the persistent queue size of pending samples in bytes which hasn't been flushed to remote storage yet. \n\nIncreasing of value might be a sign of connectivity issues. In such cases, vlagent starts to flush pending data on disk with attempt to send it later once connection is restored.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 125, + "links": [ + { + "title": "Troubleshooting", + "url": "https://docs.victoriametrics.com/victorialogs/vlagent/#troubleshooting" + } + ], + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(vlagent_remotewrite_pending_data_bytes{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}) by (instance, url)", + "interval": "", + "legendFormat": "{{instance}} => {{url}}", + "range": true, + "refId": "A" + } + ], + "title": "Persistent queue size to ($url)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the number of bytes read/write from the storage layer when vlagent has to buffer data on disk or read already buffered data.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "read" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 121, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(process_io_storage_read_bytes_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, instance)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "read {{instance}} {{job}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(process_io_storage_written_bytes_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job,instance)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "write {{instance}} {{job}}", + "range": true, + "refId": "B" + } + ], + "title": "Disk writes/reads", + "type": "timeseries" + } + ], + "title": "Drilldown", + "type": "row" + } + ], + "refresh": "", + "schemaVersion": 37, + "style": "dark", + "tags": [ + "victoriametrics" + ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "default", + "value": "default" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "ds", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "definition": "label_values(vm_app_version{version=~\"^vlagent.*\"}, job)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "job", + "options": [], + "query": { + "query": "label_values(vm_app_version{version=~\"^vlagent.*\"}, job)", + "refId": "VictoriaMetrics-job-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "localhost:9429" + ], + "value": [ + "localhost:9429" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "definition": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "instance", + "options": [], + "query": { + "query": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "refId": "VictoriaMetrics-instance-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "definition": "label_values(vlagent_remotewrite_requests_total{job=~\"$job\", instance=~\"$instance\"}, url)", + "description": "The remote write URLs", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "url", + "options": [], + "query": { + "query": "label_values(vlagent_remotewrite_requests_total{job=~\"$job\", instance=~\"$instance\"}, url)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${ds}" + }, + "definition": "label_values(vm_app_version{job=~\"$job\", instance=~\"$instance\"},short_version)", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "version", + "options": [], + "query": { + "query": "label_values(vm_app_version{job=~\"$job\", instance=~\"$instance\"},short_version)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "baseFilters": [], + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "filters": [], + "hide": 0, + "name": "adhoc", + "skipUrlSync": false, + "type": "adhoc" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "VictoriaMetrics - vlagent", + "uid": "Y5Z9GzMGz", + "version": 5, + "weekStart": "" +} diff --git a/docs/victorialogs/vlagent.md b/docs/victorialogs/vlagent.md new file mode 100644 index 0000000000..9e64084383 --- /dev/null +++ b/docs/victorialogs/vlagent.md @@ -0,0 +1,508 @@ +--- +weight: 3 +menu: + docs: + parent: victorialogs + weight: 3 +title: vlagent +tags: + - logs +aliases: + - /vlagent.html + - /vlagent/index.html + - /vlagent/ +--- + +`vlagent` is a tiny agent which helps you collect logs from various sources +and store them in [VictoriaLogs](https://docs.victoriametrics.com/victorialogs/). +See [Quick Start](#quick-start) for details. + + +## Motivation + +While VictoriaLogs provides an efficient solution to store and observe logs, it lacks of replication out of box. +Previous solution was to configure clients to replicate log streams into multiple VictoriaLogs installations. +`vlagent` is a missing piece of log streams replication. + +## Features + +- It can accept logs from popular log collectors. See [these docs](https://docs.victoriametrics.com/victorialogs/data-ingestion/). +* Can replicate collected logs simultaneously to multiple VictoriaLogs instances - see [these docs](#replication-and-high-availability). +* Works smoothly in environments with unstable connections to remote storage. If the remote storage is unavailable, the collected logs + are buffered at `-remoteWrite.tmpDataPath`. The buffered logs are sent to remote storage as soon as the connection + to the remote storage is repaired. The maximum disk usage for the buffer can be limited with `-remoteWrite.maxDiskUsagePerURL`. + +## Quick Start + +Please download `vlagent` archive from [releases page](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/latest) ( +`vlagent` is also available in docker images [Docker Hub](https://hub.docker.com/r/victoriametrics/vlagent/tags) and [Quay](https://quay.io/repository/victoriametrics/vlagent?tab=tags)), +unpack it and pass the following flags to the `vlagent` binary in order to start sending the data to the VictoriaLogs remote storage: + +* `-remoteWrite.url` with VictoriaLogs native protocol compatible remote storage endpoint, where to send the data to. + The `-remoteWrite.url` may refer to [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) address. See [these docs](#srv-urls) for details. + +Example command for writing the data received via [supported push-based protocols](#how-to-push-data-to-vlagent) +to [single-node VictoriaLogs](https://docs.victoriametrics.com/victorialogs) located at `victoria-logs-host:9428`: + +```sh +/path/to/vlagent -remoteWrite.url=https://victoria-logs-host:9428/internal/insert +``` + +Pass `-help` to `vlagent` in order to see [the full list of supported command-line flags with their descriptions](#advanced-usage). + +### Replication and high availability + +`vlagent` replicates the collected logs among multiple remote storage instances configured via `-remoteWrite.url` args. +If a single remote storage instance temporarily is out of service, then the collected data remains available in another remote storage instance. +`vlagent` buffers the collected data in files at `-remoteWrite.tmpDataPath` until the remote storage becomes available again, +and then it sends the buffered data to the remote storage in order to prevent data gaps. + +## Monitoring + +`vlagent` exports various metrics in Prometheus exposition format at `http://vmalent-host:9429/metrics` page. +We recommend setting up regular scraping of this page either through `vmagent` or by Prometheus-compatible scraper, +so that the exported metrics may be analyzed later. + +Use official [Grafana dashboard](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/dashboards/vlagent.json) for `vlagent` state overview. +Graphs on this dashboard contain useful hints - hover the `i` icon at the top left corner of each graph in order to read it. +If you have suggestions for improvements or have found a bug - please open an issue on github or add a review to the dashboard. + +## Troubleshooting + +* It is recommended [setting up the official Grafana dashboard](#monitoring) in order to monitor the state of `vlagent`. + +* It is recommended increasing `-remoteWrite.queues` if `vlagent_remotewrite_pending_data_bytes` [metric](#monitoring) + grows constantly. It is also recommended increasing `-remoteWrite.maxBlockSize` command-line flags in this case. + This can improve data ingestion performance to the configured remote storage systems at the cost of higher memory usage. + +* If you see gaps in the data pushed by `vlagent` to remote storage when `-remoteWrite.maxDiskUsagePerURL` is set, + try increasing `-remoteWrite.queues`. Such gaps may appear because `vlagent` cannot keep up with sending the collected data to remote storage. + Therefore, it starts dropping the buffered data if the on-disk buffer size exceeds `-remoteWrite.maxDiskUsagePerURL`. + +* `vlagent` drops data blocks if remote storage replies with `400 Bad Request` and `404 Not Found` HTTP responses. + The number of dropped blocks can be monitored via `vlagent_remotewrite_packets_dropped_total` metric exported at [/metrics page](#monitoring). + +* `vlagent` buffers scraped data at the `-remoteWrite.tmpDataPath` directory until it is sent to `-remoteWrite.url`. + The directory can grow large when remote storage is unavailable for extended periods of time and if the maximum directory size isn't limited + with `-remoteWrite.maxDiskUsagePerURL` command-line flag. + If you don't want to send all the buffered data from the directory to remote storage then simply stop `vlagent` and delete the directory. + +* By default `vlagent` masks `-remoteWrite.url` with `secret-url` values in logs and at `/metrics` page because + the url may contain sensitive information such as auth tokens or passwords. + Pass `-remoteWrite.showURL` command-line flag when starting `vlagent` in order to see all the valid urls. + +See also: + +- [General Troubleshooting](https://docs.victoriametrics.com/victoriametrics/troubleshooting/) + + +## Profiling + +`vlagent` provides handlers for collecting the following [Go profiles](https://blog.golang.org/profiling-go-programs): + +* Memory profile can be collected with the following command (replace `0.0.0.0` with hostname if needed): + + +```sh +curl http://0.0.0.0:9429/debug/pprof/heap > mem.pprof +``` + + +* CPU profile can be collected with the following command (replace `0.0.0.0` with hostname if needed): + + +```sh +curl http://0.0.0.0:9429/debug/pprof/profile > cpu.pprof +``` + + +The command for collecting CPU profile waits for 30 seconds before returning. + +The collected profiles may be analyzed with [go tool pprof](https://github.com/google/pprof). + +It is safe sharing the collected profiles from security point of view, since they do not contain sensitive information. + +## Advanced usage + +`vlagent` can be fine-tuned with various command-line flags. Run `./vlagent -help` in order to see the full list of these flags with their descriptions and default values: + +```bash +vlagent collects logs via popular data ingestion protocols and routes it to VictoriaLogs. + +See the docs at https://docs.victoriametrics.com/victorialogs/vlagent/ . + + -blockcache.missesBeforeCaching int + The number of cache misses before putting the block into cache. Higher values may reduce indexdb/dataBlocks cache size at the cost of higher CPU and disk read usage (default 2) + -datadog.ignoreFields array + Comma-separated list of fields to ignore for logs ingested via DataDog protocol. See https://docs.victoriametrics.com/victorialogs/data-ingestion/datadog-agent/#dropping-fields + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -datadog.maxRequestSize size + The maximum size in bytes of a single DataDog request + Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 67108864) + -datadog.streamFields array + Comma-separated list of fields to use as log stream fields for logs ingested via DataDog protocol. See https://docs.victoriametrics.com/victorialogs/data-ingestion/datadog-agent/#stream-fields + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -defaultMsgValue string + Default value for _msg field if the ingested log entry doesn't contain it; see https://docs.victoriametrics.com/victorialogs/keyconcepts/#message-field (default "missing _msg field; see https://docs.victoriametrics.com/victorialogs/keyconcepts/#message-field") + -elasticsearch.version string + Elasticsearch version to report to client (default "8.9.0") + -enableTCP6 + Whether to enable IPv6 for listening and dialing. By default, only IPv4 TCP and UDP are used + -envflag.enable + Whether to enable reading flags from environment variables in addition to the command line. Command line flag values have priority over values from environment vars. Flags are read only from the command line if this flag isn't set. See https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#environment-variables for more details + -envflag.prefix string + Prefix for environment variables if -envflag.enable is set + -filestream.disableFadvise + Whether to disable fadvise() syscall when reading large data files. The fadvise() syscall prevents from eviction of recently accessed data from OS page cache during background merges and backups. In some rare cases it is better to disable the syscall if it uses too much CPU + -flagsAuthKey value + Auth key for /flags endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* + Flag value can be read from the given file when using -flagsAuthKey=file:///abs/path/to/file or -flagsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -flagsAuthKey=http://host/path or -flagsAuthKey=https://host/path + -fs.disableMmap + Whether to use pread() instead of mmap() for reading data files. By default, mmap() is used for 64-bit arches and pread() is used for 32-bit arches, since they cannot read data files bigger than 2^32 bytes in memory. mmap() is usually faster for reading small data chunks than pread() + -http.connTimeout duration + Incoming connections to -httpListenAddr are closed after the configured timeout. This may help evenly spreading load among a cluster of services behind TCP-level load balancer. Zero value disables closing of incoming connections (default 2m0s) + -http.disableCORS + Disable CORS for all origins (*) + -http.disableKeepAlive + Whether to disable HTTP keep-alive for incoming connections at -httpListenAddr + -http.disableResponseCompression + Disable compression of HTTP responses to save CPU resources. By default, compression is enabled to save network bandwidth + -http.header.csp string + Value for 'Content-Security-Policy' header, recommended: "default-src 'self'" + -http.header.frameOptions string + Value for 'X-Frame-Options' header + -http.header.hsts string + Value for 'Strict-Transport-Security' header, recommended: 'max-age=31536000; includeSubDomains' + -http.idleConnTimeout duration + Timeout for incoming idle http connections (default 1m0s) + -http.maxGracefulShutdownDuration duration + The maximum duration for a graceful shutdown of the HTTP server. A highly loaded server may require increased value for a graceful shutdown (default 7s) + -http.pathPrefix string + An optional prefix to add to all the paths handled by http server. For example, if '-http.pathPrefix=/foo/bar' is set, then all the http requests will be handled on '/foo/bar/*' paths. This may be useful for proxied requests. See https://www.robustperception.io/using-external-urls-and-proxies-with-prometheus + -http.shutdownDelay duration + Optional delay before http server shutdown. During this delay, the server returns non-OK responses from /health page, so load balancers can route new requests to other servers + -httpAuth.password value + Password for HTTP server's Basic Auth. The authentication is disabled if -httpAuth.username is empty + Flag value can be read from the given file when using -httpAuth.password=file:///abs/path/to/file or -httpAuth.password=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -httpAuth.password=http://host/path or -httpAuth.password=https://host/path + -httpAuth.username string + Username for HTTP server's Basic Auth. The authentication is disabled if empty. See also -httpAuth.password + -httpListenAddr array + TCP address to listen for incoming http requests. Set this flag to empty value in order to disable listening on any port. This mode may be useful for running multiple vlagent instances on the same server. Note that /targets and /metrics pages aren't available if -httpListenAddr=''. See also -tls and -httpListenAddr.useProxyProtocol + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -httpListenAddr.useProxyProtocol array + Whether to use proxy protocol for connections accepted at the corresponding -httpListenAddr . See https://www.haproxy.org/download/1.8/doc/proxy-protocol.txt . With enabled proxy protocol http server cannot serve regular /metrics endpoint. Use -pushmetrics.url for metrics pushing + Supports array of values separated by comma or specified via multiple flags. + Empty values are set to false. + -insert.disable + Whether to disable /insert/* HTTP endpoints + -insert.maxFieldsPerLine int + The maximum number of log fields per line, which can be read by /insert/* handlers; see https://docs.victoriametrics.com/victorialogs/faq/#how-many-fields-a-single-log-entry-may-contain (default 1000) + -insert.maxLineSizeBytes size + The maximum size of a single line, which can be read by /insert/* handlers; see https://docs.victoriametrics.com/victorialogs/faq/#what-length-a-log-record-is-expected-to-have + Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 262144) + -insert.maxQueueDuration duration + The maximum duration to wait in the queue when -maxConcurrentInserts concurrent insert requests are executed (default 1m0s) + -internStringCacheExpireDuration duration + The expiry duration for caches for interned strings. See https://en.wikipedia.org/wiki/String_interning . See also -internStringMaxLen and -internStringDisableCache (default 6m0s) + -internStringDisableCache + Whether to disable caches for interned strings. This may reduce memory usage at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringCacheExpireDuration and -internStringMaxLen + -internStringMaxLen int + The maximum length for strings to intern. A lower limit may save memory at the cost of higher CPU usage. See https://en.wikipedia.org/wiki/String_interning . See also -internStringDisableCache and -internStringCacheExpireDuration (default 500) + -internalinsert.disable + Whether to disable /internal/insert HTTP endpoint + -internalinsert.maxRequestSize size + The maximum size in bytes of a single request, which can be accepted at /internal/insert HTTP endpoint + Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 67108864) + -journald.ignoreFields array + Comma-separated list of fields to ignore for logs ingested over journald protocol. See https://docs.victoriametrics.com/victorialogs/data-ingestion/journald/#dropping-fields + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -journald.includeEntryMetadata + Include Journald fields with double underscore prefixes + -journald.streamFields array + Comma-separated list of fields to use as log stream fields for logs ingested over journald protocol. See https://docs.victoriametrics.com/victorialogs/data-ingestion/journald/#stream-fields + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -journald.tenantID string + TenantID for logs ingested via the Journald endpoint. See https://docs.victoriametrics.com/victorialogs/data-ingestion/journald/#multitenancy (default "0:0") + -journald.timeField string + Field to use as a log timestamp for logs ingested via journald protocol. See https://docs.victoriametrics.com/victorialogs/data-ingestion/journald/#time-field (default "__REALTIME_TIMESTAMP") + -loggerDisableTimestamps + Whether to disable writing timestamps in logs + -loggerErrorsPerSecondLimit int + Per-second limit on the number of ERROR messages. If more than the given number of errors are emitted per second, the remaining errors are suppressed. Zero values disable the rate limit + -loggerFormat string + Format for logs. Possible values: default, json (default "default") + -loggerJSONFields string + Allows renaming fields in JSON formatted logs. Example: "ts:timestamp,msg:message" renames "ts" to "timestamp" and "msg" to "message". Supported fields: ts, level, caller, msg + -loggerLevel string + Minimum level of errors to log. Possible values: INFO, WARN, ERROR, FATAL, PANIC (default "INFO") + -loggerMaxArgLen int + The maximum length of a single logged argument. Longer arguments are replaced with 'arg_start..arg_end', where 'arg_start' and 'arg_end' is prefix and suffix of the arg with the length not exceeding -loggerMaxArgLen / 2 (default 5000) + -loggerOutput string + Output for the logs. Supported values: stderr, stdout (default "stderr") + -loggerTimezone string + Timezone to use for timestamps in logs. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 or Local (default "UTC") + -loggerWarnsPerSecondLimit int + Per-second limit on the number of WARN messages. If more than the given number of warns are emitted per second, then the remaining warns are suppressed. Zero values disable the rate limit + -loki.disableMessageParsing + Whether to disable automatic parsing of JSON-encoded log fields inside Loki log message into distinct log fields + -loki.maxRequestSize size + The maximum size in bytes of a single Loki request + Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 67108864) + -maxConcurrentInserts int + The maximum number of concurrent insert requests. Set higher value when clients send data over slow networks. Default value depends on the number of available CPU cores. It should work fine in most cases since it minimizes resource usage. See also -insert.maxQueueDuration (default 20) + -memory.allowedBytes size + Allowed size of system memory VictoriaMetrics caches may occupy. This option overrides -memory.allowedPercent if set to a non-zero value. Too low a value may increase the cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from the OS page cache resulting in higher disk IO usage + Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 0) + -memory.allowedPercent float + Allowed percent of system memory VictoriaMetrics caches may occupy. See also -memory.allowedBytes. Too low a value may increase cache miss rate usually resulting in higher CPU and disk IO usage. Too high a value may evict too much data from the OS page cache which will result in higher disk IO usage (default 60) + -metrics.exposeMetadata + Whether to expose TYPE and HELP metadata at the /metrics page, which is exposed at -httpListenAddr . The metadata may be needed when the /metrics page is consumed by systems, which require this information. For example, Managed Prometheus in Google Cloud - https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type + -metricsAuthKey value + Auth key for /metrics endpoint. It must be passed via authKey query arg. It overrides -httpAuth.* + Flag value can be read from the given file when using -metricsAuthKey=file:///abs/path/to/file or -metricsAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -metricsAuthKey=http://host/path or -metricsAuthKey=https://host/path + -opentelemetry.maxRequestSize size + The maximum size in bytes of a single OpenTelemetry request + Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 67108864) + -pprofAuthKey value + Auth key for /debug/pprof/* endpoints. It must be passed via authKey query arg. It overrides -httpAuth.* + Flag value can be read from the given file when using -pprofAuthKey=file:///abs/path/to/file or -pprofAuthKey=file://./relative/path/to/file . Flag value can be read from the given http/https url when using -pprofAuthKey=http://host/path or -pprofAuthKey=https://host/path + -pushmetrics.disableCompression + Whether to disable request body compression when pushing metrics to every -pushmetrics.url + -pushmetrics.extraLabel array + Optional labels to add to metrics pushed to every -pushmetrics.url . For example, -pushmetrics.extraLabel='instance="foo"' adds instance="foo" label to all the metrics pushed to every -pushmetrics.url + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -pushmetrics.header array + Optional HTTP request header to send to every -pushmetrics.url . For example, -pushmetrics.header='Authorization: Basic foobar' adds 'Authorization: Basic foobar' header to every request to every -pushmetrics.url + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -pushmetrics.interval duration + Interval for pushing metrics to every -pushmetrics.url (default 10s) + -pushmetrics.url array + Optional URL to push metrics exposed at /metrics page. See https://docs.victoriametrics.com/victoriametrics/single-server-victoriametrics/#push-metrics . By default, metrics exposed at /metrics page aren't pushed to any remote storage + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.basicAuth.password array + Optional basic auth password to use for the corresponding -remoteWrite.url + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.basicAuth.passwordFile array + Optional path to basic auth password to use for the corresponding -remoteWrite.url. The file is re-read every second + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.basicAuth.username array + Optional basic auth username to use for the corresponding -remoteWrite.url + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.bearerToken array + Optional bearer auth token to use for the corresponding -remoteWrite.url + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.bearerTokenFile array + Optional path to bearer token file to use for the corresponding -remoteWrite.url. The token is re-read from the file every second + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.flushInterval duration + Interval for flushing the data to remote storage. This option takes effect only when less than 2MB of data per second are pushed to -remoteWrite.url (default 1s) + -remoteWrite.headers array + Optional HTTP headers to send with each request to the corresponding -remoteWrite.url. For example, -remoteWrite.headers='My-Auth:foobar' would send 'My-Auth: foobar' HTTP header with every request to the corresponding -remoteWrite.url. Multiple headers must be delimited by '^^': -remoteWrite.headers='header1:value1^^header2:value2' + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.maxBlockSize size + The maximum block size to send to remote storage. Bigger blocks may improve performance at the cost of the increased memory usage. + Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB (default 8388608) + -remoteWrite.maxDiskUsagePerURL array + The maximum file-based buffer size in bytes at -remoteWrite.tmpDataPath for each -remoteWrite.url. When buffer size reaches the configured maximum, then old data is dropped when adding new data to the buffer. Buffered data is stored in ~500MB chunks. It is recommended to set the value for this flag to a multiple of the block size 500MB. Disk usage is unlimited if the value is set to 0 + Supports the following optional suffixes for size values: KB, MB, GB, TB, KiB, MiB, GiB, TiB. (default 0) + Supports array of values separated by comma or specified via multiple flags. + Empty values are set to default value. + -remoteWrite.oauth2.clientID array + Optional OAuth2 clientID to use for the corresponding -remoteWrite.url + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.oauth2.clientSecret array + Optional OAuth2 clientSecret to use for the corresponding -remoteWrite.url + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.oauth2.clientSecretFile array + Optional OAuth2 clientSecretFile to use for the corresponding -remoteWrite.url + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.oauth2.endpointParams array + Optional OAuth2 endpoint parameters to use for the corresponding -remoteWrite.url . The endpoint parameters must be set in JSON format: {"param1":"value1",...,"paramN":"valueN"} + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.oauth2.scopes array + Optional OAuth2 scopes to use for the corresponding -remoteWrite.url. Scopes must be delimited by ';' + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.oauth2.tokenUrl array + Optional OAuth2 tokenURL to use for the corresponding -remoteWrite.url + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.proxyURL array + Optional proxy URL for writing data to the corresponding -remoteWrite.url. Supported proxies: http, https, socks5. Example: -remoteWrite.proxyURL=socks5://proxy:1234 + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.queues int + The number of concurrent queues to each -remoteWrite.url. Set more queues if default number of queues isn't enough for sending high volume of collected data to remote storage. Default value depends on the number of available CPU cores. It should work fine in most cases since it minimizes resource usage (default 20) + -remoteWrite.rateLimit array + Optional rate limit in bytes per second for data sent to the corresponding -remoteWrite.url. By default, the rate limit is disabled. It can be useful for limiting load on remote storage when big amounts of buffered data (default 0) + Supports array of values separated by comma or specified via multiple flags. + Empty values are set to default value. + -remoteWrite.retryMaxTime array + The max time spent on retry attempts to send a block of data to the corresponding -remoteWrite.url. Change this value if it is expected for -remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval (default 1m0s) + Supports array of values separated by comma or specified via multiple flags. + Empty values are set to default value. + -remoteWrite.retryMinInterval array + The minimum delay between retry attempts to send a block of data to the corresponding -remoteWrite.url. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxTime (default 1s) + Supports array of values separated by comma or specified via multiple flags. + Empty values are set to default value. + -remoteWrite.sendTimeout array + Timeout for sending a single block of data to the corresponding -remoteWrite.url (default 1m0s) + Supports array of values separated by comma or specified via multiple flags. + Empty values are set to default value. + -remoteWrite.showURL + Whether to show -remoteWrite.url in the exported metrics. It is hidden by default, since it can contain sensitive info such as auth key + -remoteWrite.tlsCAFile array + Optional path to TLS CA file to use for verifying connections to the corresponding -remoteWrite.url. By default, system CA is used + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.tlsCertFile array + Optional path to client-side TLS certificate file to use when connecting to the corresponding -remoteWrite.url + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.tlsHandshakeTimeout array + The timeout for establishing tls connections to the corresponding -remoteWrite.url (default 20s) + Supports array of values separated by comma or specified via multiple flags. + Empty values are set to default value. + -remoteWrite.tlsInsecureSkipVerify array + Whether to skip tls verification when connecting to the corresponding -remoteWrite.url + Supports array of values separated by comma or specified via multiple flags. + Empty values are set to false. + -remoteWrite.tlsKeyFile array + Optional path to client-side TLS certificate key to use when connecting to the corresponding -remoteWrite.url + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.tlsServerName array + Optional TLS server name to use for connections to the corresponding -remoteWrite.url. By default, the server name from -remoteWrite.url is used + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -remoteWrite.tmpDataPath string + Path to directory for storing pending data, which isn't sent to the configured -remoteWrite.url . See also -remoteWrite.maxDiskUsagePerURL (default "vlagent-remotewrite-data") + -remoteWrite.url array + Remote storage URL to write data to. It must support VictoriaLogs native protocol. Example url: http://:9428/internal/insert. Pass multiple -remoteWrite.url options in order to replicate the collected data to multiple remote storage systems. + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.compressMethod.tcp array + Compression method for syslog messages received at the corresponding -syslog.listenAddr.tcp. Supported values: none, gzip, deflate. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#compression + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.compressMethod.udp array + Compression method for syslog messages received at the corresponding -syslog.listenAddr.udp. Supported values: none, gzip, deflate. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#compression + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.decolorizeFields.tcp array + Fields to remove ANSI color codes across logs ingested via the corresponding -syslog.listenAddr.tcp. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#decolorizing-fields + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.decolorizeFields.udp array + Fields to remove ANSI color codes across logs ingested via the corresponding -syslog.listenAddr.udp. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#decolorizing-fields + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.extraFields.tcp array + Fields to add to logs ingested via the corresponding -syslog.listenAddr.tcp. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#adding-extra-fields + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.extraFields.udp array + Fields to add to logs ingested via the corresponding -syslog.listenAddr.udp. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#adding-extra-fields + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.ignoreFields.tcp array + Fields to ignore at logs ingested via the corresponding -syslog.listenAddr.tcp. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#dropping-fields + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.ignoreFields.udp array + Fields to ignore at logs ingested via the corresponding -syslog.listenAddr.udp. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#dropping-fields + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.listenAddr.tcp array + Comma-separated list of TCP addresses to listen to for Syslog messages. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/ + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.listenAddr.udp array + Comma-separated list of UDP address to listen to for Syslog messages. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/ + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.streamFields.tcp array + Fields to use as log stream labels for logs ingested via the corresponding -syslog.listenAddr.tcp. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#stream-fields + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.streamFields.udp array + Fields to use as log stream labels for logs ingested via the corresponding -syslog.listenAddr.udp. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#stream-fields + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.tenantID.tcp array + TenantID for logs ingested via the corresponding -syslog.listenAddr.tcp. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#multitenancy + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.tenantID.udp array + TenantID for logs ingested via the corresponding -syslog.listenAddr.udp. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#multitenancy + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.timezone string + Timezone to use when parsing timestamps in RFC3164 syslog messages. Timezone must be a valid IANA Time Zone. For example: America/New_York, Europe/Berlin, Etc/GMT+3 . See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/ (default "Local") + -syslog.tls array + Whether to enable TLS for receiving syslog messages at the corresponding -syslog.listenAddr.tcp. The corresponding -syslog.tlsCertFile and -syslog.tlsKeyFile must be set if -syslog.tls is set. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#security + Supports array of values separated by comma or specified via multiple flags. + Empty values are set to false. + -syslog.tlsCertFile array + Path to file with TLS certificate for the corresponding -syslog.listenAddr.tcp if the corresponding -syslog.tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower. The provided certificate file is automatically re-read every second, so it can be dynamically updated. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#security + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.tlsCipherSuites array + Optional list of TLS cipher suites for -syslog.listenAddr.tcp if -syslog.tls is set. See the list of supported cipher suites at https://pkg.go.dev/crypto/tls#pkg-constants . See also https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#security + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.tlsKeyFile array + Path to file with TLS key for the corresponding -syslog.listenAddr.tcp if the corresponding -syslog.tls is set. The provided key file is automatically re-read every second, so it can be dynamically updated. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#security + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -syslog.tlsMinVersion string + The minimum TLS version to use for -syslog.listenAddr.tcp if -syslog.tls is set. Supported values: TLS10, TLS11, TLS12, TLS13. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#security (default "TLS13") + -syslog.useLocalTimestamp.tcp array + Whether to use local timestamp instead of the original timestamp for the ingested syslog messages at the corresponding -syslog.listenAddr.tcp. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#log-timestamps + Supports array of values separated by comma or specified via multiple flags. + Empty values are set to false. + -syslog.useLocalTimestamp.udp array + Whether to use local timestamp instead of the original timestamp for the ingested syslog messages at the corresponding -syslog.listenAddr.udp. See https://docs.victoriametrics.com/victorialogs/data-ingestion/syslog/#log-timestamps + Supports array of values separated by comma or specified via multiple flags. + Empty values are set to false. + -tls array + Whether to enable TLS for incoming HTTP requests at the given -httpListenAddr (aka https). -tlsCertFile and -tlsKeyFile must be set if -tls is set. See also -mtls + Supports array of values separated by comma or specified via multiple flags. + Empty values are set to false. + -tlsCertFile array + Path to file with TLS certificate for the corresponding -httpListenAddr if -tls is set. Prefer ECDSA certs instead of RSA certs as RSA certs are slower. The provided certificate file is automatically re-read every second, so it can be dynamically updated. See also -tlsAutocertHosts + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -tlsCipherSuites array + Optional list of TLS cipher suites for incoming requests over HTTPS if -tls is set. See the list of supported cipher suites at https://pkg.go.dev/crypto/tls#pkg-constants + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -tlsKeyFile array + Path to file with TLS key for the corresponding -httpListenAddr if -tls is set. The provided key file is automatically re-read every second, so it can be dynamically updated. See also -tlsAutocertHosts + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -tlsMinVersion array + Optional minimum TLS version to use for the corresponding -httpListenAddr if -tls is set. Supported values: TLS10, TLS11, TLS12, TLS13 + Supports an array of values separated by comma or specified via multiple flags. + Value can contain comma inside single-quoted or double-quoted string, {}, [] and () braces. + -version + Show VictoriaMetrics version +``` diff --git a/lib/logstorage/log_rows.go b/lib/logstorage/log_rows.go index 3767801e7d..dc46f697b2 100644 --- a/lib/logstorage/log_rows.go +++ b/lib/logstorage/log_rows.go @@ -266,6 +266,11 @@ func (lr *LogRows) Reset() { lr.defaultMsgValue = "" } +// RowsCount returns current log rows count +func (lr *LogRows) RowsCount() int { + return len(lr.rows) +} + // ResetKeepSettings resets rows stored in lr, while keeping its settings passed to GetLogRows(). func (lr *LogRows) ResetKeepSettings() { lr.a.reset()